diff --git a/src/context.c b/src/context.c index 45aa3ea..1d1b7b1 100644 --- a/src/context.c +++ b/src/context.c @@ -537,7 +537,9 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, */ rc = request_pool_init(&driver_data->output_pool, driver_data->video_fd, driver_data->media_fd, - output_type, 16); + output_type, 16, pixelformat, + (unsigned int)picture_width, + (unsigned int)picture_height); if (rc < 0) { status = VA_STATUS_ERROR_ALLOCATION_FAILED; goto error; diff --git a/src/picture.c b/src/picture.c index 9e703a8..9c0d26f 100644 --- a/src/picture.c +++ b/src/picture.c @@ -37,6 +37,7 @@ #include "vp8.h" #include "vp9.h" #include "av1.h" +#include "request_pool.h" #include #include @@ -55,6 +56,159 @@ #include "autoconfig.h" +/* + * iter#15 — issue #15: ensure the in-flight surface's OUTPUT mmap has + * room for `delta` more bytes appended to slices_size; if not, grow the + * pool transparently via request_pool_resize. + * + * Sequence on overflow: + * 1. Snapshot the surface's accumulated bytes to a temp heap buffer. + * 2. Release the surface's OUTPUT pool slot back to FREE (resize + * requires no slot be borrowed). + * 3. Compute new sizeimage = roundup(needed * 2, 4 KiB), and at least + * double the current source_size so geometric growth amortises + * repeated overruns at the same resolution. + * 4. Call request_pool_resize. + * 5. Re-acquire a pool slot (the new pool has fresh indices and fds). + * 6. Re-mirror surface_object->source_{index,data,size,request_fd} + * from the new slot. + * 7. Restore the saved bytes via memcpy into the new mmap. + * + * Returns VA_STATUS_SUCCESS on clean resize (or no resize needed) and + * VA_STATUS_ERROR_ALLOCATION_FAILED on heap-alloc / V4L2 / kernel + * failure — the libva client falls back to surface re-creation as + * before the resize hook landed. + * + * NOTE on inline-Sync invariant: RequestEndPicture calls + * RequestSyncSurface inline, so when codec_store_buffer runs no other + * pool slot is borrowed across libva-driver-API entry points. The + * temporary release-then-reacquire of the in-flight slot here keeps + * that invariant intact across the resize. + */ +static VAStatus +codec_store_buffer_ensure_capacity(struct request_data *driver_data, + struct object_surface *surface_object, + size_t need) +{ + struct request_pool_slot *slot; + uint8_t *save_buf; + size_t save_size; + unsigned int saved_index; + size_t want_sizeimage; + unsigned int new_sizeimage; + int new_index; + int rc; + + if (need <= surface_object->source_size) + return VA_STATUS_SUCCESS; + + save_size = surface_object->slices_size; + save_buf = NULL; + if (save_size > 0) { + save_buf = malloc(save_size); + if (save_buf == NULL) { + request_log("codec_store_buffer_ensure_capacity: malloc(%zu) for resize-save failed\n", + save_size); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + memcpy(save_buf, surface_object->source_data, save_size); + } + + /* + * Temporarily release the in-flight slot. The slot's V4L2 buffer + * has NOT been QBUF'd yet (QBUF lives in RequestEndPicture, after + * this codec_store_buffer call), so the release is a clean + * busy=false flip; no kernel state is in question. The slot's + * stale request_fd does not need to be saved — the resize closes + * every slot's fd and the post-resize acquire below re-mirrors a + * fresh slot's request_fd into surface_object->request_fd. + */ + saved_index = surface_object->source_index; + request_pool_release(&driver_data->output_pool, saved_index); + + /* + * Geometric growth: at least 2× the current source_size, but no + * less than 2× the required total — so a single resize covers the + * triggering append plus comfortable headroom for the rest of + * this frame. Round up to a 4 KiB page boundary so the kernel's + * own alignment doesn't waste pages. Compute in size_t so the + * 2× doubling can't silently wrap at 2 GiB on 32-bit unsigned int + * (sizeimage stays bounded by V4L2's u32, but the doubling target + * could otherwise overflow before the clamp). + */ + want_sizeimage = need * 2; + if (want_sizeimage < (size_t)surface_object->source_size * 2) + want_sizeimage = (size_t)surface_object->source_size * 2; + if (want_sizeimage > 0x40000000u) /* 1 GiB hard cap — V4L2 sizeimage is u32 */ + want_sizeimage = 0x40000000u; + want_sizeimage = (want_sizeimage + 0xFFFu) & ~(size_t)0xFFFu; + new_sizeimage = (unsigned int)want_sizeimage; + + request_log("codec_store_buffer: OUTPUT-pool resize (need %zu > cap %u → new_sizeimage %u)\n", + need, surface_object->source_size, new_sizeimage); + + rc = request_pool_resize(&driver_data->output_pool, new_sizeimage); + if (rc < 0) { + /* + * Resize failed. The original slot was already released + * above, so surface_object->source_data is now pointing + * at a FREE-but-still-borrowable mmap. Restore the + * surface's slot mirror so EndPicture / DestroyContext + * unwind paths see a consistent (if partial) state. + * + * If the resize aborted early (pre-STREAMOFF), the slot + * is intact: re-acquiring the same index is the inverse + * of the temporary release above. If it aborted later + * (post-teardown), the slot's data/size were zeroed in + * place by request_pool_resize and the re-acquire flips + * busy=true on a dead slot — still safe, because the + * caller will return ERROR_ALLOCATION_FAILED and the + * libva consumer destroys the surface/context. + */ + (void)request_pool_acquire(&driver_data->output_pool); + free(save_buf); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + new_index = request_pool_acquire(&driver_data->output_pool); + if (new_index < 0) { + free(save_buf); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + slot = request_pool_slot(&driver_data->output_pool, + (unsigned int)new_index); + if (slot == NULL) { + request_pool_release(&driver_data->output_pool, + (unsigned int)new_index); + free(save_buf); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + surface_object->source_index = slot->index; + surface_object->source_data = slot->data; + surface_object->source_size = slot->size; + surface_object->request_fd = slot->request_fd; + + if (need > surface_object->source_size) { + /* + * Kernel rounded the new sizeimage down below what we + * needed — drivers may clamp at their per-codec ceiling. + * Don't corrupt memory; surface the error to libva. + */ + request_log("codec_store_buffer_ensure_capacity: kernel returned sizeimage %u < required %zu\n", + surface_object->source_size, need); + free(save_buf); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + if (save_buf != NULL) { + memcpy(surface_object->source_data, save_buf, save_size); + free(save_buf); + } + + return VA_STATUS_SUCCESS; +} + static VAStatus codec_store_buffer(struct request_data *driver_data, struct object_context *context, VAProfile profile, @@ -69,30 +223,29 @@ static VAStatus codec_store_buffer(struct request_data *driver_data, * RenderPicture), we can't use a V4L2 buffer directly * and have to copy from a regular buffer. * - * Bounds check (issue #13): surface_object->source_data points - * at an OUTPUT-pool mmap of fixed size source_size, negotiated + * Capacity guard (issue #13 + #15): surface_object->source_data + * points at an OUTPUT-pool mmap of size source_size, negotiated * at S_FMT time. A stream-level resolution upshift can produce - * a slice larger than this allocation; without the guard, the - * memcpy walks past the mmap and SIGSEGVs (mpv --hwdec=vaapi- - * copy) or corrupts adjacent heap (Firefox RDD). Each append - * site below checks the running total against source_size and - * fails the RenderPicture call instead of corrupting memory; - * libavcodec re-creates the surface at the new resolution on - * the next BeginPicture. + * a slice larger than this allocation. Each append site below + * computes the post-append running total and calls + * codec_store_buffer_ensure_capacity, which transparently grows + * the OUTPUT pool (request_pool_resize) so the existing memcpy + * has room. The hard error path (VA_STATUS_ERROR_ALLOCATION_FAILED) + * only fires if both the heap save buffer AND the kernel-side + * grow fail — at which point libavcodec recreates the surface. */ - size_t cap = surface_object->source_size; size_t need; + VAStatus ensure_rc; if (context->h264_start_code) { static const char start_code[3] = { 0x00, 0x00, 0x01 }; need = (size_t)surface_object->slices_size + sizeof(start_code); - if (need > cap) { - request_log("codec_store_buffer: H.264 start code would overflow OUTPUT buffer (%zu > %zu) — resolution upshift mid-stream?\n", - need, cap); - return VA_STATUS_ERROR_ALLOCATION_FAILED; - } + ensure_rc = codec_store_buffer_ensure_capacity( + driver_data, surface_object, need); + if (ensure_rc != VA_STATUS_SUCCESS) + return ensure_rc; memcpy(surface_object->source_data + surface_object->slices_size, start_code, sizeof(start_code)); @@ -127,11 +280,10 @@ static VAStatus codec_store_buffer(struct request_data *driver_data, surface_object->params.vp8.picture.pic_fields.bits.key_frame == 0 ? 10 : 3; need = (size_t)surface_object->slices_size + header_size; - if (need > cap) { - request_log("codec_store_buffer: VP8 header pad would overflow OUTPUT buffer (%zu > %zu)\n", - need, cap); - return VA_STATUS_ERROR_ALLOCATION_FAILED; - } + ensure_rc = codec_store_buffer_ensure_capacity( + driver_data, surface_object, need); + if (ensure_rc != VA_STATUS_SUCCESS) + return ensure_rc; memset(surface_object->source_data + surface_object->slices_size, 0, header_size); @@ -141,11 +293,10 @@ static VAStatus codec_store_buffer(struct request_data *driver_data, size_t payload = (size_t)buffer_object->size * buffer_object->count; need = (size_t)surface_object->slices_size + payload; - if (need > cap) { - request_log("codec_store_buffer: slice payload would overflow OUTPUT buffer (%zu > %zu) — resolution upshift mid-stream?\n", - need, cap); - return VA_STATUS_ERROR_ALLOCATION_FAILED; - } + ensure_rc = codec_store_buffer_ensure_capacity( + driver_data, surface_object, need); + if (ensure_rc != VA_STATUS_SUCCESS) + return ensure_rc; memcpy(surface_object->source_data + surface_object->slices_size, buffer_object->data, payload); diff --git a/src/request_pool.c b/src/request_pool.c index 3c5eb74..6a4b40f 100644 --- a/src/request_pool.c +++ b/src/request_pool.c @@ -21,7 +21,10 @@ #include "v4l2.h" int request_pool_init(struct request_pool *pool, int video_fd, int media_fd, - unsigned int output_type, unsigned int count) + unsigned int output_type, unsigned int count, + unsigned int pixelformat, + unsigned int picture_width, + unsigned int picture_height) { unsigned int index_base; unsigned int length; @@ -43,6 +46,16 @@ int request_pool_init(struct request_pool *pool, int video_fd, int media_fd, pool->next = 0; pool->media_fd = media_fd; /* iter7: kept for force_release re-alloc */ + /* + * iter#15: cache the S_FMT params so request_pool_resize can + * re-issue S_FMT with a sizeimage hint override on overrun. + */ + pool->video_fd = video_fd; + pool->output_type = output_type; + pool->pixelformat = pixelformat; + pool->picture_width = picture_width; + pool->picture_height = picture_height; + for (i = 0; i < count; i++) pool->slots[i].request_fd = -1; @@ -94,6 +107,118 @@ error: return -1; } +int request_pool_resize(struct request_pool *pool, + unsigned int new_sizeimage_min) +{ + unsigned int index_base; + unsigned int length; + unsigned int offset; + unsigned int saved_count; + unsigned int i; + int rc; + + if (pool == NULL || !pool->initialized || pool->count == 0) + return -1; + + /* + * Pre-condition guard: no slot may be borrowed when we tear the + * pool down. The caller in codec_store_buffer temporarily releases + * the current in-flight surface's slot before invoking us; the + * inline-Sync-in-EndPicture pattern guarantees no other slot is + * borrowed elsewhere in the driver. Bail loudly if anyone breaks + * that invariant rather than corrupting in-flight V4L2 state. + */ + for (i = 0; i < pool->count; i++) { + if (pool->slots[i].busy) { + request_log("request_pool_resize: slot %u still busy — " + "caller must release before resize\n", i); + return -1; + } + } + + saved_count = pool->count; + + /* STREAMOFF the OUTPUT queue so REQBUFS(0) is accepted. */ + rc = v4l2_set_stream(pool->video_fd, pool->output_type, false); + if (rc < 0) + return -1; + + /* + * Tear down every slot: munmap, close per-slot request_fd. Slot + * fields are zeroed in place so failure halfway is recoverable. + */ + for (i = 0; i < pool->count; i++) { + if (pool->slots[i].data != NULL && pool->slots[i].size > 0) { + munmap(pool->slots[i].data, pool->slots[i].size); + pool->slots[i].data = NULL; + pool->slots[i].size = 0; + } + if (pool->slots[i].request_fd >= 0) { + close(pool->slots[i].request_fd); + pool->slots[i].request_fd = -1; + } + } + + /* + * Release the V4L2 OUTPUT buffer indices. REQBUFS(0) is the only + * way to ask the kernel to free buffers so CREATE_BUFS can re- + * allocate with a new per-buffer sizeimage. + */ + rc = v4l2_request_buffers(pool->video_fd, pool->output_type, 0); + if (rc < 0) + return -1; + + /* + * Re-issue S_FMT with the cached dimensions but a larger + * sizeimage. The kernel may round up further (driver-specific + * page / alignment rules); we accept whatever it returns and + * pick that up from per-slot v4l2_query_buffer below. + */ + rc = v4l2_set_format_sizeimage(pool->video_fd, pool->output_type, + pool->pixelformat, + pool->picture_width, + pool->picture_height, + new_sizeimage_min); + if (rc < 0) + return -1; + + rc = v4l2_create_buffers(pool->video_fd, pool->output_type, + saved_count, &index_base); + if (rc < 0) + return -1; + + for (i = 0; i < saved_count; i++) { + pool->slots[i].index = index_base + i; + pool->slots[i].busy = false; + + rc = v4l2_query_buffer(pool->video_fd, pool->output_type, + pool->slots[i].index, + &length, &offset, 1); + if (rc < 0) + return -1; + + pool->slots[i].data = mmap(NULL, length, + PROT_READ | PROT_WRITE, + MAP_SHARED, pool->video_fd, offset); + if (pool->slots[i].data == MAP_FAILED) { + pool->slots[i].data = NULL; + return -1; + } + pool->slots[i].size = length; + + pool->slots[i].request_fd = media_request_alloc(pool->media_fd); + if (pool->slots[i].request_fd < 0) + return -1; + } + + rc = v4l2_set_stream(pool->video_fd, pool->output_type, true); + if (rc < 0) + return -1; + + pool->next = 0; + return 0; +} + void request_pool_destroy(struct request_pool *pool) { unsigned int i; diff --git a/src/request_pool.h b/src/request_pool.h index d221378..00613bc 100644 --- a/src/request_pool.h +++ b/src/request_pool.h @@ -52,16 +52,71 @@ struct request_pool { int media_fd; /* iter7: kept for * force_release re-alloc */ bool initialized; + + /* + * iter#15: cached S_FMT params from request_pool_init, so + * request_pool_resize can re-S_FMT the OUTPUT queue with a new + * sizeimage override on a mid-session resolution upshift overrun + * without the caller having to re-thread these through six call + * sites. video_fd is also cached so the resize is fully + * self-contained — request_pool_resize takes only the pool and + * the new sizeimage hint. + */ + int video_fd; + unsigned int output_type; + unsigned int pixelformat; + unsigned int picture_width; + unsigned int picture_height; }; /* * Allocate count OUTPUT buffers via VIDIOC_CREATE_BUFS, query and mmap * each, populate pool->slots[]. Caller must have already done - * VIDIOC_S_FMT on the OUTPUT queue. Returns 0 on success, -1 on - * failure. + * VIDIOC_S_FMT on the OUTPUT queue. The S_FMT params (pixelformat, + * picture_width, picture_height) are stashed on the pool so that + * request_pool_resize can re-issue S_FMT with the same dimensions but + * a larger sizeimage hint. Returns 0 on success, -1 on failure. */ int request_pool_init(struct request_pool *pool, int video_fd, int media_fd, - unsigned int output_type, unsigned int count); + unsigned int output_type, unsigned int count, + unsigned int pixelformat, + unsigned int picture_width, + unsigned int picture_height); + +/* + * iter#15: grow the OUTPUT pool's per-slot sizeimage in place. + * + * Issued from codec_store_buffer when an Annex-B start code / VP8 + * header pad / slice payload won't fit in the current + * surface->source_size — i.e. the stream's per-frame bitstream budget + * has outgrown the OUTPUT pool slot's mmap (typical cause: SPS-driven + * resolution upshift mid-session). + * + * Steps: + * 1. STREAMOFF the OUTPUT queue. + * 2. munmap every slot, close every per-slot media-request fd. + * 3. VIDIOC_REQBUFS(count=0) to release the V4L2 buffer indices. + * 4. S_FMT with the cached pixelformat / picture_width / + * picture_height but a sizeimage hint of new_sizeimage_min. + * 5. CREATE_BUFS with the original slot count. + * 6. Per-slot: query buffer length, mmap, alloc fresh request_fd. + * 7. STREAMON. + * + * Returns 0 on success, -1 on failure (caller falls back to + * VA_STATUS_ERROR_ALLOCATION_FAILED — the libva consumer recreates + * the surface at the new resolution). + * + * Pre-condition: NO pool slot is currently borrowed (busy=false on + * every slot) AND no buffer is in-flight on the OUTPUT queue. The + * inline-Sync-in-EndPicture pattern (RequestEndPicture calls + * RequestSyncSurface before returning) makes this trivially true at + * codec_store_buffer time for the only-supported single-context + * single-render-surface flow: the in-flight surface's slot is the + * sole borrowed slot, and the resize caller temporarily releases it + * before calling here. + */ +int request_pool_resize(struct request_pool *pool, + unsigned int new_sizeimage_min); /* * Munmap all slots and free the slots array. Idempotent. diff --git a/src/v4l2.c b/src/v4l2.c index 27d9e61..f990e84 100644 --- a/src/v4l2.c +++ b/src/v4l2.c @@ -113,6 +113,28 @@ static void v4l2_setup_format(struct v4l2_format *format, unsigned int type, } } +static void v4l2_setup_format_sizeimage(struct v4l2_format *format, + unsigned int type, + unsigned int width, unsigned int height, + unsigned int pixelformat, + unsigned int sizeimage) +{ + memset(format, 0, sizeof(*format)); + format->type = type; + + if (v4l2_type_is_mplane(type)) { + format->fmt.pix_mp.width = width; + format->fmt.pix_mp.height = height; + format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage; + format->fmt.pix_mp.pixelformat = pixelformat; + } else { + format->fmt.pix.width = width; + format->fmt.pix.height = height; + format->fmt.pix.sizeimage = sizeimage; + format->fmt.pix.pixelformat = pixelformat; + } +} + bool v4l2_find_format(int video_fd, unsigned int type, unsigned int pixelformat) { struct v4l2_fmtdesc fmtdesc; @@ -172,6 +194,30 @@ int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat, return 0; } +int v4l2_set_format_sizeimage(int video_fd, unsigned int type, + unsigned int pixelformat, + unsigned int width, unsigned int height, + unsigned int sizeimage) +{ + struct v4l2_format format; + int rc; + + if (sizeimage == 0) + return v4l2_set_format(video_fd, type, pixelformat, width, height); + + v4l2_setup_format_sizeimage(&format, type, width, height, pixelformat, + sizeimage); + + rc = ioctl(video_fd, VIDIOC_S_FMT, &format); + if (rc < 0) { + request_log("Unable to set format (sizeimage=%u) for type %d: %s\n", + sizeimage, type, strerror(errno)); + return -1; + } + + return 0; +} + int v4l2_get_format(int video_fd, unsigned int type, unsigned int *width, unsigned int *height, unsigned int *bytesperline, unsigned int *sizes, unsigned int *planes_count) diff --git a/src/v4l2.h b/src/v4l2.h index ddb8f86..cacf012 100644 --- a/src/v4l2.h +++ b/src/v4l2.h @@ -36,6 +36,17 @@ bool v4l2_find_format(int video_fd, unsigned int type, unsigned int pixelformat); int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat, unsigned int width, unsigned int height); +/* + * Same as v4l2_set_format but explicitly overrides the OUTPUT + * sizeimage hint. Pass sizeimage=0 to get the v4l2_set_format default + * (SOURCE_SIZE_MAX for OUTPUT, 0 for CAPTURE). Used by + * request_pool_resize on a mid-session bitstream-budget overrun to + * grow the OUTPUT pool slots past the SOURCE_SIZE_MAX floor. + */ +int v4l2_set_format_sizeimage(int video_fd, unsigned int type, + unsigned int pixelformat, + unsigned int width, unsigned int height, + unsigned int sizeimage); int v4l2_get_format(int video_fd, unsigned int type, unsigned int *width, unsigned int *height, unsigned int *bytesperline, unsigned int *sizes, unsigned int *planes_count);