From 565f5c0de49739c383538589e7d08fa5b627841f Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Fri, 1 May 2026 12:00:00 +0000 Subject: [PATCH] context: introduce request_pool, decouple OUTPUT buffers from surfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 3 of the upstreamable plan (upstreamable_design.md §1, §5). Replaces the prior per-surface OUTPUT-buffer ownership model with a small driver-wide pool sized by codec pipeline depth (4 H.264 frames in flight), allocated unconditionally regardless of caller's num_render_targets. Prior art (kernel UAPI dev-stateless-decoder.rst, ffmpeg v4l2_request.c, Chromium V4L2StatelessVideoDecoder, GStreamer v4l2slh264dec) all decouple OUTPUT and CAPTURE pool sizing. fourier's "output_count == surfaces_count" model was a category error: OUTPUT buffers are request-time bitstream slots, CAPTURE buffers are picture-time DPB slots; their lifecycles and sizing are independent. Changes: * NEW src/request_pool.{c,h} (~200 LoC): - request_pool_init(): CREATE_BUFS + per-slot QUERYBUF + mmap. - request_pool_destroy(): munmap all, idempotent. - request_pool_acquire(): round-robin claim; returns V4L2 buffer index of an unused slot or -1. - request_pool_release(): mark slot free for reuse. - request_pool_slot(): accessor for ptr/size given a buffer index. * src/request.h: add struct request_pool output_pool to request_data. * src/context.c::RequestCreateContext: replace the per-surface OUTPUT loop with a single request_pool_init() call (count=4, independent of surfaces_count). Drop the now-unused locals (length, offset, source_data, output_buffers_count, index, index_base, i, surface_object). DELETES patch 0002's "output_buffers_count = ... ? ... : 4" hack inline — the pool's own count parameter supersedes it. * src/picture.c::RequestBeginPicture: borrow a pool slot at frame start, write its mmap pointer/size/index into the surface's transient source_* fields. The fields stay (still useful as a borrow handle that the existing codec_store_buffer memcpys target), but no longer represent surface-permanent ownership. Reset slices_size/slices_count here too (was implicit on first Render). * src/surface.c::RequestSyncSurface: after VIDIOC_DQBUF returns the OUTPUT buffer, release the pool slot and clear the surface's borrow handle. Fixes the segv on second-frame submission. * src/surface.c::RequestDestroySurfaces: remove the munmap of source_data — pool owns the mmap. * src/request.c::RequestTerminate: call request_pool_destroy() before close(video_fd) so munmaps still target a valid fd. * src/meson.build: add request_pool.c and request_pool.h to the sources/headers lists. This commit removes 0002's OUTPUT-pool hack inline (the "floor to 4" line is gone). The DECODE_MODE/START_CODE block in 0002 remains until commit 4 lands. Build-verified clean on aarch64. Signed-off-by: Markus Fritsche --- src/context.c | 67 ++++++--------------- src/meson.build | 2 + src/picture.c | 27 +++++++++ src/request.c | 7 +++ src/request.h | 8 +++ src/request_pool.c | 147 +++++++++++++++++++++++++++++++++++++++++++++ src/request_pool.h | 84 ++++++++++++++++++++++++++ src/surface.c | 18 ++++-- 8 files changed, 307 insertions(+), 53 deletions(-) create mode 100644 src/request_pool.c create mode 100644 src/request_pool.h diff --git a/src/context.c b/src/context.c index 9ca78fd..2c863c8 100644 --- a/src/context.c +++ b/src/context.c @@ -54,20 +54,12 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, { struct request_data *driver_data = context->pDriverData; struct object_config *config_object; - struct object_surface *surface_object; struct object_context *context_object = NULL; struct video_format *video_format; - unsigned int length; - unsigned int offset; - void *source_data = MAP_FAILED; VASurfaceID *ids = NULL; VAContextID id; VAStatus status; unsigned int output_type, capture_type; - unsigned int output_buffers_count; - unsigned int index_base; - unsigned int index; - unsigned int i; int rc; video_format = driver_data->video_format; @@ -92,15 +84,20 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, memset(&context_object->dpb, 0, sizeof(context_object->dpb)); /* - * The OUTPUT (bitstream-input) queue must be non-empty before - * VIDIOC_STREAMON or hantro-class drivers reject with EINVAL. - * VA-API callers (e.g. ffmpeg's vaapi-copy path) may invoke - * vaCreateContext with num_render_targets=0; allocate a small - * minimum pool independent of the caller's surface count. + * Initialize the OUTPUT (bitstream-input) buffer pool. Sized by + * codec pipeline depth (4 H.264 frames in flight is sufficient + * for current hantro/rkvdec scheduling); independent of caller- + * supplied surfaces_count. Pool is owned by driver_data so it + * outlives any single context destroy/recreate cycle. + * + * This replaces the prior per-surface OUTPUT loop, which (a) + * created an empty queue when surfaces_count==0 (ffmpeg vaapi- + * copy path) and (b) only populated surface->source_* for + * surfaces present at vaCreateContext time, NULL-derefing on + * surfaces created later. */ - output_buffers_count = surfaces_count > 0 ? surfaces_count : 4; - rc = v4l2_create_buffers(driver_data->video_fd, output_type, - output_buffers_count, &index_base); + rc = request_pool_init(&driver_data->output_pool, + driver_data->video_fd, output_type, 4); if (rc < 0) { status = VA_STATUS_ERROR_ALLOCATION_FAILED; goto error; @@ -111,40 +108,15 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, * we don't have any indication wrt its life time. Let's make sure * its life span is under our control. */ - ids = malloc(surfaces_count * sizeof(VASurfaceID)); - if (ids == NULL) { - status = VA_STATUS_ERROR_ALLOCATION_FAILED; - goto error; - } - - memcpy(ids, surfaces_ids, surfaces_count * sizeof(VASurfaceID)); - - for (i = 0; i < surfaces_count; i++) { - index = index_base + i; - - surface_object = SURFACE(driver_data, surfaces_ids[i]); - if (surface_object == NULL) { - status = VA_STATUS_ERROR_INVALID_SURFACE; - goto error; - } - - rc = v4l2_query_buffer(driver_data->video_fd, output_type, - index, &length, &offset, 1); - if (rc < 0) { + if (surfaces_count > 0) { + ids = malloc(surfaces_count * sizeof(VASurfaceID)); + if (ids == NULL) { status = VA_STATUS_ERROR_ALLOCATION_FAILED; goto error; } - source_data = mmap(NULL, length, PROT_READ | PROT_WRITE, - MAP_SHARED, driver_data->video_fd, offset); - if (source_data == MAP_FAILED) { - status = VA_STATUS_ERROR_ALLOCATION_FAILED; - goto error; - } - - surface_object->source_index = index; - surface_object->source_data = source_data; - surface_object->source_size = length; + memcpy(ids, surfaces_ids, + surfaces_count * sizeof(VASurfaceID)); } /* @@ -200,9 +172,6 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, goto complete; error: - if (source_data != MAP_FAILED) - munmap(source_data, length); - if (ids != NULL) free(ids); diff --git a/src/meson.build b/src/meson.build index b7ed642..07f3f14 100644 --- a/src/meson.build +++ b/src/meson.build @@ -44,6 +44,7 @@ sources = [ 'v4l2.c', 'mpeg2.c', 'h264.c', + 'request_pool.c', # 'h265.c' ] @@ -64,6 +65,7 @@ headers = [ 'v4l2.h', 'mpeg2.h', 'h264.h', + 'request_pool.h', # 'h265.h' ] diff --git a/src/picture.c b/src/picture.c index 3713f00..17913aa 100644 --- a/src/picture.c +++ b/src/picture.c @@ -216,6 +216,8 @@ VAStatus RequestBeginPicture(VADriverContextP context, VAContextID context_id, struct request_data *driver_data = context->pDriverData; struct object_context *context_object; struct object_surface *surface_object; + struct request_pool_slot *slot; + int slot_index; context_object = CONTEXT(driver_data, context_id); if (context_object == NULL) @@ -228,6 +230,31 @@ VAStatus RequestBeginPicture(VADriverContextP context, VAContextID context_id, if (surface_object->status == VASurfaceRendering) RequestSyncSurface(context, surface_id); + /* + * Borrow an OUTPUT (bitstream-input) slot from the driver-wide + * pool for the duration of this Begin/Render/End cycle. The + * surface's source_* fields hold the borrow's mmap pointer/size/ + * V4L2 buffer index until RequestSyncSurface releases it after + * VIDIOC_DQBUF. + */ + slot_index = request_pool_acquire(&driver_data->output_pool); + if (slot_index < 0) + return VA_STATUS_ERROR_ALLOCATION_FAILED; + + slot = request_pool_slot(&driver_data->output_pool, + (unsigned int)slot_index); + if (slot == NULL) { + request_pool_release(&driver_data->output_pool, + (unsigned int)slot_index); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + surface_object->source_index = slot->index; + surface_object->source_data = slot->data; + surface_object->source_size = slot->size; + surface_object->slices_size = 0; + surface_object->slices_count = 0; + surface_object->status = VASurfaceRendering; context_object->render_surface_id = surface_id; diff --git a/src/request.c b/src/request.c index b54c0f5..94e948e 100644 --- a/src/request.c +++ b/src/request.c @@ -205,6 +205,13 @@ VAStatus RequestTerminate(VADriverContextP context) struct object_config *config_object; int iterator; + /* + * Tear down the OUTPUT buffer pool before closing video_fd so + * the munmap calls in request_pool_destroy() can still touch the + * mmap regions (which are tied to that fd's lifetime). + */ + request_pool_destroy(&driver_data->output_pool); + close(driver_data->video_fd); close(driver_data->media_fd); diff --git a/src/request.h b/src/request.h index 1fb593b..03a6a78 100644 --- a/src/request.h +++ b/src/request.h @@ -31,6 +31,7 @@ #include "context.h" #include "object_heap.h" +#include "request_pool.h" #include "video.h" #include @@ -55,6 +56,13 @@ struct request_data { int media_fd; struct video_format *video_format; + + /* + * OUTPUT (bitstream-input) buffer pool, decoupled from VA + * surfaces. Sized by codec pipeline depth, populated on first + * RequestCreateContext, torn down at driver Terminate. + */ + struct request_pool output_pool; }; VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context); diff --git a/src/request_pool.c b/src/request_pool.c new file mode 100644 index 0000000..5df70a1 --- /dev/null +++ b/src/request_pool.c @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2026 Markus Fritsche + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. + */ + +#include "request_pool.h" + +#include +#include +#include +#include + +#include "utils.h" +#include "v4l2.h" + +int request_pool_init(struct request_pool *pool, int video_fd, + unsigned int output_type, unsigned int count) +{ + unsigned int index_base; + unsigned int length; + unsigned int offset; + unsigned int i; + int rc; + + if (pool == NULL || count == 0) + return -1; + + if (pool->initialized) + return 0; + + pool->slots = calloc(count, sizeof(*pool->slots)); + if (pool->slots == NULL) + return -1; + + pool->count = count; + pool->next = 0; + + rc = v4l2_create_buffers(video_fd, output_type, count, &index_base); + if (rc < 0) + goto error; + + for (i = 0; i < count; i++) { + pool->slots[i].index = index_base + i; + pool->slots[i].busy = false; + + rc = v4l2_query_buffer(video_fd, output_type, + pool->slots[i].index, + &length, &offset, 1); + if (rc < 0) + goto error; + + pool->slots[i].data = mmap(NULL, length, + PROT_READ | PROT_WRITE, + MAP_SHARED, video_fd, offset); + if (pool->slots[i].data == MAP_FAILED) { + pool->slots[i].data = NULL; + goto error; + } + + pool->slots[i].size = length; + } + + pool->initialized = true; + return 0; + +error: + request_pool_destroy(pool); + return -1; +} + +void request_pool_destroy(struct request_pool *pool) +{ + unsigned int i; + + if (pool == NULL || pool->slots == NULL) + return; + + for (i = 0; i < pool->count; i++) { + if (pool->slots[i].data != NULL && pool->slots[i].size > 0) + munmap(pool->slots[i].data, pool->slots[i].size); + } + + free(pool->slots); + pool->slots = NULL; + pool->count = 0; + pool->next = 0; + pool->initialized = false; +} + +int request_pool_acquire(struct request_pool *pool) +{ + unsigned int start; + unsigned int i; + + if (pool == NULL || !pool->initialized || pool->count == 0) + return -1; + + start = pool->next; + for (i = 0; i < pool->count; i++) { + unsigned int slot = (start + i) % pool->count; + + if (!pool->slots[slot].busy) { + pool->slots[slot].busy = true; + pool->next = (slot + 1) % pool->count; + return (int)pool->slots[slot].index; + } + } + + /* All slots busy; caller must wait for an in-flight DQBUF. */ + return -1; +} + +void request_pool_release(struct request_pool *pool, unsigned int index) +{ + unsigned int i; + + if (pool == NULL || pool->slots == NULL) + return; + + for (i = 0; i < pool->count; i++) { + if (pool->slots[i].index == index) { + pool->slots[i].busy = false; + return; + } + } +} + +struct request_pool_slot *request_pool_slot(struct request_pool *pool, + unsigned int index) +{ + unsigned int i; + + if (pool == NULL || pool->slots == NULL) + return NULL; + + for (i = 0; i < pool->count; i++) { + if (pool->slots[i].index == index) + return &pool->slots[i]; + } + + return NULL; +} diff --git a/src/request_pool.h b/src/request_pool.h new file mode 100644 index 0000000..c988eb5 --- /dev/null +++ b/src/request_pool.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2026 Markus Fritsche + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. + */ + +#ifndef _REQUEST_POOL_H_ +#define _REQUEST_POOL_H_ + +#include + +/* + * OUTPUT (bitstream-input) buffer pool, decoupled from caller-allocated + * VA surfaces. Sizing is driven by codec pipeline depth (typically 4 + * for H.264), not by the consumer's surface count. + * + * The pool owns the V4L2 buffer indices and their mmap pointers. A + * decode request "borrows" a slot at vaBeginPicture, fills it across + * vaRenderPicture calls, queues it at vaEndPicture, and releases it + * after VIDIOC_DQBUF returns. + * + * This replaces the per-surface OUTPUT-buffer ownership model in the + * pre-refactor code, where object_surface.source_* fields permanently + * held a single OUTPUT buffer per surface — incorrect because OUTPUT + * buffers are request-time resources, not picture-time resources, and + * because the per-surface loop in RequestCreateContext only ran when + * surfaces_count > 0 (breaking ffmpeg's vaapi-copy num_render_targets=0 + * convention). + */ + +struct request_pool_slot { + unsigned int index; /* V4L2 buffer index in OUTPUT queue */ + void *data; /* mmap pointer for this slot */ + unsigned int size; /* mmap size in bytes */ + bool busy; /* true while borrowed for a request */ +}; + +struct request_pool { + struct request_pool_slot *slots; + unsigned int count; + unsigned int next; /* round-robin acquire cursor */ + bool initialized; +}; + +/* + * Allocate count OUTPUT buffers via VIDIOC_CREATE_BUFS, query and mmap + * each, populate pool->slots[]. Caller must have already done + * VIDIOC_S_FMT on the OUTPUT queue. Returns 0 on success, -1 on + * failure. + */ +int request_pool_init(struct request_pool *pool, int video_fd, + unsigned int output_type, unsigned int count); + +/* + * Munmap all slots and free the slots array. Idempotent. + */ +void request_pool_destroy(struct request_pool *pool); + +/* + * Claim the next free slot (round-robin). Returns the slot's V4L2 + * buffer index on success (slot in pool->slots[] is determined by + * the returned index), or -1 if all slots are busy. + */ +int request_pool_acquire(struct request_pool *pool); + +/* + * Mark the slot at pool->slots[i] free for reuse. Caller must pass the + * V4L2 buffer index returned earlier from request_pool_acquire(). + */ +void request_pool_release(struct request_pool *pool, unsigned int index); + +/* + * Look up the pool slot owning a given V4L2 buffer index. Returns + * pointer to the slot on success, NULL if the index is out of range. + * The returned pointer is valid until pool destruction; do not free. + */ +struct request_pool_slot *request_pool_slot(struct request_pool *pool, + unsigned int index); + +#endif diff --git a/src/surface.c b/src/surface.c index ca3969d..0b8f802 100644 --- a/src/surface.c +++ b/src/surface.c @@ -254,10 +254,11 @@ VAStatus RequestDestroySurfaces(VADriverContextP context, if (surface_object == NULL) return VA_STATUS_ERROR_INVALID_SURFACE; - if (surface_object->source_data != NULL && - surface_object->source_size > 0) - munmap(surface_object->source_data, - surface_object->source_size); + /* + * source_* are now transient borrows from request_pool, not + * surface-owned mappings; the pool owns the underlying mmap. + * Nothing to free here. + */ for (j = 0; j < surface_object->destination_buffers_count; j++) if (surface_object->destination_map[j] != NULL && @@ -336,6 +337,15 @@ VAStatus RequestSyncSurface(VADriverContextP context, VASurfaceID surface_id) goto error; } + /* + * OUTPUT buffer is back from the kernel: return its pool slot + * for reuse and clear the surface's transient borrow handle. + */ + request_pool_release(&driver_data->output_pool, + surface_object->source_index); + surface_object->source_data = NULL; + surface_object->source_size = 0; + rc = v4l2_dequeue_buffer(driver_data->video_fd, -1, capture_type, surface_object->destination_index, surface_object->destination_buffers_count);