From: Markus Fritsche Date: 2026-05-01 Subject: [PATCH] context: introduce request_pool, decouple OUTPUT buffers from surfaces Commit 3 of the upstreamable plan (upstreamable_design.md §1, §5). Replaces the prior per-surface OUTPUT-buffer ownership model with a small driver-wide pool sized by codec pipeline depth (4 H.264 frames in flight), allocated unconditionally regardless of caller's num_render_targets. Prior art (kernel UAPI dev-stateless-decoder.rst, ffmpeg v4l2_request.c, Chromium V4L2StatelessVideoDecoder, GStreamer v4l2slh264dec) all decouple OUTPUT and CAPTURE pool sizing. fourier's "output_count == surfaces_count" model was a category error: OUTPUT buffers are request-time bitstream slots, CAPTURE buffers are picture-time DPB slots; their lifecycles and sizing are independent. Changes: * NEW src/request_pool.{c,h} (~200 LoC): - request_pool_init(): CREATE_BUFS + per-slot QUERYBUF + mmap. - request_pool_destroy(): munmap all, idempotent. - request_pool_acquire(): round-robin claim; returns V4L2 buffer index of an unused slot or -1. - request_pool_release(): mark slot free for reuse. - request_pool_slot(): accessor for ptr/size given a buffer index. * src/request.h: add struct request_pool output_pool to request_data. * src/context.c::RequestCreateContext: replace the per-surface OUTPUT loop with a single request_pool_init() call (count=4, independent of surfaces_count). Drop the now-unused locals (length, offset, source_data, output_buffers_count, index, index_base, i, surface_object). DELETES patch 0002's "output_buffers_count = ... ? ... : 4" hack inline — the pool's own count parameter supersedes it. * src/picture.c::RequestBeginPicture: borrow a pool slot at frame start, write its mmap pointer/size/index into the surface's transient source_* fields. The fields stay (still useful as a borrow handle that the existing codec_store_buffer memcpys target), but no longer represent surface-permanent ownership. Reset slices_size/slices_count here too (was implicit on first Render). * src/surface.c::RequestSyncSurface: after VIDIOC_DQBUF returns the OUTPUT buffer, release the pool slot and clear the surface's borrow handle. Fixes the segv on second-frame submission. * src/surface.c::RequestDestroySurfaces: remove the munmap of source_data — pool owns the mmap. * src/request.c::RequestTerminate: call request_pool_destroy() before close(video_fd) so munmaps still target a valid fd. * src/meson.build: add request_pool.c and request_pool.h to the sources/headers lists. This commit removes 0002's OUTPUT-pool hack inline (the "floor to 4" line is gone). The DECODE_MODE/START_CODE block in 0002 remains until commit 4 lands. Build-verified clean on aarch64. Signed-off-by: Markus Fritsche --- --- a/src/request.h 2026-05-01 20:09:57.346428828 +0000 +++ b/src/request.h 2026-05-01 20:17:57.497514185 +0000 @@ -31,6 +31,7 @@ #include "context.h" #include "object_heap.h" +#include "request_pool.h" #include "video.h" #include @@ -55,6 +56,13 @@ int media_fd; struct video_format *video_format; + + /* + * OUTPUT (bitstream-input) buffer pool, decoupled from VA + * surfaces. Sized by codec pipeline depth, populated on first + * RequestCreateContext, torn down at driver Terminate. + */ + struct request_pool output_pool; }; VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context); --- a/src/request.c 2026-05-01 20:09:57.346428828 +0000 +++ b/src/request.c 2026-05-01 20:19:48.091143681 +0000 @@ -205,6 +205,13 @@ struct object_config *config_object; int iterator; + /* + * Tear down the OUTPUT buffer pool before closing video_fd so + * the munmap calls in request_pool_destroy() can still touch the + * mmap regions (which are tied to that fd's lifetime). + */ + request_pool_destroy(&driver_data->output_pool); + close(driver_data->video_fd); close(driver_data->media_fd); --- a/src/context.c 2026-05-01 20:09:57.346428828 +0000 +++ b/src/context.c 2026-05-01 20:18:33.738048227 +0000 @@ -54,20 +54,12 @@ { struct request_data *driver_data = context->pDriverData; struct object_config *config_object; - struct object_surface *surface_object; struct object_context *context_object = NULL; struct video_format *video_format; - unsigned int length; - unsigned int offset; - void *source_data = MAP_FAILED; VASurfaceID *ids = NULL; VAContextID id; VAStatus status; unsigned int output_type, capture_type; - unsigned int output_buffers_count; - unsigned int index_base; - unsigned int index; - unsigned int i; int rc; video_format = driver_data->video_format; @@ -92,15 +84,20 @@ memset(&context_object->dpb, 0, sizeof(context_object->dpb)); /* - * The OUTPUT (bitstream-input) queue must be non-empty before - * VIDIOC_STREAMON or hantro-class drivers reject with EINVAL. - * VA-API callers (e.g. ffmpeg's vaapi-copy path) may invoke - * vaCreateContext with num_render_targets=0; allocate a small - * minimum pool independent of the caller's surface count. + * Initialize the OUTPUT (bitstream-input) buffer pool. Sized by + * codec pipeline depth (4 H.264 frames in flight is sufficient + * for current hantro/rkvdec scheduling); independent of caller- + * supplied surfaces_count. Pool is owned by driver_data so it + * outlives any single context destroy/recreate cycle. + * + * This replaces the prior per-surface OUTPUT loop, which (a) + * created an empty queue when surfaces_count==0 (ffmpeg vaapi- + * copy path) and (b) only populated surface->source_* for + * surfaces present at vaCreateContext time, NULL-derefing on + * surfaces created later. */ - output_buffers_count = surfaces_count > 0 ? surfaces_count : 4; - rc = v4l2_create_buffers(driver_data->video_fd, output_type, - output_buffers_count, &index_base); + rc = request_pool_init(&driver_data->output_pool, + driver_data->video_fd, output_type, 4); if (rc < 0) { status = VA_STATUS_ERROR_ALLOCATION_FAILED; goto error; @@ -111,40 +108,15 @@ * we don't have any indication wrt its life time. Let's make sure * its life span is under our control. */ - ids = malloc(surfaces_count * sizeof(VASurfaceID)); - if (ids == NULL) { - status = VA_STATUS_ERROR_ALLOCATION_FAILED; - goto error; - } - - memcpy(ids, surfaces_ids, surfaces_count * sizeof(VASurfaceID)); - - for (i = 0; i < surfaces_count; i++) { - index = index_base + i; - - surface_object = SURFACE(driver_data, surfaces_ids[i]); - if (surface_object == NULL) { - status = VA_STATUS_ERROR_INVALID_SURFACE; - goto error; - } - - rc = v4l2_query_buffer(driver_data->video_fd, output_type, - index, &length, &offset, 1); - if (rc < 0) { + if (surfaces_count > 0) { + ids = malloc(surfaces_count * sizeof(VASurfaceID)); + if (ids == NULL) { status = VA_STATUS_ERROR_ALLOCATION_FAILED; goto error; } - source_data = mmap(NULL, length, PROT_READ | PROT_WRITE, - MAP_SHARED, driver_data->video_fd, offset); - if (source_data == MAP_FAILED) { - status = VA_STATUS_ERROR_ALLOCATION_FAILED; - goto error; - } - - surface_object->source_index = index; - surface_object->source_data = source_data; - surface_object->source_size = length; + memcpy(ids, surfaces_ids, + surfaces_count * sizeof(VASurfaceID)); } /* @@ -200,9 +172,6 @@ goto complete; error: - if (source_data != MAP_FAILED) - munmap(source_data, length); - if (ids != NULL) free(ids); --- a/src/picture.c 2026-05-01 20:09:57.346428828 +0000 +++ b/src/picture.c 2026-05-01 20:19:10.742593454 +0000 @@ -216,6 +216,8 @@ struct request_data *driver_data = context->pDriverData; struct object_context *context_object; struct object_surface *surface_object; + struct request_pool_slot *slot; + int slot_index; context_object = CONTEXT(driver_data, context_id); if (context_object == NULL) @@ -228,6 +230,31 @@ if (surface_object->status == VASurfaceRendering) RequestSyncSurface(context, surface_id); + /* + * Borrow an OUTPUT (bitstream-input) slot from the driver-wide + * pool for the duration of this Begin/Render/End cycle. The + * surface's source_* fields hold the borrow's mmap pointer/size/ + * V4L2 buffer index until RequestSyncSurface releases it after + * VIDIOC_DQBUF. + */ + slot_index = request_pool_acquire(&driver_data->output_pool); + if (slot_index < 0) + return VA_STATUS_ERROR_ALLOCATION_FAILED; + + slot = request_pool_slot(&driver_data->output_pool, + (unsigned int)slot_index); + if (slot == NULL) { + request_pool_release(&driver_data->output_pool, + (unsigned int)slot_index); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + surface_object->source_index = slot->index; + surface_object->source_data = slot->data; + surface_object->source_size = slot->size; + surface_object->slices_size = 0; + surface_object->slices_count = 0; + surface_object->status = VASurfaceRendering; context_object->render_surface_id = surface_id; --- a/src/surface.c 2026-05-01 20:09:57.346428828 +0000 +++ b/src/surface.c 2026-05-01 20:19:35.490958060 +0000 @@ -254,10 +254,11 @@ if (surface_object == NULL) return VA_STATUS_ERROR_INVALID_SURFACE; - if (surface_object->source_data != NULL && - surface_object->source_size > 0) - munmap(surface_object->source_data, - surface_object->source_size); + /* + * source_* are now transient borrows from request_pool, not + * surface-owned mappings; the pool owns the underlying mmap. + * Nothing to free here. + */ for (j = 0; j < surface_object->destination_buffers_count; j++) if (surface_object->destination_map[j] != NULL && @@ -336,6 +337,15 @@ goto error; } + /* + * OUTPUT buffer is back from the kernel: return its pool slot + * for reuse and clear the surface's transient borrow handle. + */ + request_pool_release(&driver_data->output_pool, + surface_object->source_index); + surface_object->source_data = NULL; + surface_object->source_size = 0; + rc = v4l2_dequeue_buffer(driver_data->video_fd, -1, capture_type, surface_object->destination_index, surface_object->destination_buffers_count); --- a/src/meson.build 2026-05-01 20:09:57.346428828 +0000 +++ b/src/meson.build 2026-05-01 20:20:04.775389455 +0000 @@ -44,6 +44,7 @@ 'v4l2.c', 'mpeg2.c', 'h264.c', + 'request_pool.c', # 'h265.c' ] @@ -64,6 +65,7 @@ 'v4l2.h', 'mpeg2.h', 'h264.h', + 'request_pool.h', # 'h265.h' ] --- a/src/request_pool.h 2025-09-03 18:38:22.431999998 +0000 +++ b/src/request_pool.h 2026-05-01 20:17:37.517219722 +0000 @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2026 Markus Fritsche + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. + */ + +#ifndef _REQUEST_POOL_H_ +#define _REQUEST_POOL_H_ + +#include + +/* + * OUTPUT (bitstream-input) buffer pool, decoupled from caller-allocated + * VA surfaces. Sizing is driven by codec pipeline depth (typically 4 + * for H.264), not by the consumer's surface count. + * + * The pool owns the V4L2 buffer indices and their mmap pointers. A + * decode request "borrows" a slot at vaBeginPicture, fills it across + * vaRenderPicture calls, queues it at vaEndPicture, and releases it + * after VIDIOC_DQBUF returns. + * + * This replaces the per-surface OUTPUT-buffer ownership model in the + * pre-refactor code, where object_surface.source_* fields permanently + * held a single OUTPUT buffer per surface — incorrect because OUTPUT + * buffers are request-time resources, not picture-time resources, and + * because the per-surface loop in RequestCreateContext only ran when + * surfaces_count > 0 (breaking ffmpeg's vaapi-copy num_render_targets=0 + * convention). + */ + +struct request_pool_slot { + unsigned int index; /* V4L2 buffer index in OUTPUT queue */ + void *data; /* mmap pointer for this slot */ + unsigned int size; /* mmap size in bytes */ + bool busy; /* true while borrowed for a request */ +}; + +struct request_pool { + struct request_pool_slot *slots; + unsigned int count; + unsigned int next; /* round-robin acquire cursor */ + bool initialized; +}; + +/* + * Allocate count OUTPUT buffers via VIDIOC_CREATE_BUFS, query and mmap + * each, populate pool->slots[]. Caller must have already done + * VIDIOC_S_FMT on the OUTPUT queue. Returns 0 on success, -1 on + * failure. + */ +int request_pool_init(struct request_pool *pool, int video_fd, + unsigned int output_type, unsigned int count); + +/* + * Munmap all slots and free the slots array. Idempotent. + */ +void request_pool_destroy(struct request_pool *pool); + +/* + * Claim the next free slot (round-robin). Returns the slot's V4L2 + * buffer index on success (slot in pool->slots[] is determined by + * the returned index), or -1 if all slots are busy. + */ +int request_pool_acquire(struct request_pool *pool); + +/* + * Mark the slot at pool->slots[i] free for reuse. Caller must pass the + * V4L2 buffer index returned earlier from request_pool_acquire(). + */ +void request_pool_release(struct request_pool *pool, unsigned int index); + +/* + * Look up the pool slot owning a given V4L2 buffer index. Returns + * pointer to the slot on success, NULL if the index is out of range. + * The returned pointer is valid until pool destruction; do not free. + */ +struct request_pool_slot *request_pool_slot(struct request_pool *pool, + unsigned int index); + +#endif --- a/src/request_pool.c 2025-09-03 18:38:22.431999998 +0000 +++ b/src/request_pool.c 2026-05-01 20:17:37.537220017 +0000 @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2026 Markus Fritsche + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. + */ + +#include "request_pool.h" + +#include +#include +#include +#include + +#include "utils.h" +#include "v4l2.h" + +int request_pool_init(struct request_pool *pool, int video_fd, + unsigned int output_type, unsigned int count) +{ + unsigned int index_base; + unsigned int length; + unsigned int offset; + unsigned int i; + int rc; + + if (pool == NULL || count == 0) + return -1; + + if (pool->initialized) + return 0; + + pool->slots = calloc(count, sizeof(*pool->slots)); + if (pool->slots == NULL) + return -1; + + pool->count = count; + pool->next = 0; + + rc = v4l2_create_buffers(video_fd, output_type, count, &index_base); + if (rc < 0) + goto error; + + for (i = 0; i < count; i++) { + pool->slots[i].index = index_base + i; + pool->slots[i].busy = false; + + rc = v4l2_query_buffer(video_fd, output_type, + pool->slots[i].index, + &length, &offset, 1); + if (rc < 0) + goto error; + + pool->slots[i].data = mmap(NULL, length, + PROT_READ | PROT_WRITE, + MAP_SHARED, video_fd, offset); + if (pool->slots[i].data == MAP_FAILED) { + pool->slots[i].data = NULL; + goto error; + } + + pool->slots[i].size = length; + } + + pool->initialized = true; + return 0; + +error: + request_pool_destroy(pool); + return -1; +} + +void request_pool_destroy(struct request_pool *pool) +{ + unsigned int i; + + if (pool == NULL || pool->slots == NULL) + return; + + for (i = 0; i < pool->count; i++) { + if (pool->slots[i].data != NULL && pool->slots[i].size > 0) + munmap(pool->slots[i].data, pool->slots[i].size); + } + + free(pool->slots); + pool->slots = NULL; + pool->count = 0; + pool->next = 0; + pool->initialized = false; +} + +int request_pool_acquire(struct request_pool *pool) +{ + unsigned int start; + unsigned int i; + + if (pool == NULL || !pool->initialized || pool->count == 0) + return -1; + + start = pool->next; + for (i = 0; i < pool->count; i++) { + unsigned int slot = (start + i) % pool->count; + + if (!pool->slots[slot].busy) { + pool->slots[slot].busy = true; + pool->next = (slot + 1) % pool->count; + return (int)pool->slots[slot].index; + } + } + + /* All slots busy; caller must wait for an in-flight DQBUF. */ + return -1; +} + +void request_pool_release(struct request_pool *pool, unsigned int index) +{ + unsigned int i; + + if (pool == NULL || pool->slots == NULL) + return; + + for (i = 0; i < pool->count; i++) { + if (pool->slots[i].index == index) { + pool->slots[i].busy = false; + return; + } + } +} + +struct request_pool_slot *request_pool_slot(struct request_pool *pool, + unsigned int index) +{ + unsigned int i; + + if (pool == NULL || pool->slots == NULL) + return NULL; + + for (i = 0; i < pool->count; i++) { + if (pool->slots[i].index == index) + return &pool->slots[i]; + } + + return NULL; +}