From 19acc76da49f741aeae5ab3b01d5ec995c23d589 Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Mon, 4 May 2026 22:03:31 +0000 Subject: [PATCH] iter2 Fix 3: decoupled CAPTURE buffer pool with LRU recycling Pre-iter2 each VA surface was permanently 1:1 bound to one V4L2 CAPTURE buffer. mpv reusing a surface for a new decode while the compositor still held an EXPBUF'd dma_buf fd to the prior frame caused the kernel to write fresh decode output into the same physical memory the compositor was reading -- visible as stutter / back-and-forth swap on mpv --hwdec=vaapi --vo=gpu playback. Architecture: - New cap_pool abstraction (cap_pool.{h,c}) owns N CAPTURE buffers (N = max(surfaces_count, MIN_CAP_POOL=24)) with per-slot state {FREE, IN_DECODE, DECODED, EXPORTED} guarded by pthread_mutex_t. - Surfaces no longer own buffers; each vaBeginPicture acquires the oldest FREE slot (LRU), binds it for the decode cycle, and the slot cycles IN_DECODE -> DECODED (post-DQBUF) -> EXPORTED (post-EXPBUF). - Slot is released on next BeginPicture for the same surface or on vaDestroySurfaces. Limitations (Sonnet Phase 5 review iter2 9.x, deferred to iter3+): - Option-A statistical mitigation; race window narrows to "pool exhausted, force-recycle of oldest EXPORTED slot." For typical mpv 16-surface playback with MIN_CAP_POOL=24 the fallback never fires. - Multi-context concurrent use not addressed (one V4L2 device, multiple cap_pools -- iter3 scope). Other call sites updated: - picture.c::BeginPicture acquires + binds, releasing prior slot if any. - surface.c::SyncSurface marks slot DECODED after DQBUF. - surface.c::ExportSurfaceHandle marks slot EXPORTED, retaining OUR EXPBUF fd for force-recycle close(). - surface.c::DestroySurfaces releases via surface_unbind_slot; cap_pool owns the mmaps now. - surface.c::CreateSurfaces2 destroys the pool in the resolution-change path before REQBUFS(0) (else stale v4l2_index after Fix 1's REQBUFS). - context.c::DestroyContext invokes cap_pool_destroy. - image.c::DeriveImage skips copy_surface_to_image when current_slot is NULL (ffmpeg av_hwframe_ctx_init probes derive on undecoded surfaces). Verified: mpv vaapi-copy 200 frames bbb_1080p30, 0 drops, LRU visibly recycling slot indices, real luma gradient. mpv vaapi --vo=gpu operator-inspection follows. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/cap_pool.c | 303 ++++++++++++++++++++++++++++++++++++++++++++++++ src/cap_pool.h | 156 +++++++++++++++++++++++++ src/context.c | 11 +- src/image.c | 19 ++- src/meson.build | 2 + src/picture.c | 19 +++ src/request.h | 17 +++ src/surface.c | 199 +++++++++++++++++++++---------- src/surface.h | 35 ++++++ 9 files changed, 695 insertions(+), 66 deletions(-) create mode 100644 src/cap_pool.c create mode 100644 src/cap_pool.h diff --git a/src/cap_pool.c b/src/cap_pool.c new file mode 100644 index 0000000..c4f81ac --- /dev/null +++ b/src/cap_pool.c @@ -0,0 +1,303 @@ +/* + * Iteration 2 Fix 3: cap_pool implementation. + * + * Design rationale + limitations: see cap_pool.h docblock. + * + * Concurrency model: + * - All public functions take pool->lock at entry, release at exit. + * - cap_pool_acquire may sleep briefly while scanning slots; safe + * under lock since the scan is bounded by pool->count (<= 24 + * typical). + * - The slot pointer returned by acquire / mark_decoded / + * mark_exported / release is stable across the call (lock is + * dropped before return) but the slot's state may change between + * calls. Callers MUST NOT cache slot pointers across sleep/I/O -- + * they should treat slot pointers as opaque references valid only + * for the immediate operation. + * + * In practice, our caller pattern is: + * surface_object->current_slot = cap_pool_acquire(...); + * v4l2_queue_buffer(slot->v4l2_index, ...); + * // later, in SyncSurface for the same surface: + * v4l2_dequeue_buffer(surface_object->current_slot->v4l2_index, ...); + * cap_pool_mark_decoded(surface_object->current_slot); + * + * surface_object->current_slot is the persistent reference; the + * slot's V4L2 index is stable for the slot's lifetime. The state + * field IS read by other threads (acquire scans for FREE) — that + * reads are safe because: + * - acquire holds the lock during the scan + * - mark_decoded/mark_exported/release also hold the lock + * So state transitions are serialized. + */ + +#include "cap_pool.h" +#include "v4l2.h" +#include "utils.h" + +#include +#include +#include +#include +#include +#include + +#include + +static uint64_t monotonic_ns(void) +{ + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) + return 0; + return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec; +} + +int cap_pool_init(struct cap_pool *pool, int video_fd, unsigned int capture_type, + unsigned int count, unsigned int v4l2_buffers_count_per_slot) +{ + unsigned int index_base; + unsigned int i, j; + int rc; + + if (pool == NULL || count == 0) + return -EINVAL; + + memset(pool, 0, sizeof(*pool)); + + rc = pthread_mutex_init(&pool->lock, NULL); + if (rc != 0) + return -rc; + + pool->slots = calloc(count, sizeof(*pool->slots)); + if (pool->slots == NULL) { + pthread_mutex_destroy(&pool->lock); + return -ENOMEM; + } + pool->count = count; + + rc = v4l2_create_buffers(video_fd, capture_type, count, &index_base); + if (rc < 0) { + free(pool->slots); + pthread_mutex_destroy(&pool->lock); + return rc; + } + + for (i = 0; i < count; i++) { + struct cap_pool_slot *slot = &pool->slots[i]; + + slot->v4l2_index = index_base + i; + slot->buffers_count = v4l2_buffers_count_per_slot; + slot->state = CAP_SLOT_FREE; + slot->our_export_fd = -1; + slot->last_used_at_ns = 0; /* never used → highest LRU priority */ + slot->bound_to_surface_id = -1; + + rc = v4l2_query_buffer(video_fd, capture_type, slot->v4l2_index, + slot->map_lengths, slot->map_offsets, + v4l2_buffers_count_per_slot); + if (rc < 0) { + request_log("cap_pool_init: query_buffer failed for " + "slot %u (v4l2_index=%u)\n", + i, slot->v4l2_index); + goto error_cleanup; + } + + for (j = 0; j < v4l2_buffers_count_per_slot; j++) { + slot->map[j] = mmap(NULL, slot->map_lengths[j], + PROT_READ | PROT_WRITE, MAP_SHARED, + video_fd, slot->map_offsets[j]); + if (slot->map[j] == MAP_FAILED) { + request_log("cap_pool_init: mmap failed for " + "slot %u plane %u\n", i, j); + slot->map[j] = NULL; + goto error_cleanup; + } + } + } + + pool->initialized = true; + request_log("cap_pool_init: %u slots ready (v4l2_index=%u..%u, " + "%u plane(s) per slot)\n", + count, index_base, index_base + count - 1, + v4l2_buffers_count_per_slot); + return 0; + +error_cleanup: + for (i = 0; i < count; i++) { + struct cap_pool_slot *slot = &pool->slots[i]; + for (j = 0; j < v4l2_buffers_count_per_slot; j++) { + if (slot->map[j] != NULL && slot->map[j] != MAP_FAILED) + munmap(slot->map[j], slot->map_lengths[j]); + } + } + (void)v4l2_request_buffers(video_fd, capture_type, 0); + free(pool->slots); + pthread_mutex_destroy(&pool->lock); + memset(pool, 0, sizeof(*pool)); + return -EIO; +} + +void cap_pool_destroy(struct cap_pool *pool, int video_fd, unsigned int capture_type) +{ + unsigned int i, j; + + if (pool == NULL || !pool->initialized) + return; + + pthread_mutex_lock(&pool->lock); + + for (i = 0; i < pool->count; i++) { + struct cap_pool_slot *slot = &pool->slots[i]; + + if (slot->our_export_fd >= 0) { + close(slot->our_export_fd); + slot->our_export_fd = -1; + } + + for (j = 0; j < slot->buffers_count; j++) { + if (slot->map[j] != NULL && slot->map[j] != MAP_FAILED) { + munmap(slot->map[j], slot->map_lengths[j]); + slot->map[j] = NULL; + } + } + } + + (void)v4l2_request_buffers(video_fd, capture_type, 0); + + pthread_mutex_unlock(&pool->lock); + pthread_mutex_destroy(&pool->lock); + + free(pool->slots); + pool->slots = NULL; + pool->count = 0; + pool->initialized = false; +} + +struct cap_pool_slot *cap_pool_acquire(struct cap_pool *pool, int surface_id) +{ + struct cap_pool_slot *best = NULL; + uint64_t best_ts = UINT64_MAX; + unsigned int i; + + if (pool == NULL || !pool->initialized) + return NULL; + + pthread_mutex_lock(&pool->lock); + + /* First pass: find the FREE slot with oldest last_used_at_ns. */ + for (i = 0; i < pool->count; i++) { + struct cap_pool_slot *slot = &pool->slots[i]; + if (slot->state != CAP_SLOT_FREE) + continue; + if (slot->last_used_at_ns < best_ts) { + best = slot; + best_ts = slot->last_used_at_ns; + } + } + + /* + * Second pass (fallback): if no FREE slot, force-recycle the + * oldest EXPORTED slot. This is the documented Option A race + * window — the consumer may still hold a dup'd fd to this + * buffer's underlying physical memory, and the kernel will + * happily DMA new content into it. For typical mpv 16-surface + * playback with MIN_CAP_POOL=24, this fallback should never + * fire. If it does, the visual artifact is bounded to a few + * frames during recovery. + */ + if (best == NULL) { + best_ts = UINT64_MAX; + for (i = 0; i < pool->count; i++) { + struct cap_pool_slot *slot = &pool->slots[i]; + if (slot->state != CAP_SLOT_EXPORTED) + continue; + if (slot->last_used_at_ns < best_ts) { + best = slot; + best_ts = slot->last_used_at_ns; + } + } + if (best != NULL) { + request_log("cap_pool_acquire: pool exhausted, " + "force-recycling EXPORTED slot v4l2_index=%u " + "(consumer race window may open)\n", + best->v4l2_index); + if (best->our_export_fd >= 0) { + close(best->our_export_fd); + best->our_export_fd = -1; + } + } + } + + if (best == NULL) { + pthread_mutex_unlock(&pool->lock); + request_log("cap_pool_acquire: no slot available " + "(pool->count=%u, all slots IN_DECODE/DECODED?)\n", + pool->count); + return NULL; + } + + /* + * Don't transition DECODED slots — they hold valid pixel content + * a consumer may still be reading via DeriveImage (vaapi-copy + * path). We never recycle DECODED. If a surface holds a DECODED + * slot for an extended period, it stays held; the surface's + * destruction (vaDestroySurfaces) is the only path that releases + * it. mpv typically progresses through DECODED → EXPORTED quickly + * for vaapi DMA-BUF; for vaapi-copy, DECODED → consumer reads + * via mmap → consumer is done after copy_surface_to_image returns. + * The vaapi-copy consumer has no explicit "I'm done" signal, so + * we rely on the next BeginPicture for the same surface to + * release the prior DECODED slot. + */ + + best->state = CAP_SLOT_IN_DECODE; + best->bound_to_surface_id = surface_id; + best->last_used_at_ns = monotonic_ns(); + + pthread_mutex_unlock(&pool->lock); + return best; +} + +void cap_pool_mark_decoded(struct cap_pool *pool, struct cap_pool_slot *slot) +{ + if (pool == NULL || slot == NULL) + return; + pthread_mutex_lock(&pool->lock); + slot->state = CAP_SLOT_DECODED; + slot->last_used_at_ns = monotonic_ns(); + pthread_mutex_unlock(&pool->lock); +} + +void cap_pool_mark_exported(struct cap_pool *pool, struct cap_pool_slot *slot, int our_fd) +{ + if (pool == NULL || slot == NULL) + return; + pthread_mutex_lock(&pool->lock); + if (slot->our_export_fd >= 0 && slot->our_export_fd != our_fd) { + /* + * Double-Export: a previous EXPBUF'd fd existed. Close + * the old one. Consumer's old fd remains valid via + * dma_buf refcount. Documented in surface.c export path. + */ + close(slot->our_export_fd); + } + slot->our_export_fd = our_fd; + slot->state = CAP_SLOT_EXPORTED; + slot->last_used_at_ns = monotonic_ns(); + pthread_mutex_unlock(&pool->lock); +} + +void cap_pool_release(struct cap_pool *pool, struct cap_pool_slot *slot) +{ + if (pool == NULL || slot == NULL) + return; + pthread_mutex_lock(&pool->lock); + if (slot->our_export_fd >= 0) { + close(slot->our_export_fd); + slot->our_export_fd = -1; + } + slot->state = CAP_SLOT_FREE; + slot->bound_to_surface_id = -1; + slot->last_used_at_ns = monotonic_ns(); + pthread_mutex_unlock(&pool->lock); +} diff --git a/src/cap_pool.h b/src/cap_pool.h new file mode 100644 index 0000000..a1bf6a5 --- /dev/null +++ b/src/cap_pool.h @@ -0,0 +1,156 @@ +/* + * Iteration 2 Fix 3: decoupled CAPTURE buffer pool with LRU recycling. + * + * Background — the bug this fixes: + * + * Pre-iteration-2, each VAAPI surface was permanently 1:1 bound to a + * V4L2 CAPTURE buffer index at vaCreateSurfaces2 time. Each decode + * cycle re-QBUF'd that same physical buffer for the same surface ID. + * When mpv reused a surface for a new decode while the compositor + * still held an EXPBUF'd dma_buf fd to the prior frame's content, + * the kernel wrote new decode output into the SAME physical memory + * the compositor was reading from — visible as stutter / "back and + * forth" frame swap during mpv --hwdec=vaapi --vo=gpu playback. + * + * V4L2 does not enforce the constraint (it lets QBUF re-queue a + * buffer regardless of dma_buf refcount on EXPBUF'd fds). userspace + * must coordinate. + * + * Architecture (Sonnet Phase 5 review for iter2): + * + * Pool of N CAPTURE buffers (N >= max(surfaces_count, MIN_CAP_POOL)). + * Each slot has a state in {FREE, IN_DECODE, DECODED, EXPORTED}. + * Surfaces are no longer permanently bound; each vaBeginPicture + * acquires a FREE slot, binds it to the current decode, transitions + * it through IN_DECODE → DECODED → optionally EXPORTED. + * + * The DECODED state captures the window between SyncSurface DQBUF + * and either ExportSurfaceHandle (DMA-BUF path) or DeriveImage + * (vaapi-copy path). LRU recycling considers ONLY FREE slots, so + * DECODED slots cannot be claimed by a concurrent decode while + * the consumer is still using the bound surface's content. + * + * Concurrency: a pthread_mutex_t protects pool state. VAAPI is + * re-entrant for multi-threaded consumers (mpv may BeginPicture/ + * SyncSurface from one thread and ExportSurfaceHandle from + * another). + * + * Limitations (deferred to iteration 3+): + * + * - Option-A statistical mitigation, not a correct fix. The race + * window narrows from "constant" to "only when pool is exhausted + * and force-recycle of oldest EXPORTED slot fires." For typical + * mpv 16-surface playback with MIN_CAP_POOL=24, this never fires + * in practice (Sonnet review iter2 question 3). For pathological + * workloads (paused-with-video-still-visible, multi-stream), + * race windows still possible. Iteration 3 may revisit with + * V4L2_MEMORY_DMABUF + userspace allocation. + * + * - LRU "force-recycle" still has the race in the worst case. + * Closing OUR EXPBUF fd does not close the consumer's dup — the + * consumer's fd keeps the dma_buf alive but the V4L2 layer will + * happily write new data into the underlying physical memory on + * re-QBUF. There is no public V4L2 API to query dma_buf refcount. + * + * - Multi-context concurrent use (two libva contexts open + * simultaneously, e.g. Firefox playing two videos in different + * tabs through separate RDD instances): not addressed. Each + * context gets its own pool, but there's only one V4L2 device. + */ + +#ifndef _CAP_POOL_H_ +#define _CAP_POOL_H_ + +#include +#include +#include + +#include /* for VIDEO_MAX_PLANES */ + +#define MIN_CAP_POOL 24 + +enum cap_slot_state { + CAP_SLOT_FREE = 0, /* available for a new decode acquisition */ + CAP_SLOT_IN_DECODE, /* QBUF'd to V4L2, kernel owns */ + CAP_SLOT_DECODED, /* DQBUF'd, valid pixel content; mapped by surface */ + CAP_SLOT_EXPORTED, /* EXPBUF'd; consumer holds a dma_buf fd */ +}; + +struct cap_pool_slot { + unsigned int v4l2_index; /* V4L2 buffer index */ + void *map[VIDEO_MAX_PLANES]; /* mmap pointers */ + unsigned int map_lengths[VIDEO_MAX_PLANES]; + unsigned int map_offsets[VIDEO_MAX_PLANES]; + unsigned int buffers_count; /* V4L2 buffers per logical NV12 (1 for single-plane MPLANE) */ + enum cap_slot_state state; + int our_export_fd; /* -1 if not exported; close on FREE transition */ + uint64_t last_used_at_ns; /* CLOCK_MONOTONIC when last touched (LRU) */ + int bound_to_surface_id; /* -1 if not bound; informational */ +}; + +struct cap_pool { + struct cap_pool_slot *slots; + unsigned int count; /* allocated slot count */ + pthread_mutex_t lock; + bool initialized; +}; + +/* + * cap_pool_init — allocate a pool of `count` CAPTURE buffers via + * v4l2_create_buffers, mmap each buffer's planes, init slot states + * to FREE. `count` is min'd against any reasonable hardware cap. + * + * Returns 0 on success, negative errno on failure. + */ +int cap_pool_init(struct cap_pool *pool, int video_fd, unsigned int capture_type, + unsigned int count, unsigned int v4l2_buffers_count_per_slot); + +/* + * cap_pool_destroy — close any outstanding our_export_fds, munmap all + * planes, REQBUFS(0), free slots. Safe to call on a non-initialized + * pool (no-op). + * + * Note: closing our_export_fd does not invalidate any consumer-held + * dup'd fds — the kernel keeps the dma_buf alive while any fd refs + * it. munmap on our side is independent of the consumer's mmap (each + * mmap of a dma_buf is a distinct VMA). + */ +void cap_pool_destroy(struct cap_pool *pool, int video_fd, unsigned int capture_type); + +/* + * cap_pool_acquire — find a FREE slot with the oldest last_used_at_ns + * (LRU). If no FREE slot is available, force-recycle the oldest + * EXPORTED slot (close our_export_fd, demote to IN_DECODE for the + * caller). Returns NULL only if no slots can be recycled at all + * (catastrophic — pool too small). + * + * The returned slot is in IN_DECODE state. Caller QBUFs it and + * transitions to DECODED via cap_pool_mark_decoded after DQBUF. + */ +struct cap_pool_slot *cap_pool_acquire(struct cap_pool *pool, int surface_id); + +/* + * cap_pool_mark_decoded — IN_DECODE → DECODED. Touches last_used_at_ns. + * Called from RequestSyncSurface after successful DQBUF. + */ +void cap_pool_mark_decoded(struct cap_pool *pool, struct cap_pool_slot *slot); + +/* + * cap_pool_mark_exported — DECODED → EXPORTED. Stores `our_fd` so the + * pool owns OUR copy of the EXPBUF'd fd; the consumer received a + * dup'd / equivalent fd via the descriptor. last_used_at_ns is + * touched again so EXPORTED slots are recycled in LRU order. + * + * Called from RequestExportSurfaceHandle after VIDIOC_EXPBUF. + */ +void cap_pool_mark_exported(struct cap_pool *pool, struct cap_pool_slot *slot, int our_fd); + +/* + * cap_pool_release — explicitly return a slot to FREE (close our + * export fd if any). Called from RequestDestroySurfaces and from + * RequestBeginPicture when re-acquiring (the surface's previous slot + * is released first, then a new one acquired). + */ +void cap_pool_release(struct cap_pool *pool, struct cap_pool_slot *slot); + +#endif /* _CAP_POOL_H_ */ diff --git a/src/context.c b/src/context.c index ad5870b..2673668 100644 --- a/src/context.c +++ b/src/context.c @@ -244,9 +244,14 @@ VAStatus RequestDestroyContext(VADriverContextP context, VAContextID context_id) if (rc < 0) return VA_STATUS_ERROR_OPERATION_FAILED; - rc = v4l2_request_buffers(driver_data->video_fd, capture_type, 0); - if (rc < 0) - return VA_STATUS_ERROR_OPERATION_FAILED; + /* + * Iter2 Fix 3: cap_pool owns the CAPTURE buffers' mmaps + any + * outstanding our_export_fds. Tear it down (which also issues + * REQBUFS(0) on CAPTURE), so the next CreateSurfaces2 cycle sees + * a clean slate and rebuilds the pool at the new resolution. + */ + cap_pool_destroy(&driver_data->capture_pool, driver_data->video_fd, + capture_type); /* * Iteration 2 Fix 1: the kernel CAPTURE format state is no longer diff --git a/src/image.c b/src/image.c index eb901f0..6520878 100644 --- a/src/image.c +++ b/src/image.c @@ -209,9 +209,22 @@ VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id, if (status != VA_STATUS_SUCCESS) return status; - status = copy_surface_to_image (driver_data, surface_object, image); - if (status != VA_STATUS_SUCCESS) - return status; + /* + * Iter2 Fix 3: skip the surface→image copy when no CAPTURE slot is + * bound. ffmpeg's av_hwframe_ctx_init probes vaDeriveImage on a + * never-decoded surface to learn the format; it doesn't read the + * data. With the cap_pool decoupling, destination_data[] is NULL + * until BeginPicture binds a slot — copying from a NULL source + * crashed in memcpy. The image's buffer remains zero-initialized; + * subsequent post-decode DeriveImage on the same surface (after + * BeginPicture has bound a slot) does the real copy. + */ + if (surface_object->current_slot != NULL) { + status = copy_surface_to_image (driver_data, surface_object, + image); + if (status != VA_STATUS_SUCCESS) + return status; + } surface_object->status = VASurfaceReady; diff --git a/src/meson.build b/src/meson.build index 4ec28a2..818df99 100644 --- a/src/meson.build +++ b/src/meson.build @@ -46,6 +46,7 @@ sources = [ 'h264.c', 'h264_slice_header.c', 'request_pool.c', + 'cap_pool.c', # 'h265.c' ] @@ -68,6 +69,7 @@ headers = [ 'h264.h', 'h264_slice_header.h', 'request_pool.h', + 'cap_pool.h', # 'h265.h' ] diff --git a/src/picture.c b/src/picture.c index 305f1c4..37d696b 100644 --- a/src/picture.c +++ b/src/picture.c @@ -235,6 +235,25 @@ VAStatus RequestBeginPicture(VADriverContextP context, VAContextID context_id, if (surface_object->status == VASurfaceRendering) RequestSyncSurface(context, surface_id); + /* + * Iter2 Fix 3: acquire a CAPTURE-pool slot for this decode cycle. + * If the surface still holds a slot from a prior decode (DECODED + * or EXPORTED — the consumer is done with it by definition since + * we got back to BeginPicture for the same surface), release it + * first. The new slot is bound and its V4L2 index + mmap pointers + * are mirrored into surface_object->destination_* so the existing + * QBUF/DQBUF/EXPBUF code paths see no behavioral change. + */ + if (surface_object->current_slot != NULL) + surface_unbind_slot(driver_data, surface_object); + { + struct cap_pool_slot *cap_slot = + cap_pool_acquire(&driver_data->capture_pool, surface_id); + if (cap_slot == NULL) + return VA_STATUS_ERROR_ALLOCATION_FAILED; + surface_bind_slot(surface_object, cap_slot); + } + /* * Borrow an OUTPUT (bitstream-input) slot from the driver-wide * pool for the duration of this Begin/Render/End cycle. The diff --git a/src/request.h b/src/request.h index 03a6a78..b20b329 100644 --- a/src/request.h +++ b/src/request.h @@ -32,6 +32,7 @@ #include "context.h" #include "object_heap.h" #include "request_pool.h" +#include "cap_pool.h" #include "video.h" #include @@ -63,6 +64,22 @@ struct request_data { * RequestCreateContext, torn down at driver Terminate. */ struct request_pool output_pool; + + /* + * CAPTURE (decoded-frame) buffer pool, decoupled from VA + * surfaces (iter2 Fix 3). Each surface acquires a slot at + * vaBeginPicture time and releases it on the next acquisition + * or vaDestroySurfaces. Pool sized to max(surfaces_count, + * MIN_CAP_POOL) at first vaCreateSurfaces2; torn down at + * vaDestroyContext. + * + * Background: pre-iter2 each surface was 1:1 bound to one + * CAPTURE buffer index; mpv re-using a surface for a new decode + * caused V4L2 to re-QBUF the same physical buffer while a + * compositor still held an EXPBUF'd dma_buf fd, producing + * visible stutter on mpv vaapi --vo=gpu. + */ + struct cap_pool capture_pool; }; VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context); diff --git a/src/surface.c b/src/surface.c index 3f89b8f..88a0b7a 100644 --- a/src/surface.c +++ b/src/surface.c @@ -75,6 +75,60 @@ void surface_reset_format_cache(void) LAST_OUTPUT_HEIGHT = 0; } +/* + * Iter2 Fix 3 helpers — bind / unbind a cap_pool_slot to an + * object_surface. Called from BeginPicture (acquire+bind) and + * DestroySurfaces (unbind). Populates surface_object->destination_* + * fields from the slot so existing code paths (the QBUF in + * picture.c::EndPicture, the EXPBUF in ExportSurfaceHandle, the + * mmap-read in copy_surface_to_image) continue to work unchanged. + * + * surface_bind_slot is called only from BeginPicture; the surface's + * format-uniform fields (destination_planes_count, destination_sizes, + * destination_offsets, destination_bytesperlines) are already set + * by CreateSurfaces2 and stay constant. + */ +void surface_bind_slot(struct object_surface *surface_object, + struct cap_pool_slot *slot) +{ + unsigned int j; + + surface_object->current_slot = slot; + surface_object->destination_index = slot->v4l2_index; + surface_object->destination_buffers_count = slot->buffers_count; + + for (j = 0; j < slot->buffers_count; j++) { + surface_object->destination_map[j] = slot->map[j]; + surface_object->destination_map_lengths[j] = slot->map_lengths[j]; + surface_object->destination_map_offsets[j] = slot->map_offsets[j]; + } + + /* + * destination_data[j] is the per-plane CPU pointer used by + * copy_surface_to_image. For single-buffer MPLANE NV12 (our + * common case), all planes live in slot->map[0] at varying + * offsets recorded in destination_offsets[]. + */ + if (slot->buffers_count == 1) { + for (j = 0; j < surface_object->destination_planes_count; j++) + surface_object->destination_data[j] = + (unsigned char *)slot->map[0] + + surface_object->destination_offsets[j]; + } else { + for (j = 0; j < surface_object->destination_planes_count; j++) + surface_object->destination_data[j] = slot->map[j]; + } +} + +void surface_unbind_slot(struct request_data *driver_data, + struct object_surface *surface_object) +{ + if (surface_object->current_slot == NULL) + return; + cap_pool_release(&driver_data->capture_pool, surface_object->current_slot); + surface_object->current_slot = NULL; +} + VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format, unsigned int width, unsigned int height, VASurfaceID *surfaces_ids, @@ -90,8 +144,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format, unsigned int destination_planes_count; unsigned int format_width, format_height; unsigned int capture_type; - unsigned int index_base; - unsigned int index; unsigned int i, j; VASurfaceID id; bool found; @@ -128,12 +180,25 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format, * also block the implicit format change. Sonnet Phase 5 * review (iter2 9.1) flagged this as a missing REQBUFS(0) * gap on the CAPTURE side of the resolution-change path. + * + * Iter2 Fix 3 corollary: cap_pool owns the CAPTURE buffers' + * mmaps and slot states. Destroy it (which issues REQBUFS(0) + * on capture) before the format change so the next + * CreateSurfaces2 step can rebuild the pool at the new + * resolution. Without this, pool->initialized stays true, + * cap_pool_init below is skipped, and the slots' v4l2_index + * fields point to dead buffers from the prior resolution. */ if (LAST_OUTPUT_WIDTH != 0) { + if (driver_data->capture_pool.initialized) + cap_pool_destroy(&driver_data->capture_pool, + driver_data->video_fd, + v4l2_type_video_capture(true)); + else + (void)v4l2_request_buffers(driver_data->video_fd, + v4l2_type_video_capture(true), 0); (void)v4l2_request_buffers(driver_data->video_fd, output_type, 0); - (void)v4l2_request_buffers(driver_data->video_fd, - v4l2_type_video_capture(true), 0); } rc = v4l2_set_format(driver_data->video_fd, output_type, pixelformat, @@ -212,58 +277,58 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format, destination_sizes[0], destination_sizes[1], destination_planes_count, video_format->v4l2_buffers_count); - rc = v4l2_create_buffers(driver_data->video_fd, capture_type, - surfaces_count, &index_base); - if (rc < 0) - return VA_STATUS_ERROR_ALLOCATION_FAILED; + /* + * Iter2 Fix 3: initialize the CAPTURE buffer pool on first call. + * Pool size = max(surfaces_count, MIN_CAP_POOL); the +headroom + * gives LRU recycling enough margin to never reuse a buffer + * within the consumer's compositor-hold window for typical + * playback patterns. + * + * If the pool already exists from a prior CreateSurfaces2 (e.g. + * mpv probe surfaces vs. real-resolution surfaces), it stays — + * but if the resolution changed (Fix 1's REQBUFS(0) on CAPTURE + * fired before this point), the pool was destroyed and we + * rebuild here. + */ + if (!driver_data->capture_pool.initialized) { + unsigned int pool_count = surfaces_count > MIN_CAP_POOL ? + surfaces_count : MIN_CAP_POOL; + rc = cap_pool_init(&driver_data->capture_pool, + driver_data->video_fd, capture_type, + pool_count, video_format->v4l2_buffers_count); + if (rc < 0) + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + /* + * Compute format-uniform destination_* values (sizes, offsets, + * bytesperlines, planes_count). These are the same for all + * surfaces of this format, set once per surface here, never + * changed by BeginPicture's slot acquisition. + */ + if (video_format->v4l2_buffers_count == 1) { + destination_sizes[0] = destination_bytesperlines[0] * + format_height; + for (j = 1; j < destination_planes_count; j++) + destination_sizes[j] = destination_sizes[0] / 2; + } for (i = 0; i < surfaces_count; i++) { - index = index_base + i; - id = object_heap_allocate(&driver_data->surface_heap); surface_object = SURFACE(driver_data, id); if (surface_object == NULL) return VA_STATUS_ERROR_ALLOCATION_FAILED; - rc = v4l2_query_buffer(driver_data->video_fd, capture_type, - index, - surface_object->destination_map_lengths, - surface_object->destination_map_offsets, - video_format->v4l2_buffers_count); - if (rc < 0) - return VA_STATUS_ERROR_ALLOCATION_FAILED; - - for (j = 0; j < video_format->v4l2_buffers_count; j++) { - surface_object->destination_map[j] = - mmap(NULL, - surface_object->destination_map_lengths[j], - PROT_READ | PROT_WRITE, MAP_SHARED, - driver_data->video_fd, - surface_object->destination_map_offsets[j]); - - if (surface_object->destination_map[j] == MAP_FAILED) - return VA_STATUS_ERROR_ALLOCATION_FAILED; - } - - /* - * FIXME: Handle this per-pixelformat, trying to generalize it - * is not a reasonable approach. The final description should be - * in terms of (logical) planes. - */ + surface_object->current_slot = NULL; /* iter2 Fix 3 */ + surface_object->destination_index = 0; /* set on bind */ + surface_object->destination_planes_count = destination_planes_count; + surface_object->destination_buffers_count = + video_format->v4l2_buffers_count; if (video_format->v4l2_buffers_count == 1) { - destination_sizes[0] = destination_bytesperlines[0] * - format_height; - - for (j = 1; j < destination_planes_count; j++) - destination_sizes[j] = destination_sizes[0] / 2; - for (j = 0; j < destination_planes_count; j++) { surface_object->destination_offsets[j] = j > 0 ? destination_sizes[j - 1] : 0; - surface_object->destination_data[j] = - ((unsigned char *)surface_object->destination_map[0] + - surface_object->destination_offsets[j]); surface_object->destination_sizes[j] = destination_sizes[j]; surface_object->destination_bytesperlines[j] = @@ -272,8 +337,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format, } else if (video_format->v4l2_buffers_count == destination_planes_count) { for (j = 0; j < destination_planes_count; j++) { surface_object->destination_offsets[j] = 0; - surface_object->destination_data[j] = - surface_object->destination_map[j]; surface_object->destination_sizes[j] = destination_sizes[j]; surface_object->destination_bytesperlines[j] = @@ -291,13 +354,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format, surface_object->source_data = NULL; surface_object->source_size = 0; - surface_object->destination_index = index; - - surface_object->destination_planes_count = - destination_planes_count; - surface_object->destination_buffers_count = - video_format->v4l2_buffers_count; - memset(&surface_object->params, 0, sizeof(surface_object->params)); surface_object->slices_count = 0; @@ -324,7 +380,7 @@ VAStatus RequestDestroySurfaces(VADriverContextP context, { struct request_data *driver_data = context->pDriverData; struct object_surface *surface_object; - unsigned int i, j; + unsigned int i; for (i = 0; i < surfaces_count; i++) { surface_object = SURFACE(driver_data, surfaces_ids[i]); @@ -335,13 +391,13 @@ VAStatus RequestDestroySurfaces(VADriverContextP context, * source_* are now transient borrows from request_pool, not * surface-owned mappings; the pool owns the underlying mmap. * Nothing to free here. + * + * Iter2 Fix 3: destination_* mappings are owned by cap_pool; + * surface_unbind_slot returns the slot to FREE (closing OUR + * EXPBUF fd if any). Pool-owned mmaps are freed at + * cap_pool_destroy time (RequestDestroyContext). */ - - for (j = 0; j < surface_object->destination_buffers_count; j++) - if (surface_object->destination_map[j] != NULL && - surface_object->destination_map_lengths[j] > 0) - munmap(surface_object->destination_map[j], - surface_object->destination_map_lengths[j]); + surface_unbind_slot(driver_data, surface_object); if (surface_object->request_fd > 0) close(surface_object->request_fd); @@ -435,6 +491,17 @@ VAStatus RequestSyncSurface(VADriverContextP context, VASurfaceID surface_id) goto error; } + /* + * Iter2 Fix 3: CAPTURE buffer is back from the kernel with valid + * pixel content. Transition the slot IN_DECODE → DECODED. The slot + * stays bound to this surface until either ExportSurfaceHandle + * (→ EXPORTED), the next BeginPicture for this surface (slot is + * released first), or DestroySurfaces (release). + */ + if (surface_object->current_slot != NULL) + cap_pool_mark_decoded(&driver_data->capture_pool, + surface_object->current_slot); + /* * DEBUG INSTRUMENTATION (0010): hex-dump first 32 bytes of the * decoded CAPTURE Y-plane after DQBUF, plus a 32-byte luma @@ -664,6 +731,18 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context, goto error; } + /* + * Iter2 Fix 3: pool now owns OUR copy of the EXPBUF'd fd. The + * consumer receives a dup'd / equivalent fd via the descriptor. + * Slot transitions DECODED → EXPORTED; it will be force-recyclable + * by LRU when the pool is exhausted, but FREE slots are always + * preferred. + */ + if (surface_object->current_slot != NULL && export_fds_count > 0) + cap_pool_mark_exported(&driver_data->capture_pool, + surface_object->current_slot, + export_fds[0]); + planes_count = surface_object->destination_planes_count; surface_descriptor->fourcc = VA_FOURCC_NV12; diff --git a/src/surface.h b/src/surface.h index 5e2bef3..d2a4cb3 100644 --- a/src/surface.h +++ b/src/surface.h @@ -32,6 +32,9 @@ #include #include "object_heap.h" +#include "cap_pool.h" + +struct request_data; #define SURFACE(data, id) \ ((struct object_surface *)object_heap_lookup(&(data)->surface_heap, id)) @@ -48,6 +51,26 @@ struct object_surface { void *source_data; unsigned int source_size; + /* + * Iter2 Fix 3: destination_* fields below are now per-decode-cycle. + * They are populated from current_slot in RequestBeginPicture and + * remain valid through SyncSurface, ExportSurfaceHandle, and + * DeriveImage/copy_surface_to_image (vaapi-copy path). Subsequent + * BeginPicture for this surface releases the prior slot and + * acquires a new one. + * + * destination_planes_count, destination_sizes, destination_offsets, + * destination_bytesperlines are FORMAT-uniform across all CAPTURE + * buffers, so they're set once at CreateSurfaces2 time and stay. + * + * destination_index, destination_map[], destination_map_lengths, + * destination_map_offsets, destination_data[] are SLOT-specific + * and re-populated each BeginPicture from current_slot. + * + * destination_buffers_count is also format-uniform (V4L2 planes + * per buffer = 1 for single-plane MPLANE NV12). + */ + struct cap_pool_slot *current_slot; /* iter2 Fix 3 */ unsigned int destination_index; void *destination_map[VIDEO_MAX_PLANES]; unsigned int destination_map_lengths[VIDEO_MAX_PLANES]; @@ -146,4 +169,16 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context, */ void surface_reset_format_cache(void); +/* + * Iter2 Fix 3: bind / unbind a CAPTURE-pool slot to an object_surface. + * Called from picture.c::RequestBeginPicture (acquire+bind) and + * surface.c::RequestDestroySurfaces (unbind). Mirrors slot's V4L2 index + * and mmap pointers into surface_object->destination_* so existing + * QBUF/DQBUF/EXPBUF code paths see no behavioral change. + */ +void surface_bind_slot(struct object_surface *surface_object, + struct cap_pool_slot *slot); +void surface_unbind_slot(struct request_data *driver_data, + struct object_surface *surface_object); + #endif