iter2 Fix 3: decoupled CAPTURE buffer pool with LRU recycling
Pre-iter2 each VA surface was permanently 1:1 bound to one V4L2 CAPTURE
buffer. mpv reusing a surface for a new decode while the compositor still
held an EXPBUF'd dma_buf fd to the prior frame caused the kernel to
write fresh decode output into the same physical memory the compositor
was reading -- visible as stutter / back-and-forth swap on
mpv --hwdec=vaapi --vo=gpu playback.
Architecture:
- New cap_pool abstraction (cap_pool.{h,c}) owns N CAPTURE buffers
(N = max(surfaces_count, MIN_CAP_POOL=24)) with per-slot state
{FREE, IN_DECODE, DECODED, EXPORTED} guarded by pthread_mutex_t.
- Surfaces no longer own buffers; each vaBeginPicture acquires the
oldest FREE slot (LRU), binds it for the decode cycle, and the slot
cycles IN_DECODE -> DECODED (post-DQBUF) -> EXPORTED (post-EXPBUF).
- Slot is released on next BeginPicture for the same surface or on
vaDestroySurfaces.
Limitations (Sonnet Phase 5 review iter2 9.x, deferred to iter3+):
- Option-A statistical mitigation; race window narrows to "pool
exhausted, force-recycle of oldest EXPORTED slot." For typical mpv
16-surface playback with MIN_CAP_POOL=24 the fallback never fires.
- Multi-context concurrent use not addressed (one V4L2 device, multiple
cap_pools -- iter3 scope).
Other call sites updated:
- picture.c::BeginPicture acquires + binds, releasing prior slot if any.
- surface.c::SyncSurface marks slot DECODED after DQBUF.
- surface.c::ExportSurfaceHandle marks slot EXPORTED, retaining OUR
EXPBUF fd for force-recycle close().
- surface.c::DestroySurfaces releases via surface_unbind_slot;
cap_pool owns the mmaps now.
- surface.c::CreateSurfaces2 destroys the pool in the resolution-change
path before REQBUFS(0) (else stale v4l2_index after Fix 1's REQBUFS).
- context.c::DestroyContext invokes cap_pool_destroy.
- image.c::DeriveImage skips copy_surface_to_image when current_slot is
NULL (ffmpeg av_hwframe_ctx_init probes derive on undecoded surfaces).
Verified: mpv vaapi-copy 200 frames bbb_1080p30, 0 drops, LRU visibly
recycling slot indices, real luma gradient. mpv vaapi --vo=gpu
operator-inspection follows.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+139
-60
@@ -75,6 +75,60 @@ void surface_reset_format_cache(void)
|
||||
LAST_OUTPUT_HEIGHT = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Iter2 Fix 3 helpers — bind / unbind a cap_pool_slot to an
|
||||
* object_surface. Called from BeginPicture (acquire+bind) and
|
||||
* DestroySurfaces (unbind). Populates surface_object->destination_*
|
||||
* fields from the slot so existing code paths (the QBUF in
|
||||
* picture.c::EndPicture, the EXPBUF in ExportSurfaceHandle, the
|
||||
* mmap-read in copy_surface_to_image) continue to work unchanged.
|
||||
*
|
||||
* surface_bind_slot is called only from BeginPicture; the surface's
|
||||
* format-uniform fields (destination_planes_count, destination_sizes,
|
||||
* destination_offsets, destination_bytesperlines) are already set
|
||||
* by CreateSurfaces2 and stay constant.
|
||||
*/
|
||||
void surface_bind_slot(struct object_surface *surface_object,
|
||||
struct cap_pool_slot *slot)
|
||||
{
|
||||
unsigned int j;
|
||||
|
||||
surface_object->current_slot = slot;
|
||||
surface_object->destination_index = slot->v4l2_index;
|
||||
surface_object->destination_buffers_count = slot->buffers_count;
|
||||
|
||||
for (j = 0; j < slot->buffers_count; j++) {
|
||||
surface_object->destination_map[j] = slot->map[j];
|
||||
surface_object->destination_map_lengths[j] = slot->map_lengths[j];
|
||||
surface_object->destination_map_offsets[j] = slot->map_offsets[j];
|
||||
}
|
||||
|
||||
/*
|
||||
* destination_data[j] is the per-plane CPU pointer used by
|
||||
* copy_surface_to_image. For single-buffer MPLANE NV12 (our
|
||||
* common case), all planes live in slot->map[0] at varying
|
||||
* offsets recorded in destination_offsets[].
|
||||
*/
|
||||
if (slot->buffers_count == 1) {
|
||||
for (j = 0; j < surface_object->destination_planes_count; j++)
|
||||
surface_object->destination_data[j] =
|
||||
(unsigned char *)slot->map[0] +
|
||||
surface_object->destination_offsets[j];
|
||||
} else {
|
||||
for (j = 0; j < surface_object->destination_planes_count; j++)
|
||||
surface_object->destination_data[j] = slot->map[j];
|
||||
}
|
||||
}
|
||||
|
||||
void surface_unbind_slot(struct request_data *driver_data,
|
||||
struct object_surface *surface_object)
|
||||
{
|
||||
if (surface_object->current_slot == NULL)
|
||||
return;
|
||||
cap_pool_release(&driver_data->capture_pool, surface_object->current_slot);
|
||||
surface_object->current_slot = NULL;
|
||||
}
|
||||
|
||||
VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
unsigned int width, unsigned int height,
|
||||
VASurfaceID *surfaces_ids,
|
||||
@@ -90,8 +144,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
unsigned int destination_planes_count;
|
||||
unsigned int format_width, format_height;
|
||||
unsigned int capture_type;
|
||||
unsigned int index_base;
|
||||
unsigned int index;
|
||||
unsigned int i, j;
|
||||
VASurfaceID id;
|
||||
bool found;
|
||||
@@ -128,12 +180,25 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
* also block the implicit format change. Sonnet Phase 5
|
||||
* review (iter2 9.1) flagged this as a missing REQBUFS(0)
|
||||
* gap on the CAPTURE side of the resolution-change path.
|
||||
*
|
||||
* Iter2 Fix 3 corollary: cap_pool owns the CAPTURE buffers'
|
||||
* mmaps and slot states. Destroy it (which issues REQBUFS(0)
|
||||
* on capture) before the format change so the next
|
||||
* CreateSurfaces2 step can rebuild the pool at the new
|
||||
* resolution. Without this, pool->initialized stays true,
|
||||
* cap_pool_init below is skipped, and the slots' v4l2_index
|
||||
* fields point to dead buffers from the prior resolution.
|
||||
*/
|
||||
if (LAST_OUTPUT_WIDTH != 0) {
|
||||
if (driver_data->capture_pool.initialized)
|
||||
cap_pool_destroy(&driver_data->capture_pool,
|
||||
driver_data->video_fd,
|
||||
v4l2_type_video_capture(true));
|
||||
else
|
||||
(void)v4l2_request_buffers(driver_data->video_fd,
|
||||
v4l2_type_video_capture(true), 0);
|
||||
(void)v4l2_request_buffers(driver_data->video_fd,
|
||||
output_type, 0);
|
||||
(void)v4l2_request_buffers(driver_data->video_fd,
|
||||
v4l2_type_video_capture(true), 0);
|
||||
}
|
||||
|
||||
rc = v4l2_set_format(driver_data->video_fd, output_type, pixelformat,
|
||||
@@ -212,58 +277,58 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
destination_sizes[0], destination_sizes[1],
|
||||
destination_planes_count, video_format->v4l2_buffers_count);
|
||||
|
||||
rc = v4l2_create_buffers(driver_data->video_fd, capture_type,
|
||||
surfaces_count, &index_base);
|
||||
if (rc < 0)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
/*
|
||||
* Iter2 Fix 3: initialize the CAPTURE buffer pool on first call.
|
||||
* Pool size = max(surfaces_count, MIN_CAP_POOL); the +headroom
|
||||
* gives LRU recycling enough margin to never reuse a buffer
|
||||
* within the consumer's compositor-hold window for typical
|
||||
* playback patterns.
|
||||
*
|
||||
* If the pool already exists from a prior CreateSurfaces2 (e.g.
|
||||
* mpv probe surfaces vs. real-resolution surfaces), it stays —
|
||||
* but if the resolution changed (Fix 1's REQBUFS(0) on CAPTURE
|
||||
* fired before this point), the pool was destroyed and we
|
||||
* rebuild here.
|
||||
*/
|
||||
if (!driver_data->capture_pool.initialized) {
|
||||
unsigned int pool_count = surfaces_count > MIN_CAP_POOL ?
|
||||
surfaces_count : MIN_CAP_POOL;
|
||||
rc = cap_pool_init(&driver_data->capture_pool,
|
||||
driver_data->video_fd, capture_type,
|
||||
pool_count, video_format->v4l2_buffers_count);
|
||||
if (rc < 0)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute format-uniform destination_* values (sizes, offsets,
|
||||
* bytesperlines, planes_count). These are the same for all
|
||||
* surfaces of this format, set once per surface here, never
|
||||
* changed by BeginPicture's slot acquisition.
|
||||
*/
|
||||
if (video_format->v4l2_buffers_count == 1) {
|
||||
destination_sizes[0] = destination_bytesperlines[0] *
|
||||
format_height;
|
||||
for (j = 1; j < destination_planes_count; j++)
|
||||
destination_sizes[j] = destination_sizes[0] / 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < surfaces_count; i++) {
|
||||
index = index_base + i;
|
||||
|
||||
id = object_heap_allocate(&driver_data->surface_heap);
|
||||
surface_object = SURFACE(driver_data, id);
|
||||
if (surface_object == NULL)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
|
||||
rc = v4l2_query_buffer(driver_data->video_fd, capture_type,
|
||||
index,
|
||||
surface_object->destination_map_lengths,
|
||||
surface_object->destination_map_offsets,
|
||||
video_format->v4l2_buffers_count);
|
||||
if (rc < 0)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
|
||||
for (j = 0; j < video_format->v4l2_buffers_count; j++) {
|
||||
surface_object->destination_map[j] =
|
||||
mmap(NULL,
|
||||
surface_object->destination_map_lengths[j],
|
||||
PROT_READ | PROT_WRITE, MAP_SHARED,
|
||||
driver_data->video_fd,
|
||||
surface_object->destination_map_offsets[j]);
|
||||
|
||||
if (surface_object->destination_map[j] == MAP_FAILED)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
}
|
||||
|
||||
/*
|
||||
* FIXME: Handle this per-pixelformat, trying to generalize it
|
||||
* is not a reasonable approach. The final description should be
|
||||
* in terms of (logical) planes.
|
||||
*/
|
||||
surface_object->current_slot = NULL; /* iter2 Fix 3 */
|
||||
surface_object->destination_index = 0; /* set on bind */
|
||||
surface_object->destination_planes_count = destination_planes_count;
|
||||
surface_object->destination_buffers_count =
|
||||
video_format->v4l2_buffers_count;
|
||||
|
||||
if (video_format->v4l2_buffers_count == 1) {
|
||||
destination_sizes[0] = destination_bytesperlines[0] *
|
||||
format_height;
|
||||
|
||||
for (j = 1; j < destination_planes_count; j++)
|
||||
destination_sizes[j] = destination_sizes[0] / 2;
|
||||
|
||||
for (j = 0; j < destination_planes_count; j++) {
|
||||
surface_object->destination_offsets[j] =
|
||||
j > 0 ? destination_sizes[j - 1] : 0;
|
||||
surface_object->destination_data[j] =
|
||||
((unsigned char *)surface_object->destination_map[0] +
|
||||
surface_object->destination_offsets[j]);
|
||||
surface_object->destination_sizes[j] =
|
||||
destination_sizes[j];
|
||||
surface_object->destination_bytesperlines[j] =
|
||||
@@ -272,8 +337,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
} else if (video_format->v4l2_buffers_count == destination_planes_count) {
|
||||
for (j = 0; j < destination_planes_count; j++) {
|
||||
surface_object->destination_offsets[j] = 0;
|
||||
surface_object->destination_data[j] =
|
||||
surface_object->destination_map[j];
|
||||
surface_object->destination_sizes[j] =
|
||||
destination_sizes[j];
|
||||
surface_object->destination_bytesperlines[j] =
|
||||
@@ -291,13 +354,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
surface_object->source_data = NULL;
|
||||
surface_object->source_size = 0;
|
||||
|
||||
surface_object->destination_index = index;
|
||||
|
||||
surface_object->destination_planes_count =
|
||||
destination_planes_count;
|
||||
surface_object->destination_buffers_count =
|
||||
video_format->v4l2_buffers_count;
|
||||
|
||||
memset(&surface_object->params, 0,
|
||||
sizeof(surface_object->params));
|
||||
surface_object->slices_count = 0;
|
||||
@@ -324,7 +380,7 @@ VAStatus RequestDestroySurfaces(VADriverContextP context,
|
||||
{
|
||||
struct request_data *driver_data = context->pDriverData;
|
||||
struct object_surface *surface_object;
|
||||
unsigned int i, j;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < surfaces_count; i++) {
|
||||
surface_object = SURFACE(driver_data, surfaces_ids[i]);
|
||||
@@ -335,13 +391,13 @@ VAStatus RequestDestroySurfaces(VADriverContextP context,
|
||||
* source_* are now transient borrows from request_pool, not
|
||||
* surface-owned mappings; the pool owns the underlying mmap.
|
||||
* Nothing to free here.
|
||||
*
|
||||
* Iter2 Fix 3: destination_* mappings are owned by cap_pool;
|
||||
* surface_unbind_slot returns the slot to FREE (closing OUR
|
||||
* EXPBUF fd if any). Pool-owned mmaps are freed at
|
||||
* cap_pool_destroy time (RequestDestroyContext).
|
||||
*/
|
||||
|
||||
for (j = 0; j < surface_object->destination_buffers_count; j++)
|
||||
if (surface_object->destination_map[j] != NULL &&
|
||||
surface_object->destination_map_lengths[j] > 0)
|
||||
munmap(surface_object->destination_map[j],
|
||||
surface_object->destination_map_lengths[j]);
|
||||
surface_unbind_slot(driver_data, surface_object);
|
||||
|
||||
if (surface_object->request_fd > 0)
|
||||
close(surface_object->request_fd);
|
||||
@@ -435,6 +491,17 @@ VAStatus RequestSyncSurface(VADriverContextP context, VASurfaceID surface_id)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Iter2 Fix 3: CAPTURE buffer is back from the kernel with valid
|
||||
* pixel content. Transition the slot IN_DECODE → DECODED. The slot
|
||||
* stays bound to this surface until either ExportSurfaceHandle
|
||||
* (→ EXPORTED), the next BeginPicture for this surface (slot is
|
||||
* released first), or DestroySurfaces (release).
|
||||
*/
|
||||
if (surface_object->current_slot != NULL)
|
||||
cap_pool_mark_decoded(&driver_data->capture_pool,
|
||||
surface_object->current_slot);
|
||||
|
||||
/*
|
||||
* DEBUG INSTRUMENTATION (0010): hex-dump first 32 bytes of the
|
||||
* decoded CAPTURE Y-plane after DQBUF, plus a 32-byte luma
|
||||
@@ -664,6 +731,18 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context,
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Iter2 Fix 3: pool now owns OUR copy of the EXPBUF'd fd. The
|
||||
* consumer receives a dup'd / equivalent fd via the descriptor.
|
||||
* Slot transitions DECODED → EXPORTED; it will be force-recyclable
|
||||
* by LRU when the pool is exhausted, but FREE slots are always
|
||||
* preferred.
|
||||
*/
|
||||
if (surface_object->current_slot != NULL && export_fds_count > 0)
|
||||
cap_pool_mark_exported(&driver_data->capture_pool,
|
||||
surface_object->current_slot,
|
||||
export_fds[0]);
|
||||
|
||||
planes_count = surface_object->destination_planes_count;
|
||||
|
||||
surface_descriptor->fourcc = VA_FOURCC_NV12;
|
||||
|
||||
Reference in New Issue
Block a user