iter2 Fix 3: decoupled CAPTURE buffer pool with LRU recycling
Pre-iter2 each VA surface was permanently 1:1 bound to one V4L2 CAPTURE
buffer. mpv reusing a surface for a new decode while the compositor still
held an EXPBUF'd dma_buf fd to the prior frame caused the kernel to
write fresh decode output into the same physical memory the compositor
was reading -- visible as stutter / back-and-forth swap on
mpv --hwdec=vaapi --vo=gpu playback.
Architecture:
- New cap_pool abstraction (cap_pool.{h,c}) owns N CAPTURE buffers
(N = max(surfaces_count, MIN_CAP_POOL=24)) with per-slot state
{FREE, IN_DECODE, DECODED, EXPORTED} guarded by pthread_mutex_t.
- Surfaces no longer own buffers; each vaBeginPicture acquires the
oldest FREE slot (LRU), binds it for the decode cycle, and the slot
cycles IN_DECODE -> DECODED (post-DQBUF) -> EXPORTED (post-EXPBUF).
- Slot is released on next BeginPicture for the same surface or on
vaDestroySurfaces.
Limitations (Sonnet Phase 5 review iter2 9.x, deferred to iter3+):
- Option-A statistical mitigation; race window narrows to "pool
exhausted, force-recycle of oldest EXPORTED slot." For typical mpv
16-surface playback with MIN_CAP_POOL=24 the fallback never fires.
- Multi-context concurrent use not addressed (one V4L2 device, multiple
cap_pools -- iter3 scope).
Other call sites updated:
- picture.c::BeginPicture acquires + binds, releasing prior slot if any.
- surface.c::SyncSurface marks slot DECODED after DQBUF.
- surface.c::ExportSurfaceHandle marks slot EXPORTED, retaining OUR
EXPBUF fd for force-recycle close().
- surface.c::DestroySurfaces releases via surface_unbind_slot;
cap_pool owns the mmaps now.
- surface.c::CreateSurfaces2 destroys the pool in the resolution-change
path before REQBUFS(0) (else stale v4l2_index after Fix 1's REQBUFS).
- context.c::DestroyContext invokes cap_pool_destroy.
- image.c::DeriveImage skips copy_surface_to_image when current_slot is
NULL (ffmpeg av_hwframe_ctx_init probes derive on undecoded surfaces).
Verified: mpv vaapi-copy 200 frames bbb_1080p30, 0 drops, LRU visibly
recycling slot indices, real luma gradient. mpv vaapi --vo=gpu
operator-inspection follows.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+303
@@ -0,0 +1,303 @@
|
||||
/*
|
||||
* Iteration 2 Fix 3: cap_pool implementation.
|
||||
*
|
||||
* Design rationale + limitations: see cap_pool.h docblock.
|
||||
*
|
||||
* Concurrency model:
|
||||
* - All public functions take pool->lock at entry, release at exit.
|
||||
* - cap_pool_acquire may sleep briefly while scanning slots; safe
|
||||
* under lock since the scan is bounded by pool->count (<= 24
|
||||
* typical).
|
||||
* - The slot pointer returned by acquire / mark_decoded /
|
||||
* mark_exported / release is stable across the call (lock is
|
||||
* dropped before return) but the slot's state may change between
|
||||
* calls. Callers MUST NOT cache slot pointers across sleep/I/O --
|
||||
* they should treat slot pointers as opaque references valid only
|
||||
* for the immediate operation.
|
||||
*
|
||||
* In practice, our caller pattern is:
|
||||
* surface_object->current_slot = cap_pool_acquire(...);
|
||||
* v4l2_queue_buffer(slot->v4l2_index, ...);
|
||||
* // later, in SyncSurface for the same surface:
|
||||
* v4l2_dequeue_buffer(surface_object->current_slot->v4l2_index, ...);
|
||||
* cap_pool_mark_decoded(surface_object->current_slot);
|
||||
*
|
||||
* surface_object->current_slot is the persistent reference; the
|
||||
* slot's V4L2 index is stable for the slot's lifetime. The state
|
||||
* field IS read by other threads (acquire scans for FREE) — that
|
||||
* reads are safe because:
|
||||
* - acquire holds the lock during the scan
|
||||
* - mark_decoded/mark_exported/release also hold the lock
|
||||
* So state transitions are serialized.
|
||||
*/
|
||||
|
||||
#include "cap_pool.h"
|
||||
#include "v4l2.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#include <linux/videodev2.h>
|
||||
|
||||
static uint64_t monotonic_ns(void)
|
||||
{
|
||||
struct timespec ts;
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
|
||||
return 0;
|
||||
return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec;
|
||||
}
|
||||
|
||||
int cap_pool_init(struct cap_pool *pool, int video_fd, unsigned int capture_type,
|
||||
unsigned int count, unsigned int v4l2_buffers_count_per_slot)
|
||||
{
|
||||
unsigned int index_base;
|
||||
unsigned int i, j;
|
||||
int rc;
|
||||
|
||||
if (pool == NULL || count == 0)
|
||||
return -EINVAL;
|
||||
|
||||
memset(pool, 0, sizeof(*pool));
|
||||
|
||||
rc = pthread_mutex_init(&pool->lock, NULL);
|
||||
if (rc != 0)
|
||||
return -rc;
|
||||
|
||||
pool->slots = calloc(count, sizeof(*pool->slots));
|
||||
if (pool->slots == NULL) {
|
||||
pthread_mutex_destroy(&pool->lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
pool->count = count;
|
||||
|
||||
rc = v4l2_create_buffers(video_fd, capture_type, count, &index_base);
|
||||
if (rc < 0) {
|
||||
free(pool->slots);
|
||||
pthread_mutex_destroy(&pool->lock);
|
||||
return rc;
|
||||
}
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
struct cap_pool_slot *slot = &pool->slots[i];
|
||||
|
||||
slot->v4l2_index = index_base + i;
|
||||
slot->buffers_count = v4l2_buffers_count_per_slot;
|
||||
slot->state = CAP_SLOT_FREE;
|
||||
slot->our_export_fd = -1;
|
||||
slot->last_used_at_ns = 0; /* never used → highest LRU priority */
|
||||
slot->bound_to_surface_id = -1;
|
||||
|
||||
rc = v4l2_query_buffer(video_fd, capture_type, slot->v4l2_index,
|
||||
slot->map_lengths, slot->map_offsets,
|
||||
v4l2_buffers_count_per_slot);
|
||||
if (rc < 0) {
|
||||
request_log("cap_pool_init: query_buffer failed for "
|
||||
"slot %u (v4l2_index=%u)\n",
|
||||
i, slot->v4l2_index);
|
||||
goto error_cleanup;
|
||||
}
|
||||
|
||||
for (j = 0; j < v4l2_buffers_count_per_slot; j++) {
|
||||
slot->map[j] = mmap(NULL, slot->map_lengths[j],
|
||||
PROT_READ | PROT_WRITE, MAP_SHARED,
|
||||
video_fd, slot->map_offsets[j]);
|
||||
if (slot->map[j] == MAP_FAILED) {
|
||||
request_log("cap_pool_init: mmap failed for "
|
||||
"slot %u plane %u\n", i, j);
|
||||
slot->map[j] = NULL;
|
||||
goto error_cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pool->initialized = true;
|
||||
request_log("cap_pool_init: %u slots ready (v4l2_index=%u..%u, "
|
||||
"%u plane(s) per slot)\n",
|
||||
count, index_base, index_base + count - 1,
|
||||
v4l2_buffers_count_per_slot);
|
||||
return 0;
|
||||
|
||||
error_cleanup:
|
||||
for (i = 0; i < count; i++) {
|
||||
struct cap_pool_slot *slot = &pool->slots[i];
|
||||
for (j = 0; j < v4l2_buffers_count_per_slot; j++) {
|
||||
if (slot->map[j] != NULL && slot->map[j] != MAP_FAILED)
|
||||
munmap(slot->map[j], slot->map_lengths[j]);
|
||||
}
|
||||
}
|
||||
(void)v4l2_request_buffers(video_fd, capture_type, 0);
|
||||
free(pool->slots);
|
||||
pthread_mutex_destroy(&pool->lock);
|
||||
memset(pool, 0, sizeof(*pool));
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
void cap_pool_destroy(struct cap_pool *pool, int video_fd, unsigned int capture_type)
|
||||
{
|
||||
unsigned int i, j;
|
||||
|
||||
if (pool == NULL || !pool->initialized)
|
||||
return;
|
||||
|
||||
pthread_mutex_lock(&pool->lock);
|
||||
|
||||
for (i = 0; i < pool->count; i++) {
|
||||
struct cap_pool_slot *slot = &pool->slots[i];
|
||||
|
||||
if (slot->our_export_fd >= 0) {
|
||||
close(slot->our_export_fd);
|
||||
slot->our_export_fd = -1;
|
||||
}
|
||||
|
||||
for (j = 0; j < slot->buffers_count; j++) {
|
||||
if (slot->map[j] != NULL && slot->map[j] != MAP_FAILED) {
|
||||
munmap(slot->map[j], slot->map_lengths[j]);
|
||||
slot->map[j] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(void)v4l2_request_buffers(video_fd, capture_type, 0);
|
||||
|
||||
pthread_mutex_unlock(&pool->lock);
|
||||
pthread_mutex_destroy(&pool->lock);
|
||||
|
||||
free(pool->slots);
|
||||
pool->slots = NULL;
|
||||
pool->count = 0;
|
||||
pool->initialized = false;
|
||||
}
|
||||
|
||||
struct cap_pool_slot *cap_pool_acquire(struct cap_pool *pool, int surface_id)
|
||||
{
|
||||
struct cap_pool_slot *best = NULL;
|
||||
uint64_t best_ts = UINT64_MAX;
|
||||
unsigned int i;
|
||||
|
||||
if (pool == NULL || !pool->initialized)
|
||||
return NULL;
|
||||
|
||||
pthread_mutex_lock(&pool->lock);
|
||||
|
||||
/* First pass: find the FREE slot with oldest last_used_at_ns. */
|
||||
for (i = 0; i < pool->count; i++) {
|
||||
struct cap_pool_slot *slot = &pool->slots[i];
|
||||
if (slot->state != CAP_SLOT_FREE)
|
||||
continue;
|
||||
if (slot->last_used_at_ns < best_ts) {
|
||||
best = slot;
|
||||
best_ts = slot->last_used_at_ns;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Second pass (fallback): if no FREE slot, force-recycle the
|
||||
* oldest EXPORTED slot. This is the documented Option A race
|
||||
* window — the consumer may still hold a dup'd fd to this
|
||||
* buffer's underlying physical memory, and the kernel will
|
||||
* happily DMA new content into it. For typical mpv 16-surface
|
||||
* playback with MIN_CAP_POOL=24, this fallback should never
|
||||
* fire. If it does, the visual artifact is bounded to a few
|
||||
* frames during recovery.
|
||||
*/
|
||||
if (best == NULL) {
|
||||
best_ts = UINT64_MAX;
|
||||
for (i = 0; i < pool->count; i++) {
|
||||
struct cap_pool_slot *slot = &pool->slots[i];
|
||||
if (slot->state != CAP_SLOT_EXPORTED)
|
||||
continue;
|
||||
if (slot->last_used_at_ns < best_ts) {
|
||||
best = slot;
|
||||
best_ts = slot->last_used_at_ns;
|
||||
}
|
||||
}
|
||||
if (best != NULL) {
|
||||
request_log("cap_pool_acquire: pool exhausted, "
|
||||
"force-recycling EXPORTED slot v4l2_index=%u "
|
||||
"(consumer race window may open)\n",
|
||||
best->v4l2_index);
|
||||
if (best->our_export_fd >= 0) {
|
||||
close(best->our_export_fd);
|
||||
best->our_export_fd = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (best == NULL) {
|
||||
pthread_mutex_unlock(&pool->lock);
|
||||
request_log("cap_pool_acquire: no slot available "
|
||||
"(pool->count=%u, all slots IN_DECODE/DECODED?)\n",
|
||||
pool->count);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Don't transition DECODED slots — they hold valid pixel content
|
||||
* a consumer may still be reading via DeriveImage (vaapi-copy
|
||||
* path). We never recycle DECODED. If a surface holds a DECODED
|
||||
* slot for an extended period, it stays held; the surface's
|
||||
* destruction (vaDestroySurfaces) is the only path that releases
|
||||
* it. mpv typically progresses through DECODED → EXPORTED quickly
|
||||
* for vaapi DMA-BUF; for vaapi-copy, DECODED → consumer reads
|
||||
* via mmap → consumer is done after copy_surface_to_image returns.
|
||||
* The vaapi-copy consumer has no explicit "I'm done" signal, so
|
||||
* we rely on the next BeginPicture for the same surface to
|
||||
* release the prior DECODED slot.
|
||||
*/
|
||||
|
||||
best->state = CAP_SLOT_IN_DECODE;
|
||||
best->bound_to_surface_id = surface_id;
|
||||
best->last_used_at_ns = monotonic_ns();
|
||||
|
||||
pthread_mutex_unlock(&pool->lock);
|
||||
return best;
|
||||
}
|
||||
|
||||
void cap_pool_mark_decoded(struct cap_pool *pool, struct cap_pool_slot *slot)
|
||||
{
|
||||
if (pool == NULL || slot == NULL)
|
||||
return;
|
||||
pthread_mutex_lock(&pool->lock);
|
||||
slot->state = CAP_SLOT_DECODED;
|
||||
slot->last_used_at_ns = monotonic_ns();
|
||||
pthread_mutex_unlock(&pool->lock);
|
||||
}
|
||||
|
||||
void cap_pool_mark_exported(struct cap_pool *pool, struct cap_pool_slot *slot, int our_fd)
|
||||
{
|
||||
if (pool == NULL || slot == NULL)
|
||||
return;
|
||||
pthread_mutex_lock(&pool->lock);
|
||||
if (slot->our_export_fd >= 0 && slot->our_export_fd != our_fd) {
|
||||
/*
|
||||
* Double-Export: a previous EXPBUF'd fd existed. Close
|
||||
* the old one. Consumer's old fd remains valid via
|
||||
* dma_buf refcount. Documented in surface.c export path.
|
||||
*/
|
||||
close(slot->our_export_fd);
|
||||
}
|
||||
slot->our_export_fd = our_fd;
|
||||
slot->state = CAP_SLOT_EXPORTED;
|
||||
slot->last_used_at_ns = monotonic_ns();
|
||||
pthread_mutex_unlock(&pool->lock);
|
||||
}
|
||||
|
||||
void cap_pool_release(struct cap_pool *pool, struct cap_pool_slot *slot)
|
||||
{
|
||||
if (pool == NULL || slot == NULL)
|
||||
return;
|
||||
pthread_mutex_lock(&pool->lock);
|
||||
if (slot->our_export_fd >= 0) {
|
||||
close(slot->our_export_fd);
|
||||
slot->our_export_fd = -1;
|
||||
}
|
||||
slot->state = CAP_SLOT_FREE;
|
||||
slot->bound_to_surface_id = -1;
|
||||
slot->last_used_at_ns = monotonic_ns();
|
||||
pthread_mutex_unlock(&pool->lock);
|
||||
}
|
||||
+156
@@ -0,0 +1,156 @@
|
||||
/*
|
||||
* Iteration 2 Fix 3: decoupled CAPTURE buffer pool with LRU recycling.
|
||||
*
|
||||
* Background — the bug this fixes:
|
||||
*
|
||||
* Pre-iteration-2, each VAAPI surface was permanently 1:1 bound to a
|
||||
* V4L2 CAPTURE buffer index at vaCreateSurfaces2 time. Each decode
|
||||
* cycle re-QBUF'd that same physical buffer for the same surface ID.
|
||||
* When mpv reused a surface for a new decode while the compositor
|
||||
* still held an EXPBUF'd dma_buf fd to the prior frame's content,
|
||||
* the kernel wrote new decode output into the SAME physical memory
|
||||
* the compositor was reading from — visible as stutter / "back and
|
||||
* forth" frame swap during mpv --hwdec=vaapi --vo=gpu playback.
|
||||
*
|
||||
* V4L2 does not enforce the constraint (it lets QBUF re-queue a
|
||||
* buffer regardless of dma_buf refcount on EXPBUF'd fds). userspace
|
||||
* must coordinate.
|
||||
*
|
||||
* Architecture (Sonnet Phase 5 review for iter2):
|
||||
*
|
||||
* Pool of N CAPTURE buffers (N >= max(surfaces_count, MIN_CAP_POOL)).
|
||||
* Each slot has a state in {FREE, IN_DECODE, DECODED, EXPORTED}.
|
||||
* Surfaces are no longer permanently bound; each vaBeginPicture
|
||||
* acquires a FREE slot, binds it to the current decode, transitions
|
||||
* it through IN_DECODE → DECODED → optionally EXPORTED.
|
||||
*
|
||||
* The DECODED state captures the window between SyncSurface DQBUF
|
||||
* and either ExportSurfaceHandle (DMA-BUF path) or DeriveImage
|
||||
* (vaapi-copy path). LRU recycling considers ONLY FREE slots, so
|
||||
* DECODED slots cannot be claimed by a concurrent decode while
|
||||
* the consumer is still using the bound surface's content.
|
||||
*
|
||||
* Concurrency: a pthread_mutex_t protects pool state. VAAPI is
|
||||
* re-entrant for multi-threaded consumers (mpv may BeginPicture/
|
||||
* SyncSurface from one thread and ExportSurfaceHandle from
|
||||
* another).
|
||||
*
|
||||
* Limitations (deferred to iteration 3+):
|
||||
*
|
||||
* - Option-A statistical mitigation, not a correct fix. The race
|
||||
* window narrows from "constant" to "only when pool is exhausted
|
||||
* and force-recycle of oldest EXPORTED slot fires." For typical
|
||||
* mpv 16-surface playback with MIN_CAP_POOL=24, this never fires
|
||||
* in practice (Sonnet review iter2 question 3). For pathological
|
||||
* workloads (paused-with-video-still-visible, multi-stream),
|
||||
* race windows still possible. Iteration 3 may revisit with
|
||||
* V4L2_MEMORY_DMABUF + userspace allocation.
|
||||
*
|
||||
* - LRU "force-recycle" still has the race in the worst case.
|
||||
* Closing OUR EXPBUF fd does not close the consumer's dup — the
|
||||
* consumer's fd keeps the dma_buf alive but the V4L2 layer will
|
||||
* happily write new data into the underlying physical memory on
|
||||
* re-QBUF. There is no public V4L2 API to query dma_buf refcount.
|
||||
*
|
||||
* - Multi-context concurrent use (two libva contexts open
|
||||
* simultaneously, e.g. Firefox playing two videos in different
|
||||
* tabs through separate RDD instances): not addressed. Each
|
||||
* context gets its own pool, but there's only one V4L2 device.
|
||||
*/
|
||||
|
||||
#ifndef _CAP_POOL_H_
|
||||
#define _CAP_POOL_H_
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#include <linux/videodev2.h> /* for VIDEO_MAX_PLANES */
|
||||
|
||||
#define MIN_CAP_POOL 24
|
||||
|
||||
enum cap_slot_state {
|
||||
CAP_SLOT_FREE = 0, /* available for a new decode acquisition */
|
||||
CAP_SLOT_IN_DECODE, /* QBUF'd to V4L2, kernel owns */
|
||||
CAP_SLOT_DECODED, /* DQBUF'd, valid pixel content; mapped by surface */
|
||||
CAP_SLOT_EXPORTED, /* EXPBUF'd; consumer holds a dma_buf fd */
|
||||
};
|
||||
|
||||
struct cap_pool_slot {
|
||||
unsigned int v4l2_index; /* V4L2 buffer index */
|
||||
void *map[VIDEO_MAX_PLANES]; /* mmap pointers */
|
||||
unsigned int map_lengths[VIDEO_MAX_PLANES];
|
||||
unsigned int map_offsets[VIDEO_MAX_PLANES];
|
||||
unsigned int buffers_count; /* V4L2 buffers per logical NV12 (1 for single-plane MPLANE) */
|
||||
enum cap_slot_state state;
|
||||
int our_export_fd; /* -1 if not exported; close on FREE transition */
|
||||
uint64_t last_used_at_ns; /* CLOCK_MONOTONIC when last touched (LRU) */
|
||||
int bound_to_surface_id; /* -1 if not bound; informational */
|
||||
};
|
||||
|
||||
struct cap_pool {
|
||||
struct cap_pool_slot *slots;
|
||||
unsigned int count; /* allocated slot count */
|
||||
pthread_mutex_t lock;
|
||||
bool initialized;
|
||||
};
|
||||
|
||||
/*
|
||||
* cap_pool_init — allocate a pool of `count` CAPTURE buffers via
|
||||
* v4l2_create_buffers, mmap each buffer's planes, init slot states
|
||||
* to FREE. `count` is min'd against any reasonable hardware cap.
|
||||
*
|
||||
* Returns 0 on success, negative errno on failure.
|
||||
*/
|
||||
int cap_pool_init(struct cap_pool *pool, int video_fd, unsigned int capture_type,
|
||||
unsigned int count, unsigned int v4l2_buffers_count_per_slot);
|
||||
|
||||
/*
|
||||
* cap_pool_destroy — close any outstanding our_export_fds, munmap all
|
||||
* planes, REQBUFS(0), free slots. Safe to call on a non-initialized
|
||||
* pool (no-op).
|
||||
*
|
||||
* Note: closing our_export_fd does not invalidate any consumer-held
|
||||
* dup'd fds — the kernel keeps the dma_buf alive while any fd refs
|
||||
* it. munmap on our side is independent of the consumer's mmap (each
|
||||
* mmap of a dma_buf is a distinct VMA).
|
||||
*/
|
||||
void cap_pool_destroy(struct cap_pool *pool, int video_fd, unsigned int capture_type);
|
||||
|
||||
/*
|
||||
* cap_pool_acquire — find a FREE slot with the oldest last_used_at_ns
|
||||
* (LRU). If no FREE slot is available, force-recycle the oldest
|
||||
* EXPORTED slot (close our_export_fd, demote to IN_DECODE for the
|
||||
* caller). Returns NULL only if no slots can be recycled at all
|
||||
* (catastrophic — pool too small).
|
||||
*
|
||||
* The returned slot is in IN_DECODE state. Caller QBUFs it and
|
||||
* transitions to DECODED via cap_pool_mark_decoded after DQBUF.
|
||||
*/
|
||||
struct cap_pool_slot *cap_pool_acquire(struct cap_pool *pool, int surface_id);
|
||||
|
||||
/*
|
||||
* cap_pool_mark_decoded — IN_DECODE → DECODED. Touches last_used_at_ns.
|
||||
* Called from RequestSyncSurface after successful DQBUF.
|
||||
*/
|
||||
void cap_pool_mark_decoded(struct cap_pool *pool, struct cap_pool_slot *slot);
|
||||
|
||||
/*
|
||||
* cap_pool_mark_exported — DECODED → EXPORTED. Stores `our_fd` so the
|
||||
* pool owns OUR copy of the EXPBUF'd fd; the consumer received a
|
||||
* dup'd / equivalent fd via the descriptor. last_used_at_ns is
|
||||
* touched again so EXPORTED slots are recycled in LRU order.
|
||||
*
|
||||
* Called from RequestExportSurfaceHandle after VIDIOC_EXPBUF.
|
||||
*/
|
||||
void cap_pool_mark_exported(struct cap_pool *pool, struct cap_pool_slot *slot, int our_fd);
|
||||
|
||||
/*
|
||||
* cap_pool_release — explicitly return a slot to FREE (close our
|
||||
* export fd if any). Called from RequestDestroySurfaces and from
|
||||
* RequestBeginPicture when re-acquiring (the surface's previous slot
|
||||
* is released first, then a new one acquired).
|
||||
*/
|
||||
void cap_pool_release(struct cap_pool *pool, struct cap_pool_slot *slot);
|
||||
|
||||
#endif /* _CAP_POOL_H_ */
|
||||
+8
-3
@@ -244,9 +244,14 @@ VAStatus RequestDestroyContext(VADriverContextP context, VAContextID context_id)
|
||||
if (rc < 0)
|
||||
return VA_STATUS_ERROR_OPERATION_FAILED;
|
||||
|
||||
rc = v4l2_request_buffers(driver_data->video_fd, capture_type, 0);
|
||||
if (rc < 0)
|
||||
return VA_STATUS_ERROR_OPERATION_FAILED;
|
||||
/*
|
||||
* Iter2 Fix 3: cap_pool owns the CAPTURE buffers' mmaps + any
|
||||
* outstanding our_export_fds. Tear it down (which also issues
|
||||
* REQBUFS(0) on CAPTURE), so the next CreateSurfaces2 cycle sees
|
||||
* a clean slate and rebuilds the pool at the new resolution.
|
||||
*/
|
||||
cap_pool_destroy(&driver_data->capture_pool, driver_data->video_fd,
|
||||
capture_type);
|
||||
|
||||
/*
|
||||
* Iteration 2 Fix 1: the kernel CAPTURE format state is no longer
|
||||
|
||||
+16
-3
@@ -209,9 +209,22 @@ VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id,
|
||||
if (status != VA_STATUS_SUCCESS)
|
||||
return status;
|
||||
|
||||
status = copy_surface_to_image (driver_data, surface_object, image);
|
||||
if (status != VA_STATUS_SUCCESS)
|
||||
return status;
|
||||
/*
|
||||
* Iter2 Fix 3: skip the surface→image copy when no CAPTURE slot is
|
||||
* bound. ffmpeg's av_hwframe_ctx_init probes vaDeriveImage on a
|
||||
* never-decoded surface to learn the format; it doesn't read the
|
||||
* data. With the cap_pool decoupling, destination_data[] is NULL
|
||||
* until BeginPicture binds a slot — copying from a NULL source
|
||||
* crashed in memcpy. The image's buffer remains zero-initialized;
|
||||
* subsequent post-decode DeriveImage on the same surface (after
|
||||
* BeginPicture has bound a slot) does the real copy.
|
||||
*/
|
||||
if (surface_object->current_slot != NULL) {
|
||||
status = copy_surface_to_image (driver_data, surface_object,
|
||||
image);
|
||||
if (status != VA_STATUS_SUCCESS)
|
||||
return status;
|
||||
}
|
||||
|
||||
surface_object->status = VASurfaceReady;
|
||||
|
||||
|
||||
@@ -46,6 +46,7 @@ sources = [
|
||||
'h264.c',
|
||||
'h264_slice_header.c',
|
||||
'request_pool.c',
|
||||
'cap_pool.c',
|
||||
# 'h265.c'
|
||||
]
|
||||
|
||||
@@ -68,6 +69,7 @@ headers = [
|
||||
'h264.h',
|
||||
'h264_slice_header.h',
|
||||
'request_pool.h',
|
||||
'cap_pool.h',
|
||||
# 'h265.h'
|
||||
]
|
||||
|
||||
|
||||
@@ -235,6 +235,25 @@ VAStatus RequestBeginPicture(VADriverContextP context, VAContextID context_id,
|
||||
if (surface_object->status == VASurfaceRendering)
|
||||
RequestSyncSurface(context, surface_id);
|
||||
|
||||
/*
|
||||
* Iter2 Fix 3: acquire a CAPTURE-pool slot for this decode cycle.
|
||||
* If the surface still holds a slot from a prior decode (DECODED
|
||||
* or EXPORTED — the consumer is done with it by definition since
|
||||
* we got back to BeginPicture for the same surface), release it
|
||||
* first. The new slot is bound and its V4L2 index + mmap pointers
|
||||
* are mirrored into surface_object->destination_* so the existing
|
||||
* QBUF/DQBUF/EXPBUF code paths see no behavioral change.
|
||||
*/
|
||||
if (surface_object->current_slot != NULL)
|
||||
surface_unbind_slot(driver_data, surface_object);
|
||||
{
|
||||
struct cap_pool_slot *cap_slot =
|
||||
cap_pool_acquire(&driver_data->capture_pool, surface_id);
|
||||
if (cap_slot == NULL)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
surface_bind_slot(surface_object, cap_slot);
|
||||
}
|
||||
|
||||
/*
|
||||
* Borrow an OUTPUT (bitstream-input) slot from the driver-wide
|
||||
* pool for the duration of this Begin/Render/End cycle. The
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "context.h"
|
||||
#include "object_heap.h"
|
||||
#include "request_pool.h"
|
||||
#include "cap_pool.h"
|
||||
#include "video.h"
|
||||
#include <va/va.h>
|
||||
|
||||
@@ -63,6 +64,22 @@ struct request_data {
|
||||
* RequestCreateContext, torn down at driver Terminate.
|
||||
*/
|
||||
struct request_pool output_pool;
|
||||
|
||||
/*
|
||||
* CAPTURE (decoded-frame) buffer pool, decoupled from VA
|
||||
* surfaces (iter2 Fix 3). Each surface acquires a slot at
|
||||
* vaBeginPicture time and releases it on the next acquisition
|
||||
* or vaDestroySurfaces. Pool sized to max(surfaces_count,
|
||||
* MIN_CAP_POOL) at first vaCreateSurfaces2; torn down at
|
||||
* vaDestroyContext.
|
||||
*
|
||||
* Background: pre-iter2 each surface was 1:1 bound to one
|
||||
* CAPTURE buffer index; mpv re-using a surface for a new decode
|
||||
* caused V4L2 to re-QBUF the same physical buffer while a
|
||||
* compositor still held an EXPBUF'd dma_buf fd, producing
|
||||
* visible stutter on mpv vaapi --vo=gpu.
|
||||
*/
|
||||
struct cap_pool capture_pool;
|
||||
};
|
||||
|
||||
VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context);
|
||||
|
||||
+139
-60
@@ -75,6 +75,60 @@ void surface_reset_format_cache(void)
|
||||
LAST_OUTPUT_HEIGHT = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Iter2 Fix 3 helpers — bind / unbind a cap_pool_slot to an
|
||||
* object_surface. Called from BeginPicture (acquire+bind) and
|
||||
* DestroySurfaces (unbind). Populates surface_object->destination_*
|
||||
* fields from the slot so existing code paths (the QBUF in
|
||||
* picture.c::EndPicture, the EXPBUF in ExportSurfaceHandle, the
|
||||
* mmap-read in copy_surface_to_image) continue to work unchanged.
|
||||
*
|
||||
* surface_bind_slot is called only from BeginPicture; the surface's
|
||||
* format-uniform fields (destination_planes_count, destination_sizes,
|
||||
* destination_offsets, destination_bytesperlines) are already set
|
||||
* by CreateSurfaces2 and stay constant.
|
||||
*/
|
||||
void surface_bind_slot(struct object_surface *surface_object,
|
||||
struct cap_pool_slot *slot)
|
||||
{
|
||||
unsigned int j;
|
||||
|
||||
surface_object->current_slot = slot;
|
||||
surface_object->destination_index = slot->v4l2_index;
|
||||
surface_object->destination_buffers_count = slot->buffers_count;
|
||||
|
||||
for (j = 0; j < slot->buffers_count; j++) {
|
||||
surface_object->destination_map[j] = slot->map[j];
|
||||
surface_object->destination_map_lengths[j] = slot->map_lengths[j];
|
||||
surface_object->destination_map_offsets[j] = slot->map_offsets[j];
|
||||
}
|
||||
|
||||
/*
|
||||
* destination_data[j] is the per-plane CPU pointer used by
|
||||
* copy_surface_to_image. For single-buffer MPLANE NV12 (our
|
||||
* common case), all planes live in slot->map[0] at varying
|
||||
* offsets recorded in destination_offsets[].
|
||||
*/
|
||||
if (slot->buffers_count == 1) {
|
||||
for (j = 0; j < surface_object->destination_planes_count; j++)
|
||||
surface_object->destination_data[j] =
|
||||
(unsigned char *)slot->map[0] +
|
||||
surface_object->destination_offsets[j];
|
||||
} else {
|
||||
for (j = 0; j < surface_object->destination_planes_count; j++)
|
||||
surface_object->destination_data[j] = slot->map[j];
|
||||
}
|
||||
}
|
||||
|
||||
void surface_unbind_slot(struct request_data *driver_data,
|
||||
struct object_surface *surface_object)
|
||||
{
|
||||
if (surface_object->current_slot == NULL)
|
||||
return;
|
||||
cap_pool_release(&driver_data->capture_pool, surface_object->current_slot);
|
||||
surface_object->current_slot = NULL;
|
||||
}
|
||||
|
||||
VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
unsigned int width, unsigned int height,
|
||||
VASurfaceID *surfaces_ids,
|
||||
@@ -90,8 +144,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
unsigned int destination_planes_count;
|
||||
unsigned int format_width, format_height;
|
||||
unsigned int capture_type;
|
||||
unsigned int index_base;
|
||||
unsigned int index;
|
||||
unsigned int i, j;
|
||||
VASurfaceID id;
|
||||
bool found;
|
||||
@@ -128,12 +180,25 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
* also block the implicit format change. Sonnet Phase 5
|
||||
* review (iter2 9.1) flagged this as a missing REQBUFS(0)
|
||||
* gap on the CAPTURE side of the resolution-change path.
|
||||
*
|
||||
* Iter2 Fix 3 corollary: cap_pool owns the CAPTURE buffers'
|
||||
* mmaps and slot states. Destroy it (which issues REQBUFS(0)
|
||||
* on capture) before the format change so the next
|
||||
* CreateSurfaces2 step can rebuild the pool at the new
|
||||
* resolution. Without this, pool->initialized stays true,
|
||||
* cap_pool_init below is skipped, and the slots' v4l2_index
|
||||
* fields point to dead buffers from the prior resolution.
|
||||
*/
|
||||
if (LAST_OUTPUT_WIDTH != 0) {
|
||||
if (driver_data->capture_pool.initialized)
|
||||
cap_pool_destroy(&driver_data->capture_pool,
|
||||
driver_data->video_fd,
|
||||
v4l2_type_video_capture(true));
|
||||
else
|
||||
(void)v4l2_request_buffers(driver_data->video_fd,
|
||||
v4l2_type_video_capture(true), 0);
|
||||
(void)v4l2_request_buffers(driver_data->video_fd,
|
||||
output_type, 0);
|
||||
(void)v4l2_request_buffers(driver_data->video_fd,
|
||||
v4l2_type_video_capture(true), 0);
|
||||
}
|
||||
|
||||
rc = v4l2_set_format(driver_data->video_fd, output_type, pixelformat,
|
||||
@@ -212,58 +277,58 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
destination_sizes[0], destination_sizes[1],
|
||||
destination_planes_count, video_format->v4l2_buffers_count);
|
||||
|
||||
rc = v4l2_create_buffers(driver_data->video_fd, capture_type,
|
||||
surfaces_count, &index_base);
|
||||
if (rc < 0)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
/*
|
||||
* Iter2 Fix 3: initialize the CAPTURE buffer pool on first call.
|
||||
* Pool size = max(surfaces_count, MIN_CAP_POOL); the +headroom
|
||||
* gives LRU recycling enough margin to never reuse a buffer
|
||||
* within the consumer's compositor-hold window for typical
|
||||
* playback patterns.
|
||||
*
|
||||
* If the pool already exists from a prior CreateSurfaces2 (e.g.
|
||||
* mpv probe surfaces vs. real-resolution surfaces), it stays —
|
||||
* but if the resolution changed (Fix 1's REQBUFS(0) on CAPTURE
|
||||
* fired before this point), the pool was destroyed and we
|
||||
* rebuild here.
|
||||
*/
|
||||
if (!driver_data->capture_pool.initialized) {
|
||||
unsigned int pool_count = surfaces_count > MIN_CAP_POOL ?
|
||||
surfaces_count : MIN_CAP_POOL;
|
||||
rc = cap_pool_init(&driver_data->capture_pool,
|
||||
driver_data->video_fd, capture_type,
|
||||
pool_count, video_format->v4l2_buffers_count);
|
||||
if (rc < 0)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute format-uniform destination_* values (sizes, offsets,
|
||||
* bytesperlines, planes_count). These are the same for all
|
||||
* surfaces of this format, set once per surface here, never
|
||||
* changed by BeginPicture's slot acquisition.
|
||||
*/
|
||||
if (video_format->v4l2_buffers_count == 1) {
|
||||
destination_sizes[0] = destination_bytesperlines[0] *
|
||||
format_height;
|
||||
for (j = 1; j < destination_planes_count; j++)
|
||||
destination_sizes[j] = destination_sizes[0] / 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < surfaces_count; i++) {
|
||||
index = index_base + i;
|
||||
|
||||
id = object_heap_allocate(&driver_data->surface_heap);
|
||||
surface_object = SURFACE(driver_data, id);
|
||||
if (surface_object == NULL)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
|
||||
rc = v4l2_query_buffer(driver_data->video_fd, capture_type,
|
||||
index,
|
||||
surface_object->destination_map_lengths,
|
||||
surface_object->destination_map_offsets,
|
||||
video_format->v4l2_buffers_count);
|
||||
if (rc < 0)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
|
||||
for (j = 0; j < video_format->v4l2_buffers_count; j++) {
|
||||
surface_object->destination_map[j] =
|
||||
mmap(NULL,
|
||||
surface_object->destination_map_lengths[j],
|
||||
PROT_READ | PROT_WRITE, MAP_SHARED,
|
||||
driver_data->video_fd,
|
||||
surface_object->destination_map_offsets[j]);
|
||||
|
||||
if (surface_object->destination_map[j] == MAP_FAILED)
|
||||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||||
}
|
||||
|
||||
/*
|
||||
* FIXME: Handle this per-pixelformat, trying to generalize it
|
||||
* is not a reasonable approach. The final description should be
|
||||
* in terms of (logical) planes.
|
||||
*/
|
||||
surface_object->current_slot = NULL; /* iter2 Fix 3 */
|
||||
surface_object->destination_index = 0; /* set on bind */
|
||||
surface_object->destination_planes_count = destination_planes_count;
|
||||
surface_object->destination_buffers_count =
|
||||
video_format->v4l2_buffers_count;
|
||||
|
||||
if (video_format->v4l2_buffers_count == 1) {
|
||||
destination_sizes[0] = destination_bytesperlines[0] *
|
||||
format_height;
|
||||
|
||||
for (j = 1; j < destination_planes_count; j++)
|
||||
destination_sizes[j] = destination_sizes[0] / 2;
|
||||
|
||||
for (j = 0; j < destination_planes_count; j++) {
|
||||
surface_object->destination_offsets[j] =
|
||||
j > 0 ? destination_sizes[j - 1] : 0;
|
||||
surface_object->destination_data[j] =
|
||||
((unsigned char *)surface_object->destination_map[0] +
|
||||
surface_object->destination_offsets[j]);
|
||||
surface_object->destination_sizes[j] =
|
||||
destination_sizes[j];
|
||||
surface_object->destination_bytesperlines[j] =
|
||||
@@ -272,8 +337,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
} else if (video_format->v4l2_buffers_count == destination_planes_count) {
|
||||
for (j = 0; j < destination_planes_count; j++) {
|
||||
surface_object->destination_offsets[j] = 0;
|
||||
surface_object->destination_data[j] =
|
||||
surface_object->destination_map[j];
|
||||
surface_object->destination_sizes[j] =
|
||||
destination_sizes[j];
|
||||
surface_object->destination_bytesperlines[j] =
|
||||
@@ -291,13 +354,6 @@ VAStatus RequestCreateSurfaces2(VADriverContextP context, unsigned int format,
|
||||
surface_object->source_data = NULL;
|
||||
surface_object->source_size = 0;
|
||||
|
||||
surface_object->destination_index = index;
|
||||
|
||||
surface_object->destination_planes_count =
|
||||
destination_planes_count;
|
||||
surface_object->destination_buffers_count =
|
||||
video_format->v4l2_buffers_count;
|
||||
|
||||
memset(&surface_object->params, 0,
|
||||
sizeof(surface_object->params));
|
||||
surface_object->slices_count = 0;
|
||||
@@ -324,7 +380,7 @@ VAStatus RequestDestroySurfaces(VADriverContextP context,
|
||||
{
|
||||
struct request_data *driver_data = context->pDriverData;
|
||||
struct object_surface *surface_object;
|
||||
unsigned int i, j;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < surfaces_count; i++) {
|
||||
surface_object = SURFACE(driver_data, surfaces_ids[i]);
|
||||
@@ -335,13 +391,13 @@ VAStatus RequestDestroySurfaces(VADriverContextP context,
|
||||
* source_* are now transient borrows from request_pool, not
|
||||
* surface-owned mappings; the pool owns the underlying mmap.
|
||||
* Nothing to free here.
|
||||
*
|
||||
* Iter2 Fix 3: destination_* mappings are owned by cap_pool;
|
||||
* surface_unbind_slot returns the slot to FREE (closing OUR
|
||||
* EXPBUF fd if any). Pool-owned mmaps are freed at
|
||||
* cap_pool_destroy time (RequestDestroyContext).
|
||||
*/
|
||||
|
||||
for (j = 0; j < surface_object->destination_buffers_count; j++)
|
||||
if (surface_object->destination_map[j] != NULL &&
|
||||
surface_object->destination_map_lengths[j] > 0)
|
||||
munmap(surface_object->destination_map[j],
|
||||
surface_object->destination_map_lengths[j]);
|
||||
surface_unbind_slot(driver_data, surface_object);
|
||||
|
||||
if (surface_object->request_fd > 0)
|
||||
close(surface_object->request_fd);
|
||||
@@ -435,6 +491,17 @@ VAStatus RequestSyncSurface(VADriverContextP context, VASurfaceID surface_id)
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Iter2 Fix 3: CAPTURE buffer is back from the kernel with valid
|
||||
* pixel content. Transition the slot IN_DECODE → DECODED. The slot
|
||||
* stays bound to this surface until either ExportSurfaceHandle
|
||||
* (→ EXPORTED), the next BeginPicture for this surface (slot is
|
||||
* released first), or DestroySurfaces (release).
|
||||
*/
|
||||
if (surface_object->current_slot != NULL)
|
||||
cap_pool_mark_decoded(&driver_data->capture_pool,
|
||||
surface_object->current_slot);
|
||||
|
||||
/*
|
||||
* DEBUG INSTRUMENTATION (0010): hex-dump first 32 bytes of the
|
||||
* decoded CAPTURE Y-plane after DQBUF, plus a 32-byte luma
|
||||
@@ -664,6 +731,18 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context,
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Iter2 Fix 3: pool now owns OUR copy of the EXPBUF'd fd. The
|
||||
* consumer receives a dup'd / equivalent fd via the descriptor.
|
||||
* Slot transitions DECODED → EXPORTED; it will be force-recyclable
|
||||
* by LRU when the pool is exhausted, but FREE slots are always
|
||||
* preferred.
|
||||
*/
|
||||
if (surface_object->current_slot != NULL && export_fds_count > 0)
|
||||
cap_pool_mark_exported(&driver_data->capture_pool,
|
||||
surface_object->current_slot,
|
||||
export_fds[0]);
|
||||
|
||||
planes_count = surface_object->destination_planes_count;
|
||||
|
||||
surface_descriptor->fourcc = VA_FOURCC_NV12;
|
||||
|
||||
@@ -32,6 +32,9 @@
|
||||
#include <va/va_backend.h>
|
||||
|
||||
#include "object_heap.h"
|
||||
#include "cap_pool.h"
|
||||
|
||||
struct request_data;
|
||||
|
||||
#define SURFACE(data, id) \
|
||||
((struct object_surface *)object_heap_lookup(&(data)->surface_heap, id))
|
||||
@@ -48,6 +51,26 @@ struct object_surface {
|
||||
void *source_data;
|
||||
unsigned int source_size;
|
||||
|
||||
/*
|
||||
* Iter2 Fix 3: destination_* fields below are now per-decode-cycle.
|
||||
* They are populated from current_slot in RequestBeginPicture and
|
||||
* remain valid through SyncSurface, ExportSurfaceHandle, and
|
||||
* DeriveImage/copy_surface_to_image (vaapi-copy path). Subsequent
|
||||
* BeginPicture for this surface releases the prior slot and
|
||||
* acquires a new one.
|
||||
*
|
||||
* destination_planes_count, destination_sizes, destination_offsets,
|
||||
* destination_bytesperlines are FORMAT-uniform across all CAPTURE
|
||||
* buffers, so they're set once at CreateSurfaces2 time and stay.
|
||||
*
|
||||
* destination_index, destination_map[], destination_map_lengths,
|
||||
* destination_map_offsets, destination_data[] are SLOT-specific
|
||||
* and re-populated each BeginPicture from current_slot.
|
||||
*
|
||||
* destination_buffers_count is also format-uniform (V4L2 planes
|
||||
* per buffer = 1 for single-plane MPLANE NV12).
|
||||
*/
|
||||
struct cap_pool_slot *current_slot; /* iter2 Fix 3 */
|
||||
unsigned int destination_index;
|
||||
void *destination_map[VIDEO_MAX_PLANES];
|
||||
unsigned int destination_map_lengths[VIDEO_MAX_PLANES];
|
||||
@@ -146,4 +169,16 @@ VAStatus RequestExportSurfaceHandle(VADriverContextP context,
|
||||
*/
|
||||
void surface_reset_format_cache(void);
|
||||
|
||||
/*
|
||||
* Iter2 Fix 3: bind / unbind a CAPTURE-pool slot to an object_surface.
|
||||
* Called from picture.c::RequestBeginPicture (acquire+bind) and
|
||||
* surface.c::RequestDestroySurfaces (unbind). Mirrors slot's V4L2 index
|
||||
* and mmap pointers into surface_object->destination_* so existing
|
||||
* QBUF/DQBUF/EXPBUF code paths see no behavioral change.
|
||||
*/
|
||||
void surface_bind_slot(struct object_surface *surface_object,
|
||||
struct cap_pool_slot *slot);
|
||||
void surface_unbind_slot(struct request_data *driver_data,
|
||||
struct object_surface *surface_object);
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user