From ca4dd880070255fb62665b617308d35701813338 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Thu, 14 May 2026 08:06:10 +0000 Subject: [PATCH] =?UTF-8?q?iter13=20=CE=B1-17:=20explicit=20DMA=5FBUF=5FIO?= =?UTF-8?q?CTL=5FSYNC=20around=20copy=5Fsurface=5Fto=5Fimage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit V4L2 CAPTURE buffers are V4L2_MEMORY_MMAP and mapped cached. Kernel DMA writes don't propagate to CPU cache observer; reading destination_data[] without DMA_BUF_IOCTL_SYNC(START|READ) returns stale data on RK3399 — observed as Bug 4 (H.264 partial-fill) and Bug 5 (HEVC all-zero) when libva goes through cached-mmap readback while kdirect ffmpeg-v4l2request + DRM_PRIME-mmap reads cleanly via implicit sync. Per Tomasz Figa's 2024 linaro-mm-sig discussion + feedback_rfc_v2_ vb2_dma_resv_scope.md: userspace responsibility for cache sync on cached-mmap'd V4L2 buffers. RFC v2 fence work doesn't engage this path; this ioctl pair does. Just-in-time EXPBUF + SYNC + close per copy. Per-call cost is one ioctl pair + one fd lifecycle per plane. Could cache the EXPBUF fd on cap_pool slot but doing it transient keeps lifecycle simple. Closing the EXPBUF fd is a no-op on V4L2 buffer memory. If EXPBUF or SYNC fails, fall through to existing memcpy path — preserves pre-iter13 behavior on the error branch. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/image.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/src/image.c b/src/image.c index 20cafd9..4acb662 100644 --- a/src/image.c +++ b/src/image.c @@ -31,7 +31,13 @@ #include "video.h" #include +#include #include +#include + +#include + +#include #include "tiled_yuv.h" #include "utils.h" @@ -150,11 +156,66 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data, { struct object_buffer *buffer_object; unsigned int i; + int sync_fds[VIDEO_MAX_PLANES]; + unsigned int n_sync_fds = 0; buffer_object = BUFFER(driver_data, image->buf); if (buffer_object == NULL) return VA_STATUS_ERROR_INVALID_BUFFER; + for (i = 0; i < VIDEO_MAX_PLANES; i++) + sync_fds[i] = -1; + + /* + * iter13 α-17: explicit cache sync around the CAPTURE buffer read. + * + * The CAPTURE buffer is V4L2_MEMORY_MMAP and was mapped at + * cap_pool_init time with cached attributes. Kernel decode writes to + * the buffer via DMA, which doesn't propagate to the CPU's cache + * observer for that virtual mapping. Reading from + * surface_object->destination_data[] without an explicit cache + * invalidation returns stale data — observed empirically as Bug 4 + * (H.264 partial-fill) and Bug 5 (HEVC all-zero) when libva went + * through the SAME readback path that kdirect ffmpeg-v4l2request + + * DRM_PRIME-mmap successfully reads (kdirect's drm-prime mmap + * implicitly handles sync). + * + * DMA_BUF_IOCTL_SYNC(START | READ) makes the CPU mapping coherent + * with the producing engine's writes; END releases the sync. + * Per V4L2 + dma-buf spec, this is the userspace contract for + * cached-mmap'd buffers (Tomasz Figa, linaro-mm-sig 2024-07-11). + * + * Requires a dma-buf fd: get one via VIDIOC_EXPBUF, sync, close. + * Per-call cost is one ioctl pair + one fd open/close per plane. + * Could be optimised by caching the EXPBUF fd on the cap_pool slot, + * but doing it just-in-time keeps the lifecycle uncomplicated. The + * EXPBUF fd's dup count doesn't affect the V4L2 buffer's underlying + * pages; closing the fd is a no-op on memory. + * + * If EXPBUF fails (e.g., consumer-held EXPBUF prevents a second one + * — only true for hantro G1 oddity), we skip the sync silently. The + * existing pre-iter13 behavior is preserved on the error path. + */ + if (surface_object->current_slot != NULL && + driver_data->video_format != NULL) { + unsigned int capture_type = + v4l2_type_video_capture(driver_data->video_format->v4l2_mplane); + if (v4l2_export_buffer(driver_data->video_fd, capture_type, + surface_object->destination_index, + O_RDONLY, sync_fds, + surface_object->destination_buffers_count) >= 0) { + n_sync_fds = surface_object->destination_buffers_count; + for (i = 0; i < n_sync_fds; i++) { + struct dma_buf_sync s = { + .flags = DMA_BUF_SYNC_START | + DMA_BUF_SYNC_READ, + }; + /* failure is non-fatal: we continue with the read */ + (void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s); + } + } + } + for (i = 0; i < surface_object->destination_planes_count; i++) { #ifdef __arm__ if (!video_format_is_linear(driver_data->video_format)) @@ -173,6 +234,15 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data, #endif } + /* iter13 α-17: release cache sync. END pairs with each START. */ + for (i = 0; i < n_sync_fds; i++) { + struct dma_buf_sync s = { + .flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ, + }; + (void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s); + close(sync_fds[i]); + } + return VA_STATUS_SUCCESS; }