iter13 α-17: explicit DMA_BUF_IOCTL_SYNC around copy_surface_to_image
V4L2 CAPTURE buffers are V4L2_MEMORY_MMAP and mapped cached. Kernel DMA writes don't propagate to CPU cache observer; reading destination_data[] without DMA_BUF_IOCTL_SYNC(START|READ) returns stale data on RK3399 — observed as Bug 4 (H.264 partial-fill) and Bug 5 (HEVC all-zero) when libva goes through cached-mmap readback while kdirect ffmpeg-v4l2request + DRM_PRIME-mmap reads cleanly via implicit sync. Per Tomasz Figa's 2024 linaro-mm-sig discussion + feedback_rfc_v2_ vb2_dma_resv_scope.md: userspace responsibility for cache sync on cached-mmap'd V4L2 buffers. RFC v2 fence work doesn't engage this path; this ioctl pair does. Just-in-time EXPBUF + SYNC + close per copy. Per-call cost is one ioctl pair + one fd lifecycle per plane. Could cache the EXPBUF fd on cap_pool slot but doing it transient keeps lifecycle simple. Closing the EXPBUF fd is a no-op on V4L2 buffer memory. If EXPBUF or SYNC fails, fall through to existing memcpy path — preserves pre-iter13 behavior on the error branch. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+70
@@ -31,7 +31,13 @@
|
||||
#include "video.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <sys/ioctl.h>
|
||||
|
||||
#include <linux/dma-buf.h>
|
||||
|
||||
#include "tiled_yuv.h"
|
||||
#include "utils.h"
|
||||
@@ -150,11 +156,66 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
|
||||
{
|
||||
struct object_buffer *buffer_object;
|
||||
unsigned int i;
|
||||
int sync_fds[VIDEO_MAX_PLANES];
|
||||
unsigned int n_sync_fds = 0;
|
||||
|
||||
buffer_object = BUFFER(driver_data, image->buf);
|
||||
if (buffer_object == NULL)
|
||||
return VA_STATUS_ERROR_INVALID_BUFFER;
|
||||
|
||||
for (i = 0; i < VIDEO_MAX_PLANES; i++)
|
||||
sync_fds[i] = -1;
|
||||
|
||||
/*
|
||||
* iter13 α-17: explicit cache sync around the CAPTURE buffer read.
|
||||
*
|
||||
* The CAPTURE buffer is V4L2_MEMORY_MMAP and was mapped at
|
||||
* cap_pool_init time with cached attributes. Kernel decode writes to
|
||||
* the buffer via DMA, which doesn't propagate to the CPU's cache
|
||||
* observer for that virtual mapping. Reading from
|
||||
* surface_object->destination_data[] without an explicit cache
|
||||
* invalidation returns stale data — observed empirically as Bug 4
|
||||
* (H.264 partial-fill) and Bug 5 (HEVC all-zero) when libva went
|
||||
* through the SAME readback path that kdirect ffmpeg-v4l2request +
|
||||
* DRM_PRIME-mmap successfully reads (kdirect's drm-prime mmap
|
||||
* implicitly handles sync).
|
||||
*
|
||||
* DMA_BUF_IOCTL_SYNC(START | READ) makes the CPU mapping coherent
|
||||
* with the producing engine's writes; END releases the sync.
|
||||
* Per V4L2 + dma-buf spec, this is the userspace contract for
|
||||
* cached-mmap'd buffers (Tomasz Figa, linaro-mm-sig 2024-07-11).
|
||||
*
|
||||
* Requires a dma-buf fd: get one via VIDIOC_EXPBUF, sync, close.
|
||||
* Per-call cost is one ioctl pair + one fd open/close per plane.
|
||||
* Could be optimised by caching the EXPBUF fd on the cap_pool slot,
|
||||
* but doing it just-in-time keeps the lifecycle uncomplicated. The
|
||||
* EXPBUF fd's dup count doesn't affect the V4L2 buffer's underlying
|
||||
* pages; closing the fd is a no-op on memory.
|
||||
*
|
||||
* If EXPBUF fails (e.g., consumer-held EXPBUF prevents a second one
|
||||
* — only true for hantro G1 oddity), we skip the sync silently. The
|
||||
* existing pre-iter13 behavior is preserved on the error path.
|
||||
*/
|
||||
if (surface_object->current_slot != NULL &&
|
||||
driver_data->video_format != NULL) {
|
||||
unsigned int capture_type =
|
||||
v4l2_type_video_capture(driver_data->video_format->v4l2_mplane);
|
||||
if (v4l2_export_buffer(driver_data->video_fd, capture_type,
|
||||
surface_object->destination_index,
|
||||
O_RDONLY, sync_fds,
|
||||
surface_object->destination_buffers_count) >= 0) {
|
||||
n_sync_fds = surface_object->destination_buffers_count;
|
||||
for (i = 0; i < n_sync_fds; i++) {
|
||||
struct dma_buf_sync s = {
|
||||
.flags = DMA_BUF_SYNC_START |
|
||||
DMA_BUF_SYNC_READ,
|
||||
};
|
||||
/* failure is non-fatal: we continue with the read */
|
||||
(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < surface_object->destination_planes_count; i++) {
|
||||
#ifdef __arm__
|
||||
if (!video_format_is_linear(driver_data->video_format))
|
||||
@@ -173,6 +234,15 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
|
||||
#endif
|
||||
}
|
||||
|
||||
/* iter13 α-17: release cache sync. END pairs with each START. */
|
||||
for (i = 0; i < n_sync_fds; i++) {
|
||||
struct dma_buf_sync s = {
|
||||
.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ,
|
||||
};
|
||||
(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
|
||||
close(sync_fds[i]);
|
||||
}
|
||||
|
||||
return VA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user