5803cbcf6c
Three structural fixes for AV1 with film_grain on vpu981 (RK3588). Output
is no longer empty / crashed; frame 0 (IDR with apply_grain=1) is
bit-exact vs kdirect. Inter frames still diverge.
Fix 1 — surface.h + surface.c: linked_decode_surface_id field on
object_surface, initialized to VA_INVALID_SURFACE. When AV1 picture has
apply_grain=1, VAAPI's VADecPictureParameterBufferAV1 carries a
current_display_picture distinct from current_frame. ffmpeg-vaapi calls
vaBeginPicture on current_frame (decode surface, slot gets bound) but
vaGetImage on current_display_picture (display surface, no slot) → NULL
deref in copy_surface_to_image.
Fix 2 — av1.c: in av1_set_controls, when cur_frame != cur_display, set
display_surface->linked_decode_surface_id = current_frame. Establishes
the back-link so display surface can borrow decode surface's data.
Fix 3 — image.c copy_surface_to_image: when slot is NULL and the
surface has linked_decode_surface_id, lookup the decode surface and
mirror its destination_data[] + destination_sizes[] +
destination_planes_count. NULL guard with diagnostic log retained.
Fix 4 — av1.c fill_film_grain: when apply_grain=1, also set
V4L2_AV1_FILM_GRAIN_FLAG_UPDATE_GRAIN. Confirmed by strace-diff: kdirect
sends flags=0x0B (APPLY|UPDATE|...), libva was sending 0x09 (APPLY but
no UPDATE). Without UPDATE the kernel tries to reuse from
film_grain_params_ref_idx=0, which is never populated. Earlier reverted
because UPDATE seemed to trigger a SEGV — but that SEGV was the
unmasked NULL-slot deref; with fix 1+2+3 in place UPDATE is safe.
Fix 5 — av1.c reference_frame_ts plumbing: when a referenced surface
has timestamp=0 AND linked_decode_surface_id set, follow the link to
find the decode surface that carries the real timestamp. Display
surfaces don't get OUTPUT QBUF'd by us, so their own timestamp stays
zero.
Also: BeginPicture diagnostic log + surface_unbind_slot diagnostic log
+ v4l2.c error_idx diagnostic (kept from earlier — useful for ongoing
investigation).
Verification on ampere:
test_av1.ivf (208x208, 2 frames, no grain): bit-exact PASS sha
029ee72c214b37c1 (unchanged, no regression)
av1_larger.ivf (352x288, 10 frames, film_grain alternates):
frame 0 (key, apply_grain=1): PASS bit-exact vs kdirect
frame 4: PASS bit-exact
frames 1,2,3,5,6,7,8,9: DIFFER
Frame 0 PASS proves: SEQUENCE + FRAME + TILE_GROUP_ENTRY + FILM_GRAIN
mapping is correct for IDR. Frame 4 PASS is unexplained but encouraging.
Inter-frame divergence (frame 1+) points at: reference handling for
inter prediction is still off — either order_hints[] (still zero,
VAAPI doesn't expose per-ref), or grain-applied vs pre-grain DPB
semantics, or ref_frame_idx pointing into the wrong surface space.
Next investigation: per-frame strace diff between libva and kdirect
controls payload to spot remaining field mis-mappings on inter frames.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
414 lines
14 KiB
C
414 lines
14 KiB
C
/*
|
||
* Copyright (C) 2007 Intel Corporation
|
||
* Copyright (C) 2016 Florent Revest <florent.revest@free-electrons.com>
|
||
* Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
|
||
*
|
||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||
* copy of this software and associated documentation files (the
|
||
* "Software"), to deal in the Software without restriction, including
|
||
* without limitation the rights to use, copy, modify, merge, publish,
|
||
* distribute, sub license, and/or sell copies of the Software, and to
|
||
* permit persons to whom the Software is furnished to do so, subject to
|
||
* the following conditions:
|
||
*
|
||
* The above copyright notice and this permission notice (including the
|
||
* next paragraph) shall be included in all copies or substantial portions
|
||
* of the Software.
|
||
*
|
||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
|
||
* IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
|
||
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||
*/
|
||
|
||
#include "image.h"
|
||
#include "buffer.h"
|
||
#include "request.h"
|
||
#include "surface.h"
|
||
#include "video.h"
|
||
|
||
#include <assert.h>
|
||
#include <fcntl.h>
|
||
#include <string.h>
|
||
#include <unistd.h>
|
||
|
||
#include <sys/ioctl.h>
|
||
|
||
#include <linux/dma-buf.h>
|
||
|
||
#include "tiled_yuv.h"
|
||
#include "utils.h"
|
||
#include "v4l2.h"
|
||
|
||
VAStatus RequestCreateImage(VADriverContextP context, VAImageFormat *format,
|
||
int width, int height, VAImage *image)
|
||
{
|
||
struct request_data *driver_data = context->pDriverData;
|
||
unsigned int destination_sizes[VIDEO_MAX_PLANES];
|
||
unsigned int destination_bytesperlines[VIDEO_MAX_PLANES];
|
||
unsigned int destination_planes_count;
|
||
unsigned int planes_count;
|
||
unsigned int format_width, format_height;
|
||
unsigned int size;
|
||
unsigned int capture_type;
|
||
struct video_format *video_format;
|
||
struct object_image *image_object;
|
||
VABufferID buffer_id;
|
||
VAImageID id;
|
||
VAStatus status;
|
||
unsigned int i;
|
||
int rc;
|
||
|
||
video_format = driver_data->video_format;
|
||
if (video_format == NULL)
|
||
return VA_STATUS_ERROR_OPERATION_FAILED;
|
||
|
||
capture_type = v4l2_type_video_capture(video_format->v4l2_mplane);
|
||
|
||
/*
|
||
* FIXME: This should be replaced by per-pixelformat hadling to
|
||
* determine the logical plane offsets and sizes;
|
||
*/
|
||
rc = v4l2_get_format(driver_data->video_fd, capture_type,
|
||
&format_width, &format_height,
|
||
destination_bytesperlines, destination_sizes,
|
||
&planes_count);
|
||
if (rc < 0)
|
||
return VA_STATUS_ERROR_OPERATION_FAILED;
|
||
|
||
destination_planes_count = video_format->planes_count;
|
||
size = 0;
|
||
|
||
/* The size returned by V4L2 covers buffers, not logical planes. */
|
||
for (i = 0; i < planes_count; i++)
|
||
size += destination_sizes[i];
|
||
|
||
/* Here we calculate the sizes assuming NV12. */
|
||
|
||
destination_sizes[0] = destination_bytesperlines[0] * format_height;
|
||
|
||
for (i = 1; i < destination_planes_count; i++) {
|
||
destination_bytesperlines[i] = destination_bytesperlines[0];
|
||
destination_sizes[i] = destination_sizes[0] / 2;
|
||
}
|
||
|
||
id = object_heap_allocate(&driver_data->image_heap);
|
||
image_object = IMAGE(driver_data, id);
|
||
if (image_object == NULL)
|
||
return VA_STATUS_ERROR_ALLOCATION_FAILED;
|
||
|
||
status = RequestCreateBuffer(context, 0, VAImageBufferType, size, 1,
|
||
NULL, &buffer_id);
|
||
if (status != VA_STATUS_SUCCESS) {
|
||
object_heap_free(&driver_data->image_heap,
|
||
(struct object_base *)image_object);
|
||
return status;
|
||
}
|
||
|
||
memset(image, 0, sizeof(*image));
|
||
|
||
image->format = *format;
|
||
image->width = width;
|
||
image->height = height;
|
||
image->buf = buffer_id;
|
||
image->image_id = id;
|
||
|
||
image->num_planes = destination_planes_count;
|
||
image->data_size = size;
|
||
|
||
for (i = 0; i < image->num_planes; i++) {
|
||
image->pitches[i] = destination_bytesperlines[i];
|
||
image->offsets[i] = i > 0 ? destination_sizes[i - 1] : 0;
|
||
}
|
||
|
||
image_object->image = *image;
|
||
|
||
return VA_STATUS_SUCCESS;
|
||
}
|
||
|
||
VAStatus RequestDestroyImage(VADriverContextP context, VAImageID image_id)
|
||
{
|
||
|
||
struct request_data *driver_data = context->pDriverData;
|
||
struct object_image *image_object;
|
||
VAStatus status;
|
||
|
||
image_object = IMAGE(driver_data, image_id);
|
||
if (image_object == NULL)
|
||
return VA_STATUS_ERROR_INVALID_IMAGE;
|
||
|
||
status = RequestDestroyBuffer(context, image_object->image.buf);
|
||
if (status != VA_STATUS_SUCCESS)
|
||
return status;
|
||
|
||
object_heap_free(&driver_data->image_heap,
|
||
(struct object_base *)image_object);
|
||
|
||
return VA_STATUS_SUCCESS;
|
||
}
|
||
|
||
static VAStatus copy_surface_to_image (struct request_data *driver_data,
|
||
struct object_surface *surface_object,
|
||
VAImage *image)
|
||
{
|
||
struct object_buffer *buffer_object;
|
||
unsigned int i;
|
||
int sync_fds[VIDEO_MAX_PLANES];
|
||
unsigned int n_sync_fds = 0;
|
||
|
||
buffer_object = BUFFER(driver_data, image->buf);
|
||
if (buffer_object == NULL)
|
||
return VA_STATUS_ERROR_INVALID_BUFFER;
|
||
|
||
for (i = 0; i < VIDEO_MAX_PLANES; i++)
|
||
sync_fds[i] = -1;
|
||
|
||
/*
|
||
* iter13 α-17: explicit cache sync around the CAPTURE buffer read.
|
||
*
|
||
* The CAPTURE buffer is V4L2_MEMORY_MMAP and was mapped at
|
||
* cap_pool_init time with cached attributes. Kernel decode writes to
|
||
* the buffer via DMA, which doesn't propagate to the CPU's cache
|
||
* observer for that virtual mapping. Reading from
|
||
* surface_object->destination_data[] without an explicit cache
|
||
* invalidation returns stale data — observed empirically as Bug 4
|
||
* (H.264 partial-fill) and Bug 5 (HEVC all-zero) when libva went
|
||
* through the SAME readback path that kdirect ffmpeg-v4l2request +
|
||
* DRM_PRIME-mmap successfully reads (kdirect's drm-prime mmap
|
||
* implicitly handles sync).
|
||
*
|
||
* DMA_BUF_IOCTL_SYNC(START | READ) makes the CPU mapping coherent
|
||
* with the producing engine's writes; END releases the sync.
|
||
* Per V4L2 + dma-buf spec, this is the userspace contract for
|
||
* cached-mmap'd buffers (Tomasz Figa, linaro-mm-sig 2024-07-11).
|
||
*
|
||
* Requires a dma-buf fd: get one via VIDIOC_EXPBUF, sync, close.
|
||
* Per-call cost is one ioctl pair + one fd open/close per plane.
|
||
* Could be optimised by caching the EXPBUF fd on the cap_pool slot,
|
||
* but doing it just-in-time keeps the lifecycle uncomplicated. The
|
||
* EXPBUF fd's dup count doesn't affect the V4L2 buffer's underlying
|
||
* pages; closing the fd is a no-op on memory.
|
||
*
|
||
* If EXPBUF fails (e.g., consumer-held EXPBUF prevents a second one
|
||
* — only true for hantro G1 oddity), we skip the sync silently. The
|
||
* existing pre-iter13 behavior is preserved on the error path.
|
||
*/
|
||
if (surface_object->current_slot != NULL &&
|
||
driver_data->video_format != NULL) {
|
||
unsigned int capture_type =
|
||
v4l2_type_video_capture(driver_data->video_format->v4l2_mplane);
|
||
if (v4l2_export_buffer(driver_data->video_fd, capture_type,
|
||
surface_object->destination_index,
|
||
O_RDONLY, sync_fds,
|
||
surface_object->destination_buffers_count) >= 0) {
|
||
n_sync_fds = surface_object->destination_buffers_count;
|
||
for (i = 0; i < n_sync_fds; i++) {
|
||
struct dma_buf_sync s = {
|
||
.flags = DMA_BUF_SYNC_START |
|
||
DMA_BUF_SYNC_READ,
|
||
};
|
||
/* failure is non-fatal: we continue with the read */
|
||
(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
|
||
}
|
||
}
|
||
}
|
||
|
||
/*
|
||
* AV1 film_grain: when this surface is the display surface of a
|
||
* decode (current_display_picture != current_frame with apply_grain=1),
|
||
* its slot is NULL because BeginPicture only fired on the decode
|
||
* surface. Follow the back-link set in av1_set_controls and borrow
|
||
* the decode surface's destination_data + sizes for the copy.
|
||
*/
|
||
if (surface_object->current_slot == NULL &&
|
||
surface_object->linked_decode_surface_id != VA_INVALID_SURFACE) {
|
||
struct object_surface *decode_surface =
|
||
SURFACE(driver_data,
|
||
surface_object->linked_decode_surface_id);
|
||
if (decode_surface != NULL &&
|
||
decode_surface->current_slot != NULL) {
|
||
/* Mirror the fields we read below. The surface heap
|
||
* pointer is stable for the surface's lifetime; we
|
||
* only need destination_data + destination_sizes +
|
||
* destination_planes_count from it. */
|
||
surface_object->destination_planes_count =
|
||
decode_surface->destination_planes_count;
|
||
for (i = 0; i < decode_surface->destination_planes_count; i++) {
|
||
surface_object->destination_data[i] =
|
||
decode_surface->destination_data[i];
|
||
surface_object->destination_sizes[i] =
|
||
decode_surface->destination_sizes[i];
|
||
}
|
||
}
|
||
}
|
||
|
||
for (i = 0; i < surface_object->destination_planes_count; i++) {
|
||
/* AV1 Phase 3 diag: surface NULL-deref hunt. */
|
||
if (buffer_object->data == NULL ||
|
||
surface_object->destination_data[i] == NULL) {
|
||
request_log("copy_surface_to_image NULL i=%u "
|
||
"buf_data=%p dest_data=%p dest_size=%u "
|
||
"planes=%u slot=%p linked=0x%x\n",
|
||
i, (void *)buffer_object->data,
|
||
(void *)surface_object->destination_data[i],
|
||
surface_object->destination_sizes[i],
|
||
surface_object->destination_planes_count,
|
||
(void *)surface_object->current_slot,
|
||
surface_object->linked_decode_surface_id);
|
||
return VA_STATUS_ERROR_OPERATION_FAILED;
|
||
}
|
||
#ifdef __arm__
|
||
if (!video_format_is_linear(driver_data->video_format))
|
||
tiled_to_planar(surface_object->destination_data[i],
|
||
buffer_object->data + image->offsets[i],
|
||
image->pitches[i], image->width,
|
||
i == 0 ? image->height :
|
||
image->height / 2);
|
||
else {
|
||
#endif
|
||
memcpy(buffer_object->data + image->offsets[i],
|
||
surface_object->destination_data[i],
|
||
surface_object->destination_sizes[i]);
|
||
#ifdef __arm__
|
||
}
|
||
#endif
|
||
}
|
||
|
||
/* iter13 α-17: release cache sync. END pairs with each START. */
|
||
for (i = 0; i < n_sync_fds; i++) {
|
||
struct dma_buf_sync s = {
|
||
.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ,
|
||
};
|
||
(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
|
||
close(sync_fds[i]);
|
||
}
|
||
|
||
return VA_STATUS_SUCCESS;
|
||
}
|
||
|
||
VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id,
|
||
VAImage *image)
|
||
{
|
||
struct request_data *driver_data = context->pDriverData;
|
||
struct object_surface *surface_object;
|
||
struct object_buffer *buffer_object;
|
||
VAImageFormat format;
|
||
VAStatus status;
|
||
|
||
|
||
surface_object = SURFACE(driver_data, surface_id);
|
||
if (surface_object == NULL)
|
||
return VA_STATUS_ERROR_INVALID_SURFACE;
|
||
|
||
if (surface_object->status == VASurfaceRendering) {
|
||
status = RequestSyncSurface(context, surface_id);
|
||
if (status != VA_STATUS_SUCCESS)
|
||
return status;
|
||
}
|
||
|
||
/* Fully populate VAImageFormat to match QueryImageFormats output. */
|
||
memset(&format, 0, sizeof(format));
|
||
format.fourcc = VA_FOURCC_NV12;
|
||
format.byte_order = VA_LSB_FIRST;
|
||
format.bits_per_pixel = 12;
|
||
|
||
status = RequestCreateImage(context, &format, surface_object->width,
|
||
surface_object->height, image);
|
||
if (status != VA_STATUS_SUCCESS)
|
||
return status;
|
||
|
||
/*
|
||
* Iter2 Fix 3: skip the surface→image copy when no CAPTURE slot is
|
||
* bound. ffmpeg's av_hwframe_ctx_init probes vaDeriveImage on a
|
||
* never-decoded surface to learn the format; it doesn't read the
|
||
* data. With the cap_pool decoupling, destination_data[] is NULL
|
||
* until BeginPicture binds a slot — copying from a NULL source
|
||
* crashed in memcpy. The image's buffer remains zero-initialized;
|
||
* subsequent post-decode DeriveImage on the same surface (after
|
||
* BeginPicture has bound a slot) does the real copy.
|
||
*/
|
||
if (surface_object->current_slot != NULL) {
|
||
status = copy_surface_to_image (driver_data, surface_object,
|
||
image);
|
||
if (status != VA_STATUS_SUCCESS)
|
||
return status;
|
||
}
|
||
|
||
surface_object->status = VASurfaceReady;
|
||
|
||
buffer_object = BUFFER(driver_data, image->buf);
|
||
buffer_object->derived_surface_id = surface_id;
|
||
|
||
return VA_STATUS_SUCCESS;
|
||
}
|
||
|
||
VAStatus RequestQueryImageFormats(VADriverContextP context,
|
||
VAImageFormat *formats, int *formats_count)
|
||
{
|
||
|
||
/*
|
||
* Populate the VAImageFormat fully per VAAPI spec for NV12 —
|
||
* not just .fourcc. Consumers (FFmpeg's hwcontext_vaapi, mpv,
|
||
* Firefox) read .byte_order and .bits_per_pixel; leaving them
|
||
* uninitialized inherits whatever caller-stack garbage is in
|
||
* the buffer and produces non-deterministic behavior. Reference:
|
||
* Mesa's gallium/frontends/va/image.c::vlVaQueryImageFormats and
|
||
* intel-vaapi-driver's i965_drv_video.c — both publish NV12
|
||
* with byte_order=VA_LSB_FIRST and bits_per_pixel=12.
|
||
*
|
||
* For YUV formats, depth/red_mask/green_mask/blue_mask/alpha_mask
|
||
* are not meaningful (those describe RGB bit layouts); leave them
|
||
* zeroed via memset before populating.
|
||
*/
|
||
memset(&formats[0], 0, sizeof(formats[0]));
|
||
formats[0].fourcc = VA_FOURCC_NV12;
|
||
formats[0].byte_order = VA_LSB_FIRST;
|
||
formats[0].bits_per_pixel = 12;
|
||
*formats_count = 1;
|
||
|
||
return VA_STATUS_SUCCESS;
|
||
}
|
||
|
||
VAStatus RequestSetImagePalette(VADriverContextP context, VAImageID image_id,
|
||
unsigned char *palette)
|
||
{
|
||
return VA_STATUS_ERROR_UNIMPLEMENTED;
|
||
}
|
||
|
||
VAStatus RequestGetImage(VADriverContextP context, VASurfaceID surface_id,
|
||
int x, int y, unsigned int width, unsigned int height,
|
||
VAImageID image_id)
|
||
{
|
||
struct request_data *driver_data = context->pDriverData;
|
||
struct object_surface *surface_object;
|
||
struct object_image *image_object;
|
||
VAImage *image;
|
||
|
||
|
||
surface_object = SURFACE(driver_data, surface_id);
|
||
if (surface_object == NULL)
|
||
return VA_STATUS_ERROR_INVALID_SURFACE;
|
||
|
||
image_object = IMAGE(driver_data, image_id);
|
||
if (image_object == NULL)
|
||
return VA_STATUS_ERROR_INVALID_IMAGE;
|
||
|
||
image = &image_object->image;
|
||
if (x != 0 || y != 0 || width != image->width || height != image->height)
|
||
return VA_STATUS_ERROR_UNIMPLEMENTED;
|
||
|
||
return copy_surface_to_image (driver_data, surface_object, image);
|
||
}
|
||
|
||
VAStatus RequestPutImage(VADriverContextP context, VASurfaceID surface_id,
|
||
VAImageID image, int src_x, int src_y,
|
||
unsigned int src_width, unsigned int src_height,
|
||
int dst_x, int dst_y, unsigned int dst_width,
|
||
unsigned int dst_height)
|
||
{
|
||
return VA_STATUS_ERROR_UNIMPLEMENTED;
|
||
}
|