Files
libva-v4l2-request-fourier/src/image.c
T
claude-noether 3ffa9d0d17 iter40: Pi 5 HEVC chapter — backend integration lands, bit-exact pending
Phase 6 implementation. Backend builds clean on higgs (Debian 13
trixie, aarch64), vainfo lists VAProfileHEVCMain via rpi-hevc-dec,
multi-device probe finds /dev/video19 + /dev/media1, CreateContext
+ S_FMT + REQBUFS + STREAMON all succeed.

Phase 7 partial: infrastructure works, 10 frames flow through the
pipeline (correct byte counts produced — 13824000 for 1280x720 x 10
NV12 frames). But every DQBUF CAPTURE returns V4L2_BUF_FLAG_ERROR
so output content is wrong (libva sha != kdirect sha). The decode
itself is failing on the rpi-hevc-dec side despite all ctrl
submissions returning success.

Code changes:
- request.h: video_fd_rpi_hevc_dec / media_fd_rpi_hevc_dec slots +
  has_hevc_ext_sps_rps_rpi_hevc_dec flag (mirrors iter38 + iter2
  pair-of-flags pattern, naturally false on Pi).
- request.c: known_decoder_drivers gains rpi-hevc-dec; primary-driver
  probe gets an else-if branch setting the new fds (Phase 5 F3);
  request_switch_device_for_profile prefers 'p' for HEVC when
  rpi-hevc-dec present.
- context.c: per-fd want_pixfmt (NC12 on Pi), capture_pixelformat
  taken from video_format slot (not hardcoded NV12/NV15);
  synthetic-SPS pre-seed gated off for Pi (Phase 5 F6);
  destination_sizes uses nv12_col128_uv_plane_offset for NC12 SAND
  layout (Phase 5 F2);
  per-driver HEVC_START_CODE (NONE on Pi, ANNEX_B on RK);
  per-driver context_object->h264_start_code (skip prepend on Pi).
- video.c: NV12_COL128 video_format entry (8-bit SAND, single
  buffer, 2 planes, NV12 drm_format with MOD_NONE so detile branch
  fires rather than tiled_to_planar).
- nv12_col128.c/.h: detile primitive (Y + UV per-plane, kernel
  hevc_d_video.c bytesperline formula + ffmpeg/Kynesim per-pixel
  offset). UV plane offset = 128 * ALIGN(h, 8) — within-column
  (SAND interleaves Y+UV per column, NOT plane-concatenated;
  earlier wrong formula caught by Phase 7 SEGV).
- image.c: #ifdef __arm__ extended to __arm__ || __aarch64__
  (Phase 5 F1 — guard was killing detile path on all aarch64
  hosts including fresnel iter39 NV15 path, masked because 10-bit
  never exercised); RequestCreateImage NC12 → NV12 stride override
  (linear width, not column-stride); copy_surface_to_image NC12
  detile branch (gates on fourcc + v4l2_format).
- nv15.h: fallback V4L2_PIX_FMT_NV15 define (Debian 13 headers
  omit it though they have NC12).
- nv12_col128.h: fallback V4L2_PIX_FMT_NV12_COL128 +
  V4L2_PIX_FMT_NV12_10_COL128 (Arch / mainline pre-Pi headers).
- tests/test_nv12_col128_detile.c: hand-crafted-bytes unit test;
  passes (8 cases: Y + UV for 4 widths incl. 1366 misaligned;
  UV-offset helper).
- meson.build / nv12_col128 sources listed.

Phase 7 status: not yet bit-exact. Remaining diagnosis: per-frame
S_EXT_CTRLS payload diff vs kdirect (kdirect sends 4 ctrls
SPS+PPS+decode_params+slice_array; ours sends 5 incl. scaling_matrix;
field ordering differs). Likely the slice_array contents need
per-driver handling for rpi-hevc-dec's expected layout. Beyond
in-session reach.

iter38 5/5 baseline on fresnel + ampere should be unaffected (new
fd stays -1 on non-Pi hosts; all gates either short-circuit on
fd-not-present or no-op).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-17 19:17:14 +00:00

515 lines
18 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Copyright (C) 2007 Intel Corporation
* Copyright (C) 2016 Florent Revest <florent.revest@free-electrons.com>
* Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "image.h"
#include "buffer.h"
#include "request.h"
#include "surface.h"
#include "video.h"
#include <assert.h>
#include <fcntl.h>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <linux/dma-buf.h>
#include "nv15.h"
#include "nv12_col128.h"
#include "tiled_yuv.h"
#include "utils.h"
#include "v4l2.h"
VAStatus RequestCreateImage(VADriverContextP context, VAImageFormat *format,
int width, int height, VAImage *image)
{
struct request_data *driver_data = context->pDriverData;
unsigned int destination_sizes[VIDEO_MAX_PLANES];
unsigned int destination_bytesperlines[VIDEO_MAX_PLANES];
unsigned int destination_planes_count;
unsigned int planes_count;
unsigned int format_width, format_height;
unsigned int size;
unsigned int capture_type;
struct video_format *video_format;
struct object_image *image_object;
VABufferID buffer_id;
VAImageID id;
VAStatus status;
unsigned int i;
int rc;
video_format = driver_data->video_format;
if (video_format == NULL)
return VA_STATUS_ERROR_OPERATION_FAILED;
capture_type = v4l2_type_video_capture(video_format->v4l2_mplane);
/*
* FIXME: This should be replaced by per-pixelformat hadling to
* determine the logical plane offsets and sizes;
*/
rc = v4l2_get_format(driver_data->video_fd, capture_type,
&format_width, &format_height,
destination_bytesperlines, destination_sizes,
&planes_count);
if (rc < 0)
return VA_STATUS_ERROR_OPERATION_FAILED;
destination_planes_count = video_format->planes_count;
size = 0;
/* The size returned by V4L2 covers buffers, not logical planes. */
for (i = 0; i < planes_count; i++)
size += destination_sizes[i];
if (format->fourcc == VA_FOURCC_P010) {
/*
* iter39: P010 image overrides V4L2-side NV15 sizing. The
* source is the kernel-reported NV15 packed plane; the image
* buffer holds dense P010 (2 bytes per pixel, 16bpp).
* Recompute sizes/pitches against P010 layout so consumers
* (vaGetImage, vaDeriveImage) see standard P010 geometry.
*/
destination_bytesperlines[0] = width * 2;
destination_sizes[0] = destination_bytesperlines[0] * format_height;
for (i = 1; i < destination_planes_count; i++) {
destination_bytesperlines[i] = destination_bytesperlines[0];
destination_sizes[i] = destination_sizes[0] / 2;
}
size = 0;
for (i = 0; i < destination_planes_count; i++)
size += destination_sizes[i];
} else if (format->fourcc == VA_FOURCC_NV12 &&
video_format->v4l2_format == V4L2_PIX_FMT_NV12_COL128) {
/*
* iter40 Phase 5 review F2: NC12 source, NV12 image output.
* V4L2-reported destination_bytesperlines[0] is the NC12
* column stride (= ALIGN(height,8) * 3/2 — e.g. 1080 for
* 1280×720), NOT the linear NV12 Y stride. Override to the
* linear stride (width) so VAImage pitches reflect the
* detile-output layout the consumer reads.
*/
destination_bytesperlines[0] = width;
destination_sizes[0] = destination_bytesperlines[0] * format_height;
for (i = 1; i < destination_planes_count; i++) {
destination_bytesperlines[i] = destination_bytesperlines[0];
destination_sizes[i] = destination_sizes[0] / 2;
}
size = 0;
for (i = 0; i < destination_planes_count; i++)
size += destination_sizes[i];
} else {
/* NV12: V4L2 stride is correct, sizes derived from height. */
destination_sizes[0] = destination_bytesperlines[0] * format_height;
for (i = 1; i < destination_planes_count; i++) {
destination_bytesperlines[i] = destination_bytesperlines[0];
destination_sizes[i] = destination_sizes[0] / 2;
}
}
id = object_heap_allocate(&driver_data->image_heap);
image_object = IMAGE(driver_data, id);
if (image_object == NULL)
return VA_STATUS_ERROR_ALLOCATION_FAILED;
status = RequestCreateBuffer(context, 0, VAImageBufferType, size, 1,
NULL, &buffer_id);
if (status != VA_STATUS_SUCCESS) {
object_heap_free(&driver_data->image_heap,
(struct object_base *)image_object);
return status;
}
memset(image, 0, sizeof(*image));
image->format = *format;
image->width = width;
image->height = height;
image->buf = buffer_id;
image->image_id = id;
image->num_planes = destination_planes_count;
image->data_size = size;
for (i = 0; i < image->num_planes; i++) {
image->pitches[i] = destination_bytesperlines[i];
image->offsets[i] = i > 0 ? destination_sizes[i - 1] : 0;
}
image_object->image = *image;
return VA_STATUS_SUCCESS;
}
VAStatus RequestDestroyImage(VADriverContextP context, VAImageID image_id)
{
struct request_data *driver_data = context->pDriverData;
struct object_image *image_object;
VAStatus status;
image_object = IMAGE(driver_data, image_id);
if (image_object == NULL)
return VA_STATUS_ERROR_INVALID_IMAGE;
status = RequestDestroyBuffer(context, image_object->image.buf);
if (status != VA_STATUS_SUCCESS)
return status;
object_heap_free(&driver_data->image_heap,
(struct object_base *)image_object);
return VA_STATUS_SUCCESS;
}
static VAStatus copy_surface_to_image (struct request_data *driver_data,
struct object_surface *surface_object,
VAImage *image)
{
struct object_buffer *buffer_object;
unsigned int i;
int sync_fds[VIDEO_MAX_PLANES];
unsigned int n_sync_fds = 0;
buffer_object = BUFFER(driver_data, image->buf);
if (buffer_object == NULL)
return VA_STATUS_ERROR_INVALID_BUFFER;
for (i = 0; i < VIDEO_MAX_PLANES; i++)
sync_fds[i] = -1;
/*
* iter13 α-17: explicit cache sync around the CAPTURE buffer read.
*
* The CAPTURE buffer is V4L2_MEMORY_MMAP and was mapped at
* cap_pool_init time with cached attributes. Kernel decode writes to
* the buffer via DMA, which doesn't propagate to the CPU's cache
* observer for that virtual mapping. Reading from
* surface_object->destination_data[] without an explicit cache
* invalidation returns stale data — observed empirically as Bug 4
* (H.264 partial-fill) and Bug 5 (HEVC all-zero) when libva went
* through the SAME readback path that kdirect ffmpeg-v4l2request +
* DRM_PRIME-mmap successfully reads (kdirect's drm-prime mmap
* implicitly handles sync).
*
* DMA_BUF_IOCTL_SYNC(START | READ) makes the CPU mapping coherent
* with the producing engine's writes; END releases the sync.
* Per V4L2 + dma-buf spec, this is the userspace contract for
* cached-mmap'd buffers (Tomasz Figa, linaro-mm-sig 2024-07-11).
*
* Requires a dma-buf fd: get one via VIDIOC_EXPBUF, sync, close.
* Per-call cost is one ioctl pair + one fd open/close per plane.
* Could be optimised by caching the EXPBUF fd on the cap_pool slot,
* but doing it just-in-time keeps the lifecycle uncomplicated. The
* EXPBUF fd's dup count doesn't affect the V4L2 buffer's underlying
* pages; closing the fd is a no-op on memory.
*
* If EXPBUF fails (e.g., consumer-held EXPBUF prevents a second one
* — only true for hantro G1 oddity), we skip the sync silently. The
* existing pre-iter13 behavior is preserved on the error path.
*/
if (surface_object->current_slot != NULL &&
driver_data->video_format != NULL) {
unsigned int capture_type =
v4l2_type_video_capture(driver_data->video_format->v4l2_mplane);
if (v4l2_export_buffer(driver_data->video_fd, capture_type,
surface_object->destination_index,
O_RDONLY, sync_fds,
surface_object->destination_buffers_count) >= 0) {
n_sync_fds = surface_object->destination_buffers_count;
for (i = 0; i < n_sync_fds; i++) {
struct dma_buf_sync s = {
.flags = DMA_BUF_SYNC_START |
DMA_BUF_SYNC_READ,
};
/* failure is non-fatal: we continue with the read */
(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
}
}
}
for (i = 0; i < surface_object->destination_planes_count; i++) {
/*
* iter40 Phase 5 review F1: guard extended from __arm__ to
* __arm__ || __aarch64__. Without this, the detile primitives
* silently compiled out on aarch64 (fresnel RK3399, ampere
* RK3588, higgs Pi CM5) and the memcpy fall-through delivered
* raw tiled bytes to NV12/P010 image consumers. iter39 5/5
* PASS masked the issue because no 10-bit path was exercised.
*/
#if defined(__arm__) || defined(__aarch64__)
/*
* Sunxi tiled_to_planar lives in tiled_yuv.S which is
* #ifdef __arm__ — symbol absent on aarch64. Keep this
* branch arm-only; aarch64 Sunxi support would need a C or
* aarch64-ASM port (no Sunxi aarch64 board in current fleet).
*/
#if defined(__arm__)
if (!video_format_is_linear(driver_data->video_format))
tiled_to_planar(surface_object->destination_data[i],
buffer_object->data + image->offsets[i],
image->pitches[i], image->width,
i == 0 ? image->height :
image->height / 2);
else
#endif
if (driver_data->is_10bit &&
image->format.fourcc == VA_FOURCC_P010) {
/*
* iter39: rkvdec emits NV15 (4×10-bit packed in 5
* bytes); the VA image buffer is dense P010 (2B/pixel,
* value in bits[15:6]). Source stride is the V4L2-
* reported NV15 bytesperline (= ceil(width/4)*5,
* possibly aligned higher by the kernel); destination
* stride is image->pitches[i] = width * 2.
*/
unsigned int plane_h = (i == 0) ? image->height
: image->height / 2;
nv15_unpack_plane_to_p010(
surface_object->destination_data[i],
(uint16_t *)(buffer_object->data + image->offsets[i]),
image->width, plane_h,
surface_object->destination_bytesperlines[i]);
} else if (driver_data->video_format != NULL &&
driver_data->video_format->v4l2_format ==
V4L2_PIX_FMT_NV12_COL128 &&
image->format.fourcc == VA_FOURCC_NV12) {
/*
* iter40: Pi 5 rpi-hevc-dec emits NV12_COL128 (SAND
* 128-pixel-wide column tiles). Detile to linear NV12
* via the per-plane primitive. surface_object->
* destination_data[i] is the V4L2 CAPTURE mmap (single
* buffer, planes_count==2): i==0 is the Y plane base,
* i==1 is the UV plane base offset within the SAME
* physical buffer (per cap_pool plane[1] offset = Y
* plane size in COL128 layout).
*
* src_col_stride = destination_bytesperlines[i] = the
* kernel-reported NC12 bytesperline (column stride,
* = ALIGN(image_h, 8) * 3/2). Same for both planes
* since column geometry is plane-agnostic.
*
* dst stride is image->pitches[i] = image->width
* (overridden in RequestCreateImage NC12 branch below).
*/
if (i == 0) {
nv12_col128_detile_y(
(uint8_t *)(buffer_object->data + image->offsets[i]),
image->pitches[i],
surface_object->destination_data[i],
surface_object->destination_bytesperlines[i],
image->width, image->height);
} else {
nv12_col128_detile_uv(
(uint8_t *)(buffer_object->data + image->offsets[i]),
image->pitches[i],
surface_object->destination_data[i],
surface_object->destination_bytesperlines[i],
image->width, image->height / 2);
}
} else {
#endif
memcpy(buffer_object->data + image->offsets[i],
surface_object->destination_data[i],
surface_object->destination_sizes[i]);
#if defined(__arm__) || defined(__aarch64__)
}
#endif
}
/* iter13 α-17: release cache sync. END pairs with each START. */
for (i = 0; i < n_sync_fds; i++) {
struct dma_buf_sync s = {
.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ,
};
(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
close(sync_fds[i]);
}
return VA_STATUS_SUCCESS;
}
VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id,
VAImage *image)
{
struct request_data *driver_data = context->pDriverData;
struct object_surface *surface_object;
struct object_buffer *buffer_object;
VAImageFormat format;
VAStatus status;
surface_object = SURFACE(driver_data, surface_id);
if (surface_object == NULL)
return VA_STATUS_ERROR_INVALID_SURFACE;
if (surface_object->status == VASurfaceRendering) {
status = RequestSyncSurface(context, surface_id);
if (status != VA_STATUS_SUCCESS)
return status;
}
/* Fully populate VAImageFormat to match QueryImageFormats output. */
memset(&format, 0, sizeof(format));
if (driver_data->is_10bit) {
/* iter39: 10-bit session derives a P010 image. NV15-source
* unpack happens in copy_surface_to_image. */
format.fourcc = VA_FOURCC_P010;
format.byte_order = VA_LSB_FIRST;
format.bits_per_pixel = 24;
} else {
format.fourcc = VA_FOURCC_NV12;
format.byte_order = VA_LSB_FIRST;
format.bits_per_pixel = 12;
}
status = RequestCreateImage(context, &format, surface_object->width,
surface_object->height, image);
if (status != VA_STATUS_SUCCESS)
return status;
/*
* Iter2 Fix 3: skip the surface→image copy when no CAPTURE slot is
* bound. ffmpeg's av_hwframe_ctx_init probes vaDeriveImage on a
* never-decoded surface to learn the format; it doesn't read the
* data. With the cap_pool decoupling, destination_data[] is NULL
* until BeginPicture binds a slot — copying from a NULL source
* crashed in memcpy. The image's buffer remains zero-initialized;
* subsequent post-decode DeriveImage on the same surface (after
* BeginPicture has bound a slot) does the real copy.
*/
if (surface_object->current_slot != NULL) {
status = copy_surface_to_image (driver_data, surface_object,
image);
if (status != VA_STATUS_SUCCESS)
return status;
}
surface_object->status = VASurfaceReady;
buffer_object = BUFFER(driver_data, image->buf);
buffer_object->derived_surface_id = surface_id;
return VA_STATUS_SUCCESS;
}
VAStatus RequestQueryImageFormats(VADriverContextP context,
VAImageFormat *formats, int *formats_count)
{
struct request_data *driver_data = context->pDriverData;
int n = 0;
/*
* Populate the VAImageFormat fully per VAAPI spec — not just
* .fourcc. Consumers (FFmpeg's hwcontext_vaapi, mpv, Firefox)
* read .byte_order and .bits_per_pixel; leaving them
* uninitialized inherits caller-stack garbage and produces
* non-deterministic behavior. Reference: Mesa's
* gallium/frontends/va/image.c::vlVaQueryImageFormats and
* intel-vaapi-driver's i965_drv_video.c.
*
* iter39: advertise P010 when an active session is 10-bit so
* ffmpeg-vaapi sees a valid 10-bit-compatible entry during
* vaQueryImageFormats. NV12 stays advertised unconditionally so
* the 8-bit catalog query response is unchanged.
*/
memset(&formats[n], 0, sizeof(formats[n]));
formats[n].fourcc = VA_FOURCC_NV12;
formats[n].byte_order = VA_LSB_FIRST;
formats[n].bits_per_pixel = 12;
n++;
/*
* iter39 Option B revert (2026-05-17): P010 advertisement is
* gated on driver_data->is_10bit again. Previously advertised
* unconditionally (63fed87) so ffmpeg-vaapi's early
* vaQueryImageFormats (pre-vaCreateContext) could see it for
* 10-bit profiles — but that broke HEVC 8-bit on fresnel:
* ffmpeg-vaapi picked P010 for the HEVC hwframe pool, EndPicture
* SEGV'd in the .so when the consumer-side P010 expectations met
* an 8-bit NV12 CAPTURE buffer.
* Safe because Option B drops VAProfileHEVCMain10 + Hi10P from
* enumeration — no 10-bit decode pipeline will reach this catalog
* query so the gate-on-is_10bit (which stays false for 8-bit
* profiles) correctly returns NV12-only.
*/
if (driver_data->is_10bit && n < V4L2_REQUEST_MAX_IMAGE_FORMATS) {
memset(&formats[n], 0, sizeof(formats[n]));
formats[n].fourcc = VA_FOURCC_P010;
formats[n].byte_order = VA_LSB_FIRST;
formats[n].bits_per_pixel = 24;
n++;
}
*formats_count = n;
return VA_STATUS_SUCCESS;
}
VAStatus RequestSetImagePalette(VADriverContextP context, VAImageID image_id,
unsigned char *palette)
{
return VA_STATUS_ERROR_UNIMPLEMENTED;
}
VAStatus RequestGetImage(VADriverContextP context, VASurfaceID surface_id,
int x, int y, unsigned int width, unsigned int height,
VAImageID image_id)
{
struct request_data *driver_data = context->pDriverData;
struct object_surface *surface_object;
struct object_image *image_object;
VAImage *image;
surface_object = SURFACE(driver_data, surface_id);
if (surface_object == NULL)
return VA_STATUS_ERROR_INVALID_SURFACE;
image_object = IMAGE(driver_data, image_id);
if (image_object == NULL)
return VA_STATUS_ERROR_INVALID_IMAGE;
image = &image_object->image;
if (x != 0 || y != 0 || width != image->width || height != image->height)
return VA_STATUS_ERROR_UNIMPLEMENTED;
return copy_surface_to_image (driver_data, surface_object, image);
}
VAStatus RequestPutImage(VADriverContextP context, VASurfaceID surface_id,
VAImageID image, int src_x, int src_y,
unsigned int src_width, unsigned int src_height,
int dst_x, int dst_y, unsigned int dst_width,
unsigned int dst_height)
{
return VA_STATUS_ERROR_UNIMPLEMENTED;
}