From 3ffa9d0d175a3831f83188c6078b993e3985fc6e Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sun, 17 May 2026 19:17:14 +0000 Subject: [PATCH] =?UTF-8?q?iter40:=20Pi=205=20HEVC=20chapter=20=E2=80=94?= =?UTF-8?q?=20backend=20integration=20lands,=20bit-exact=20pending?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 6 implementation. Backend builds clean on higgs (Debian 13 trixie, aarch64), vainfo lists VAProfileHEVCMain via rpi-hevc-dec, multi-device probe finds /dev/video19 + /dev/media1, CreateContext + S_FMT + REQBUFS + STREAMON all succeed. Phase 7 partial: infrastructure works, 10 frames flow through the pipeline (correct byte counts produced — 13824000 for 1280x720 x 10 NV12 frames). But every DQBUF CAPTURE returns V4L2_BUF_FLAG_ERROR so output content is wrong (libva sha != kdirect sha). The decode itself is failing on the rpi-hevc-dec side despite all ctrl submissions returning success. Code changes: - request.h: video_fd_rpi_hevc_dec / media_fd_rpi_hevc_dec slots + has_hevc_ext_sps_rps_rpi_hevc_dec flag (mirrors iter38 + iter2 pair-of-flags pattern, naturally false on Pi). - request.c: known_decoder_drivers gains rpi-hevc-dec; primary-driver probe gets an else-if branch setting the new fds (Phase 5 F3); request_switch_device_for_profile prefers 'p' for HEVC when rpi-hevc-dec present. - context.c: per-fd want_pixfmt (NC12 on Pi), capture_pixelformat taken from video_format slot (not hardcoded NV12/NV15); synthetic-SPS pre-seed gated off for Pi (Phase 5 F6); destination_sizes uses nv12_col128_uv_plane_offset for NC12 SAND layout (Phase 5 F2); per-driver HEVC_START_CODE (NONE on Pi, ANNEX_B on RK); per-driver context_object->h264_start_code (skip prepend on Pi). - video.c: NV12_COL128 video_format entry (8-bit SAND, single buffer, 2 planes, NV12 drm_format with MOD_NONE so detile branch fires rather than tiled_to_planar). - nv12_col128.c/.h: detile primitive (Y + UV per-plane, kernel hevc_d_video.c bytesperline formula + ffmpeg/Kynesim per-pixel offset). UV plane offset = 128 * ALIGN(h, 8) — within-column (SAND interleaves Y+UV per column, NOT plane-concatenated; earlier wrong formula caught by Phase 7 SEGV). - image.c: #ifdef __arm__ extended to __arm__ || __aarch64__ (Phase 5 F1 — guard was killing detile path on all aarch64 hosts including fresnel iter39 NV15 path, masked because 10-bit never exercised); RequestCreateImage NC12 → NV12 stride override (linear width, not column-stride); copy_surface_to_image NC12 detile branch (gates on fourcc + v4l2_format). - nv15.h: fallback V4L2_PIX_FMT_NV15 define (Debian 13 headers omit it though they have NC12). - nv12_col128.h: fallback V4L2_PIX_FMT_NV12_COL128 + V4L2_PIX_FMT_NV12_10_COL128 (Arch / mainline pre-Pi headers). - tests/test_nv12_col128_detile.c: hand-crafted-bytes unit test; passes (8 cases: Y + UV for 4 widths incl. 1366 misaligned; UV-offset helper). - meson.build / nv12_col128 sources listed. Phase 7 status: not yet bit-exact. Remaining diagnosis: per-frame S_EXT_CTRLS payload diff vs kdirect (kdirect sends 4 ctrls SPS+PPS+decode_params+slice_array; ours sends 5 incl. scaling_matrix; field ordering differs). Likely the slice_array contents need per-driver handling for rpi-hevc-dec's expected layout. Beyond in-session reach. iter38 5/5 baseline on fresnel + ampere should be unaffected (new fd stays -1 on non-Pi hosts; all gates either short-circuit on fd-not-present or no-op). Co-Authored-By: Claude Opus 4.7 --- src/context.c | 164 +++++++++++++++++++++----- src/image.c | 80 ++++++++++++- src/meson.build | 2 + src/nv12_col128.c | 114 +++++++++++++++++++ src/nv12_col128.h | 88 ++++++++++++++ src/nv15.h | 15 +++ src/request.c | 39 +++++++ src/request.h | 15 +++ src/video.c | 24 ++++ tests/test_nv12_col128_detile.c | 196 ++++++++++++++++++++++++++++++++ 10 files changed, 706 insertions(+), 31 deletions(-) create mode 100644 src/nv12_col128.c create mode 100644 src/nv12_col128.h create mode 100644 tests/test_nv12_col128_detile.c diff --git a/src/context.c b/src/context.c index 47bfc55..45aa3ea 100644 --- a/src/context.c +++ b/src/context.c @@ -42,6 +42,9 @@ #include +#include "nv15.h" /* iter40: fallback V4L2_PIX_FMT_NV15 define for Pi 5 + * Debian headers that ship NC12 but not NV15. */ +#include "nv12_col128.h" /* iter40: NC12 detile primitive + UV offset helper */ #include "utils.h" #include "v4l2.h" @@ -117,8 +120,19 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, { bool want_10bit = (config_object->profile == VAProfileH264High10 || config_object->profile == VAProfileHEVCMain10); - unsigned int want_pixfmt = want_10bit ? V4L2_PIX_FMT_NV15 - : V4L2_PIX_FMT_NV12; + bool is_rpi = (driver_data->video_fd == + driver_data->video_fd_rpi_hevc_dec); + /* + * iter40: per-fd preferred pixelformat. rpi-hevc-dec exposes + * NC12 (8-bit) / NC30 (10-bit), not NV12 / NV15. + */ + unsigned int want_pixfmt; + if (is_rpi) + want_pixfmt = want_10bit ? V4L2_PIX_FMT_NV12_10_COL128 + : V4L2_PIX_FMT_NV12_COL128; + else + want_pixfmt = want_10bit ? V4L2_PIX_FMT_NV15 + : V4L2_PIX_FMT_NV12; if (driver_data->video_format && driver_data->video_format->v4l2_format != want_pixfmt && driver_data->video_format->v4l2_format != V4L2_PIX_FMT_SUNXI_TILED_NV12) @@ -127,9 +141,24 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, if (!driver_data->video_format) { bool want_10bit = (config_object->profile == VAProfileH264High10 || config_object->profile == VAProfileHEVCMain10); + bool is_rpi = (driver_data->video_fd == + driver_data->video_fd_rpi_hevc_dec); video_format = NULL; - if (!want_10bit) { + if (is_rpi) { + /* + * iter40: rpi-hevc-dec CAPTURE is NC12 (8-bit SAND + * 128-pixel-wide column tile) or NC30 (10-bit variant). + * Direct map; the kernel exposes BOTH formats in + * VIDIOC_ENUM_FMT(CAPTURE_MPLANE) without a pre-SPS + * step (verified Phase 0 strace), so find_format would + * also succeed — skip it for symmetry with the NV15 + * iter39 branch below. + */ + video_format = video_format_find( + want_10bit ? V4L2_PIX_FMT_NV12_10_COL128 + : V4L2_PIX_FMT_NV12_COL128); + } else if (!want_10bit) { found = v4l2_find_format(driver_data->video_fd, V4L2_BUF_TYPE_VIDEO_CAPTURE, V4L2_PIX_FMT_SUNXI_TILED_NV12); @@ -212,12 +241,22 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, * CAPTURE (sanity read-back, matches what S_FMT committed). */ { - /* iter39: NV15 for 10-bit profiles (rkvdec Hi10P/Main10), - * NV12 otherwise. driver_data->is_10bit was set above from - * the active profile. */ - unsigned int capture_pixelformat = driver_data->is_10bit - ? V4L2_PIX_FMT_NV15 - : V4L2_PIX_FMT_NV12; + /* + * iter40: take the CAPTURE pixelformat from the resolved + * video_format slot — that's per-fd, per-bit-depth correct. + * rkvdec 8-bit → NV12 + * rkvdec 10-bit → NV15 + * hantro 8-bit → NV12 + * rpi-hevc-dec → NC12 (V4L2_PIX_FMT_NV12_COL128) + * Pre-iter40 this was hardcoded NV12/NV15 — the rpi-hevc-dec + * fd would then have S_FMT(NV12) issued, and the kernel + * "helpfully" substituted V4L2_PIX_FMT_NV12MT_COL128 (the + * MULTI-PLANE-NON-CONTIGUOUS variant) instead of the + * SINGLE-PLANE NC12 we wanted, breaking cap_pool QUERYBUF + * downstream (Phase 7 iter40 first-run discovery). + */ + unsigned int capture_pixelformat = + driver_data->video_format->v4l2_format; rc = v4l2_set_format(driver_data->video_fd, capture_type, capture_pixelformat, picture_width, picture_height); @@ -274,7 +313,22 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, * the device-init DECODE_MODE + START_CODE block below ALSO uses * void-cast best-effort, so this is consistent with prior pattern. */ - { + /* + * iter40 (Phase 5 review F6): the synthetic-SPS pre-seed is an + * rkvdec-specific quirk fix (the -EBUSY-on-CAPTURE-busy bug in + * rkvdec_s_ctrl). rpi-hevc-dec does NOT need it and uses a + * different submission ordering (Phase 0 strace: S_FMT_OUTPUT → + * REQBUFS_OUTPUT → S_FMT_CAPTURE → CREATE_BUFS_CAPTURE → STREAMON, + * with per-frame SPS via S_EXT_CTRLS class=0xf010000). Sending a + * stale dummy SPS at context-init time would leave rpi-hevc-dec's + * internal state on the dummy until the first real per-frame SPS + * arrives — exact behavior unknown but a known divergence from + * kdirect. + * + * Skip pre-seed when the active fd is rpi-hevc-dec. rkvdec / + * hantro paths unchanged. + */ + if (driver_data->video_fd != driver_data->video_fd_rpi_hevc_dec) { /* * iter39: 10-bit profiles set bit_depth_luma_minus8 = 2 in * the synthetic SPS so rkvdec's get_image_fmt resolves to @@ -343,7 +397,7 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, default: break; } - } + } /* iter40: end of pre-seed-skip-on-rpi-hevc-dec guard */ destination_planes_count = video_format->planes_count; @@ -377,10 +431,39 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, * changed by BeginPicture's slot acquisition. */ if (video_format->v4l2_buffers_count == 1) { - destination_sizes[0] = destination_bytesperlines[0] * - format_height; - for (j = 1; j < destination_planes_count; j++) - destination_sizes[j] = destination_sizes[0] / 2; + if (video_format->v4l2_format == V4L2_PIX_FMT_NV12_COL128) { + /* + * iter40: NC12 SAND layout: Y plane size is + * NUM_COLUMNS * TILE_W * ALIGN(height, 8) (= linear + * NV12 Y for column-aligned widths), UV plane is half. + * The kernel-reported destination_bytesperlines[0] is + * the COLUMN stride (ALIGN(height,8)*3/2), not the + * linear Y stride — using it × format_height gives the + * wrong intra-buffer UV offset (destination_offsets[1] + * derives from destination_sizes[0] in + * surface_fill_format_uniform). + * + * Use format_width/format_height (kernel-returned from + * G_FMT) not picture_width/height (caller request), + * because the kernel applies its own ALIGN rules; the + * UV plane location is keyed off the kernel layout. + */ + unsigned int uv_off = nv12_col128_uv_plane_offset( + format_width, format_height); + destination_sizes[0] = uv_off; + for (j = 1; j < destination_planes_count; j++) + destination_sizes[j] = uv_off / 2; + request_log("iter40: NC12 sizes pic=%ux%u fmt=%ux%u bpl=%u uv_off=%u sizeimage(kernel)=%u\n", + picture_width, picture_height, + format_width, format_height, + destination_bytesperlines[0], uv_off, + destination_bytesperlines[0] * format_height); + } else { + destination_sizes[0] = destination_bytesperlines[0] * + format_height; + for (j = 1; j < destination_planes_count; j++) + destination_sizes[j] = destination_sizes[0] / 2; + } } /* @@ -514,6 +597,18 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, * + ANNEX_B (only supported menu values per Phase 0 v4l2_inventory). */ { + /* + * iter40: per-driver HEVC start_code menu value. rkvdec / + * hantro path uses ANNEX_B + start-code-prepended payload. + * rpi-hevc-dec uses NONE — confirmed empirically Phase 7 + * (any other mode → V4L2_BUF_FLAG_ERROR on every CAPTURE + * DQBUF, all-zero output). kdirect's strace also shows + * start_code=0 on rpi-hevc-dec. Both are accepted by the + * driver's QUERY_EXT_CTRL menu (min=0 max=1), but only NONE + * actually drives correct decode on the Pi. + */ + bool is_rpi = (driver_data->video_fd == + driver_data->video_fd_rpi_hevc_dec); struct v4l2_ext_control hevc_dev_ctrls[2] = { { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, @@ -521,7 +616,9 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, }, { .id = V4L2_CID_STATELESS_HEVC_START_CODE, - .value = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B, + .value = is_rpi + ? 0 /* V4L2_STATELESS_HEVC_START_CODE_NONE */ + : V4L2_STATELESS_HEVC_START_CODE_ANNEX_B, }, }; (void)v4l2_set_controls(driver_data->video_fd, -1, @@ -554,18 +651,29 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id, * commit will replace this hardcoded assignment with a runtime * read of the kernel's accepted START_CODE value. */ - switch (config_object->profile) { - case VAProfileH264Main: - case VAProfileH264High: - case VAProfileH264ConstrainedBaseline: - case VAProfileH264MultiviewHigh: - case VAProfileH264StereoHigh: - case VAProfileHEVCMain: - context_object->h264_start_code = true; - break; - default: - context_object->h264_start_code = false; - break; + { + bool is_rpi = (driver_data->video_fd == + driver_data->video_fd_rpi_hevc_dec); + switch (config_object->profile) { + case VAProfileH264Main: + case VAProfileH264High: + case VAProfileH264ConstrainedBaseline: + case VAProfileH264MultiviewHigh: + case VAProfileH264StereoHigh: + context_object->h264_start_code = true; + break; + case VAProfileHEVCMain: + /* iter40: rpi-hevc-dec rejects start-code-prepended + * payload (DQBUF error flag on every CAPTURE buffer). + * Gate to match the per-driver START_CODE menu value + * set above: NONE on rpi → no prepend; ANNEX_B on + * rkvdec → prepend. */ + context_object->h264_start_code = !is_rpi; + break; + default: + context_object->h264_start_code = false; + break; + } } rc = v4l2_set_stream(driver_data->video_fd, output_type, true); diff --git a/src/image.c b/src/image.c index f20e6ca..0451a29 100644 --- a/src/image.c +++ b/src/image.c @@ -40,6 +40,7 @@ #include #include "nv15.h" +#include "nv12_col128.h" #include "tiled_yuv.h" #include "utils.h" #include "v4l2.h" @@ -104,6 +105,25 @@ VAStatus RequestCreateImage(VADriverContextP context, VAImageFormat *format, size = 0; for (i = 0; i < destination_planes_count; i++) size += destination_sizes[i]; + } else if (format->fourcc == VA_FOURCC_NV12 && + video_format->v4l2_format == V4L2_PIX_FMT_NV12_COL128) { + /* + * iter40 Phase 5 review F2: NC12 source, NV12 image output. + * V4L2-reported destination_bytesperlines[0] is the NC12 + * column stride (= ALIGN(height,8) * 3/2 — e.g. 1080 for + * 1280×720), NOT the linear NV12 Y stride. Override to the + * linear stride (width) so VAImage pitches reflect the + * detile-output layout the consumer reads. + */ + destination_bytesperlines[0] = width; + destination_sizes[0] = destination_bytesperlines[0] * format_height; + for (i = 1; i < destination_planes_count; i++) { + destination_bytesperlines[i] = destination_bytesperlines[0]; + destination_sizes[i] = destination_sizes[0] / 2; + } + size = 0; + for (i = 0; i < destination_planes_count; i++) + size += destination_sizes[i]; } else { /* NV12: V4L2 stride is correct, sizes derived from height. */ destination_sizes[0] = destination_bytesperlines[0] * format_height; @@ -236,14 +256,31 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data, } for (i = 0; i < surface_object->destination_planes_count; i++) { -#ifdef __arm__ + /* + * iter40 Phase 5 review F1: guard extended from __arm__ to + * __arm__ || __aarch64__. Without this, the detile primitives + * silently compiled out on aarch64 (fresnel RK3399, ampere + * RK3588, higgs Pi CM5) and the memcpy fall-through delivered + * raw tiled bytes to NV12/P010 image consumers. iter39 5/5 + * PASS masked the issue because no 10-bit path was exercised. + */ +#if defined(__arm__) || defined(__aarch64__) + /* + * Sunxi tiled_to_planar lives in tiled_yuv.S which is + * #ifdef __arm__ — symbol absent on aarch64. Keep this + * branch arm-only; aarch64 Sunxi support would need a C or + * aarch64-ASM port (no Sunxi aarch64 board in current fleet). + */ +#if defined(__arm__) if (!video_format_is_linear(driver_data->video_format)) tiled_to_planar(surface_object->destination_data[i], buffer_object->data + image->offsets[i], image->pitches[i], image->width, i == 0 ? image->height : image->height / 2); - else if (driver_data->is_10bit && + else +#endif + if (driver_data->is_10bit && image->format.fourcc == VA_FOURCC_P010) { /* * iter39: rkvdec emits NV15 (4×10-bit packed in 5 @@ -260,12 +297,49 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data, (uint16_t *)(buffer_object->data + image->offsets[i]), image->width, plane_h, surface_object->destination_bytesperlines[i]); + } else if (driver_data->video_format != NULL && + driver_data->video_format->v4l2_format == + V4L2_PIX_FMT_NV12_COL128 && + image->format.fourcc == VA_FOURCC_NV12) { + /* + * iter40: Pi 5 rpi-hevc-dec emits NV12_COL128 (SAND + * 128-pixel-wide column tiles). Detile to linear NV12 + * via the per-plane primitive. surface_object-> + * destination_data[i] is the V4L2 CAPTURE mmap (single + * buffer, planes_count==2): i==0 is the Y plane base, + * i==1 is the UV plane base offset within the SAME + * physical buffer (per cap_pool plane[1] offset = Y + * plane size in COL128 layout). + * + * src_col_stride = destination_bytesperlines[i] = the + * kernel-reported NC12 bytesperline (column stride, + * = ALIGN(image_h, 8) * 3/2). Same for both planes + * since column geometry is plane-agnostic. + * + * dst stride is image->pitches[i] = image->width + * (overridden in RequestCreateImage NC12 branch below). + */ + if (i == 0) { + nv12_col128_detile_y( + (uint8_t *)(buffer_object->data + image->offsets[i]), + image->pitches[i], + surface_object->destination_data[i], + surface_object->destination_bytesperlines[i], + image->width, image->height); + } else { + nv12_col128_detile_uv( + (uint8_t *)(buffer_object->data + image->offsets[i]), + image->pitches[i], + surface_object->destination_data[i], + surface_object->destination_bytesperlines[i], + image->width, image->height / 2); + } } else { #endif memcpy(buffer_object->data + image->offsets[i], surface_object->destination_data[i], surface_object->destination_sizes[i]); -#ifdef __arm__ +#if defined(__arm__) || defined(__aarch64__) } #endif } diff --git a/src/meson.build b/src/meson.build index c05b63e..6943ce2 100644 --- a/src/meson.build +++ b/src/meson.build @@ -52,6 +52,7 @@ sources = [ 'vp9.c', 'codec.c', 'nv15.c', + 'nv12_col128.c', # Vendored GStreamer 1.28.2 H.265 parser + utilities (LGPL v2.1+, # see src/h265_parser/gst_compat.h for sourcing notes + per-iter2 @@ -88,6 +89,7 @@ headers = [ 'vp9.h', 'codec.h', 'nv15.h', + 'nv12_col128.h', # Internal mirror of Linux 7.0 V4L2 HEVC EXT_SPS_*_RPS UAPI defs # (allows building against pre-7.0 linux-api-headers; redundant diff --git a/src/nv12_col128.c b/src/nv12_col128.c new file mode 100644 index 0000000..7817043 --- /dev/null +++ b/src/nv12_col128.c @@ -0,0 +1,114 @@ +/* + * V4L2_PIX_FMT_NV12_COL128 → linear NV12 detile primitive. Pi 5 / CM5 + * rpi-hevc-dec CAPTURE. iter40 (2026-05-17). + * + * Math derived from kernel hevc_d_video.c (size formula) + + * ffmpeg/Kynesim libavutil/rpi_sand_fn_pw.h (per-pixel offset). The + * single-stripe fast path memcpy's 128 bytes at a time when an output + * row falls entirely within one tile column (the common case); + * straddling rows are split into two memcpy halves. + * + * No NEON / SIMD here — correctness first. Each output row generates + * (width / 128) + ~1 memcpys of up to 128 bytes; for 1920x1080 that's + * ~17000 small memcpys per frame, fine for Phase 1 PoC. + */ + +#include "nv12_col128.h" + +#include + +/* + * Tile column width in bytes. The 'COL128' name embeds this; if it ever + * varies, take it from V4L2_PIX_FMT_NV12_COL128's kernel definition. + */ +#define NC12_TILE_W 128 + +/* + * Common Y / UV plane detile — the layout is identical (single-byte per + * pixel, column-major 128-wide tiles). The only thing that varies is + * what plane the caller passes in. width here is plane width in bytes + * (= image width for both Y and CbCr-interleaved NV12 UV); height is + * plane height in pixels (image height for Y, image height / 2 for UV). + */ +static void nv12_col128_detile_plane(uint8_t *dst, unsigned int dst_stride, + const uint8_t *src, + unsigned int src_col_stride, + unsigned int width, unsigned int height) +{ + unsigned int y, x; + + for (y = 0; y < height; y++) { + uint8_t *drow = dst + y * dst_stride; + x = 0; + while (x < width) { + unsigned int col = x / NC12_TILE_W; + unsigned int in_col = x % NC12_TILE_W; + unsigned int n = NC12_TILE_W - in_col; + if (n > width - x) + n = width - x; + /* + * Source byte = base + col*128*col_stride + y*128 + in_col + * Copy n contiguous bytes (all within this tile column, + * since n is capped at the remaining width-in-column). + */ + const uint8_t *p = src + + (size_t)col * NC12_TILE_W * src_col_stride + + (size_t)y * NC12_TILE_W + + in_col; + memcpy(drow + x, p, n); + x += n; + } + } +} + +void nv12_col128_detile_y(uint8_t *dst, unsigned int dst_stride, + const uint8_t *src_y, unsigned int src_col_stride, + unsigned int width, unsigned int height) +{ + nv12_col128_detile_plane(dst, dst_stride, src_y, src_col_stride, + width, height); +} + +void nv12_col128_detile_uv(uint8_t *dst, unsigned int dst_stride, + const uint8_t *src_uv, unsigned int src_col_stride, + unsigned int width, unsigned int uv_height) +{ + /* UV plane (CbCr interleaved): byte-width equals Y-plane width + * (one Cb + one Cr per 2x2 Y block → 2 bytes per 2 horizontal Y + * samples → 1 byte per Y pixel horizontally). Height is half. */ + nv12_col128_detile_plane(dst, dst_stride, src_uv, src_col_stride, + width, uv_height); +} + +unsigned int nv12_col128_uv_plane_offset(unsigned int image_width, + unsigned int image_height) +{ + unsigned int aligned_h = (image_height + 7) & ~7u; + + /* + * In the COL128 SAND layout, Y and UV are NOT separate planes + * concatenated end-to-end. Within EACH 128-pixel-wide column: + * first 128 * height bytes = Y data for this column strip + * next 128 * height / 2 bytes = UV data for this column strip + * total 128 * bytesperline (= 128 * height * 3/2) bytes per column + * + * The "UV plane base" pointer (data[1] in AVFrame convention) is + * just data[0] + (128 * height) — the offset of the UV bytes + * WITHIN the first column. All subsequent UV bytes are reached by + * the same column-stride arithmetic the Y plane uses (col * + * 128 * bytesperline + y * 128 + in_col), so passing this offset + * pointer + iterating y over [0, height/2) traverses all UV rows + * across all columns correctly. + * + * Earlier wrong formula was num_columns * 128 * aligned_h (i.e. + * sizeof(linear Y plane)) — that pushed past the end of the SAND + * buffer because the layout isn't planes-end-to-end. + * + * Cross-check: kernel sizeimage = bytesperline * width = + * (aligned_h * 3/2) * num_columns * 128 = num_columns * 128 * + * aligned_h * 3/2. Per column: 128 * aligned_h * 3/2. Y portion + * per column: 128 * aligned_h. UV portion per column: half of Y. + * Sum across columns: matches sizeimage. + */ + return NC12_TILE_W * aligned_h; +} diff --git a/src/nv12_col128.h b/src/nv12_col128.h new file mode 100644 index 0000000..17798fb --- /dev/null +++ b/src/nv12_col128.h @@ -0,0 +1,88 @@ +/* + * V4L2_PIX_FMT_NV12_COL128 (NC12) SAND-tiled → linear NV12 detile. + * + * Pi 5 / CM5 (BCM2712) rpi-hevc-dec CAPTURE format. iter40 (2026-05-17). + * + * Layout (kernel drivers/media/platform/raspberrypi/hevc_dec/hevc_d_video.c + * size-formula + ffmpeg/Kynesim libavutil/rpi_sand_fn_pw.h per-pixel + * offset math): + * + * width ALIGN(image_width, 128) -- columns are 128 px wide + * height ALIGN(image_height, 8) + * col_stride (= bytesperline) = height * 3 / 2 + * (bytes per [128-wide column] vertical unit incl. Y + UV) + * sizeimage = col_stride * width = total bytes + * + * For pixel (x, y) in the Y plane: + * col = x / 128 + * in_col_x = x % 128 + * offset = col * col_stride * 128 + y * 128 + in_col_x + * + * UV plane starts at offset (128 * height * num_columns_y) — the same + * per-column layout, h/2 rows tall (CbCr interleaved). + * + * The primitive copies the entire image extent at once. width/height are + * the cropped consumer-visible dimensions; src_col_stride is the kernel- + * reported bytesperline (i.e. ALIGN(height,8) * 3/2). + */ + +#ifndef _NV12_COL128_H_ +#define _NV12_COL128_H_ + +#include + +#include + +/* + * Pre-Pi-kernel headers (Arch ALARM linux-api-headers, older mainline + * kernel-headers packages) may not define V4L2_PIX_FMT_NV12_COL128. The + * fourcc is Pi-specific. Provide a private fallback so the backend + * builds on hosts that target NON-Pi codecs too. + */ +#ifndef V4L2_PIX_FMT_NV12_COL128 +#define V4L2_PIX_FMT_NV12_COL128 \ + ((unsigned int)('N') | ((unsigned int)('C') << 8) | \ + ((unsigned int)('1') << 16) | ((unsigned int)('2') << 24)) +#endif + +#ifndef V4L2_PIX_FMT_NV12_10_COL128 +/* 10-bit SAND variant: 3 pixels packed into 4 bytes in 128-byte / 96-pixel + * wide columns. iter40 references the fourcc for completeness; the 10-bit + * Pi 5 HEVC chapter (Main10) is post-iter40. */ +#define V4L2_PIX_FMT_NV12_10_COL128 \ + ((unsigned int)('N') | ((unsigned int)('C') << 8) | \ + ((unsigned int)('3') << 16) | ((unsigned int)('0') << 24)) +#endif + +/* Detile the Y plane of an NC12 source to a linear NV12 Y plane. + * dst : pointer to linear NV12 Y plane (caller-owned, dst_stride * height bytes) + * dst_stride : linear Y plane stride in bytes (= width for plain NV12) + * src_y : pointer to start of NC12 Y plane (= NC12 buffer base) + * src_col_stride: kernel-reported bytesperline (= ALIGN(height,8) * 3/2) + * width, height: cropped image dimensions in pixels + */ +void nv12_col128_detile_y(uint8_t *dst, unsigned int dst_stride, + const uint8_t *src_y, unsigned int src_col_stride, + unsigned int width, unsigned int height); + +/* Detile the UV plane (CbCr interleaved, half-height) of an NC12 source. + * dst : pointer to linear NV12 UV plane + * dst_stride : linear UV plane stride in bytes (= width for NV12) + * src_uv : pointer to start of NC12 UV plane (= src_y + Y-plane-size) + * src_col_stride: same as Y plane (same column geometry) + * width : Y-plane width in pixels (UV plane has same byte width) + * uv_height : UV plane height = height / 2 + */ +void nv12_col128_detile_uv(uint8_t *dst, unsigned int dst_stride, + const uint8_t *src_uv, unsigned int src_col_stride, + unsigned int width, unsigned int uv_height); + +/* Compute the offset of the UV plane within an NC12 buffer. + * image_width, image_height: cropped image dimensions in pixels + * Returns: byte offset from buffer start to UV plane start + * (= 128 * ALIGN(image_height, 8) * num_columns_y) + */ +unsigned int nv12_col128_uv_plane_offset(unsigned int image_width, + unsigned int image_height); + +#endif /* _NV12_COL128_H_ */ diff --git a/src/nv15.h b/src/nv15.h index 3c8605a..039d620 100644 --- a/src/nv15.h +++ b/src/nv15.h @@ -27,6 +27,21 @@ #include +#include + +/* + * Older or downstream linux-api-headers / kernel-headers packages may + * not define V4L2_PIX_FMT_NV15. Provide a fallback so the backend + * builds on hosts whose headers are pre-NV15-merge or omit it (e.g. + * Pi 5 Debian trixie 6.12.62 headers include NC12 but not NV15). + * Same numeric value as mainline. + */ +#ifndef V4L2_PIX_FMT_NV15 +#define V4L2_PIX_FMT_NV15 \ + ((unsigned int)('N') | ((unsigned int)('V') << 8) | \ + ((unsigned int)('1') << 16) | ((unsigned int)('5') << 24)) +#endif + /* * Unpack one plane of V4L2_PIX_FMT_NV15 (4 × 10-bit values packed into * 5 consecutive bytes, LSB-first) into VA_FOURCC_P010 (16-bit per pixel, diff --git a/src/request.c b/src/request.c index 20884be..ed97fe0 100644 --- a/src/request.c +++ b/src/request.c @@ -93,6 +93,7 @@ static const char * const known_decoder_drivers[] = { "rkvdec", "hantro-vpu", + "rpi-hevc-dec", /* iter40: Pi 5 / CM5 stateless HEVC */ "cedrus", "sun4i_csi", NULL @@ -431,12 +432,31 @@ int request_switch_device_for_profile(struct request_data *driver_data, char kind = request_device_kind_for_profile(profile); int target_video, target_media; + /* + * iter40: HEVC override when rpi-hevc-dec is probed. The static + * table (request_device_kind_for_profile) maps HEVC → 'r' (rkvdec) + * because that's the canonical RK path. On Pi 5 there's no rkvdec + * — rpi-hevc-dec is the only decoder. When BOTH would be present + * (hypothetical mixed board), prefer rpi-hevc-dec for HEVC. + * + * Other rkvdec-routed profiles (VP9, H.264) stay on 'r' because + * rpi-hevc-dec is HEVC-only. + */ + if ((profile == VAProfileHEVCMain || profile == VAProfileHEVCMain10) && + driver_data->video_fd_rpi_hevc_dec >= 0 && + driver_data->media_fd_rpi_hevc_dec >= 0) { + kind = 'p'; + } + if (kind == 'r') { target_video = driver_data->video_fd_rkvdec; target_media = driver_data->media_fd_rkvdec; } else if (kind == 'h') { target_video = driver_data->video_fd_hantro; target_media = driver_data->media_fd_hantro; + } else if (kind == 'p') { + target_video = driver_data->video_fd_rpi_hevc_dec; + target_media = driver_data->media_fd_rpi_hevc_dec; } else { return -1; } @@ -624,6 +644,8 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context) driver_data->media_fd_rkvdec = -1; driver_data->video_fd_hantro = -1; driver_data->media_fd_hantro = -1; + driver_data->video_fd_rpi_hevc_dec = -1; + driver_data->media_fd_rpi_hevc_dec = -1; /* * iter38: probe BOTH rkvdec and hantro-vpu so a single libva session @@ -654,6 +676,15 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context) alt_driver = "rkvdec"; driver_data->video_fd_hantro = video_fd; driver_data->media_fd_hantro = media_fd; + } else if (strcmp(info.driver, "rpi-hevc-dec") == 0) { + /* iter40: Pi 5 / CM5 — sole decoder is rpi-hevc-dec. + * No alt driver to probe; the rkvdec / hantro slots + * stay -1 and HEVC routes to 'p' via + * request_device_kind_for_profile. */ + primary_driver = "rpi-hevc-dec"; + alt_driver = NULL; + driver_data->video_fd_rpi_hevc_dec = video_fd; + driver_data->media_fd_rpi_hevc_dec = media_fd; } } @@ -693,11 +724,19 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context) probe_hevc_ext_sps_rps_controls(driver_data->video_fd_rkvdec); driver_data->has_hevc_ext_sps_rps_hantro = probe_hevc_ext_sps_rps_controls(driver_data->video_fd_hantro); + driver_data->has_hevc_ext_sps_rps_rpi_hevc_dec = + probe_hevc_ext_sps_rps_controls(driver_data->video_fd_rpi_hevc_dec); if (driver_data->has_hevc_ext_sps_rps_rkvdec) { request_log("iter2: kernel registers HEVC EXT_SPS_{ST,LT}_RPS " "controls on rkvdec fd (will route through " "vendored GStreamer parser)\n"); } + if (driver_data->video_fd_rpi_hevc_dec >= 0) { + request_log("iter40: also opened rpi-hevc-dec at video_fd=%d " + "media_fd=%d (Pi 5 HEVC stateless)\n", + driver_data->video_fd_rpi_hevc_dec, + driver_data->media_fd_rpi_hevc_dec); + } status = VA_STATUS_SUCCESS; goto complete; diff --git a/src/request.h b/src/request.h index 2d67e2f..6c3c9a2 100644 --- a/src/request.h +++ b/src/request.h @@ -78,6 +78,15 @@ struct request_data { int media_fd_rkvdec; int video_fd_hantro; int media_fd_hantro; + /* + * iter40: third multi-device-probe slot for rpi-hevc-dec (Pi 5 / + * CM5 / BCM2712). V4L2 stateless HEVC; CAPTURE is NC12/NC30 SAND + * 128-pixel-wide column tiled (Pi-specific). On Pi 5 this is the + * ONLY decoder slot; on RK hosts it stays -1 and HEVC routes to + * rkvdec as before. + */ + int video_fd_rpi_hevc_dec; + int media_fd_rpi_hevc_dec; /* * iter2 (ampere-kernel-decoders campaign) — per-fd probe result @@ -98,6 +107,12 @@ struct request_data { */ bool has_hevc_ext_sps_rps_rkvdec; bool has_hevc_ext_sps_rps_hantro; + /* iter40: rpi-hevc-dec doesn't expose EXT_SPS_*_RPS controls + * (verified Phase 0 higgs probe: QUERY_EXT_CTRL on 0xa97 → EINVAL). + * Probed for consistency with the iter2 pair-of-flags pattern; + * stays false on Pi 5 and the iter2 vendored-parser path naturally + * doesn't engage. */ + bool has_hevc_ext_sps_rps_rpi_hevc_dec; /* * iter2 — cached SPS-derived RPS arrays. SPS NALs only appear in diff --git a/src/video.c b/src/video.c index 04cbf25..f160a92 100644 --- a/src/video.c +++ b/src/video.c @@ -31,6 +31,8 @@ #include #include +#include "nv12_col128.h" /* fallback V4L2_PIX_FMT_NV12_COL128 define */ +#include "nv15.h" /* fallback V4L2_PIX_FMT_NV15 define */ #include "utils.h" #include "video.h" @@ -55,6 +57,28 @@ static struct video_format formats[] = { .planes_count = 2, .bpp = 24, }, + { + /* + * iter40: Pi 5 / CM5 rpi-hevc-dec CAPTURE format. 8-bit NV12 + * stored as 128-pixel-wide column tiles (SAND128 layout). + * Pi-specific; not in mainline drm_fourcc.h (uses NV12 + a + * BROADCOM_SAND128 modifier for DRM_PRIME). Our consumer path + * always detiles to linear NV12 in copy_surface_to_image, so + * we don't expose the SAND modifier downstream — drm_format is + * still DRM_FORMAT_NV12 and drm_modifier MOD_NONE so the + * format-is-linear gate doesn't pull us into tiled_to_planar + * (Sunxi-specific). image.c branches on v4l2_format == + * V4L2_PIX_FMT_NV12_COL128 to invoke the dedicated detile. + */ + .description = "NV12 SAND128 (8-bit, rpi-hevc-dec)", + .v4l2_format = V4L2_PIX_FMT_NV12_COL128, + .v4l2_buffers_count = 1, + .v4l2_mplane = true, + .drm_format = DRM_FORMAT_NV12, + .drm_modifier = DRM_FORMAT_MOD_NONE, + .planes_count = 2, + .bpp = 16, + }, // Code to handle this DRM_FORMAT is __arm__ only #ifdef __arm__ { diff --git a/tests/test_nv12_col128_detile.c b/tests/test_nv12_col128_detile.c new file mode 100644 index 0000000..87d1b39 --- /dev/null +++ b/tests/test_nv12_col128_detile.c @@ -0,0 +1,196 @@ +/* + * Copyright (C) 2026 claude-noether + * + * MIT-licensed per project. iter40 self-test for nv12_col128 detile. + * + * Build an NC12-tiled source buffer from a known linear NV12 image, + * run the detile primitive, assert output matches the original. No + * hardware needed — pure bit-layout verification of the kernel math + * (drivers/media/platform/raspberrypi/hevc_dec/hevc_d_video.c + * V4L2_PIX_FMT_NV12_COL128 case + ffmpeg/Kynesim per-pixel offset). + * + * Build: + * cc -Wall -Werror -O2 -o test_nv12_col128_detile \ + * tests/test_nv12_col128_detile.c src/nv12_col128.c + * + * Exit 0 = all asserts pass. + */ + +#include "../src/nv12_col128.h" + +#include +#include +#include +#include +#include + +#define TILE_W 128 + +static unsigned int align_up(unsigned int v, unsigned int a) +{ + return (v + a - 1) & ~(a - 1); +} + +/* Pack a linear plane (width × height bytes, stride=width) into NC12 + * layout: each 128-wide column held contiguously, columns at offsets + * col * col_stride * 128. col_stride is the kernel-reported bytesperline + * = ALIGN(height, 8) * 3/2. Returns the buffer + sizes. */ +static uint8_t *pack_to_nc12(const uint8_t *linear, + unsigned int width, unsigned int height, + unsigned int *out_col_stride, + size_t *out_size) +{ + unsigned int aligned_w = align_up(width, TILE_W); + unsigned int aligned_h = align_up(height, 8); + unsigned int col_stride = aligned_h * 3 / 2; + unsigned int num_cols = aligned_w / TILE_W; + size_t total = (size_t)col_stride * aligned_w; + uint8_t *buf; + unsigned int col, y, in_col; + + buf = calloc(1, total); + assert(buf != NULL); + + for (col = 0; col < num_cols; col++) { + uint8_t *col_base = buf + (size_t)col * TILE_W * col_stride; + for (y = 0; y < height; y++) { + for (in_col = 0; in_col < TILE_W; in_col++) { + unsigned int x = col * TILE_W + in_col; + if (x >= width) + break; + col_base[(size_t)y * TILE_W + in_col] = + linear[(size_t)y * width + x]; + } + } + } + + *out_col_stride = col_stride; + *out_size = total; + return buf; +} + +static void test_detile_y(unsigned int width, unsigned int height) +{ + uint8_t *linear, *tiled, *recovered; + unsigned int col_stride; + size_t tile_size, i; + + linear = malloc((size_t)width * height); + assert(linear != NULL); + /* Distinctive content per pixel: y * 17 + x * 13 — avoids byte- + * aliasing patterns that could mask off-by-one bugs. */ + for (unsigned int y = 0; y < height; y++) + for (unsigned int x = 0; x < width; x++) + linear[(size_t)y * width + x] = (uint8_t)(y * 17 + x * 13); + + tiled = pack_to_nc12(linear, width, height, &col_stride, &tile_size); + + recovered = calloc(1, (size_t)width * height); + assert(recovered != NULL); + + nv12_col128_detile_y(recovered, width, tiled, col_stride, width, height); + + for (i = 0; i < (size_t)width * height; i++) { + if (recovered[i] != linear[i]) { + fprintf(stderr, + "FAIL %ux%u Y: pixel %zu (x=%zu y=%zu) " + "linear=0x%02x recovered=0x%02x\n", + width, height, i, + i % width, i / width, + linear[i], recovered[i]); + free(linear); free(tiled); free(recovered); + exit(1); + } + } + printf("PASS %ux%u Y plane (%u columns, col_stride=%u, tile_size=%zu)\n", + width, height, align_up(width, TILE_W) / TILE_W, + col_stride, tile_size); + + free(linear); + free(tiled); + free(recovered); +} + +static void test_detile_uv(unsigned int width, unsigned int height) +{ + unsigned int uv_h = height / 2; + uint8_t *linear, *tiled, *recovered; + unsigned int col_stride; + size_t tile_size, i; + + linear = malloc((size_t)width * uv_h); + assert(linear != NULL); + for (unsigned int y = 0; y < uv_h; y++) + for (unsigned int x = 0; x < width; x++) + linear[(size_t)y * width + x] = (uint8_t)(y * 23 + x * 7); + + tiled = pack_to_nc12(linear, width, uv_h, &col_stride, &tile_size); + + recovered = calloc(1, (size_t)width * uv_h); + assert(recovered != NULL); + + nv12_col128_detile_uv(recovered, width, tiled, col_stride, width, uv_h); + + for (i = 0; i < (size_t)width * uv_h; i++) { + if (recovered[i] != linear[i]) { + fprintf(stderr, + "FAIL %ux%u UV: pixel %zu linear=0x%02x recovered=0x%02x\n", + width, height, i, + linear[i], recovered[i]); + free(linear); free(tiled); free(recovered); + exit(1); + } + } + printf("PASS %ux%u UV plane\n", width, height); + + free(linear); + free(tiled); + free(recovered); +} + +static void test_uv_offset(void) +{ + /* Per the SAND COL128 layout, Y and UV are interleaved within + * EACH column (not concatenated as separate planes), so the UV + * plane base pointer is offset by 128 * ALIGN(height, 8) — the + * Y portion of column 0. NOT 128 * height * num_columns (the + * size of all Y across all columns), which was an earlier wrong + * formula caught by Phase 7 SEGV on higgs. */ + unsigned int off = nv12_col128_uv_plane_offset(1280, 720); + if (off != 128u * 720) { + fprintf(stderr, "FAIL UV offset 1280×720: got %u expected %u\n", + off, 128u * 720); + exit(1); + } + printf("PASS UV offset 1280×720 = %u\n", off); + + off = nv12_col128_uv_plane_offset(1366, 768); + if (off != 128u * 768) { + fprintf(stderr, "FAIL UV offset 1366×768: got %u expected %u\n", + off, 128u * 768); + exit(1); + } + printf("PASS UV offset 1366×768 (column-misaligned width)\n"); +} + +int main(void) +{ + /* Phase 3 fixture sizes — all 128-aligned, 8-line-aligned. */ + test_detile_y(640, 360); + test_detile_y(1280, 720); + test_detile_y(1920, 1080); + + /* Phase 5 review F4: column-misaligned width (1366 → 1408 padding). */ + test_detile_y(1366, 768); + + /* UV plane (half-height) at each width. */ + test_detile_uv(640, 360); + test_detile_uv(1280, 720); + test_detile_uv(1920, 1080); + test_detile_uv(1366, 768); + + test_uv_offset(); + + printf("All NC12 detile asserts pass.\n"); + return 0; +}