iter40: Pi 5 HEVC chapter — backend integration lands, bit-exact pending

Phase 6 implementation. Backend builds clean on higgs (Debian 13
trixie, aarch64), vainfo lists VAProfileHEVCMain via rpi-hevc-dec,
multi-device probe finds /dev/video19 + /dev/media1, CreateContext
+ S_FMT + REQBUFS + STREAMON all succeed.

Phase 7 partial: infrastructure works, 10 frames flow through the
pipeline (correct byte counts produced — 13824000 for 1280x720 x 10
NV12 frames). But every DQBUF CAPTURE returns V4L2_BUF_FLAG_ERROR
so output content is wrong (libva sha != kdirect sha). The decode
itself is failing on the rpi-hevc-dec side despite all ctrl
submissions returning success.

Code changes:
- request.h: video_fd_rpi_hevc_dec / media_fd_rpi_hevc_dec slots +
  has_hevc_ext_sps_rps_rpi_hevc_dec flag (mirrors iter38 + iter2
  pair-of-flags pattern, naturally false on Pi).
- request.c: known_decoder_drivers gains rpi-hevc-dec; primary-driver
  probe gets an else-if branch setting the new fds (Phase 5 F3);
  request_switch_device_for_profile prefers 'p' for HEVC when
  rpi-hevc-dec present.
- context.c: per-fd want_pixfmt (NC12 on Pi), capture_pixelformat
  taken from video_format slot (not hardcoded NV12/NV15);
  synthetic-SPS pre-seed gated off for Pi (Phase 5 F6);
  destination_sizes uses nv12_col128_uv_plane_offset for NC12 SAND
  layout (Phase 5 F2);
  per-driver HEVC_START_CODE (NONE on Pi, ANNEX_B on RK);
  per-driver context_object->h264_start_code (skip prepend on Pi).
- video.c: NV12_COL128 video_format entry (8-bit SAND, single
  buffer, 2 planes, NV12 drm_format with MOD_NONE so detile branch
  fires rather than tiled_to_planar).
- nv12_col128.c/.h: detile primitive (Y + UV per-plane, kernel
  hevc_d_video.c bytesperline formula + ffmpeg/Kynesim per-pixel
  offset). UV plane offset = 128 * ALIGN(h, 8) — within-column
  (SAND interleaves Y+UV per column, NOT plane-concatenated;
  earlier wrong formula caught by Phase 7 SEGV).
- image.c: #ifdef __arm__ extended to __arm__ || __aarch64__
  (Phase 5 F1 — guard was killing detile path on all aarch64
  hosts including fresnel iter39 NV15 path, masked because 10-bit
  never exercised); RequestCreateImage NC12 → NV12 stride override
  (linear width, not column-stride); copy_surface_to_image NC12
  detile branch (gates on fourcc + v4l2_format).
- nv15.h: fallback V4L2_PIX_FMT_NV15 define (Debian 13 headers
  omit it though they have NC12).
- nv12_col128.h: fallback V4L2_PIX_FMT_NV12_COL128 +
  V4L2_PIX_FMT_NV12_10_COL128 (Arch / mainline pre-Pi headers).
- tests/test_nv12_col128_detile.c: hand-crafted-bytes unit test;
  passes (8 cases: Y + UV for 4 widths incl. 1366 misaligned;
  UV-offset helper).
- meson.build / nv12_col128 sources listed.

Phase 7 status: not yet bit-exact. Remaining diagnosis: per-frame
S_EXT_CTRLS payload diff vs kdirect (kdirect sends 4 ctrls
SPS+PPS+decode_params+slice_array; ours sends 5 incl. scaling_matrix;
field ordering differs). Likely the slice_array contents need
per-driver handling for rpi-hevc-dec's expected layout. Beyond
in-session reach.

iter38 5/5 baseline on fresnel + ampere should be unaffected (new
fd stays -1 on non-Pi hosts; all gates either short-circuit on
fd-not-present or no-op).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-17 19:17:14 +00:00
parent f1be489c75
commit 3ffa9d0d17
10 changed files with 706 additions and 31 deletions
+196
View File
@@ -0,0 +1,196 @@
/*
* Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
*
* MIT-licensed per project. iter40 self-test for nv12_col128 detile.
*
* Build an NC12-tiled source buffer from a known linear NV12 image,
* run the detile primitive, assert output matches the original. No
* hardware needed — pure bit-layout verification of the kernel math
* (drivers/media/platform/raspberrypi/hevc_dec/hevc_d_video.c
* V4L2_PIX_FMT_NV12_COL128 case + ffmpeg/Kynesim per-pixel offset).
*
* Build:
* cc -Wall -Werror -O2 -o test_nv12_col128_detile \
* tests/test_nv12_col128_detile.c src/nv12_col128.c
*
* Exit 0 = all asserts pass.
*/
#include "../src/nv12_col128.h"
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define TILE_W 128
static unsigned int align_up(unsigned int v, unsigned int a)
{
return (v + a - 1) & ~(a - 1);
}
/* Pack a linear plane (width × height bytes, stride=width) into NC12
* layout: each 128-wide column held contiguously, columns at offsets
* col * col_stride * 128. col_stride is the kernel-reported bytesperline
* = ALIGN(height, 8) * 3/2. Returns the buffer + sizes. */
static uint8_t *pack_to_nc12(const uint8_t *linear,
unsigned int width, unsigned int height,
unsigned int *out_col_stride,
size_t *out_size)
{
unsigned int aligned_w = align_up(width, TILE_W);
unsigned int aligned_h = align_up(height, 8);
unsigned int col_stride = aligned_h * 3 / 2;
unsigned int num_cols = aligned_w / TILE_W;
size_t total = (size_t)col_stride * aligned_w;
uint8_t *buf;
unsigned int col, y, in_col;
buf = calloc(1, total);
assert(buf != NULL);
for (col = 0; col < num_cols; col++) {
uint8_t *col_base = buf + (size_t)col * TILE_W * col_stride;
for (y = 0; y < height; y++) {
for (in_col = 0; in_col < TILE_W; in_col++) {
unsigned int x = col * TILE_W + in_col;
if (x >= width)
break;
col_base[(size_t)y * TILE_W + in_col] =
linear[(size_t)y * width + x];
}
}
}
*out_col_stride = col_stride;
*out_size = total;
return buf;
}
static void test_detile_y(unsigned int width, unsigned int height)
{
uint8_t *linear, *tiled, *recovered;
unsigned int col_stride;
size_t tile_size, i;
linear = malloc((size_t)width * height);
assert(linear != NULL);
/* Distinctive content per pixel: y * 17 + x * 13 — avoids byte-
* aliasing patterns that could mask off-by-one bugs. */
for (unsigned int y = 0; y < height; y++)
for (unsigned int x = 0; x < width; x++)
linear[(size_t)y * width + x] = (uint8_t)(y * 17 + x * 13);
tiled = pack_to_nc12(linear, width, height, &col_stride, &tile_size);
recovered = calloc(1, (size_t)width * height);
assert(recovered != NULL);
nv12_col128_detile_y(recovered, width, tiled, col_stride, width, height);
for (i = 0; i < (size_t)width * height; i++) {
if (recovered[i] != linear[i]) {
fprintf(stderr,
"FAIL %ux%u Y: pixel %zu (x=%zu y=%zu) "
"linear=0x%02x recovered=0x%02x\n",
width, height, i,
i % width, i / width,
linear[i], recovered[i]);
free(linear); free(tiled); free(recovered);
exit(1);
}
}
printf("PASS %ux%u Y plane (%u columns, col_stride=%u, tile_size=%zu)\n",
width, height, align_up(width, TILE_W) / TILE_W,
col_stride, tile_size);
free(linear);
free(tiled);
free(recovered);
}
static void test_detile_uv(unsigned int width, unsigned int height)
{
unsigned int uv_h = height / 2;
uint8_t *linear, *tiled, *recovered;
unsigned int col_stride;
size_t tile_size, i;
linear = malloc((size_t)width * uv_h);
assert(linear != NULL);
for (unsigned int y = 0; y < uv_h; y++)
for (unsigned int x = 0; x < width; x++)
linear[(size_t)y * width + x] = (uint8_t)(y * 23 + x * 7);
tiled = pack_to_nc12(linear, width, uv_h, &col_stride, &tile_size);
recovered = calloc(1, (size_t)width * uv_h);
assert(recovered != NULL);
nv12_col128_detile_uv(recovered, width, tiled, col_stride, width, uv_h);
for (i = 0; i < (size_t)width * uv_h; i++) {
if (recovered[i] != linear[i]) {
fprintf(stderr,
"FAIL %ux%u UV: pixel %zu linear=0x%02x recovered=0x%02x\n",
width, height, i,
linear[i], recovered[i]);
free(linear); free(tiled); free(recovered);
exit(1);
}
}
printf("PASS %ux%u UV plane\n", width, height);
free(linear);
free(tiled);
free(recovered);
}
static void test_uv_offset(void)
{
/* Per the SAND COL128 layout, Y and UV are interleaved within
* EACH column (not concatenated as separate planes), so the UV
* plane base pointer is offset by 128 * ALIGN(height, 8) — the
* Y portion of column 0. NOT 128 * height * num_columns (the
* size of all Y across all columns), which was an earlier wrong
* formula caught by Phase 7 SEGV on higgs. */
unsigned int off = nv12_col128_uv_plane_offset(1280, 720);
if (off != 128u * 720) {
fprintf(stderr, "FAIL UV offset 1280×720: got %u expected %u\n",
off, 128u * 720);
exit(1);
}
printf("PASS UV offset 1280×720 = %u\n", off);
off = nv12_col128_uv_plane_offset(1366, 768);
if (off != 128u * 768) {
fprintf(stderr, "FAIL UV offset 1366×768: got %u expected %u\n",
off, 128u * 768);
exit(1);
}
printf("PASS UV offset 1366×768 (column-misaligned width)\n");
}
int main(void)
{
/* Phase 3 fixture sizes — all 128-aligned, 8-line-aligned. */
test_detile_y(640, 360);
test_detile_y(1280, 720);
test_detile_y(1920, 1080);
/* Phase 5 review F4: column-misaligned width (1366 → 1408 padding). */
test_detile_y(1366, 768);
/* UV plane (half-height) at each width. */
test_detile_uv(640, 360);
test_detile_uv(1280, 720);
test_detile_uv(1920, 1080);
test_detile_uv(1366, 768);
test_uv_offset();
printf("All NC12 detile asserts pass.\n");
return 0;
}