iter40: Pi 5 HEVC chapter — backend integration lands, bit-exact pending
Phase 6 implementation. Backend builds clean on higgs (Debian 13 trixie, aarch64), vainfo lists VAProfileHEVCMain via rpi-hevc-dec, multi-device probe finds /dev/video19 + /dev/media1, CreateContext + S_FMT + REQBUFS + STREAMON all succeed. Phase 7 partial: infrastructure works, 10 frames flow through the pipeline (correct byte counts produced — 13824000 for 1280x720 x 10 NV12 frames). But every DQBUF CAPTURE returns V4L2_BUF_FLAG_ERROR so output content is wrong (libva sha != kdirect sha). The decode itself is failing on the rpi-hevc-dec side despite all ctrl submissions returning success. Code changes: - request.h: video_fd_rpi_hevc_dec / media_fd_rpi_hevc_dec slots + has_hevc_ext_sps_rps_rpi_hevc_dec flag (mirrors iter38 + iter2 pair-of-flags pattern, naturally false on Pi). - request.c: known_decoder_drivers gains rpi-hevc-dec; primary-driver probe gets an else-if branch setting the new fds (Phase 5 F3); request_switch_device_for_profile prefers 'p' for HEVC when rpi-hevc-dec present. - context.c: per-fd want_pixfmt (NC12 on Pi), capture_pixelformat taken from video_format slot (not hardcoded NV12/NV15); synthetic-SPS pre-seed gated off for Pi (Phase 5 F6); destination_sizes uses nv12_col128_uv_plane_offset for NC12 SAND layout (Phase 5 F2); per-driver HEVC_START_CODE (NONE on Pi, ANNEX_B on RK); per-driver context_object->h264_start_code (skip prepend on Pi). - video.c: NV12_COL128 video_format entry (8-bit SAND, single buffer, 2 planes, NV12 drm_format with MOD_NONE so detile branch fires rather than tiled_to_planar). - nv12_col128.c/.h: detile primitive (Y + UV per-plane, kernel hevc_d_video.c bytesperline formula + ffmpeg/Kynesim per-pixel offset). UV plane offset = 128 * ALIGN(h, 8) — within-column (SAND interleaves Y+UV per column, NOT plane-concatenated; earlier wrong formula caught by Phase 7 SEGV). - image.c: #ifdef __arm__ extended to __arm__ || __aarch64__ (Phase 5 F1 — guard was killing detile path on all aarch64 hosts including fresnel iter39 NV15 path, masked because 10-bit never exercised); RequestCreateImage NC12 → NV12 stride override (linear width, not column-stride); copy_surface_to_image NC12 detile branch (gates on fourcc + v4l2_format). - nv15.h: fallback V4L2_PIX_FMT_NV15 define (Debian 13 headers omit it though they have NC12). - nv12_col128.h: fallback V4L2_PIX_FMT_NV12_COL128 + V4L2_PIX_FMT_NV12_10_COL128 (Arch / mainline pre-Pi headers). - tests/test_nv12_col128_detile.c: hand-crafted-bytes unit test; passes (8 cases: Y + UV for 4 widths incl. 1366 misaligned; UV-offset helper). - meson.build / nv12_col128 sources listed. Phase 7 status: not yet bit-exact. Remaining diagnosis: per-frame S_EXT_CTRLS payload diff vs kdirect (kdirect sends 4 ctrls SPS+PPS+decode_params+slice_array; ours sends 5 incl. scaling_matrix; field ordering differs). Likely the slice_array contents need per-driver handling for rpi-hevc-dec's expected layout. Beyond in-session reach. iter38 5/5 baseline on fresnel + ampere should be unaffected (new fd stays -1 on non-Pi hosts; all gates either short-circuit on fd-not-present or no-op). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
|
||||
*
|
||||
* MIT-licensed per project. iter40 self-test for nv12_col128 detile.
|
||||
*
|
||||
* Build an NC12-tiled source buffer from a known linear NV12 image,
|
||||
* run the detile primitive, assert output matches the original. No
|
||||
* hardware needed — pure bit-layout verification of the kernel math
|
||||
* (drivers/media/platform/raspberrypi/hevc_dec/hevc_d_video.c
|
||||
* V4L2_PIX_FMT_NV12_COL128 case + ffmpeg/Kynesim per-pixel offset).
|
||||
*
|
||||
* Build:
|
||||
* cc -Wall -Werror -O2 -o test_nv12_col128_detile \
|
||||
* tests/test_nv12_col128_detile.c src/nv12_col128.c
|
||||
*
|
||||
* Exit 0 = all asserts pass.
|
||||
*/
|
||||
|
||||
#include "../src/nv12_col128.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define TILE_W 128
|
||||
|
||||
static unsigned int align_up(unsigned int v, unsigned int a)
|
||||
{
|
||||
return (v + a - 1) & ~(a - 1);
|
||||
}
|
||||
|
||||
/* Pack a linear plane (width × height bytes, stride=width) into NC12
|
||||
* layout: each 128-wide column held contiguously, columns at offsets
|
||||
* col * col_stride * 128. col_stride is the kernel-reported bytesperline
|
||||
* = ALIGN(height, 8) * 3/2. Returns the buffer + sizes. */
|
||||
static uint8_t *pack_to_nc12(const uint8_t *linear,
|
||||
unsigned int width, unsigned int height,
|
||||
unsigned int *out_col_stride,
|
||||
size_t *out_size)
|
||||
{
|
||||
unsigned int aligned_w = align_up(width, TILE_W);
|
||||
unsigned int aligned_h = align_up(height, 8);
|
||||
unsigned int col_stride = aligned_h * 3 / 2;
|
||||
unsigned int num_cols = aligned_w / TILE_W;
|
||||
size_t total = (size_t)col_stride * aligned_w;
|
||||
uint8_t *buf;
|
||||
unsigned int col, y, in_col;
|
||||
|
||||
buf = calloc(1, total);
|
||||
assert(buf != NULL);
|
||||
|
||||
for (col = 0; col < num_cols; col++) {
|
||||
uint8_t *col_base = buf + (size_t)col * TILE_W * col_stride;
|
||||
for (y = 0; y < height; y++) {
|
||||
for (in_col = 0; in_col < TILE_W; in_col++) {
|
||||
unsigned int x = col * TILE_W + in_col;
|
||||
if (x >= width)
|
||||
break;
|
||||
col_base[(size_t)y * TILE_W + in_col] =
|
||||
linear[(size_t)y * width + x];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*out_col_stride = col_stride;
|
||||
*out_size = total;
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void test_detile_y(unsigned int width, unsigned int height)
|
||||
{
|
||||
uint8_t *linear, *tiled, *recovered;
|
||||
unsigned int col_stride;
|
||||
size_t tile_size, i;
|
||||
|
||||
linear = malloc((size_t)width * height);
|
||||
assert(linear != NULL);
|
||||
/* Distinctive content per pixel: y * 17 + x * 13 — avoids byte-
|
||||
* aliasing patterns that could mask off-by-one bugs. */
|
||||
for (unsigned int y = 0; y < height; y++)
|
||||
for (unsigned int x = 0; x < width; x++)
|
||||
linear[(size_t)y * width + x] = (uint8_t)(y * 17 + x * 13);
|
||||
|
||||
tiled = pack_to_nc12(linear, width, height, &col_stride, &tile_size);
|
||||
|
||||
recovered = calloc(1, (size_t)width * height);
|
||||
assert(recovered != NULL);
|
||||
|
||||
nv12_col128_detile_y(recovered, width, tiled, col_stride, width, height);
|
||||
|
||||
for (i = 0; i < (size_t)width * height; i++) {
|
||||
if (recovered[i] != linear[i]) {
|
||||
fprintf(stderr,
|
||||
"FAIL %ux%u Y: pixel %zu (x=%zu y=%zu) "
|
||||
"linear=0x%02x recovered=0x%02x\n",
|
||||
width, height, i,
|
||||
i % width, i / width,
|
||||
linear[i], recovered[i]);
|
||||
free(linear); free(tiled); free(recovered);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("PASS %ux%u Y plane (%u columns, col_stride=%u, tile_size=%zu)\n",
|
||||
width, height, align_up(width, TILE_W) / TILE_W,
|
||||
col_stride, tile_size);
|
||||
|
||||
free(linear);
|
||||
free(tiled);
|
||||
free(recovered);
|
||||
}
|
||||
|
||||
static void test_detile_uv(unsigned int width, unsigned int height)
|
||||
{
|
||||
unsigned int uv_h = height / 2;
|
||||
uint8_t *linear, *tiled, *recovered;
|
||||
unsigned int col_stride;
|
||||
size_t tile_size, i;
|
||||
|
||||
linear = malloc((size_t)width * uv_h);
|
||||
assert(linear != NULL);
|
||||
for (unsigned int y = 0; y < uv_h; y++)
|
||||
for (unsigned int x = 0; x < width; x++)
|
||||
linear[(size_t)y * width + x] = (uint8_t)(y * 23 + x * 7);
|
||||
|
||||
tiled = pack_to_nc12(linear, width, uv_h, &col_stride, &tile_size);
|
||||
|
||||
recovered = calloc(1, (size_t)width * uv_h);
|
||||
assert(recovered != NULL);
|
||||
|
||||
nv12_col128_detile_uv(recovered, width, tiled, col_stride, width, uv_h);
|
||||
|
||||
for (i = 0; i < (size_t)width * uv_h; i++) {
|
||||
if (recovered[i] != linear[i]) {
|
||||
fprintf(stderr,
|
||||
"FAIL %ux%u UV: pixel %zu linear=0x%02x recovered=0x%02x\n",
|
||||
width, height, i,
|
||||
linear[i], recovered[i]);
|
||||
free(linear); free(tiled); free(recovered);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("PASS %ux%u UV plane\n", width, height);
|
||||
|
||||
free(linear);
|
||||
free(tiled);
|
||||
free(recovered);
|
||||
}
|
||||
|
||||
static void test_uv_offset(void)
|
||||
{
|
||||
/* Per the SAND COL128 layout, Y and UV are interleaved within
|
||||
* EACH column (not concatenated as separate planes), so the UV
|
||||
* plane base pointer is offset by 128 * ALIGN(height, 8) — the
|
||||
* Y portion of column 0. NOT 128 * height * num_columns (the
|
||||
* size of all Y across all columns), which was an earlier wrong
|
||||
* formula caught by Phase 7 SEGV on higgs. */
|
||||
unsigned int off = nv12_col128_uv_plane_offset(1280, 720);
|
||||
if (off != 128u * 720) {
|
||||
fprintf(stderr, "FAIL UV offset 1280×720: got %u expected %u\n",
|
||||
off, 128u * 720);
|
||||
exit(1);
|
||||
}
|
||||
printf("PASS UV offset 1280×720 = %u\n", off);
|
||||
|
||||
off = nv12_col128_uv_plane_offset(1366, 768);
|
||||
if (off != 128u * 768) {
|
||||
fprintf(stderr, "FAIL UV offset 1366×768: got %u expected %u\n",
|
||||
off, 128u * 768);
|
||||
exit(1);
|
||||
}
|
||||
printf("PASS UV offset 1366×768 (column-misaligned width)\n");
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
/* Phase 3 fixture sizes — all 128-aligned, 8-line-aligned. */
|
||||
test_detile_y(640, 360);
|
||||
test_detile_y(1280, 720);
|
||||
test_detile_y(1920, 1080);
|
||||
|
||||
/* Phase 5 review F4: column-misaligned width (1366 → 1408 padding). */
|
||||
test_detile_y(1366, 768);
|
||||
|
||||
/* UV plane (half-height) at each width. */
|
||||
test_detile_uv(640, 360);
|
||||
test_detile_uv(1280, 720);
|
||||
test_detile_uv(1920, 1080);
|
||||
test_detile_uv(1366, 768);
|
||||
|
||||
test_uv_offset();
|
||||
|
||||
printf("All NC12 detile asserts pass.\n");
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user