Files
daedalus-decoder/tests/test_idct_bitexact.c
T
claude-noether a7a0d56ecd Stage 2 PR-a: predicted samples plumbing — caller-supplied per-MB pixels
First concrete deliverable on the daedalus-decoder Stage 2 path post
the 2026-05-25 architecture re-pin (memory: dejavu / frame-major UMA).

Q2 decision: CPU intra prediction.  libavcodec's existing NEON intra
prediction kernels generate predicted samples per MB; daedalus-decoder
accepts those samples through the API and uses them as the IDCT-add
starting state.  FFmpeg's `idct_add` semantics — dst += idct(coeffs);
clip255 — fold DESIGN.md's Stage 3 reconstruction into the existing
Stage 1 IDCT dispatch for free.  No new GPU work.

API change
----------

`daedalus_decoder_mb_input` gains a `const uint8_t *predicted` field:

    predicted [  0 .. 256) — 16×16 luma, row-major raster
    predicted [256 .. 320) — 8×8  Cb,   row-major raster
    predicted [320 .. 384) — 8×8  Cr,   row-major raster

NULL is legal and equivalent to all-zero predicted samples — preserves
the existing IDCT-isolation test contract.

Internal changes
----------------

  - `daedalus_decoder` gains predicted_y (W×H) and predicted_uv (planar
    Cb||Cr, W×H/2) buffers allocated at create, zeroed at end of every
    flush_frame so NULL `mb->predicted` is indistinguishable from
    explicit zeros from one frame to the next.
  - `append_mb` splats mb->predicted into predicted_y/_uv at raster
    (mb_y*16, mb_x*16) for luma and (mb_y*8, mb_x*8) for each chroma
    component.
  - `flush_frame` replaces `calloc(scratch_y)` and `calloc(scratch_uv)`
    with `malloc + memcpy from predicted_y/_uv` — the IDCT dispatch
    then writes residual on top, clip-adding to the predicted samples
    in place.

Test
----

`test_idct_bitexact` extended:

  - Generates random predicted samples (uint8_t) per MB alongside the
    existing random coeffs.
  - Pre-fills the reference ref_y / ref_cb / ref_cr planes with those
    same predicted samples at the corresponding raster positions
    BEFORE applying ref_idct4_add / ref_idct8_add per block.
  - Compares GPU output to reference byte-for-byte.

Result on hertz (Pi 5 V3D 7.1), all three substrates:

  test_idct_bitexact 320 240 0xfeedface5a5a5a5a {cpu, qpu, auto}
    Y bytes diff:  0/76800 (0.0000%)
    Cb bytes diff: 0/19200 (0.0000%)
    Cr bytes diff: 0/19200 (0.0000%)
  BIT-EXACT PASS on all three substrates

Catches any silent drift between substrates and any predicted-samples
plumbing mistake on either the API or the dispatch side.

Followups
---------

  - Stage 2 PR-b: deblock dispatch in flush_frame.
  - Stage 2 daemon refactor (parallel, daedalus-v4l2 daemon): replace
    avcodec_send_packet/receive_frame with a libavcodec-parser-only
    path that drives daedalus_decoder_append_mb in raster order +
    flush_frame at slice boundary.
2026-05-25 23:01:20 +02:00

447 lines
18 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* test_idct_bitexact — phase1 stage1 bit-exact gate for the frame-
* scaled luma + chroma IDCT 4×4 / 8×8 dispatch + Stage 2 predicted-
* samples plumbing.
*
* Generates a frame of random coefficients AND random predicted
* samples per MB, runs daedalus_decoder (which writes the predicted
* samples into its frame-scoped predicted_y/_uv buffers via
* append_mb, then pre-fills the IDCT dispatch scratch from them in
* flush_frame), and compares every output byte against an inline C
* reference that mirrors the H.264 §8.5.12.1 1D butterfly applied
* to the same predicted+coeffs inputs.
*
* Why "bit-exact": the GPU shader and the C reference apply the same
* integer arithmetic. Any rounding / sign / overflow disagreement is
* a bug. Pass = every output byte matches.
*
* Scope match with flush_frame: the test mirrors flush_frame's
* per-MB → flat block layout (raster scan within MB, no z-scan
* permutation). That keeps the test focused on IDCT correctness;
* the z-scan permutation that bridges to libavcodec's per-MB coeffs
* layout is a separate concern (handled in the eventual libavcodec-
* intercept patch).
*
* Covers Y (4x4 + 8x8) and chroma (4x4 Cb + Cr, NV12-interleaved).
* Half the MBs use transform_8x8=1 (4 luma 8x8 blocks), half use
* transform_8x8=0 (16 luma 4x4 blocks); both partitions are
* exercised in the same frame so the flush_frame partitioning logic
* is also under test, not just the underlying shaders. Random coeffs
* for all components; reference IDCT applied per block. The chroma
* compare deinterleaves NV12 UV back into separate Cb/Cr expectations.
*
* Not in scope (covered by other tests / future PRs):
* - Chroma DC / Intra16x16 DC Hadamard pre-pass
* - bit-exactness against real H.264 streams (test-vector PR)
* - deblock (lands in Stage 2 PR-b after this one)
*/
#include "daedalus_decoder.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* xorshift64* for deterministic random coefficient generation. */
static uint64_t xs64_state;
static uint64_t xs64(void)
{
uint64_t x = xs64_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs64_state = x;
}
/* Inline C reference — H.264 §8.5.12.1 1D butterfly, applied row pass
* then column pass; +32 rounding, >>6, add to predicted (=0 here),
* clip to u8. Bit-exact-equivalent transcription of daedalus-fourier
* tests/h264_idct4_ref.c (LGPL-2.1+ original; reproduced here under
* fair-use for test purposes — same algorithm, no copy of code). */
static int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
static void h264_idct4_butterfly(const int d[4], int out[4])
{
int e = d[0] + d[2];
int f = d[0] - d[2];
int g = (d[1] >> 1) - d[3];
int h = d[1] + (d[3] >> 1);
out[0] = e + h;
out[1] = f + g;
out[2] = f - g;
out[3] = e - h;
}
/* 1D 8-point butterfly per H.264 §8.5.13.2. Transcribed from
* daedalus-fourier tests/h264_idct8_ref.c (LGPL-2.1+ in the original —
* algorithm reproduced here for test purposes, no copy of code). */
static void h264_idct8_butterfly(const int d[8], int g[8])
{
int e[8], f[8];
e[0] = d[0] + d[4];
e[1] = -d[3] + d[5] - d[7] - (d[7] >> 1);
e[2] = d[0] - d[4];
e[3] = d[1] + d[7] - d[3] - (d[3] >> 1);
e[4] = (d[2] >> 1) - d[6];
e[5] = -d[1] + d[7] + d[5] + (d[5] >> 1);
e[6] = d[2] + (d[6] >> 1);
e[7] = d[3] + d[5] + d[1] + (d[1] >> 1);
f[0] = e[0] + e[6];
f[1] = e[1] + (e[7] >> 2);
f[2] = e[2] + e[4];
f[3] = e[3] + (e[5] >> 2);
f[4] = e[2] - e[4];
f[5] = (e[3] >> 2) - e[5];
f[6] = e[0] - e[6];
f[7] = e[7] - (e[1] >> 2);
g[0] = f[0] + f[7];
g[1] = f[2] + f[5];
g[2] = f[4] + f[3];
g[3] = f[6] + f[1];
g[4] = f[6] - f[1];
g[5] = f[4] - f[3];
g[6] = f[2] - f[5];
g[7] = f[0] - f[7];
}
static void ref_idct8_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
{
/* block layout COLUMN-MAJOR: block[c*8 + r] = coef at (row=r, col=c). */
int tmp[8][8];
for (int r = 0; r < 8; r++) {
int d[8];
for (int c = 0; c < 8; c++) d[c] = block[c * 8 + r];
int g[8];
h264_idct8_butterfly(d, g);
for (int c = 0; c < 8; c++) tmp[r][c] = g[c];
}
int col_out[8][8];
for (int c = 0; c < 8; c++) {
int d[8];
for (int r = 0; r < 8; r++) d[r] = tmp[r][c];
int g[8];
h264_idct8_butterfly(d, g);
for (int r = 0; r < 8; r++) col_out[r][c] = g[r];
}
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++)
dst[r * stride + c] = (uint8_t) clip_u8(
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
}
static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
{
/* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier):
* block[c*4 + r] = coeff at (row=r, col=c).
* Row pass first: gather d[c] = block[c*4 + r] for fixed r. */
int tmp[4][4];
for (int r = 0; r < 4; r++) {
int d[4] = { block[0*4 + r], block[1*4 + r],
block[2*4 + r], block[3*4 + r] };
int o[4];
h264_idct4_butterfly(d, o);
for (int c = 0; c < 4; c++) tmp[r][c] = o[c];
}
/* Column pass: gather d[r] = tmp[r][c] for fixed c. */
int col_out[4][4];
for (int c = 0; c < 4; c++) {
int d[4] = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] };
int o[4];
h264_idct4_butterfly(d, o);
for (int r = 0; r < 4; r++) col_out[r][c] = o[r];
}
/* Add (predicted=dst, here 0) + clip. */
for (int r = 0; r < 4; r++)
for (int c = 0; c < 4; c++)
dst[r * stride + c] = (uint8_t) clip_u8(
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
}
int main(int argc, char **argv)
{
/* Smaller than 1080p to keep the test snappy; still N_MBs >= 64 so
* the dispatch covers multiple workgroups (16 blocks/WG → ≥4 WGs). */
int width = argc > 1 ? atoi(argv[1]) : 320;
int height = argc > 2 ? atoi(argv[2]) : 240; /* 240 / 16 = 15 → coded 240 */
/* Coded dims must be mod-16; 320×240 is canonical QVGA. */
uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL;
xs64_state = seed;
/* Optional 4th argv: "auto" (default) / "cpu" / "qpu" to pin the
* dispatch substrate. Both substrates must produce IDENTICAL
* output (the V3D shaders are bit-exact gates against the same
* spec the NEON path implements); the ctest suite runs the QVGA
* test once per substrate to catch any silent drift. */
daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO;
const char *sub_name = "auto";
if (argc > 4) {
if (!strcmp(argv[4], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; }
else if (!strcmp(argv[4], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; }
else if (!strcmp(argv[4], "auto")) { /* default */ }
else {
fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[4]);
return 1;
}
}
int mb_w = width / 16;
int mb_h = height / 16;
int n_mbs = mb_w * mb_h;
printf("test_idct_bitexact: %dx%d (%d MBs), seed=0x%lx\n",
width, height, n_mbs, (unsigned long) seed);
daedalus_decoder *dec = daedalus_decoder_create(width, height);
if (!dec) {
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
return 0;
}
if (daedalus_decoder_set_substrate(dec, sub) != 0) {
fprintf(stderr, "set_substrate(%s) failed\n", sub_name);
return 1;
}
printf("substrate: %s\n", sub_name);
/* Build the per-MB inputs. Each MB gets 16 luma 4×4 blocks of
* random coeffs in [-512, 511] — same range as the daedalus-fourier
* cycle-6 M1 gate uses. Plus random predicted samples (uint8 each)
* to exercise the Stage 2 predicted-samples plumbing — when this
* is non-zero, flush_frame must pre-fill the IDCT-dispatch scratch
* from dec->predicted_y / dec->predicted_uv (Stage 2 PR-a) rather
* than from calloc-zero (the Stage 1 scaffold contract). The
* reference path mirrors this by pre-filling ref_y / ref_cb / ref_cr
* from the same predicted bytes BEFORE the per-block ref_idct*_add
* calls — so the test catches any mismatch between caller-supplied
* predicted and what reaches the GPU's IDCT-add starting state. */
int16_t (*per_mb_coeffs)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_coeffs));
uint8_t (*per_mb_predicted)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_predicted));
if (!per_mb_coeffs || !per_mb_predicted) { fprintf(stderr, "alloc fail\n"); return 1; }
for (int mb = 0; mb < n_mbs; mb++) {
for (int i = 0; i < 384; i++) {
/* Random coeffs in [-512, 511] for all of luma + Cb + Cr. */
per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
/* Random predicted samples in [0, 255]. */
per_mb_predicted[mb][i] = (uint8_t)(xs64() & 0xff);
}
}
/* Per-MB transform mode (deterministic split: every odd raster MB
* is 8x8, every even is 4x4 — exercises BOTH partitions in the
* same frame so the flush_frame partitioning logic is under test). */
uint8_t *mb_8x8 = malloc((size_t) n_mbs);
if (!mb_8x8) { fprintf(stderr, "alloc fail\n"); return 1; }
for (int i = 0; i < n_mbs; i++) mb_8x8[i] = (i & 1) ? 1 : 0;
/* Append in raster order. */
struct daedalus_decoder_mb_input mb = {0};
int n_8x8_mbs = 0, n_4x4_mbs = 0;
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int idx = my * mb_w + mx;
mb.mb_x = (uint16_t) mx;
mb.mb_y = (uint16_t) my;
mb.coeffs = per_mb_coeffs[idx];
mb.predicted = per_mb_predicted[idx];
mb.transform_8x8 = mb_8x8[idx];
if (mb_8x8[idx]) n_8x8_mbs++; else n_4x4_mbs++;
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
fprintf(stderr, "append (%d,%d) failed\n", mx, my);
return 1;
}
}
}
printf("MB mix: %d 4x4 MBs, %d 8x8 MBs\n", n_4x4_mbs, n_8x8_mbs);
/* Flush — exercise BOTH the luma path (out_y) and the chroma path
* (out_uv set to non-NULL so flush_frame runs the chroma dispatch
* + NV12 interleave). */
size_t y_size = (size_t) width * height;
size_t uv_size = (size_t) width * height / 2;
uint8_t *gpu_y = calloc(1, y_size);
uint8_t *gpu_uv = calloc(1, uv_size);
if (!gpu_y || !gpu_uv) return 1;
int frc = daedalus_decoder_flush_frame(dec, gpu_y, (size_t) width,
gpu_uv, (size_t) width);
if (frc != 0) {
fprintf(stderr, "flush_frame rc=%d\n", frc);
return 1;
}
/* Compute the reference output: same per-MB → flat raster block
* layout as flush_frame uses. Branch per MB on transform_8x8.
*
* ref_y is pre-filled with each MB's 16×16 luma predicted samples
* at raster (my*16, mx*16), then ref_idct4_add/8_add overlay the
* residual via FFmpeg `idct_add` semantics (dst += idct(coeffs);
* clip255). This mirrors what flush_frame does on the GPU side:
* scratch_y starts from dec->predicted_y, IDCT-add writes back. */
uint8_t *ref_y = malloc(y_size);
if (!ref_y) return 1;
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int mb_idx = my * mb_w + mx;
const uint8_t *p_y = per_mb_predicted[mb_idx]; /* [0..256) */
for (int r = 0; r < 16; r++) {
memcpy(&ref_y[((size_t) my * 16 + r) * (size_t) width
+ (size_t) mx * 16],
&p_y[r * 16], 16);
}
}
}
int16_t block_scratch[64]; /* large enough for 8x8 */
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int mb_idx = my * mb_w + mx;
if (mb_8x8[mb_idx]) {
/* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
int block_in_mb = sb_y * 2 + sb_x;
memcpy(block_scratch,
&per_mb_coeffs[mb_idx][block_in_mb * 64],
64 * sizeof(int16_t));
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 8;
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 8;
ref_idct8_add(&ref_y[px_y * (size_t) width + px_x],
width, block_scratch);
}
}
} else {
/* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
for (int sb_y = 0; sb_y < 4; sb_y++) {
for (int sb_x = 0; sb_x < 4; sb_x++) {
int block_in_mb = sb_y * 4 + sb_x;
memcpy(block_scratch,
&per_mb_coeffs[mb_idx][block_in_mb * 16],
16 * sizeof(int16_t));
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
width, block_scratch);
}
}
}
}
}
/* Build the chroma reference: separate planar Cb and Cr (W/2 by
* H/2), each block IDCT'd into its plane. Chroma per-MB layout
* matches flush_frame: 4 Cb blocks then 4 Cr blocks, raster order
* within each component (sb_y * 2 + sb_x). */
size_t chroma_w = (size_t) width / 2;
size_t chroma_h = (size_t) height / 2;
size_t chroma_plane_size = chroma_w * chroma_h;
uint8_t *ref_cb = malloc(chroma_plane_size);
uint8_t *ref_cr = malloc(chroma_plane_size);
if (!ref_cb || !ref_cr) return 1;
/* Pre-fill ref_cb / ref_cr with per-MB 8x8 chroma predicted samples
* (mirrors the predicted-samples plumbing on the chroma path). */
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int mb_idx = my * mb_w + mx;
const uint8_t *p_cb = per_mb_predicted[mb_idx] + 256;
const uint8_t *p_cr = per_mb_predicted[mb_idx] + 256 + 64;
for (int r = 0; r < 8; r++) {
memcpy(&ref_cb[((size_t) my * 8 + r) * chroma_w + (size_t) mx * 8],
&p_cb[r * 8], 8);
memcpy(&ref_cr[((size_t) my * 8 + r) * chroma_w + (size_t) mx * 8],
&p_cr[r * 8], 8);
}
}
}
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int mb_idx = my * mb_w + mx;
for (int comp = 0; comp < 2; comp++) {
uint8_t *plane = (comp == 0) ? ref_cb : ref_cr;
size_t coeff_base = 256u + (size_t) comp * 64u;
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
int block_in_comp = sb_y * 2 + sb_x;
memcpy(block_scratch,
&per_mb_coeffs[mb_idx][coeff_base +
(size_t) block_in_comp * 16],
16 * sizeof(int16_t));
size_t px_y = (size_t) my * 8 + (size_t) sb_y * 4;
size_t px_x = (size_t) mx * 8 + (size_t) sb_x * 4;
ref_idct4_add(&plane[px_y * chroma_w + px_x],
(ptrdiff_t) chroma_w, block_scratch);
}
}
}
}
}
/* Y compare. */
size_t y_diffs = 0, y_first_diff = 0;
for (size_t i = 0; i < y_size; i++) {
if (gpu_y[i] != ref_y[i]) {
if (y_diffs == 0) y_first_diff = i;
y_diffs++;
}
}
printf("Y bytes total: %zu\n", y_size);
printf("Y bytes diff: %zu (%.4f%%)\n", y_diffs, 100.0 * y_diffs / y_size);
if (y_diffs) {
printf("Y first diff at offset %zu: gpu=%u ref=%u\n",
y_first_diff, gpu_y[y_first_diff], ref_y[y_first_diff]);
}
/* UV compare — deinterleave NV12 back into Cb/Cr and compare. */
size_t cb_diffs = 0, cr_diffs = 0;
size_t cb_first = 0, cr_first = 0;
for (size_t r = 0; r < chroma_h; r++) {
const uint8_t *gpu_row = gpu_uv + r * (size_t) width;
const uint8_t *cb_row = ref_cb + r * chroma_w;
const uint8_t *cr_row = ref_cr + r * chroma_w;
for (size_t c = 0; c < chroma_w; c++) {
uint8_t gpu_cb = gpu_row[c * 2 + 0];
uint8_t gpu_cr = gpu_row[c * 2 + 1];
if (gpu_cb != cb_row[c]) {
if (cb_diffs == 0) cb_first = r * chroma_w + c;
cb_diffs++;
}
if (gpu_cr != cr_row[c]) {
if (cr_diffs == 0) cr_first = r * chroma_w + c;
cr_diffs++;
}
}
}
printf("Cb bytes total: %zu diff: %zu (%.4f%%)\n",
chroma_plane_size, cb_diffs,
100.0 * cb_diffs / chroma_plane_size);
printf("Cr bytes total: %zu diff: %zu (%.4f%%)\n",
chroma_plane_size, cr_diffs,
100.0 * cr_diffs / chroma_plane_size);
if (cb_diffs) {
size_t r = cb_first / chroma_w, c = cb_first % chroma_w;
printf("Cb first diff at (%zu,%zu): gpu=%u ref=%u\n",
r, c, gpu_uv[r * (size_t) width + c * 2 + 0], ref_cb[cb_first]);
}
if (cr_diffs) {
size_t r = cr_first / chroma_w, c = cr_first % chroma_w;
printf("Cr first diff at (%zu,%zu): gpu=%u ref=%u\n",
r, c, gpu_uv[r * (size_t) width + c * 2 + 1], ref_cr[cr_first]);
}
free(ref_cr);
free(ref_cb);
free(ref_y);
free(gpu_uv);
free(gpu_y);
free(mb_8x8);
free(per_mb_predicted);
free(per_mb_coeffs);
daedalus_decoder_destroy(dec);
if (y_diffs == 0 && cb_diffs == 0 && cr_diffs == 0) {
printf("BIT-EXACT PASS (Y + Cb + Cr)\n");
return 0;
}
fprintf(stderr, "BIT-EXACT FAIL\n");
return 1;
}