Files
daedalus-decoder/tests/test_idct_bitexact.c
T
claude-noether 44ca4e550f phase1: substrate selector API + cross-substrate bit-exact ctest
Surfaces daedalus-fourier's substrate-override capability at the
decoder boundary.  Lets tests run on CPU-only hosts (CI runners,
x86 dev boxes) AND cross-checks V3D shader output against NEON
reference on hosts that have both.

API additions (pre-0.1 ABI, additive):

  - daedalus_decoder_substrate enum { AUTO, CPU, QPU }
    (mirrors daedalus_substrate; isolated for ABI reasons).
  - daedalus_decoder_set_substrate(dec, sub) setter, same
    mid-frame-change restrictions as set_output_format.
  - Default remains AUTO — the only sensible choice for production.

Internal:

  - flush_frame now calls daedalus_dispatch_h264_idct{4,8} with an
    explicit substrate instead of daedalus_recipe_dispatch_*.  Mapped
    via a small map_substrate() helper.  No perf delta on AUTO (recipe
    layer was just doing the same dispatch under the hood).

Test changes:

  - test_smoke: new EXPECTs for set_substrate (valid + bogus).
  - test_idct_bitexact: new argv[4] takes "auto" (default), "cpu", or
    "qpu" to force the substrate.
  - CMakeLists.txt: new ctest entry `idct_bitexact_cpu` re-runs the
    QVGA case forcing the CPU path.  Catches silent drift between
    the V3D shader and the NEON reference; both must produce
    identical output for the same coefficient input (and they do —
    see ctest log below).

Verified on hertz (Pi 5 / V3D 7.1 / daedalus-fourier 0.1.0):

  $ ctest --test-dir build --output-on-failure
      Start 1: smoke
  1/4 Test #1: smoke ............................   Passed    0.10 sec
      Start 2: idct_bitexact
  2/4 Test #2: idct_bitexact ....................   Passed    0.03 sec
      Start 3: idct_bitexact_cpu
  3/4 Test #3: idct_bitexact_cpu ................   Passed    0.03 sec
      Start 4: idct_bitexact_1080p
  4/4 Test #4: idct_bitexact_1080p ..............   Passed    0.06 sec

  100% tests passed, 0 tests failed out of 4

CPU substrate produces byte-identical Y + Cb + Cr planes against the
same C reference that the AUTO/QPU path matches — confirming the V3D
shaders and the daedalus-fourier NEON path agree at the spec level.

Why we plumbed the lower-level dispatch instead of leaving recipe in
place: recipe is just a thin wrapper that calls dispatch with
AUTO.  Once we needed substrate control, the wrapper became a
liability (would have required adding a parallel recipe API for each
substrate); going direct is simpler and the AUTO path is unchanged.

Coverage note: idct_bitexact_cpu runs at QVGA (300 MBs); not also at
1080p because the CPU path's wall time scales linearly with block
count and a 1080p CPU run is ~0.5s on hertz — fine standalone but
slows ctest enough that it would tempt opt-in gating.  The bit-exact
content is the same regardless of frame size; the 1080p variant only
exists to gate index-arithmetic bugs that surface above small int
boundaries.
2026-05-24 23:07:45 +02:00

399 lines
16 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* test_idct_bitexact — phase1 stage1 bit-exact gate for the frame-
* scaled luma IDCT 4×4 dispatch.
*
* Generates a frame of random coefficients, runs daedalus_decoder
* (with predicted=0 by the scaffold's flush_frame contract), and
* compares every output byte against an inline C reference that
* mirrors the H.264 §8.5.12.1 1D butterfly.
*
* Why "bit-exact": the GPU shader and the C reference apply the same
* integer arithmetic. Any rounding / sign / overflow disagreement is
* a bug. Pass = every output byte matches.
*
* Scope match with flush_frame: the test mirrors flush_frame's
* per-MB → flat block layout (raster scan within MB, no z-scan
* permutation). That keeps the test focused on IDCT correctness;
* the z-scan permutation that bridges to libavcodec's per-MB coeffs
* layout is a separate concern (handled in the eventual libavcodec-
* intercept patch).
*
* Covers Y (4x4 + 8x8) and chroma (4x4 Cb + Cr, NV12-interleaved).
* Half the MBs use transform_8x8=1 (4 luma 8x8 blocks), half use
* transform_8x8=0 (16 luma 4x4 blocks); both partitions are
* exercised in the same frame so the flush_frame partitioning logic
* is also under test, not just the underlying shaders. Random coeffs
* for all components; reference IDCT applied per block. The chroma
* compare deinterleaves NV12 UV back into separate Cb/Cr expectations.
*
* Not in scope (covered by other tests / future PRs):
* - Chroma DC / Intra16x16 DC Hadamard pre-pass
* - bit-exactness against real H.264 streams (test-vector PR)
* - non-zero predicted pixels (intra prediction lands in Stage 2a)
*/
#include "daedalus_decoder.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* xorshift64* for deterministic random coefficient generation. */
static uint64_t xs64_state;
static uint64_t xs64(void)
{
uint64_t x = xs64_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs64_state = x;
}
/* Inline C reference — H.264 §8.5.12.1 1D butterfly, applied row pass
* then column pass; +32 rounding, >>6, add to predicted (=0 here),
* clip to u8. Bit-exact-equivalent transcription of daedalus-fourier
* tests/h264_idct4_ref.c (LGPL-2.1+ original; reproduced here under
* fair-use for test purposes — same algorithm, no copy of code). */
static int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
static void h264_idct4_butterfly(const int d[4], int out[4])
{
int e = d[0] + d[2];
int f = d[0] - d[2];
int g = (d[1] >> 1) - d[3];
int h = d[1] + (d[3] >> 1);
out[0] = e + h;
out[1] = f + g;
out[2] = f - g;
out[3] = e - h;
}
/* 1D 8-point butterfly per H.264 §8.5.13.2. Transcribed from
* daedalus-fourier tests/h264_idct8_ref.c (LGPL-2.1+ in the original —
* algorithm reproduced here for test purposes, no copy of code). */
static void h264_idct8_butterfly(const int d[8], int g[8])
{
int e[8], f[8];
e[0] = d[0] + d[4];
e[1] = -d[3] + d[5] - d[7] - (d[7] >> 1);
e[2] = d[0] - d[4];
e[3] = d[1] + d[7] - d[3] - (d[3] >> 1);
e[4] = (d[2] >> 1) - d[6];
e[5] = -d[1] + d[7] + d[5] + (d[5] >> 1);
e[6] = d[2] + (d[6] >> 1);
e[7] = d[3] + d[5] + d[1] + (d[1] >> 1);
f[0] = e[0] + e[6];
f[1] = e[1] + (e[7] >> 2);
f[2] = e[2] + e[4];
f[3] = e[3] + (e[5] >> 2);
f[4] = e[2] - e[4];
f[5] = (e[3] >> 2) - e[5];
f[6] = e[0] - e[6];
f[7] = e[7] - (e[1] >> 2);
g[0] = f[0] + f[7];
g[1] = f[2] + f[5];
g[2] = f[4] + f[3];
g[3] = f[6] + f[1];
g[4] = f[6] - f[1];
g[5] = f[4] - f[3];
g[6] = f[2] - f[5];
g[7] = f[0] - f[7];
}
static void ref_idct8_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
{
/* block layout COLUMN-MAJOR: block[c*8 + r] = coef at (row=r, col=c). */
int tmp[8][8];
for (int r = 0; r < 8; r++) {
int d[8];
for (int c = 0; c < 8; c++) d[c] = block[c * 8 + r];
int g[8];
h264_idct8_butterfly(d, g);
for (int c = 0; c < 8; c++) tmp[r][c] = g[c];
}
int col_out[8][8];
for (int c = 0; c < 8; c++) {
int d[8];
for (int r = 0; r < 8; r++) d[r] = tmp[r][c];
int g[8];
h264_idct8_butterfly(d, g);
for (int r = 0; r < 8; r++) col_out[r][c] = g[r];
}
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++)
dst[r * stride + c] = (uint8_t) clip_u8(
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
}
static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
{
/* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier):
* block[c*4 + r] = coeff at (row=r, col=c).
* Row pass first: gather d[c] = block[c*4 + r] for fixed r. */
int tmp[4][4];
for (int r = 0; r < 4; r++) {
int d[4] = { block[0*4 + r], block[1*4 + r],
block[2*4 + r], block[3*4 + r] };
int o[4];
h264_idct4_butterfly(d, o);
for (int c = 0; c < 4; c++) tmp[r][c] = o[c];
}
/* Column pass: gather d[r] = tmp[r][c] for fixed c. */
int col_out[4][4];
for (int c = 0; c < 4; c++) {
int d[4] = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] };
int o[4];
h264_idct4_butterfly(d, o);
for (int r = 0; r < 4; r++) col_out[r][c] = o[r];
}
/* Add (predicted=dst, here 0) + clip. */
for (int r = 0; r < 4; r++)
for (int c = 0; c < 4; c++)
dst[r * stride + c] = (uint8_t) clip_u8(
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
}
int main(int argc, char **argv)
{
/* Smaller than 1080p to keep the test snappy; still N_MBs >= 64 so
* the dispatch covers multiple workgroups (16 blocks/WG → ≥4 WGs). */
int width = argc > 1 ? atoi(argv[1]) : 320;
int height = argc > 2 ? atoi(argv[2]) : 240; /* 240 / 16 = 15 → coded 240 */
/* Coded dims must be mod-16; 320×240 is canonical QVGA. */
uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL;
xs64_state = seed;
/* Optional 4th argv: "auto" (default) / "cpu" / "qpu" to pin the
* dispatch substrate. Both substrates must produce IDENTICAL
* output (the V3D shaders are bit-exact gates against the same
* spec the NEON path implements); the ctest suite runs the QVGA
* test once per substrate to catch any silent drift. */
daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO;
const char *sub_name = "auto";
if (argc > 4) {
if (!strcmp(argv[4], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; }
else if (!strcmp(argv[4], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; }
else if (!strcmp(argv[4], "auto")) { /* default */ }
else {
fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[4]);
return 1;
}
}
int mb_w = width / 16;
int mb_h = height / 16;
int n_mbs = mb_w * mb_h;
printf("test_idct_bitexact: %dx%d (%d MBs), seed=0x%lx\n",
width, height, n_mbs, (unsigned long) seed);
daedalus_decoder *dec = daedalus_decoder_create(width, height);
if (!dec) {
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
return 0;
}
if (daedalus_decoder_set_substrate(dec, sub) != 0) {
fprintf(stderr, "set_substrate(%s) failed\n", sub_name);
return 1;
}
printf("substrate: %s\n", sub_name);
/* Build the per-MB inputs. Each MB gets 16 luma 4×4 blocks of
* random coeffs in [-512, 511] — same range as the daedalus-fourier
* cycle-6 M1 gate uses. */
int16_t (*per_mb_coeffs)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_coeffs));
if (!per_mb_coeffs) { fprintf(stderr, "alloc fail\n"); return 1; }
for (int mb = 0; mb < n_mbs; mb++) {
for (int i = 0; i < 384; i++) {
/* Random coeffs in [-512, 511] for all of luma + Cb + Cr.
* Same range as the daedalus-fourier cycle-6 M1 gate. */
per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
}
}
/* Per-MB transform mode (deterministic split: every odd raster MB
* is 8x8, every even is 4x4 — exercises BOTH partitions in the
* same frame so the flush_frame partitioning logic is under test). */
uint8_t *mb_8x8 = malloc((size_t) n_mbs);
if (!mb_8x8) { fprintf(stderr, "alloc fail\n"); return 1; }
for (int i = 0; i < n_mbs; i++) mb_8x8[i] = (i & 1) ? 1 : 0;
/* Append in raster order. */
struct daedalus_decoder_mb_input mb = {0};
int n_8x8_mbs = 0, n_4x4_mbs = 0;
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int idx = my * mb_w + mx;
mb.mb_x = (uint16_t) mx;
mb.mb_y = (uint16_t) my;
mb.coeffs = per_mb_coeffs[idx];
mb.transform_8x8 = mb_8x8[idx];
if (mb_8x8[idx]) n_8x8_mbs++; else n_4x4_mbs++;
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
fprintf(stderr, "append (%d,%d) failed\n", mx, my);
return 1;
}
}
}
printf("MB mix: %d 4x4 MBs, %d 8x8 MBs\n", n_4x4_mbs, n_8x8_mbs);
/* Flush — exercise BOTH the luma path (out_y) and the chroma path
* (out_uv set to non-NULL so flush_frame runs the chroma dispatch
* + NV12 interleave). */
size_t y_size = (size_t) width * height;
size_t uv_size = (size_t) width * height / 2;
uint8_t *gpu_y = calloc(1, y_size);
uint8_t *gpu_uv = calloc(1, uv_size);
if (!gpu_y || !gpu_uv) return 1;
int frc = daedalus_decoder_flush_frame(dec, gpu_y, (size_t) width,
gpu_uv, (size_t) width);
if (frc != 0) {
fprintf(stderr, "flush_frame rc=%d\n", frc);
return 1;
}
/* Compute the reference output: same per-MB → flat raster block
* layout as flush_frame uses. Branch per MB on transform_8x8. */
uint8_t *ref_y = calloc(1, y_size);
if (!ref_y) return 1;
int16_t block_scratch[64]; /* large enough for 8x8 */
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int mb_idx = my * mb_w + mx;
if (mb_8x8[mb_idx]) {
/* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
int block_in_mb = sb_y * 2 + sb_x;
memcpy(block_scratch,
&per_mb_coeffs[mb_idx][block_in_mb * 64],
64 * sizeof(int16_t));
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 8;
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 8;
ref_idct8_add(&ref_y[px_y * (size_t) width + px_x],
width, block_scratch);
}
}
} else {
/* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
for (int sb_y = 0; sb_y < 4; sb_y++) {
for (int sb_x = 0; sb_x < 4; sb_x++) {
int block_in_mb = sb_y * 4 + sb_x;
memcpy(block_scratch,
&per_mb_coeffs[mb_idx][block_in_mb * 16],
16 * sizeof(int16_t));
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
width, block_scratch);
}
}
}
}
}
/* Build the chroma reference: separate planar Cb and Cr (W/2 by
* H/2), each block IDCT'd into its plane. Chroma per-MB layout
* matches flush_frame: 4 Cb blocks then 4 Cr blocks, raster order
* within each component (sb_y * 2 + sb_x). */
size_t chroma_w = (size_t) width / 2;
size_t chroma_h = (size_t) height / 2;
size_t chroma_plane_size = chroma_w * chroma_h;
uint8_t *ref_cb = calloc(1, chroma_plane_size);
uint8_t *ref_cr = calloc(1, chroma_plane_size);
if (!ref_cb || !ref_cr) return 1;
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int mb_idx = my * mb_w + mx;
for (int comp = 0; comp < 2; comp++) {
uint8_t *plane = (comp == 0) ? ref_cb : ref_cr;
size_t coeff_base = 256u + (size_t) comp * 64u;
for (int sb_y = 0; sb_y < 2; sb_y++) {
for (int sb_x = 0; sb_x < 2; sb_x++) {
int block_in_comp = sb_y * 2 + sb_x;
memcpy(block_scratch,
&per_mb_coeffs[mb_idx][coeff_base +
(size_t) block_in_comp * 16],
16 * sizeof(int16_t));
size_t px_y = (size_t) my * 8 + (size_t) sb_y * 4;
size_t px_x = (size_t) mx * 8 + (size_t) sb_x * 4;
ref_idct4_add(&plane[px_y * chroma_w + px_x],
(ptrdiff_t) chroma_w, block_scratch);
}
}
}
}
}
/* Y compare. */
size_t y_diffs = 0, y_first_diff = 0;
for (size_t i = 0; i < y_size; i++) {
if (gpu_y[i] != ref_y[i]) {
if (y_diffs == 0) y_first_diff = i;
y_diffs++;
}
}
printf("Y bytes total: %zu\n", y_size);
printf("Y bytes diff: %zu (%.4f%%)\n", y_diffs, 100.0 * y_diffs / y_size);
if (y_diffs) {
printf("Y first diff at offset %zu: gpu=%u ref=%u\n",
y_first_diff, gpu_y[y_first_diff], ref_y[y_first_diff]);
}
/* UV compare — deinterleave NV12 back into Cb/Cr and compare. */
size_t cb_diffs = 0, cr_diffs = 0;
size_t cb_first = 0, cr_first = 0;
for (size_t r = 0; r < chroma_h; r++) {
const uint8_t *gpu_row = gpu_uv + r * (size_t) width;
const uint8_t *cb_row = ref_cb + r * chroma_w;
const uint8_t *cr_row = ref_cr + r * chroma_w;
for (size_t c = 0; c < chroma_w; c++) {
uint8_t gpu_cb = gpu_row[c * 2 + 0];
uint8_t gpu_cr = gpu_row[c * 2 + 1];
if (gpu_cb != cb_row[c]) {
if (cb_diffs == 0) cb_first = r * chroma_w + c;
cb_diffs++;
}
if (gpu_cr != cr_row[c]) {
if (cr_diffs == 0) cr_first = r * chroma_w + c;
cr_diffs++;
}
}
}
printf("Cb bytes total: %zu diff: %zu (%.4f%%)\n",
chroma_plane_size, cb_diffs,
100.0 * cb_diffs / chroma_plane_size);
printf("Cr bytes total: %zu diff: %zu (%.4f%%)\n",
chroma_plane_size, cr_diffs,
100.0 * cr_diffs / chroma_plane_size);
if (cb_diffs) {
size_t r = cb_first / chroma_w, c = cb_first % chroma_w;
printf("Cb first diff at (%zu,%zu): gpu=%u ref=%u\n",
r, c, gpu_uv[r * (size_t) width + c * 2 + 0], ref_cb[cb_first]);
}
if (cr_diffs) {
size_t r = cr_first / chroma_w, c = cr_first % chroma_w;
printf("Cr first diff at (%zu,%zu): gpu=%u ref=%u\n",
r, c, gpu_uv[r * (size_t) width + c * 2 + 1], ref_cr[cr_first]);
}
free(ref_cr);
free(ref_cb);
free(ref_y);
free(gpu_uv);
free(gpu_y);
free(mb_8x8);
free(per_mb_coeffs);
daedalus_decoder_destroy(dec);
if (y_diffs == 0 && cb_diffs == 0 && cr_diffs == 0) {
printf("BIT-EXACT PASS (Y + Cb + Cr)\n");
return 0;
}
fprintf(stderr, "BIT-EXACT FAIL\n");
return 1;
}