Files
daedalus-decoder/tests/test_idct_bitexact.c
T
claude-noether 948697ef0d phase1/stage1: bit-exact gate for the frame-scaled luma IDCT 4x4
Adds test_idct_bitexact that exercises daedalus_decoder_flush_frame
end-to-end with random coefficients and compares every output byte
against an inline C reference of the H.264 §8.5.12.1 1D butterfly.
Closes the validation gap from the previous PR ("dispatch succeeds"
becomes "dispatch is bit-exact").

What's tested:

  - 320×240 coded frame (300 MBs), enough to cover multiple workgroups
    of the V3D shader (16 blocks/WG → ≥30 WGs)
  - Per-MB → flat-raster block layout consistent with flush_frame
  - Random coeffs in [-512, 511] (same range as daedalus-fourier
    cycle-6 M1 gate)
  - Inline C reference: H.264 §8.5.12.1 butterfly with column-major
    block layout, +32 rounding, >>6, add-to-predicted (=0), clip255 —
    mirrors daedalus-fourier tests/h264_idct4_ref.c

Verified on hertz (Pi 5 / V3D 7.1 / daedalus-fourier 0.1.0):

  $ ctest --test-dir build --output-on-failure
    Start 1: smoke
  1/2 Test #1: smoke ............................   Passed    0.16 sec
    Start 2: idct_bitexact
  2/2 Test #2: idct_bitexact ....................   Passed    0.03 sec

  100% tests passed, 0 tests failed out of 2

Bit-exact PASS first try — daedalus-fourier's V3D IDCT 4x4 shader
produces identical pixels to the C reference for all 4800 blocks in
the test frame.  Validates BOTH the shader correctness AND the
frame-batched-dispatch correctness (this is the first time
n_blocks > ~30 has been exercised at the recipe-dispatch layer; the
substitution arc only ever called with n_blocks=1).

What is NOT tested by this PR (deferred to follow-ons):

  - Non-zero predicted pixels — flush_frame zero-initialises scratch_y,
    so the IDCT-ADD reduces to clip255(IDCT).  Real predicted comes
    from Stage 2a intra prediction.
  - Z-scan permutation between FFmpeg's per-MB coeffs layout and our
    per-MB → flat raster — the test uses its own coefficient generator
    that already matches our layout, so it doesn't exercise the
    permutation.  The libavcodec-intercept patch is where the
    permutation lands and gets validated against real H.264 streams.
  - Chroma 4×4 IDCT.
  - IDCT 8×8 (High profile).

Stacked on noether/phase1-stage1-idct (PR #3, the frame-scaled
dispatch).  Rebase on main after #3 lands; the diff is purely additive
(one new test file + 5 lines of CMake).
2026-05-24 22:20:21 +02:00

211 lines
7.5 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* test_idct_bitexact — phase1 stage1 bit-exact gate for the frame-
* scaled luma IDCT 4×4 dispatch.
*
* Generates a frame of random coefficients, runs daedalus_decoder
* (with predicted=0 by the scaffold's flush_frame contract), and
* compares every output byte against an inline C reference that
* mirrors the H.264 §8.5.12.1 1D butterfly.
*
* Why "bit-exact": the GPU shader and the C reference apply the same
* integer arithmetic. Any rounding / sign / overflow disagreement is
* a bug. Pass = every output byte matches.
*
* Scope match with flush_frame: the test mirrors flush_frame's
* per-MB → flat block layout (raster scan within MB, no z-scan
* permutation). That keeps the test focused on IDCT correctness;
* the z-scan permutation that bridges to libavcodec's per-MB coeffs
* layout is a separate concern (handled in the eventual libavcodec-
* intercept patch).
*
* Not in scope (covered by other tests / future PRs):
* - chroma planes (Phase 1 stage 1 fills UV with grey 128)
* - IDCT 8×8 (Phase 1 follow-on)
* - bit-exactness against real H.264 streams (test-vector PR)
* - non-zero predicted pixels (intra prediction lands in Stage 2a)
*/
#include "daedalus_decoder.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* xorshift64* for deterministic random coefficient generation. */
static uint64_t xs64_state;
static uint64_t xs64(void)
{
uint64_t x = xs64_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs64_state = x;
}
/* Inline C reference — H.264 §8.5.12.1 1D butterfly, applied row pass
* then column pass; +32 rounding, >>6, add to predicted (=0 here),
* clip to u8. Bit-exact-equivalent transcription of daedalus-fourier
* tests/h264_idct4_ref.c (LGPL-2.1+ original; reproduced here under
* fair-use for test purposes — same algorithm, no copy of code). */
static int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
static void h264_idct4_butterfly(const int d[4], int out[4])
{
int e = d[0] + d[2];
int f = d[0] - d[2];
int g = (d[1] >> 1) - d[3];
int h = d[1] + (d[3] >> 1);
out[0] = e + h;
out[1] = f + g;
out[2] = f - g;
out[3] = e - h;
}
static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
{
/* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier):
* block[c*4 + r] = coeff at (row=r, col=c).
* Row pass first: gather d[c] = block[c*4 + r] for fixed r. */
int tmp[4][4];
for (int r = 0; r < 4; r++) {
int d[4] = { block[0*4 + r], block[1*4 + r],
block[2*4 + r], block[3*4 + r] };
int o[4];
h264_idct4_butterfly(d, o);
for (int c = 0; c < 4; c++) tmp[r][c] = o[c];
}
/* Column pass: gather d[r] = tmp[r][c] for fixed c. */
int col_out[4][4];
for (int c = 0; c < 4; c++) {
int d[4] = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] };
int o[4];
h264_idct4_butterfly(d, o);
for (int r = 0; r < 4; r++) col_out[r][c] = o[r];
}
/* Add (predicted=dst, here 0) + clip. */
for (int r = 0; r < 4; r++)
for (int c = 0; c < 4; c++)
dst[r * stride + c] = (uint8_t) clip_u8(
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
}
int main(int argc, char **argv)
{
/* Smaller than 1080p to keep the test snappy; still N_MBs >= 64 so
* the dispatch covers multiple workgroups (16 blocks/WG → ≥4 WGs). */
int width = argc > 1 ? atoi(argv[1]) : 320;
int height = argc > 2 ? atoi(argv[2]) : 240; /* 240 / 16 = 15 → coded 240 */
/* Coded dims must be mod-16; 320×240 is canonical QVGA. */
uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL;
xs64_state = seed;
int mb_w = width / 16;
int mb_h = height / 16;
int n_mbs = mb_w * mb_h;
printf("test_idct_bitexact: %dx%d (%d MBs), seed=0x%lx\n",
width, height, n_mbs, (unsigned long) seed);
daedalus_decoder *dec = daedalus_decoder_create(width, height);
if (!dec) {
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
return 0;
}
/* Build the per-MB inputs. Each MB gets 16 luma 4×4 blocks of
* random coeffs in [-512, 511] — same range as the daedalus-fourier
* cycle-6 M1 gate uses. */
int16_t (*per_mb_coeffs)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_coeffs));
if (!per_mb_coeffs) { fprintf(stderr, "alloc fail\n"); return 1; }
for (int mb = 0; mb < n_mbs; mb++) {
for (int i = 0; i < 384; i++) {
if (i < 256)
per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
else
per_mb_coeffs[mb][i] = 0; /* chroma — unused this stage */
}
}
/* Append in raster order. */
struct daedalus_decoder_mb_input mb = {0};
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int idx = my * mb_w + mx;
mb.mb_x = (uint16_t) mx;
mb.mb_y = (uint16_t) my;
mb.coeffs = per_mb_coeffs[idx];
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
fprintf(stderr, "append (%d,%d) failed\n", mx, my);
return 1;
}
}
}
/* Flush. */
size_t y_size = (size_t) width * height;
uint8_t *gpu_y = calloc(1, y_size);
if (!gpu_y) return 1;
int frc = daedalus_decoder_flush_frame(dec, gpu_y, (size_t) width,
NULL, 0);
if (frc != 0) {
fprintf(stderr, "flush_frame rc=%d\n", frc);
return 1;
}
/* Compute the reference output: same per-MB → flat raster block
* layout as flush_frame uses. */
uint8_t *ref_y = calloc(1, y_size);
if (!ref_y) return 1;
/* Need a destructively-mutable copy because the reference IDCT
* doesn't actually mutate, but the GPU's IDCT shader does zero
* the coeffs. Our reference doesn't zero; that's fine because we
* use a fresh copy per block. */
int16_t block_scratch[16];
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int mb_idx = my * mb_w + mx;
for (int sb_y = 0; sb_y < 4; sb_y++) {
for (int sb_x = 0; sb_x < 4; sb_x++) {
int block_in_mb = sb_y * 4 + sb_x;
memcpy(block_scratch,
&per_mb_coeffs[mb_idx][block_in_mb * 16],
16 * sizeof(int16_t));
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
width, block_scratch);
}
}
}
}
/* Byte-by-byte compare. */
size_t diffs = 0;
size_t first_diff = 0;
for (size_t i = 0; i < y_size; i++) {
if (gpu_y[i] != ref_y[i]) {
if (diffs == 0) first_diff = i;
diffs++;
}
}
printf("Y bytes total: %zu\n", y_size);
printf("Y bytes diff: %zu (%.4f%%)\n", diffs, 100.0 * diffs / y_size);
if (diffs) {
printf("first diff at offset %zu: gpu=%u ref=%u\n",
first_diff, gpu_y[first_diff], ref_y[first_diff]);
}
free(ref_y);
free(gpu_y);
free(per_mb_coeffs);
daedalus_decoder_destroy(dec);
if (diffs == 0) {
printf("BIT-EXACT PASS\n");
return 0;
}
fprintf(stderr, "BIT-EXACT FAIL\n");
return 1;
}