/* SPDX-License-Identifier: BSD-2-Clause */ /* * test_idct_bitexact — phase1 stage1 bit-exact gate for the frame- * scaled luma IDCT 4×4 dispatch. * * Generates a frame of random coefficients, runs daedalus_decoder * (with predicted=0 by the scaffold's flush_frame contract), and * compares every output byte against an inline C reference that * mirrors the H.264 §8.5.12.1 1D butterfly. * * Why "bit-exact": the GPU shader and the C reference apply the same * integer arithmetic. Any rounding / sign / overflow disagreement is * a bug. Pass = every output byte matches. * * Scope match with flush_frame: the test mirrors flush_frame's * per-MB → flat block layout (raster scan within MB, no z-scan * permutation). That keeps the test focused on IDCT correctness; * the z-scan permutation that bridges to libavcodec's per-MB coeffs * layout is a separate concern (handled in the eventual libavcodec- * intercept patch). * * Covers Y (4x4 + 8x8) and chroma (4x4 Cb + Cr, NV12-interleaved). * Half the MBs use transform_8x8=1 (4 luma 8x8 blocks), half use * transform_8x8=0 (16 luma 4x4 blocks); both partitions are * exercised in the same frame so the flush_frame partitioning logic * is also under test, not just the underlying shaders. Random coeffs * for all components; reference IDCT applied per block. The chroma * compare deinterleaves NV12 UV back into separate Cb/Cr expectations. * * Not in scope (covered by other tests / future PRs): * - Chroma DC / Intra16x16 DC Hadamard pre-pass * - bit-exactness against real H.264 streams (test-vector PR) * - non-zero predicted pixels (intra prediction lands in Stage 2a) */ #include "daedalus_decoder.h" #include #include #include #include /* xorshift64* for deterministic random coefficient generation. */ static uint64_t xs64_state; static uint64_t xs64(void) { uint64_t x = xs64_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs64_state = x; } /* Inline C reference — H.264 §8.5.12.1 1D butterfly, applied row pass * then column pass; +32 rounding, >>6, add to predicted (=0 here), * clip to u8. Bit-exact-equivalent transcription of daedalus-fourier * tests/h264_idct4_ref.c (LGPL-2.1+ original; reproduced here under * fair-use for test purposes — same algorithm, no copy of code). */ static int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } static void h264_idct4_butterfly(const int d[4], int out[4]) { int e = d[0] + d[2]; int f = d[0] - d[2]; int g = (d[1] >> 1) - d[3]; int h = d[1] + (d[3] >> 1); out[0] = e + h; out[1] = f + g; out[2] = f - g; out[3] = e - h; } /* 1D 8-point butterfly per H.264 §8.5.13.2. Transcribed from * daedalus-fourier tests/h264_idct8_ref.c (LGPL-2.1+ in the original — * algorithm reproduced here for test purposes, no copy of code). */ static void h264_idct8_butterfly(const int d[8], int g[8]) { int e[8], f[8]; e[0] = d[0] + d[4]; e[1] = -d[3] + d[5] - d[7] - (d[7] >> 1); e[2] = d[0] - d[4]; e[3] = d[1] + d[7] - d[3] - (d[3] >> 1); e[4] = (d[2] >> 1) - d[6]; e[5] = -d[1] + d[7] + d[5] + (d[5] >> 1); e[6] = d[2] + (d[6] >> 1); e[7] = d[3] + d[5] + d[1] + (d[1] >> 1); f[0] = e[0] + e[6]; f[1] = e[1] + (e[7] >> 2); f[2] = e[2] + e[4]; f[3] = e[3] + (e[5] >> 2); f[4] = e[2] - e[4]; f[5] = (e[3] >> 2) - e[5]; f[6] = e[0] - e[6]; f[7] = e[7] - (e[1] >> 2); g[0] = f[0] + f[7]; g[1] = f[2] + f[5]; g[2] = f[4] + f[3]; g[3] = f[6] + f[1]; g[4] = f[6] - f[1]; g[5] = f[4] - f[3]; g[6] = f[2] - f[5]; g[7] = f[0] - f[7]; } static void ref_idct8_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block) { /* block layout COLUMN-MAJOR: block[c*8 + r] = coef at (row=r, col=c). */ int tmp[8][8]; for (int r = 0; r < 8; r++) { int d[8]; for (int c = 0; c < 8; c++) d[c] = block[c * 8 + r]; int g[8]; h264_idct8_butterfly(d, g); for (int c = 0; c < 8; c++) tmp[r][c] = g[c]; } int col_out[8][8]; for (int c = 0; c < 8; c++) { int d[8]; for (int r = 0; r < 8; r++) d[r] = tmp[r][c]; int g[8]; h264_idct8_butterfly(d, g); for (int r = 0; r < 8; r++) col_out[r][c] = g[r]; } for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) dst[r * stride + c] = (uint8_t) clip_u8( dst[r * stride + c] + ((col_out[r][c] + 32) >> 6)); } static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block) { /* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier): * block[c*4 + r] = coeff at (row=r, col=c). * Row pass first: gather d[c] = block[c*4 + r] for fixed r. */ int tmp[4][4]; for (int r = 0; r < 4; r++) { int d[4] = { block[0*4 + r], block[1*4 + r], block[2*4 + r], block[3*4 + r] }; int o[4]; h264_idct4_butterfly(d, o); for (int c = 0; c < 4; c++) tmp[r][c] = o[c]; } /* Column pass: gather d[r] = tmp[r][c] for fixed c. */ int col_out[4][4]; for (int c = 0; c < 4; c++) { int d[4] = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] }; int o[4]; h264_idct4_butterfly(d, o); for (int r = 0; r < 4; r++) col_out[r][c] = o[r]; } /* Add (predicted=dst, here 0) + clip. */ for (int r = 0; r < 4; r++) for (int c = 0; c < 4; c++) dst[r * stride + c] = (uint8_t) clip_u8( dst[r * stride + c] + ((col_out[r][c] + 32) >> 6)); } int main(int argc, char **argv) { /* Smaller than 1080p to keep the test snappy; still N_MBs >= 64 so * the dispatch covers multiple workgroups (16 blocks/WG → ≥4 WGs). */ int width = argc > 1 ? atoi(argv[1]) : 320; int height = argc > 2 ? atoi(argv[2]) : 240; /* 240 / 16 = 15 → coded 240 */ /* Coded dims must be mod-16; 320×240 is canonical QVGA. */ uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL; xs64_state = seed; /* Optional 4th argv: "auto" (default) / "cpu" / "qpu" to pin the * dispatch substrate. Both substrates must produce IDENTICAL * output (the V3D shaders are bit-exact gates against the same * spec the NEON path implements); the ctest suite runs the QVGA * test once per substrate to catch any silent drift. */ daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO; const char *sub_name = "auto"; if (argc > 4) { if (!strcmp(argv[4], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; } else if (!strcmp(argv[4], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; } else if (!strcmp(argv[4], "auto")) { /* default */ } else { fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[4]); return 1; } } int mb_w = width / 16; int mb_h = height / 16; int n_mbs = mb_w * mb_h; printf("test_idct_bitexact: %dx%d (%d MBs), seed=0x%lx\n", width, height, n_mbs, (unsigned long) seed); daedalus_decoder *dec = daedalus_decoder_create(width, height); if (!dec) { fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n"); return 0; } if (daedalus_decoder_set_substrate(dec, sub) != 0) { fprintf(stderr, "set_substrate(%s) failed\n", sub_name); return 1; } printf("substrate: %s\n", sub_name); /* Build the per-MB inputs. Each MB gets 16 luma 4×4 blocks of * random coeffs in [-512, 511] — same range as the daedalus-fourier * cycle-6 M1 gate uses. */ int16_t (*per_mb_coeffs)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_coeffs)); if (!per_mb_coeffs) { fprintf(stderr, "alloc fail\n"); return 1; } for (int mb = 0; mb < n_mbs; mb++) { for (int i = 0; i < 384; i++) { /* Random coeffs in [-512, 511] for all of luma + Cb + Cr. * Same range as the daedalus-fourier cycle-6 M1 gate. */ per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512); } } /* Per-MB transform mode (deterministic split: every odd raster MB * is 8x8, every even is 4x4 — exercises BOTH partitions in the * same frame so the flush_frame partitioning logic is under test). */ uint8_t *mb_8x8 = malloc((size_t) n_mbs); if (!mb_8x8) { fprintf(stderr, "alloc fail\n"); return 1; } for (int i = 0; i < n_mbs; i++) mb_8x8[i] = (i & 1) ? 1 : 0; /* Append in raster order. */ struct daedalus_decoder_mb_input mb = {0}; int n_8x8_mbs = 0, n_4x4_mbs = 0; for (int my = 0; my < mb_h; my++) { for (int mx = 0; mx < mb_w; mx++) { int idx = my * mb_w + mx; mb.mb_x = (uint16_t) mx; mb.mb_y = (uint16_t) my; mb.coeffs = per_mb_coeffs[idx]; mb.transform_8x8 = mb_8x8[idx]; if (mb_8x8[idx]) n_8x8_mbs++; else n_4x4_mbs++; if (daedalus_decoder_append_mb(dec, &mb) != 0) { fprintf(stderr, "append (%d,%d) failed\n", mx, my); return 1; } } } printf("MB mix: %d 4x4 MBs, %d 8x8 MBs\n", n_4x4_mbs, n_8x8_mbs); /* Flush — exercise BOTH the luma path (out_y) and the chroma path * (out_uv set to non-NULL so flush_frame runs the chroma dispatch * + NV12 interleave). */ size_t y_size = (size_t) width * height; size_t uv_size = (size_t) width * height / 2; uint8_t *gpu_y = calloc(1, y_size); uint8_t *gpu_uv = calloc(1, uv_size); if (!gpu_y || !gpu_uv) return 1; int frc = daedalus_decoder_flush_frame(dec, gpu_y, (size_t) width, gpu_uv, (size_t) width); if (frc != 0) { fprintf(stderr, "flush_frame rc=%d\n", frc); return 1; } /* Compute the reference output: same per-MB → flat raster block * layout as flush_frame uses. Branch per MB on transform_8x8. */ uint8_t *ref_y = calloc(1, y_size); if (!ref_y) return 1; int16_t block_scratch[64]; /* large enough for 8x8 */ for (int my = 0; my < mb_h; my++) { for (int mx = 0; mx < mb_w; mx++) { int mb_idx = my * mb_w + mx; if (mb_8x8[mb_idx]) { /* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */ for (int sb_y = 0; sb_y < 2; sb_y++) { for (int sb_x = 0; sb_x < 2; sb_x++) { int block_in_mb = sb_y * 2 + sb_x; memcpy(block_scratch, &per_mb_coeffs[mb_idx][block_in_mb * 64], 64 * sizeof(int16_t)); size_t px_y = (size_t) my * 16 + (size_t) sb_y * 8; size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 8; ref_idct8_add(&ref_y[px_y * (size_t) width + px_x], width, block_scratch); } } } else { /* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */ for (int sb_y = 0; sb_y < 4; sb_y++) { for (int sb_x = 0; sb_x < 4; sb_x++) { int block_in_mb = sb_y * 4 + sb_x; memcpy(block_scratch, &per_mb_coeffs[mb_idx][block_in_mb * 16], 16 * sizeof(int16_t)); size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4; size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4; ref_idct4_add(&ref_y[px_y * (size_t) width + px_x], width, block_scratch); } } } } } /* Build the chroma reference: separate planar Cb and Cr (W/2 by * H/2), each block IDCT'd into its plane. Chroma per-MB layout * matches flush_frame: 4 Cb blocks then 4 Cr blocks, raster order * within each component (sb_y * 2 + sb_x). */ size_t chroma_w = (size_t) width / 2; size_t chroma_h = (size_t) height / 2; size_t chroma_plane_size = chroma_w * chroma_h; uint8_t *ref_cb = calloc(1, chroma_plane_size); uint8_t *ref_cr = calloc(1, chroma_plane_size); if (!ref_cb || !ref_cr) return 1; for (int my = 0; my < mb_h; my++) { for (int mx = 0; mx < mb_w; mx++) { int mb_idx = my * mb_w + mx; for (int comp = 0; comp < 2; comp++) { uint8_t *plane = (comp == 0) ? ref_cb : ref_cr; size_t coeff_base = 256u + (size_t) comp * 64u; for (int sb_y = 0; sb_y < 2; sb_y++) { for (int sb_x = 0; sb_x < 2; sb_x++) { int block_in_comp = sb_y * 2 + sb_x; memcpy(block_scratch, &per_mb_coeffs[mb_idx][coeff_base + (size_t) block_in_comp * 16], 16 * sizeof(int16_t)); size_t px_y = (size_t) my * 8 + (size_t) sb_y * 4; size_t px_x = (size_t) mx * 8 + (size_t) sb_x * 4; ref_idct4_add(&plane[px_y * chroma_w + px_x], (ptrdiff_t) chroma_w, block_scratch); } } } } } /* Y compare. */ size_t y_diffs = 0, y_first_diff = 0; for (size_t i = 0; i < y_size; i++) { if (gpu_y[i] != ref_y[i]) { if (y_diffs == 0) y_first_diff = i; y_diffs++; } } printf("Y bytes total: %zu\n", y_size); printf("Y bytes diff: %zu (%.4f%%)\n", y_diffs, 100.0 * y_diffs / y_size); if (y_diffs) { printf("Y first diff at offset %zu: gpu=%u ref=%u\n", y_first_diff, gpu_y[y_first_diff], ref_y[y_first_diff]); } /* UV compare — deinterleave NV12 back into Cb/Cr and compare. */ size_t cb_diffs = 0, cr_diffs = 0; size_t cb_first = 0, cr_first = 0; for (size_t r = 0; r < chroma_h; r++) { const uint8_t *gpu_row = gpu_uv + r * (size_t) width; const uint8_t *cb_row = ref_cb + r * chroma_w; const uint8_t *cr_row = ref_cr + r * chroma_w; for (size_t c = 0; c < chroma_w; c++) { uint8_t gpu_cb = gpu_row[c * 2 + 0]; uint8_t gpu_cr = gpu_row[c * 2 + 1]; if (gpu_cb != cb_row[c]) { if (cb_diffs == 0) cb_first = r * chroma_w + c; cb_diffs++; } if (gpu_cr != cr_row[c]) { if (cr_diffs == 0) cr_first = r * chroma_w + c; cr_diffs++; } } } printf("Cb bytes total: %zu diff: %zu (%.4f%%)\n", chroma_plane_size, cb_diffs, 100.0 * cb_diffs / chroma_plane_size); printf("Cr bytes total: %zu diff: %zu (%.4f%%)\n", chroma_plane_size, cr_diffs, 100.0 * cr_diffs / chroma_plane_size); if (cb_diffs) { size_t r = cb_first / chroma_w, c = cb_first % chroma_w; printf("Cb first diff at (%zu,%zu): gpu=%u ref=%u\n", r, c, gpu_uv[r * (size_t) width + c * 2 + 0], ref_cb[cb_first]); } if (cr_diffs) { size_t r = cr_first / chroma_w, c = cr_first % chroma_w; printf("Cr first diff at (%zu,%zu): gpu=%u ref=%u\n", r, c, gpu_uv[r * (size_t) width + c * 2 + 1], ref_cr[cr_first]); } free(ref_cr); free(ref_cb); free(ref_y); free(gpu_uv); free(gpu_y); free(mb_8x8); free(per_mb_coeffs); daedalus_decoder_destroy(dec); if (y_diffs == 0 && cb_diffs == 0 && cr_diffs == 0) { printf("BIT-EXACT PASS (Y + Cb + Cr)\n"); return 0; } fprintf(stderr, "BIT-EXACT FAIL\n"); return 1; }