phase1/stage1: bit-exact gate for the frame-scaled IDCT 4×4 #4
+5
-1
@@ -112,9 +112,13 @@ enable_testing()
|
|||||||
add_executable(test_smoke tests/test_smoke.c)
|
add_executable(test_smoke tests/test_smoke.c)
|
||||||
target_link_libraries(test_smoke PRIVATE daedalus_decoder)
|
target_link_libraries(test_smoke PRIVATE daedalus_decoder)
|
||||||
target_compile_options(test_smoke PRIVATE -O2)
|
target_compile_options(test_smoke PRIVATE -O2)
|
||||||
|
|
||||||
add_test(NAME smoke COMMAND test_smoke)
|
add_test(NAME smoke COMMAND test_smoke)
|
||||||
|
|
||||||
|
add_executable(test_idct_bitexact tests/test_idct_bitexact.c)
|
||||||
|
target_link_libraries(test_idct_bitexact PRIVATE daedalus_decoder)
|
||||||
|
target_compile_options(test_idct_bitexact PRIVATE -O2)
|
||||||
|
add_test(NAME idct_bitexact COMMAND test_idct_bitexact)
|
||||||
|
|
||||||
# ---- Install ------------------------------------------------------
|
# ---- Install ------------------------------------------------------
|
||||||
#
|
#
|
||||||
# Library + public header. Stage 2/3 will add a pkg-config file and
|
# Library + public header. Stage 2/3 will add a pkg-config file and
|
||||||
|
|||||||
@@ -0,0 +1,210 @@
|
|||||||
|
/* SPDX-License-Identifier: BSD-2-Clause */
|
||||||
|
/*
|
||||||
|
* test_idct_bitexact — phase1 stage1 bit-exact gate for the frame-
|
||||||
|
* scaled luma IDCT 4×4 dispatch.
|
||||||
|
*
|
||||||
|
* Generates a frame of random coefficients, runs daedalus_decoder
|
||||||
|
* (with predicted=0 by the scaffold's flush_frame contract), and
|
||||||
|
* compares every output byte against an inline C reference that
|
||||||
|
* mirrors the H.264 §8.5.12.1 1D butterfly.
|
||||||
|
*
|
||||||
|
* Why "bit-exact": the GPU shader and the C reference apply the same
|
||||||
|
* integer arithmetic. Any rounding / sign / overflow disagreement is
|
||||||
|
* a bug. Pass = every output byte matches.
|
||||||
|
*
|
||||||
|
* Scope match with flush_frame: the test mirrors flush_frame's
|
||||||
|
* per-MB → flat block layout (raster scan within MB, no z-scan
|
||||||
|
* permutation). That keeps the test focused on IDCT correctness;
|
||||||
|
* the z-scan permutation that bridges to libavcodec's per-MB coeffs
|
||||||
|
* layout is a separate concern (handled in the eventual libavcodec-
|
||||||
|
* intercept patch).
|
||||||
|
*
|
||||||
|
* Not in scope (covered by other tests / future PRs):
|
||||||
|
* - chroma planes (Phase 1 stage 1 fills UV with grey 128)
|
||||||
|
* - IDCT 8×8 (Phase 1 follow-on)
|
||||||
|
* - bit-exactness against real H.264 streams (test-vector PR)
|
||||||
|
* - non-zero predicted pixels (intra prediction lands in Stage 2a)
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "daedalus_decoder.h"
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
/* xorshift64* for deterministic random coefficient generation. */
|
||||||
|
static uint64_t xs64_state;
|
||||||
|
static uint64_t xs64(void)
|
||||||
|
{
|
||||||
|
uint64_t x = xs64_state;
|
||||||
|
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||||||
|
return xs64_state = x;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Inline C reference — H.264 §8.5.12.1 1D butterfly, applied row pass
|
||||||
|
* then column pass; +32 rounding, >>6, add to predicted (=0 here),
|
||||||
|
* clip to u8. Bit-exact-equivalent transcription of daedalus-fourier
|
||||||
|
* tests/h264_idct4_ref.c (LGPL-2.1+ original; reproduced here under
|
||||||
|
* fair-use for test purposes — same algorithm, no copy of code). */
|
||||||
|
static int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||||
|
|
||||||
|
static void h264_idct4_butterfly(const int d[4], int out[4])
|
||||||
|
{
|
||||||
|
int e = d[0] + d[2];
|
||||||
|
int f = d[0] - d[2];
|
||||||
|
int g = (d[1] >> 1) - d[3];
|
||||||
|
int h = d[1] + (d[3] >> 1);
|
||||||
|
out[0] = e + h;
|
||||||
|
out[1] = f + g;
|
||||||
|
out[2] = f - g;
|
||||||
|
out[3] = e - h;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
|
||||||
|
{
|
||||||
|
/* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier):
|
||||||
|
* block[c*4 + r] = coeff at (row=r, col=c).
|
||||||
|
* Row pass first: gather d[c] = block[c*4 + r] for fixed r. */
|
||||||
|
int tmp[4][4];
|
||||||
|
for (int r = 0; r < 4; r++) {
|
||||||
|
int d[4] = { block[0*4 + r], block[1*4 + r],
|
||||||
|
block[2*4 + r], block[3*4 + r] };
|
||||||
|
int o[4];
|
||||||
|
h264_idct4_butterfly(d, o);
|
||||||
|
for (int c = 0; c < 4; c++) tmp[r][c] = o[c];
|
||||||
|
}
|
||||||
|
/* Column pass: gather d[r] = tmp[r][c] for fixed c. */
|
||||||
|
int col_out[4][4];
|
||||||
|
for (int c = 0; c < 4; c++) {
|
||||||
|
int d[4] = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] };
|
||||||
|
int o[4];
|
||||||
|
h264_idct4_butterfly(d, o);
|
||||||
|
for (int r = 0; r < 4; r++) col_out[r][c] = o[r];
|
||||||
|
}
|
||||||
|
/* Add (predicted=dst, here 0) + clip. */
|
||||||
|
for (int r = 0; r < 4; r++)
|
||||||
|
for (int c = 0; c < 4; c++)
|
||||||
|
dst[r * stride + c] = (uint8_t) clip_u8(
|
||||||
|
dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
/* Smaller than 1080p to keep the test snappy; still N_MBs >= 64 so
|
||||||
|
* the dispatch covers multiple workgroups (16 blocks/WG → ≥4 WGs). */
|
||||||
|
int width = argc > 1 ? atoi(argv[1]) : 320;
|
||||||
|
int height = argc > 2 ? atoi(argv[2]) : 240; /* 240 / 16 = 15 → coded 240 */
|
||||||
|
/* Coded dims must be mod-16; 320×240 is canonical QVGA. */
|
||||||
|
|
||||||
|
uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL;
|
||||||
|
xs64_state = seed;
|
||||||
|
|
||||||
|
int mb_w = width / 16;
|
||||||
|
int mb_h = height / 16;
|
||||||
|
int n_mbs = mb_w * mb_h;
|
||||||
|
printf("test_idct_bitexact: %dx%d (%d MBs), seed=0x%lx\n",
|
||||||
|
width, height, n_mbs, (unsigned long) seed);
|
||||||
|
|
||||||
|
daedalus_decoder *dec = daedalus_decoder_create(width, height);
|
||||||
|
if (!dec) {
|
||||||
|
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Build the per-MB inputs. Each MB gets 16 luma 4×4 blocks of
|
||||||
|
* random coeffs in [-512, 511] — same range as the daedalus-fourier
|
||||||
|
* cycle-6 M1 gate uses. */
|
||||||
|
int16_t (*per_mb_coeffs)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_coeffs));
|
||||||
|
if (!per_mb_coeffs) { fprintf(stderr, "alloc fail\n"); return 1; }
|
||||||
|
|
||||||
|
for (int mb = 0; mb < n_mbs; mb++) {
|
||||||
|
for (int i = 0; i < 384; i++) {
|
||||||
|
if (i < 256)
|
||||||
|
per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
|
||||||
|
else
|
||||||
|
per_mb_coeffs[mb][i] = 0; /* chroma — unused this stage */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Append in raster order. */
|
||||||
|
struct daedalus_decoder_mb_input mb = {0};
|
||||||
|
for (int my = 0; my < mb_h; my++) {
|
||||||
|
for (int mx = 0; mx < mb_w; mx++) {
|
||||||
|
int idx = my * mb_w + mx;
|
||||||
|
mb.mb_x = (uint16_t) mx;
|
||||||
|
mb.mb_y = (uint16_t) my;
|
||||||
|
mb.coeffs = per_mb_coeffs[idx];
|
||||||
|
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
|
||||||
|
fprintf(stderr, "append (%d,%d) failed\n", mx, my);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Flush. */
|
||||||
|
size_t y_size = (size_t) width * height;
|
||||||
|
uint8_t *gpu_y = calloc(1, y_size);
|
||||||
|
if (!gpu_y) return 1;
|
||||||
|
int frc = daedalus_decoder_flush_frame(dec, gpu_y, (size_t) width,
|
||||||
|
NULL, 0);
|
||||||
|
if (frc != 0) {
|
||||||
|
fprintf(stderr, "flush_frame rc=%d\n", frc);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Compute the reference output: same per-MB → flat raster block
|
||||||
|
* layout as flush_frame uses. */
|
||||||
|
uint8_t *ref_y = calloc(1, y_size);
|
||||||
|
if (!ref_y) return 1;
|
||||||
|
/* Need a destructively-mutable copy because the reference IDCT
|
||||||
|
* doesn't actually mutate, but the GPU's IDCT shader does zero
|
||||||
|
* the coeffs. Our reference doesn't zero; that's fine because we
|
||||||
|
* use a fresh copy per block. */
|
||||||
|
int16_t block_scratch[16];
|
||||||
|
for (int my = 0; my < mb_h; my++) {
|
||||||
|
for (int mx = 0; mx < mb_w; mx++) {
|
||||||
|
int mb_idx = my * mb_w + mx;
|
||||||
|
for (int sb_y = 0; sb_y < 4; sb_y++) {
|
||||||
|
for (int sb_x = 0; sb_x < 4; sb_x++) {
|
||||||
|
int block_in_mb = sb_y * 4 + sb_x;
|
||||||
|
memcpy(block_scratch,
|
||||||
|
&per_mb_coeffs[mb_idx][block_in_mb * 16],
|
||||||
|
16 * sizeof(int16_t));
|
||||||
|
size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
|
||||||
|
size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
|
||||||
|
ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
|
||||||
|
width, block_scratch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Byte-by-byte compare. */
|
||||||
|
size_t diffs = 0;
|
||||||
|
size_t first_diff = 0;
|
||||||
|
for (size_t i = 0; i < y_size; i++) {
|
||||||
|
if (gpu_y[i] != ref_y[i]) {
|
||||||
|
if (diffs == 0) first_diff = i;
|
||||||
|
diffs++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("Y bytes total: %zu\n", y_size);
|
||||||
|
printf("Y bytes diff: %zu (%.4f%%)\n", diffs, 100.0 * diffs / y_size);
|
||||||
|
if (diffs) {
|
||||||
|
printf("first diff at offset %zu: gpu=%u ref=%u\n",
|
||||||
|
first_diff, gpu_y[first_diff], ref_y[first_diff]);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(ref_y);
|
||||||
|
free(gpu_y);
|
||||||
|
free(per_mb_coeffs);
|
||||||
|
daedalus_decoder_destroy(dec);
|
||||||
|
|
||||||
|
if (diffs == 0) {
|
||||||
|
printf("BIT-EXACT PASS\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "BIT-EXACT FAIL\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user