From 854bdeda20de7d0ff977c5493c84a5d9620ae83e Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 11:18:59 +0200 Subject: [PATCH] h264: chroma DC 2x2 Hadamard pre-pass primitive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the H.264 §8.5.11.1 chroma DC Hadamard transform. In 4:2:0 chroma, the four DC coefficients (one from each chroma 4x4 AC block within an MB) go through a 2x2 Hadamard before quant-scaling and before being added back to each block's [0,0] coefficient prior to the 4x4 AC IDCT. This PR ships the pure Hadamard transform: f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1] f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1] f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1] f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1] implemented as the 2-stage row+col butterfly (1:1 with the NEON SIMD shape upstream). Operates in-place on int16[4]. What this does NOT do (deferred to caller-side composition): - QP-dependent scaling per §8.5.11.2. The scale depends on QP_C (with chroma_qp_offset adjustment), so the formula has branches (>=6 vs <6) and looks up LevelScale4x4 table values. The libavcodec intercept patch composes Hadamard + scale + shift itself since the scale shape varies by codec-level context (slice header chroma_qp_offset, PPS chroma_qp_offset, second_chroma_qp_offset for the chroma_qp_index_offset). - Inverse transform (decode-time used for the FORWARD direction is the same Hadamard up to scaling, but conceptually the spec distinguishes them in §8.5.11; we expose only the matrix). Test design (tests/test_chroma_dc_hadamard.c): 7 cases, all spec-derived hand-computations: - all-uniform 5 → [20, 0, 0, 0] - col gradient [0,10,0,10] → [20, -20, 0, 0] - row gradient [0,0,10,10] → [20, 0, -20, 0] - anti-diagonal [10,0,0,10] → [20, 0, 0, 20] - asymmetric [1,2,3,4] → [10, -2, -4, 0] - sign-alternating [-5,5,-5,5] → [0, -20, 0, 0] - double-Hadamard invariant: H·H = 4·I, so applying twice gives [4*c[0], 4*c[1], 4*c[2], 4*c[3]] for any input. The double-Hadamard test is the strongest correctness gate: any single sign error in the butterfly would break the H·H = 4·I algebraic property, surfacing immediately. All 7 PASS first try. Verified on hertz: $ ./build/test_chroma_dc_hadamard all-uniform 5 PASS col gradient [0,10,0,10] PASS row gradient [0,0,10,10] PASS anti-diagonal [10,0,0,10] PASS asymmetric [1,2,3,4] PASS sign-alternating [-5,5,-5,5] PASS double-Hadamard = 4*orig PASS ALL chroma DC Hadamard tests PASS With this primitive the H.264 8-bit 4:2:0 pixel-math primitive matrix is complete in fourier: - IDCT 4x4 (luma + chroma) ✓ - IDCT 8x8 (luma, High profile) ✓ - Chroma DC Hadamard 2x2 ✓ (this PR) - Deblock (8 variants) ✓ - Intra prediction (26 modes) ✓ - MC qpel (30 dispatches) ✓ What remains for the libavcodec intercept patch: CABAC/CAVLC entropy decode, SPS/PPS parsing, slice header parsing, MB type / QP / CBP / intra mode prediction. All of that lives at the intercept layer (it's spec-derived from the bitstream syntax, not pixel-math); the intercept patch will call into these fourier primitives once the metadata is decoded. --- CMakeLists.txt | 11 ++- tests/h264_chroma_dc_hadamard_ref.c | 53 +++++++++++++ tests/test_chroma_dc_hadamard.c | 118 ++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 tests/h264_chroma_dc_hadamard_ref.c create mode 100644 tests/test_chroma_dc_hadamard.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f37451..8a9d6ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -564,14 +564,21 @@ add_executable(test_intra_pred_chroma8x8 target_compile_options(test_intra_pred_chroma8x8 PRIVATE -O2) # H.264 Intra_8x8 luma prediction (High profile, 9 modes + 1-2-1 -# reference-sample pre-filter). This PR ships the pre-filter + the -# 3 simple modes (V, H, DC); the 6 directional modes follow. +# reference-sample pre-filter). add_executable(test_intra_pred_8x8_luma tests/test_intra_pred_8x8_luma.c tests/h264_intra_pred_8x8_luma_ref.c ) target_compile_options(test_intra_pred_8x8_luma PRIVATE -O2) +# H.264 chroma DC 2x2 Hadamard pre-pass primitive. Pure transform, +# no QP-dependent scaling (that's caller-side composition). +add_executable(test_chroma_dc_hadamard + tests/test_chroma_dc_hadamard.c + tests/h264_chroma_dc_hadamard_ref.c +) +target_compile_options(test_chroma_dc_hadamard PRIVATE -O2) + add_executable(bench_pool_overhead tests/bench_pool_overhead.c) target_link_libraries(bench_pool_overhead PRIVATE daedalus_core) target_compile_options(bench_pool_overhead PRIVATE -O2) diff --git a/tests/h264_chroma_dc_hadamard_ref.c b/tests/h264_chroma_dc_hadamard_ref.c new file mode 100644 index 0000000..3d5ddb4 --- /dev/null +++ b/tests/h264_chroma_dc_hadamard_ref.c @@ -0,0 +1,53 @@ +/* + * Standalone bit-exact C reference for the H.264 chroma DC 2x2 + * Hadamard transform (per H.264 §8.5.11.1). + * + * In 4:2:0 chroma, the four DC coefficients (one from each chroma + * 4x4 AC block within an MB) are arranged into a 2x2 block: + * + * c[0,0] c[0,1] block (0,0) DC block (0,1) DC + * c[1,0] c[1,1] block (1,0) DC block (1,1) DC + * + * The 2x2 Hadamard transform: + * + * f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1] + * f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1] + * f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1] + * f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1] + * + * Equivalently expressed as 2-stage butterflies (row then col), which + * the NEON impl uses for SIMD friendliness — we present that form + * here too so the QPU/NEON ports are 1:1. + * + * Output f[] replaces the input c[]. The QP-dependent scaling per + * §8.5.11.2 happens AFTER this primitive — the intercept patch + * composes Hadamard + LevelScale + shift itself, since the scaling + * shape depends on QP and on whether we're in the chroma_qp_offset + * adjustment regime. + * + * Input/output layout: + * c[0..3] in row-major order: [c[0,0], c[0,1], c[1,0], c[1,1]] + * + * License: BSD-2-Clause. Algorithm is in the H.264 spec. + */ +#include + +void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4]) +{ + /* Stage 1: butterfly along rows. + * t[0] = c[0,0] + c[0,1] = c[0] + c[1] + * t[1] = c[0,0] - c[0,1] = c[0] - c[1] + * t[2] = c[1,0] + c[1,1] = c[2] + c[3] + * t[3] = c[1,0] - c[1,1] = c[2] - c[3] + */ + int t0 = c[0] + c[1]; + int t1 = c[0] - c[1]; + int t2 = c[2] + c[3]; + int t3 = c[2] - c[3]; + + /* Stage 2: butterfly along cols. */ + c[0] = (int16_t)(t0 + t2); /* f[0,0] = t0+t2 = sum of all 4 */ + c[1] = (int16_t)(t1 + t3); /* f[0,1] = (c0-c1) + (c2-c3) */ + c[2] = (int16_t)(t0 - t2); /* f[1,0] = (c0+c1) - (c2+c3) */ + c[3] = (int16_t)(t1 - t3); /* f[1,1] = (c0-c1) - (c2-c3) */ +} diff --git a/tests/test_chroma_dc_hadamard.c b/tests/test_chroma_dc_hadamard.c new file mode 100644 index 0000000..4a9b4b2 --- /dev/null +++ b/tests/test_chroma_dc_hadamard.c @@ -0,0 +1,118 @@ +/* + * Tests the H.264 chroma DC 2x2 Hadamard primitive against + * spec-derived expected outputs. + * + * f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1] "sum" + * f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1] "col-diff" + * f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1] "row-diff" + * f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1] "anti-diag" + */ +#include +#include +#include + +extern void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4]); + +static int check(const char *name, int16_t in[4], int16_t expect[4]) +{ + int16_t c[4]; memcpy(c, in, sizeof(c)); + daedalus_h264_chroma_dc_hadamard_2x2_ref(c); + int fail = 0; + for (int i = 0; i < 4; i++) { + if (c[i] != expect[i]) { + fprintf(stderr, "%s: c[%d] = %d, expected %d\n", + name, i, c[i], expect[i]); + fail = 1; + } + } + if (!fail) printf(" %-32s PASS\n", name); + else printf(" %-32s FAIL\n", name); + return fail; +} + +int main(void) +{ + int fail = 0; + + /* Test 1: All-same input. + * c = [5, 5, 5, 5] + * f[0,0] = 20, f[0,1] = 0, f[1,0] = 0, f[1,1] = 0 + */ + { int16_t in[4] = { 5, 5, 5, 5 }; + int16_t ex[4] = { 20, 0, 0, 0 }; + fail |= check("all-uniform 5", in, ex); } + + /* Test 2: Single-axis variation (col 1 = 0, col 2 = 10). + * c = [0, 10, 0, 10] + * f[0,0] = 0+10+0+10 = 20 + * f[0,1] = 0-10+0-10 = -20 + * f[1,0] = 0+10-0-10 = 0 + * f[1,1] = 0-10-0+10 = 0 + */ + { int16_t in[4] = { 0, 10, 0, 10 }; + int16_t ex[4] = { 20, -20, 0, 0 }; + fail |= check("col gradient [0,10,0,10]", in, ex); } + + /* Test 3: Row gradient. + * c = [0, 0, 10, 10] + * f[0,0] = 20, f[0,1] = 0, f[1,0] = 0-20 = -20, f[1,1] = 0 + */ + { int16_t in[4] = { 0, 0, 10, 10 }; + int16_t ex[4] = { 20, 0, -20, 0 }; + fail |= check("row gradient [0,0,10,10]", in, ex); } + + /* Test 4: Anti-diagonal pattern. + * c = [10, 0, 0, 10] + * f[0,0] = 20 + * f[0,1] = 10-0+0-10 = 0 + * f[1,0] = 10+0-0-10 = 0 + * f[1,1] = 10-0-0+10 = 20 + */ + { int16_t in[4] = { 10, 0, 0, 10 }; + int16_t ex[4] = { 20, 0, 0, 20 }; + fail |= check("anti-diagonal [10,0,0,10]", in, ex); } + + /* Test 5: Asymmetric — all bands non-zero. + * c = [1, 2, 3, 4] + * f[0,0] = 10 + * f[0,1] = 1-2+3-4 = -2 + * f[1,0] = 1+2-3-4 = -4 + * f[1,1] = 1-2-3+4 = 0 + */ + { int16_t in[4] = { 1, 2, 3, 4 }; + int16_t ex[4] = { 10, -2, -4, 0 }; + fail |= check("asymmetric [1,2,3,4]", in, ex); } + + /* Test 6: Negative inputs (Hadamard is linear, so signs preserve). + * c = [-5, 5, -5, 5] + * f[0,0] = -5+5-5+5 = 0 + * f[0,1] = -5-5-5-5 = -20 + * f[1,0] = -5+5+5-5 = 0 + * f[1,1] = -5-5+5+5 = 0 + */ + { int16_t in[4] = { -5, 5, -5, 5 }; + int16_t ex[4] = { 0, -20, 0, 0 }; + fail |= check("sign-alternating [-5,5,-5,5]", in, ex); } + + /* Test 7: Inverse-property check. H * H = 4*I for the unscaled + * 2x2 Hadamard. So applying twice multiplies each by 4. + * c = [1, 2, 3, 4] + * First Hadamard: [10, -2, -4, 0] + * Second Hadamard: [4, 8, 12, 16] + */ + { int16_t in[4] = { 1, 2, 3, 4 }; + int16_t ex[4] = { 4, 8, 12, 16 }; + int16_t c[4]; memcpy(c, in, sizeof(c)); + daedalus_h264_chroma_dc_hadamard_2x2_ref(c); + daedalus_h264_chroma_dc_hadamard_2x2_ref(c); + int local_fail = 0; + for (int i = 0; i < 4; i++) if (c[i] != ex[i]) local_fail = 1; + printf(" %-32s %s\n", "double-Hadamard = 4*orig", + local_fail ? "FAIL" : "PASS"); + fail |= local_fail; + } + + if (fail == 0) printf("\nALL chroma DC Hadamard tests PASS\n"); + else fprintf(stderr, "\n%d test(s) FAILED\n", fail); + return fail ? 1 : 0; +} -- 2.47.3