From 854bdeda20de7d0ff977c5493c84a5d9620ae83e Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Mon, 25 May 2026 11:18:59 +0200
Subject: [PATCH] h264: chroma DC 2x2 Hadamard pre-pass primitive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the H.264 §8.5.11.1 chroma DC Hadamard transform.  In 4:2:0
chroma, the four DC coefficients (one from each chroma 4x4 AC block
within an MB) go through a 2x2 Hadamard before quant-scaling and
before being added back to each block's [0,0] coefficient prior to
the 4x4 AC IDCT.

This PR ships the pure Hadamard transform:

  f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1]
  f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1]
  f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1]
  f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1]

implemented as the 2-stage row+col butterfly (1:1 with the NEON
SIMD shape upstream).  Operates in-place on int16[4].

What this does NOT do (deferred to caller-side composition):

  - QP-dependent scaling per §8.5.11.2.  The scale depends on
    QP_C (with chroma_qp_offset adjustment), so the formula has
    branches (>=6 vs <6) and looks up LevelScale4x4 table values.
    The libavcodec intercept patch composes Hadamard + scale +
    shift itself since the scale shape varies by codec-level
    context (slice header chroma_qp_offset, PPS chroma_qp_offset,
    second_chroma_qp_offset for the chroma_qp_index_offset).
  - Inverse transform (decode-time used for the FORWARD direction
    is the same Hadamard up to scaling, but conceptually the spec
    distinguishes them in §8.5.11; we expose only the matrix).

Test design (tests/test_chroma_dc_hadamard.c):

  7 cases, all spec-derived hand-computations:
    - all-uniform 5 → [20, 0, 0, 0]
    - col gradient [0,10,0,10] → [20, -20, 0, 0]
    - row gradient [0,0,10,10] → [20, 0, -20, 0]
    - anti-diagonal [10,0,0,10] → [20, 0, 0, 20]
    - asymmetric [1,2,3,4] → [10, -2, -4, 0]
    - sign-alternating [-5,5,-5,5] → [0, -20, 0, 0]
    - double-Hadamard invariant: H·H = 4·I, so applying twice
      gives [4*c[0], 4*c[1], 4*c[2], 4*c[3]] for any input.

The double-Hadamard test is the strongest correctness gate: any
single sign error in the butterfly would break the H·H = 4·I
algebraic property, surfacing immediately.  All 7 PASS first try.

Verified on hertz:

  $ ./build/test_chroma_dc_hadamard
    all-uniform 5                    PASS
    col gradient [0,10,0,10]         PASS
    row gradient [0,0,10,10]         PASS
    anti-diagonal [10,0,0,10]        PASS
    asymmetric [1,2,3,4]             PASS
    sign-alternating [-5,5,-5,5]     PASS
    double-Hadamard = 4*orig         PASS

  ALL chroma DC Hadamard tests PASS

With this primitive the H.264 8-bit 4:2:0 pixel-math primitive
matrix is complete in fourier:
  - IDCT 4x4 (luma + chroma) ✓
  - IDCT 8x8 (luma, High profile) ✓
  - Chroma DC Hadamard 2x2 ✓ (this PR)
  - Deblock (8 variants) ✓
  - Intra prediction (26 modes) ✓
  - MC qpel (30 dispatches) ✓

What remains for the libavcodec intercept patch: CABAC/CAVLC entropy
decode, SPS/PPS parsing, slice header parsing, MB type / QP / CBP /
intra mode prediction.  All of that lives at the intercept layer
(it's spec-derived from the bitstream syntax, not pixel-math); the
intercept patch will call into these fourier primitives once the
metadata is decoded.
---
 CMakeLists.txt                      |  11 ++-
 tests/h264_chroma_dc_hadamard_ref.c |  53 +++++++++++++
 tests/test_chroma_dc_hadamard.c     | 118 ++++++++++++++++++++++++++++
 3 files changed, 180 insertions(+), 2 deletions(-)
 create mode 100644 tests/h264_chroma_dc_hadamard_ref.c
 create mode 100644 tests/test_chroma_dc_hadamard.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f37451..8a9d6ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -564,14 +564,21 @@ add_executable(test_intra_pred_chroma8x8
 target_compile_options(test_intra_pred_chroma8x8 PRIVATE -O2)
 
 # H.264 Intra_8x8 luma prediction (High profile, 9 modes + 1-2-1
-# reference-sample pre-filter).  This PR ships the pre-filter + the
-# 3 simple modes (V, H, DC); the 6 directional modes follow.
+# reference-sample pre-filter).
 add_executable(test_intra_pred_8x8_luma
     tests/test_intra_pred_8x8_luma.c
     tests/h264_intra_pred_8x8_luma_ref.c
 )
 target_compile_options(test_intra_pred_8x8_luma PRIVATE -O2)
 
+# H.264 chroma DC 2x2 Hadamard pre-pass primitive.  Pure transform,
+# no QP-dependent scaling (that's caller-side composition).
+add_executable(test_chroma_dc_hadamard
+    tests/test_chroma_dc_hadamard.c
+    tests/h264_chroma_dc_hadamard_ref.c
+)
+target_compile_options(test_chroma_dc_hadamard PRIVATE -O2)
+
 add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
 target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
 target_compile_options(bench_pool_overhead PRIVATE -O2)
diff --git a/tests/h264_chroma_dc_hadamard_ref.c b/tests/h264_chroma_dc_hadamard_ref.c
new file mode 100644
index 0000000..3d5ddb4
--- /dev/null
+++ b/tests/h264_chroma_dc_hadamard_ref.c
@@ -0,0 +1,53 @@
+/*
+ * Standalone bit-exact C reference for the H.264 chroma DC 2x2
+ * Hadamard transform (per H.264 §8.5.11.1).
+ *
+ * In 4:2:0 chroma, the four DC coefficients (one from each chroma
+ * 4x4 AC block within an MB) are arranged into a 2x2 block:
+ *
+ *     c[0,0]  c[0,1]      block (0,0) DC   block (0,1) DC
+ *     c[1,0]  c[1,1]      block (1,0) DC   block (1,1) DC
+ *
+ * The 2x2 Hadamard transform:
+ *
+ *     f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1]
+ *     f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1]
+ *     f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1]
+ *     f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1]
+ *
+ * Equivalently expressed as 2-stage butterflies (row then col), which
+ * the NEON impl uses for SIMD friendliness — we present that form
+ * here too so the QPU/NEON ports are 1:1.
+ *
+ * Output f[] replaces the input c[].  The QP-dependent scaling per
+ * §8.5.11.2 happens AFTER this primitive — the intercept patch
+ * composes Hadamard + LevelScale + shift itself, since the scaling
+ * shape depends on QP and on whether we're in the chroma_qp_offset
+ * adjustment regime.
+ *
+ * Input/output layout:
+ *   c[0..3] in row-major order: [c[0,0], c[0,1], c[1,0], c[1,1]]
+ *
+ * License: BSD-2-Clause.  Algorithm is in the H.264 spec.
+ */
+#include <stdint.h>
+
+void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4])
+{
+    /* Stage 1: butterfly along rows.
+     *   t[0] = c[0,0] + c[0,1]   = c[0] + c[1]
+     *   t[1] = c[0,0] - c[0,1]   = c[0] - c[1]
+     *   t[2] = c[1,0] + c[1,1]   = c[2] + c[3]
+     *   t[3] = c[1,0] - c[1,1]   = c[2] - c[3]
+     */
+    int t0 = c[0] + c[1];
+    int t1 = c[0] - c[1];
+    int t2 = c[2] + c[3];
+    int t3 = c[2] - c[3];
+
+    /* Stage 2: butterfly along cols. */
+    c[0] = (int16_t)(t0 + t2);   /* f[0,0] = t0+t2 = sum of all 4 */
+    c[1] = (int16_t)(t1 + t3);   /* f[0,1] = (c0-c1) + (c2-c3) */
+    c[2] = (int16_t)(t0 - t2);   /* f[1,0] = (c0+c1) - (c2+c3) */
+    c[3] = (int16_t)(t1 - t3);   /* f[1,1] = (c0-c1) - (c2-c3) */
+}
diff --git a/tests/test_chroma_dc_hadamard.c b/tests/test_chroma_dc_hadamard.c
new file mode 100644
index 0000000..4a9b4b2
--- /dev/null
+++ b/tests/test_chroma_dc_hadamard.c
@@ -0,0 +1,118 @@
+/*
+ * Tests the H.264 chroma DC 2x2 Hadamard primitive against
+ * spec-derived expected outputs.
+ *
+ *   f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1]    "sum"
+ *   f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1]    "col-diff"
+ *   f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1]    "row-diff"
+ *   f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1]    "anti-diag"
+ */
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+extern void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4]);
+
+static int check(const char *name, int16_t in[4], int16_t expect[4])
+{
+    int16_t c[4]; memcpy(c, in, sizeof(c));
+    daedalus_h264_chroma_dc_hadamard_2x2_ref(c);
+    int fail = 0;
+    for (int i = 0; i < 4; i++) {
+        if (c[i] != expect[i]) {
+            fprintf(stderr, "%s: c[%d] = %d, expected %d\n",
+                    name, i, c[i], expect[i]);
+            fail = 1;
+        }
+    }
+    if (!fail) printf("  %-32s PASS\n", name);
+    else       printf("  %-32s FAIL\n", name);
+    return fail;
+}
+
+int main(void)
+{
+    int fail = 0;
+
+    /* Test 1: All-same input.
+     *   c = [5, 5, 5, 5]
+     *   f[0,0] = 20, f[0,1] = 0, f[1,0] = 0, f[1,1] = 0
+     */
+    { int16_t in[4] = { 5, 5, 5, 5 };
+      int16_t ex[4] = { 20, 0, 0, 0 };
+      fail |= check("all-uniform 5", in, ex); }
+
+    /* Test 2: Single-axis variation (col 1 = 0, col 2 = 10).
+     *   c = [0, 10, 0, 10]
+     *   f[0,0] = 0+10+0+10 = 20
+     *   f[0,1] = 0-10+0-10 = -20
+     *   f[1,0] = 0+10-0-10 = 0
+     *   f[1,1] = 0-10-0+10 = 0
+     */
+    { int16_t in[4] = { 0, 10, 0, 10 };
+      int16_t ex[4] = { 20, -20, 0, 0 };
+      fail |= check("col gradient [0,10,0,10]", in, ex); }
+
+    /* Test 3: Row gradient.
+     *   c = [0, 0, 10, 10]
+     *   f[0,0] = 20, f[0,1] = 0, f[1,0] = 0-20 = -20, f[1,1] = 0
+     */
+    { int16_t in[4] = { 0, 0, 10, 10 };
+      int16_t ex[4] = { 20, 0, -20, 0 };
+      fail |= check("row gradient [0,0,10,10]", in, ex); }
+
+    /* Test 4: Anti-diagonal pattern.
+     *   c = [10, 0, 0, 10]
+     *   f[0,0] = 20
+     *   f[0,1] = 10-0+0-10 = 0
+     *   f[1,0] = 10+0-0-10 = 0
+     *   f[1,1] = 10-0-0+10 = 20
+     */
+    { int16_t in[4] = { 10, 0, 0, 10 };
+      int16_t ex[4] = { 20, 0, 0, 20 };
+      fail |= check("anti-diagonal [10,0,0,10]", in, ex); }
+
+    /* Test 5: Asymmetric — all bands non-zero.
+     *   c = [1, 2, 3, 4]
+     *   f[0,0] = 10
+     *   f[0,1] = 1-2+3-4 = -2
+     *   f[1,0] = 1+2-3-4 = -4
+     *   f[1,1] = 1-2-3+4 = 0
+     */
+    { int16_t in[4] = { 1, 2, 3, 4 };
+      int16_t ex[4] = { 10, -2, -4, 0 };
+      fail |= check("asymmetric [1,2,3,4]", in, ex); }
+
+    /* Test 6: Negative inputs (Hadamard is linear, so signs preserve).
+     *   c = [-5, 5, -5, 5]
+     *   f[0,0] = -5+5-5+5 = 0
+     *   f[0,1] = -5-5-5-5 = -20
+     *   f[1,0] = -5+5+5-5 = 0
+     *   f[1,1] = -5-5+5+5 = 0
+     */
+    { int16_t in[4] = { -5, 5, -5, 5 };
+      int16_t ex[4] = { 0, -20, 0, 0 };
+      fail |= check("sign-alternating [-5,5,-5,5]", in, ex); }
+
+    /* Test 7: Inverse-property check.  H * H = 4*I for the unscaled
+     * 2x2 Hadamard.  So applying twice multiplies each by 4.
+     *   c = [1, 2, 3, 4]
+     *   First Hadamard:  [10, -2, -4, 0]
+     *   Second Hadamard: [4, 8, 12, 16]
+     */
+    { int16_t in[4] = { 1, 2, 3, 4 };
+      int16_t ex[4] = { 4, 8, 12, 16 };
+      int16_t c[4]; memcpy(c, in, sizeof(c));
+      daedalus_h264_chroma_dc_hadamard_2x2_ref(c);
+      daedalus_h264_chroma_dc_hadamard_2x2_ref(c);
+      int local_fail = 0;
+      for (int i = 0; i < 4; i++) if (c[i] != ex[i]) local_fail = 1;
+      printf("  %-32s %s\n", "double-Hadamard = 4*orig",
+             local_fail ? "FAIL" : "PASS");
+      fail |= local_fail;
+    }
+
+    if (fail == 0) printf("\nALL chroma DC Hadamard tests PASS\n");
+    else           fprintf(stderr, "\n%d test(s) FAILED\n", fail);
+    return fail ? 1 : 0;
+}
-- 
2.47.3