From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Sun, 25 May 2026 13:00:00 +0200
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 chroma DC Hadamard through daedalus-fourier

Substitutes H264DSPContext.chroma_dc_dequant_idct in the
4:2:0 / bit_depth=8 init path with a wrapper that composes
the daedalus chroma DC Hadamard primitive (fourier PR #25)
with qmul scaling FFmpeg does in one fused function.

Bit-exact against ff_h264_chroma_dc_dequant_idct_8_c.
Hadamard correctness gated by fourier PR #23 test suite.

4:2:2 chroma stays on the in-tree 422 variant (same
gating shape as 0009 chroma deblock substitution).

Requires daedalus-fourier commit b9f9ff2 or later (PR #25
exposing the public Hadamard symbol).  Pin bumps in PKGBUILD
and build-deb.sh come in the same commit.
---
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
--- a/libavcodec/aarch64/h264_idct_daedalus.c	2026-05-25 13:38:32.019491484 +0200
+++ libavcodec/aarch64/h264_idct_daedalus.c	2026-05-25 13:38:32.033821507 +0200
@@ -1,5 +1,5 @@
 /*
- * H.264 4x4 / 8x8 IDCT + luma v/h (inter + intra) + chroma v/h deblock — daedalus-fourier substitution shims.
+ * H.264 4x4 / 8x8 IDCT + luma v/h (inter+intra) + chroma v/h deblock + chroma DC Hadamard — daedalus-fourier substitution shims.
  *
  * Routes H264DSPContext.idct_add           → daedalus_recipe_dispatch_h264_idct4
  *        H264DSPContext.idct8_add          → daedalus_recipe_dispatch_h264_idct8
@@ -9,6 +9,7 @@
  *        H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h
  *        H264DSPContext.v_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_v_intra
  *        H264DSPContext.h_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_h_intra
+ *        H264DSPContext.chroma_dc_dequant_idct   → daedalus_h264_chroma_dc_hadamard_2x2 + caller-side qmul
  * instead of the in-tree ff_h264_*_neon assembly.  The recipe layer
  * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8
  * is CPU primary with QPU opportunistic — the ctx below is no-QPU,
@@ -60,6 +61,7 @@
                                                 int alpha, int beta);
 void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
                                                 int alpha, int beta);
+void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul);
 
 void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
 {
@@ -187,3 +189,32 @@
     daedalus_recipe_dispatch_h264_deblock_luma_h_intra(g_dctx, pix, (size_t)stride,
                                                         1, &meta);
 }
+
+/* Composes daedalus_h264_chroma_dc_hadamard_2x2 with the qmul scaling
+ * that FFmpeg's reference does in one fused function (h264idct_template.c
+ * ff_h264_chroma_dc_dequant_idct).
+ *
+ * The 4 DC coefficients are scattered across the per-MB coefficient
+ * buffer at offsets [r*stride + c*xStride] (stride=32, xStride=16).
+ * Extract into a contiguous int16[4], run the Hadamard, then apply
+ * the qmul scale and write back to the original positions.
+ *
+ * No daedalus ctx needed; the Hadamard is a pure stateless primitive.
+ */
+void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul)
+{
+    enum { stride = 32, xStride = 16 };
+    int16_t dc[4];
+
+    dc[0] = block[stride*0 + xStride*0];
+    dc[1] = block[stride*0 + xStride*1];
+    dc[2] = block[stride*1 + xStride*0];
+    dc[3] = block[stride*1 + xStride*1];
+
+    daedalus_h264_chroma_dc_hadamard_2x2(dc);
+
+    block[stride*0 + xStride*0] = (int16_t)((int)dc[0] * qmul >> 7);
+    block[stride*0 + xStride*1] = (int16_t)((int)dc[1] * qmul >> 7);
+    block[stride*1 + xStride*0] = (int16_t)((int)dc[2] * qmul >> 7);
+    block[stride*1 + xStride*1] = (int16_t)((int)dc[3] * qmul >> 7);
+}
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c	2026-05-25 13:38:32.020346459 +0200
+++ libavcodec/aarch64/h264dsp_init_aarch64.c	2026-05-25 13:38:32.033909804 +0200
@@ -41,6 +41,7 @@
                                            int beta);
 void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
                                                 int alpha, int beta);
+void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul);
 void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                        int beta, int8_t *tc0);
 void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride,
@@ -135,6 +136,7 @@
         c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
 
         if (chroma_format_idc <= 1) {
+            c->chroma_dc_dequant_idct = ff_h264_chroma_dc_dequant_idct_daedalus;
             c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_daedalus;
             c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
             c->h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
--
2.47.3