From 1b286ddb4efaca26ec9b9e290e989fec77dc1c77 Mon Sep 17 00:00:00 2001
From: Markus Fritsche <mfritsche@reauktion.de>
Date: Fri, 22 May 2026 10:18:21 +0200
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 8x8 IDCT through
 daedalus-fourier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

H264DSPContext.idct8_add (called per 8x8 block from the High-profile
intra-8x8-DCT decode path in h264_mb.c) now dispatches through
daedalus_recipe_dispatch_h264_idct8 instead of ff_h264_idct8_add_neon.

The recipe layer picks the substrate; for cycle 7 (H.264 IDCT 8x8)
the recipe is CPU NEON, so this is effectively a NEON-to-NEON
substitution layered on top of the cycle-6 IDCT 4x4 wiring.  Same
pthread_once global context, same destructive-zero semantics; FFmpeg
column-major 8x8 storage block[r + 8*c] matches daedalus's convention.

Bulk path c->idct8_add4 (used for inter 8x8-DCT macroblocks) remains
on the in-tree NEON .S code and will be batched through
daedalus_recipe_dispatch_h264_idct8 with n_blocks>1 in a follow-up.

Bit-exact against ff_h264_idct8_add_neon (daedalus-fourier cycle 7
green).

Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 7.
---
 libavcodec/aarch64/h264_idct_daedalus.c   | 29 ++++++++++++++++-------
 libavcodec/aarch64/h264dsp_init_aarch64.c |  3 ++-
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
index 538d223..cbb98af 100644
--- a/libavcodec/aarch64/h264_idct_daedalus.c
+++ b/libavcodec/aarch64/h264_idct_daedalus.c
@@ -1,14 +1,16 @@
 /*
- * H.264 4x4 IDCT + add — daedalus-fourier substitution shim.
+ * H.264 4x4 / 8x8 IDCT + add — daedalus-fourier substitution shims.
  *
- * Routes H264DSPContext.idct_add through
- * daedalus_recipe_dispatch_h264_idct4 instead of ff_h264_idct_add_neon.
- * The recipe layer picks the substrate (CPU NEON by default for
- * cycle 6; future cycles may dispatch to V3D opportunistically).
+ * Routes H264DSPContext.idct_add  → daedalus_recipe_dispatch_h264_idct4
+ *        H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
+ * instead of the in-tree ff_h264_idct{,8}_add_neon assembly.  The
+ * recipe layer picks the substrate (CPU NEON by default for cycles
+ * 6 + 7; future cycles may dispatch to V3D opportunistically).
  *
- * FFmpeg's 4x4 block memory layout matches daedalus's column-major
- * convention: block[r + 4*c] = coefficient at (row r, col c).  Both
- * sides destructively zero the block after the transform.
+ * FFmpeg's 4x4 and 8x8 block memory layouts match daedalus's
+ * column-major convention: block[r + N*c] = coefficient at
+ * (row r, col c) for N ∈ {4, 8}.  Both sides destructively zero the
+ * block after the transform.
  *
  * The library context is process-global and lazily initialised under
  * pthread_once.  We pick the no-QPU constructor here because
@@ -37,6 +39,7 @@ static void daedalus_ctx_init_once(void)
 }
 
 void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
 
 void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
 {
@@ -47,3 +50,13 @@ void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
     daedalus_recipe_dispatch_h264_idct4(g_dctx, dst, (size_t)stride,
                                         block, 1, &meta);
 }
+
+void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride)
+{
+    static const daedalus_h264_block_meta meta = { .dst_off = 0 };
+
+    pthread_once(&g_dctx_once, daedalus_ctx_init_once);
+
+    daedalus_recipe_dispatch_h264_idct8(g_dctx, dst, (size_t)stride,
+                                        block, 1, &meta);
+}
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index b993df2..741e551 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -79,6 +79,7 @@ void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
                             const uint8_t nnzc[15 * 8]);
 
 void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
 void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
 void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
                              int16_t *block, int stride,
@@ -146,7 +147,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
         c->idct_add16intra = ff_h264_idct_add16intra_neon;
         if (chroma_format_idc <= 1)
             c->idct_add8   = ff_h264_idct_add8_neon;
-        c->idct8_add       = ff_h264_idct8_add_neon;
+        c->idct8_add       = ff_h264_idct8_add_daedalus;
         c->idct8_dc_add    = ff_h264_idct8_dc_add_neon;
         c->idct8_add4      = ff_h264_idct8_add4_neon;
     } else if (have_neon(cpu_flags) && bit_depth == 10) {
-- 
2.47.3