From 68731c41d7ea68be0e912b128cb4e71fb56e8263 Mon Sep 17 00:00:00 2001
From: Markus Fritsche <mfritsche@reauktion.de>
Date: Fri, 22 May 2026 12:15:16 +0200
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 luma-v deblock through
 daedalus-fourier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

H264DSPContext.v_loop_filter_luma (non-intra bS<4 vertical luma
deblock, called per macroblock-row edge from the slice deblock
loop) now dispatches through
daedalus_recipe_dispatch_h264_deblock_luma_v instead of
ff_h264_v_loop_filter_luma_neon.

The recipe layer picks the substrate; for cycle 8 the daedalus
docstring marks the kernel "CPU primary; QPU opportunistic", but
the libavcodec.so context here is built with
daedalus_ctx_create_no_qpu — process-global pthread_once init,
shared with cycles 6/7.  QPU opportunism stays gated off until a
follow-up adds an explicit feature flag (no implicit Vulkan init
in arbitrary host processes).  In the meantime cycle 8 is a
plumbing-only substitution, NEON-to-NEON via the daedalus recipe.

Intra (bS=4) loop filter — c->v_loop_filter_luma_intra — stays on
the in-tree NEON .S code; daedalus's daedalus_h264_deblock_meta
only covers the non-intra path per its docstring.

FFmpeg `int alpha/beta/int8_t tc0[4]` → daedalus_h264_deblock_meta
(int32_t alpha/beta + inline int8_t tc0[4]).  pix already points
to row 0 of the bottom block per FFmpeg's deblock convention,
satisfying daedalus's `dst_off >= 4 * dst_stride` constraint.

Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8.
---
 libavcodec/aarch64/h264_idct_daedalus.c   | 36 +++++++++++++++++++----
 libavcodec/aarch64/h264dsp_init_aarch64.c |  4 ++-
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
index cbb98af..92365fa 100644
--- a/libavcodec/aarch64/h264_idct_daedalus.c
+++ b/libavcodec/aarch64/h264_idct_daedalus.c
@@ -1,11 +1,14 @@
 /*
- * H.264 4x4 / 8x8 IDCT + add — daedalus-fourier substitution shims.
+ * H.264 4x4 / 8x8 IDCT + luma-v deblock — daedalus-fourier substitution shims.
  *
- * Routes H264DSPContext.idct_add  → daedalus_recipe_dispatch_h264_idct4
- *        H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
- * instead of the in-tree ff_h264_idct{,8}_add_neon assembly.  The
- * recipe layer picks the substrate (CPU NEON by default for cycles
- * 6 + 7; future cycles may dispatch to V3D opportunistically).
+ * Routes H264DSPContext.idct_add           → daedalus_recipe_dispatch_h264_idct4
+ *        H264DSPContext.idct8_add          → daedalus_recipe_dispatch_h264_idct8
+ *        H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v
+ * instead of the in-tree ff_h264_*_neon assembly.  The recipe layer
+ * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8
+ * is CPU primary with QPU opportunistic — the ctx below is no-QPU,
+ * so cycle 8 stays on the CPU NEON path until a separate change
+ * gates QPU init on a daedalus-fourier feature flag).
  *
  * FFmpeg's 4x4 and 8x8 block memory layouts match daedalus's
  * column-major convention: block[r + N*c] = coefficient at
@@ -40,6 +43,8 @@ static void daedalus_ctx_init_once(void)
 
 void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride);
 void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
+                                         int alpha, int beta, int8_t *tc0);
 
 void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
 {
@@ -60,3 +65,22 @@ void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride)
     daedalus_recipe_dispatch_h264_idct8(g_dctx, dst, (size_t)stride,
                                         block, 1, &meta);
 }
+
+void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
+                                         int alpha, int beta, int8_t *tc0)
+{
+    daedalus_h264_deblock_meta meta = {
+        .dst_off = 0,
+        .alpha   = alpha,
+        .beta    = beta,
+    };
+    meta.tc0[0] = tc0[0];
+    meta.tc0[1] = tc0[1];
+    meta.tc0[2] = tc0[2];
+    meta.tc0[3] = tc0[3];
+
+    pthread_once(&g_dctx_once, daedalus_ctx_init_once);
+
+    daedalus_recipe_dispatch_h264_deblock_luma_v(g_dctx, pix, (size_t)stride,
+                                                 1, &meta);
+}
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index 741e551..85ac381 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -27,6 +27,8 @@
 
 void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                      int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
+                                         int alpha, int beta, int8_t *tc0);
 void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                      int beta, int8_t *tc0);
 void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
@@ -114,7 +116,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags) && bit_depth == 8) {
-        c->v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
+        c->v_loop_filter_luma   = ff_h264_v_loop_filter_luma_daedalus;
         c->h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
         c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon;
         c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
-- 
2.47.3