marfrit-packages/arch/ffmpeg-v4l2-request-fourier/0008-h264-deblock-luma-h-daedalus-fourier.patch

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Sun, 25 May 2026 12:00:00 +0200
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 luma-h deblock through daedalus-fourier

Sibling of 0005 (which substituted v_loop_filter_luma).  Same
NEON-to-NEON substitution: H264DSPContext.h_loop_filter_luma →
daedalus_recipe_dispatch_h264_deblock_luma_h.  The H kernel landed
in daedalus-fourier PR #9 (CPU NEON only — no QPU shader yet).

libavcodec.so ctx is no-QPU per the existing 0003-0005 / 0007
pattern; we cannot assume Vulkan in arbitrary host processes
(firefox-fourier RDD, mpv-fourier, etc.).

Intra (bS=4) h_loop_filter_luma_intra stays on the in-tree NEON .S
code; daedalus_h264_deblock_meta only covers the non-intra path.
An intra-h substitution can land once daedalus-fourier exposes a
dispatch helper (the kernel already exists internally per PR #11).

Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8 H.
---
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
--- a/libavcodec/aarch64/h264_idct_daedalus.c	2026-05-25 13:09:33.694760715 +0200
+++ libavcodec/aarch64/h264_idct_daedalus.c	2026-05-25 13:09:33.715603719 +0200
@@ -1,9 +1,10 @@
 /*
- * H.264 4x4 / 8x8 IDCT + luma-v deblock — daedalus-fourier substitution shims.
+ * H.264 4x4 / 8x8 IDCT + luma v/h deblock — daedalus-fourier substitution shims.
  *
  * Routes H264DSPContext.idct_add           → daedalus_recipe_dispatch_h264_idct4
  *        H264DSPContext.idct8_add          → daedalus_recipe_dispatch_h264_idct8
  *        H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v
+ *        H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h
  * instead of the in-tree ff_h264_*_neon assembly.  The recipe layer
  * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8
  * is CPU primary with QPU opportunistic — the ctx below is no-QPU,
@@ -45,6 +46,8 @@
 void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
 void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
                                          int alpha, int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
+                                         int alpha, int beta, int8_t *tc0);

 void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
 {
@@ -84,3 +87,22 @@
     daedalus_recipe_dispatch_h264_deblock_luma_v(g_dctx, pix, (size_t)stride,
                                                  1, &meta);
 }
+
+void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
+                                         int alpha, int beta, int8_t *tc0)
+{
+    daedalus_h264_deblock_meta meta = {
+        .dst_off = 0,
+        .alpha   = alpha,
+        .beta    = beta,
+    };
+    meta.tc0[0] = tc0[0];
+    meta.tc0[1] = tc0[1];
+    meta.tc0[2] = tc0[2];
+    meta.tc0[3] = tc0[3];
+
+    pthread_once(&g_dctx_once, daedalus_ctx_init_once);
+
+    daedalus_recipe_dispatch_h264_deblock_luma_h(g_dctx, pix, (size_t)stride,
+                                                 1, &meta);
+}
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c	2026-05-25 13:09:33.695937103 +0200
+++ libavcodec/aarch64/h264dsp_init_aarch64.c	2026-05-25 13:09:33.715541700 +0200
@@ -31,6 +31,8 @@
                                          int alpha, int beta, int8_t *tc0);
 void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                      int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
+                                         int alpha, int beta, int8_t *tc0);
 void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
                                            int beta);
 void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
@@ -117,7 +119,7 @@

     if (have_neon(cpu_flags) && bit_depth == 8) {
         c->v_loop_filter_luma   = ff_h264_v_loop_filter_luma_daedalus;
-        c->h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
+        c->h_loop_filter_luma   = ff_h264_h_loop_filter_luma_daedalus;
         c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon;
         c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;

--
2.47.3