From 624f83e87796c93ddcf49ffe1626c04154fa4b5c Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 13:10:05 +0200 Subject: [PATCH] ffmpeg-v4l2-request-fourier: route H.264 luma-h deblock through daedalus-fourier (0008) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds patch 0008 to the substitution arc, mirroring 0005's V variant for H.264 non-intra bS<4 horizontal luma deblock. H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h The H kernel was added to daedalus-fourier in PR #9 (vendored ff_h264_h_loop_filter_luma_neon, wired through the same CPU-dispatch pattern as V). Recipe table routes AUTO to CPU NEON (no QPU shader for H yet), so this is a NEON-to-NEON substitution via the daedalus recipe layer — same shape as 0005. The libavcodec.so ctx remains no-QPU (daedalus_ctx_create_no_qpu), matching the existing 0003/0004/0005/0007 patches. Higher-cycle QPU init waits for a feature-flag gating change in a separate PR. Intra (bS=4) h_loop_filter_luma_intra stays on the in-tree NEON .S code; daedalus_h264_deblock_meta covers the non-intra path only. A follow-up can route intra once daedalus-fourier exposes the intra-h dispatch (the kernel already exists internally per fourier PR #11). Wires the new patch into both arch/PKGBUILD and debian/build-deb.sh sequences. Verified the patch applies cleanly on top of 0001-0007 against the pinned upstream commit b57fbbe5 on hertz. --- ...h264-deblock-luma-h-daedalus-fourier.patch | 92 +++++++++++++++++++ arch/ffmpeg-v4l2-request-fourier/PKGBUILD | 6 +- ...h264-deblock-luma-h-daedalus-fourier.patch | 92 +++++++++++++++++++ .../ffmpeg-v4l2-request-fourier/build-deb.sh | 1 + 4 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 arch/ffmpeg-v4l2-request-fourier/0008-h264-deblock-luma-h-daedalus-fourier.patch create mode 100644 debian/ffmpeg-v4l2-request-fourier/0008-h264-deblock-luma-h-daedalus-fourier.patch diff --git a/arch/ffmpeg-v4l2-request-fourier/0008-h264-deblock-luma-h-daedalus-fourier.patch b/arch/ffmpeg-v4l2-request-fourier/0008-h264-deblock-luma-h-daedalus-fourier.patch new file mode 100644 index 000000000..f747cbc8c --- /dev/null +++ b/arch/ffmpeg-v4l2-request-fourier/0008-h264-deblock-luma-h-daedalus-fourier.patch @@ -0,0 +1,92 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: claude-noether +Date: Sun, 25 May 2026 12:00:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 luma-h deblock through daedalus-fourier + +Sibling of 0005 (which substituted v_loop_filter_luma). Same +NEON-to-NEON substitution: H264DSPContext.h_loop_filter_luma → +daedalus_recipe_dispatch_h264_deblock_luma_h. The H kernel landed +in daedalus-fourier PR #9 (CPU NEON only — no QPU shader yet). + +libavcodec.so ctx is no-QPU per the existing 0003-0005 / 0007 +pattern; we cannot assume Vulkan in arbitrary host processes +(firefox-fourier RDD, mpv-fourier, etc.). + +Intra (bS=4) h_loop_filter_luma_intra stays on the in-tree NEON .S +code; daedalus_h264_deblock_meta only covers the non-intra path. +An intra-h substitution can land once daedalus-fourier exposes a +dispatch helper (the kernel already exists internally per PR #11). + +Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8 H. +--- +diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c +--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:09:33.694760715 +0200 ++++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:09:33.715603719 +0200 +@@ -1,9 +1,10 @@ + /* +- * H.264 4x4 / 8x8 IDCT + luma-v deblock — daedalus-fourier substitution shims. ++ * H.264 4x4 / 8x8 IDCT + luma v/h deblock — daedalus-fourier substitution shims. + * + * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 + * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 + * H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v ++ * H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h + * instead of the in-tree ff_h264_*_neon assembly. The recipe layer + * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8 + * is CPU primary with QPU opportunistic — the ctx below is no-QPU, +@@ -45,6 +46,8 @@ + void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride); + void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); ++void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); + + void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) + { +@@ -84,3 +87,22 @@ + daedalus_recipe_dispatch_h264_deblock_luma_v(g_dctx, pix, (size_t)stride, + 1, &meta); + } ++ ++void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0) ++{ ++ daedalus_h264_deblock_meta meta = { ++ .dst_off = 0, ++ .alpha = alpha, ++ .beta = beta, ++ }; ++ meta.tc0[0] = tc0[0]; ++ meta.tc0[1] = tc0[1]; ++ meta.tc0[2] = tc0[2]; ++ meta.tc0[3] = tc0[3]; ++ ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ ++ daedalus_recipe_dispatch_h264_deblock_luma_h(g_dctx, pix, (size_t)stride, ++ 1, &meta); ++} +diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c +--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:09:33.695937103 +0200 ++++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:09:33.715541700 +0200 +@@ -31,6 +31,8 @@ + int alpha, int beta, int8_t *tc0); + void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); ++void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); + void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); + void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, +@@ -117,7 +119,7 @@ + + if (have_neon(cpu_flags) && bit_depth == 8) { + c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_daedalus; +- c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; ++ c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_daedalus; + c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon; + c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; + +-- +2.47.3 + diff --git a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD index 27c41ce2e..2f538563a 100644 --- a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD +++ b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD @@ -94,8 +94,9 @@ source=("git+https://github.com/Kwiboo/FFmpeg.git#commit=${_commit}" '0004-h264-idct8-daedalus-fourier.patch' '0005-h264-deblock-luma-v-daedalus-fourier.patch' '0006-h264-restore-low-delay.patch' - '0007-h264-qpel-mc20-daedalus-fourier.patch') -sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') + '0007-h264-qpel-mc20-daedalus-fourier.patch' + '0008-h264-deblock-luma-h-daedalus-fourier.patch') +sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') pkgver() { cd "${_srcname}" @@ -113,6 +114,7 @@ prepare() { patch -Np1 -i "${srcdir}/0005-h264-deblock-luma-v-daedalus-fourier.patch" patch -Np1 -i "${srcdir}/0006-h264-restore-low-delay.patch" patch -Np1 -i "${srcdir}/0007-h264-qpel-mc20-daedalus-fourier.patch" + patch -Np1 -i "${srcdir}/0008-h264-deblock-luma-h-daedalus-fourier.patch" } build() { diff --git a/debian/ffmpeg-v4l2-request-fourier/0008-h264-deblock-luma-h-daedalus-fourier.patch b/debian/ffmpeg-v4l2-request-fourier/0008-h264-deblock-luma-h-daedalus-fourier.patch new file mode 100644 index 000000000..f747cbc8c --- /dev/null +++ b/debian/ffmpeg-v4l2-request-fourier/0008-h264-deblock-luma-h-daedalus-fourier.patch @@ -0,0 +1,92 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: claude-noether +Date: Sun, 25 May 2026 12:00:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 luma-h deblock through daedalus-fourier + +Sibling of 0005 (which substituted v_loop_filter_luma). Same +NEON-to-NEON substitution: H264DSPContext.h_loop_filter_luma → +daedalus_recipe_dispatch_h264_deblock_luma_h. The H kernel landed +in daedalus-fourier PR #9 (CPU NEON only — no QPU shader yet). + +libavcodec.so ctx is no-QPU per the existing 0003-0005 / 0007 +pattern; we cannot assume Vulkan in arbitrary host processes +(firefox-fourier RDD, mpv-fourier, etc.). + +Intra (bS=4) h_loop_filter_luma_intra stays on the in-tree NEON .S +code; daedalus_h264_deblock_meta only covers the non-intra path. +An intra-h substitution can land once daedalus-fourier exposes a +dispatch helper (the kernel already exists internally per PR #11). + +Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8 H. +--- +diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c +--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:09:33.694760715 +0200 ++++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:09:33.715603719 +0200 +@@ -1,9 +1,10 @@ + /* +- * H.264 4x4 / 8x8 IDCT + luma-v deblock — daedalus-fourier substitution shims. ++ * H.264 4x4 / 8x8 IDCT + luma v/h deblock — daedalus-fourier substitution shims. + * + * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 + * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 + * H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v ++ * H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h + * instead of the in-tree ff_h264_*_neon assembly. The recipe layer + * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8 + * is CPU primary with QPU opportunistic — the ctx below is no-QPU, +@@ -45,6 +46,8 @@ + void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride); + void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); ++void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); + + void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) + { +@@ -84,3 +87,22 @@ + daedalus_recipe_dispatch_h264_deblock_luma_v(g_dctx, pix, (size_t)stride, + 1, &meta); + } ++ ++void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0) ++{ ++ daedalus_h264_deblock_meta meta = { ++ .dst_off = 0, ++ .alpha = alpha, ++ .beta = beta, ++ }; ++ meta.tc0[0] = tc0[0]; ++ meta.tc0[1] = tc0[1]; ++ meta.tc0[2] = tc0[2]; ++ meta.tc0[3] = tc0[3]; ++ ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ ++ daedalus_recipe_dispatch_h264_deblock_luma_h(g_dctx, pix, (size_t)stride, ++ 1, &meta); ++} +diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c +--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:09:33.695937103 +0200 ++++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:09:33.715541700 +0200 +@@ -31,6 +31,8 @@ + int alpha, int beta, int8_t *tc0); + void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); ++void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); + void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); + void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, +@@ -117,7 +119,7 @@ + + if (have_neon(cpu_flags) && bit_depth == 8) { + c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_daedalus; +- c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; ++ c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_daedalus; + c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon; + c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; + +-- +2.47.3 + diff --git a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh index e5e7ace66..4deab98f5 100755 --- a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh +++ b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh @@ -74,6 +74,7 @@ patch -Np1 -i "$HERE/0004-h264-idct8-daedalus-fourier.patch" patch -Np1 -i "$HERE/0005-h264-deblock-luma-v-daedalus-fourier.patch" patch -Np1 -i "$HERE/0006-h264-restore-low-delay.patch" patch -Np1 -i "$HERE/0007-h264-qpel-mc20-daedalus-fourier.patch" +patch -Np1 -i "$HERE/0008-h264-deblock-luma-h-daedalus-fourier.patch" # --- daedalus-fourier: fetch + build static .a with PIC, install to a # per-build prefix; libavcodec.so links it into the shared object so