From babb280410b3c3f43cda0fa2c834be78bc9742ac Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 13:16:45 +0200 Subject: [PATCH] ffmpeg-v4l2-request-fourier: route H.264 chroma v/h deblock through daedalus-fourier (0009) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chroma siblings of 0005 (luma_v) and 0008 (luma_h). Same NEON-to-NEON pattern via the daedalus recipe layer: H264DSPContext.v_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_v H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h Both kernels landed in daedalus-fourier PR #10. Recipe table routes AUTO to CPU NEON (no chroma QPU shaders yet), so this is plumbing- only and stays bit-exact against the in-tree NEON. Intra chroma (bS=4) loop filters remain on in-tree NEON; daedalus_h264_deblock_meta covers the non-intra (bS<4) path. Verified the patch applies cleanly on top of 0001-0008 against the pinned upstream commit b57fbbe5 on hertz. Wires the new patch into both arch/PKGBUILD and debian/build-deb.sh. --- ...h264-deblock-chroma-daedalus-fourier.patch | 127 ++++++++++++++++++ arch/ffmpeg-v4l2-request-fourier/PKGBUILD | 6 +- ...h264-deblock-chroma-daedalus-fourier.patch | 127 ++++++++++++++++++ .../ffmpeg-v4l2-request-fourier/build-deb.sh | 1 + 4 files changed, 259 insertions(+), 2 deletions(-) create mode 100644 arch/ffmpeg-v4l2-request-fourier/0009-h264-deblock-chroma-daedalus-fourier.patch create mode 100644 debian/ffmpeg-v4l2-request-fourier/0009-h264-deblock-chroma-daedalus-fourier.patch diff --git a/arch/ffmpeg-v4l2-request-fourier/0009-h264-deblock-chroma-daedalus-fourier.patch b/arch/ffmpeg-v4l2-request-fourier/0009-h264-deblock-chroma-daedalus-fourier.patch new file mode 100644 index 000000000..1e0a0ca50 --- /dev/null +++ b/arch/ffmpeg-v4l2-request-fourier/0009-h264-deblock-chroma-daedalus-fourier.patch @@ -0,0 +1,127 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: claude-noether +Date: Sun, 25 May 2026 12:00:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 chroma v/h deblock through daedalus-fourier + +Chroma siblings of 0005 (luma_v) and 0008 (luma_h). Same +NEON-to-NEON pattern via the daedalus recipe layer: + + H264DSPContext.v_loop_filter_chroma → + daedalus_recipe_dispatch_h264_deblock_chroma_v + H264DSPContext.h_loop_filter_chroma → + daedalus_recipe_dispatch_h264_deblock_chroma_h + +Both kernels landed in daedalus-fourier PR #10. Recipe table +routes AUTO to CPU NEON (no chroma QPU shaders yet), so this +is plumbing-only and stays bit-exact against the in-tree NEON. + +Intra chroma (bS=4) loop filters remain on in-tree NEON; +daedalus_h264_deblock_meta covers the non-intra (bS<4) path. + +Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8 chroma. +--- +diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c +--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:15:45.995368233 +0200 ++++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:15:46.015839177 +0200 +@@ -1,10 +1,12 @@ + /* +- * H.264 4x4 / 8x8 IDCT + luma v/h deblock — daedalus-fourier substitution shims. ++ * H.264 4x4 / 8x8 IDCT + luma v/h + chroma v/h deblock — daedalus-fourier substitution shims. + * + * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 + * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 +- * H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v +- * H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h ++ * H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v ++ * H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h ++ * H264DSPContext.v_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_v ++ * H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h + * instead of the in-tree ff_h264_*_neon assembly. The recipe layer + * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8 + * is CPU primary with QPU opportunistic — the ctx below is no-QPU, +@@ -48,6 +50,10 @@ + int alpha, int beta, int8_t *tc0); + void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); ++void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); ++void ff_h264_h_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); + + void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) + { +@@ -106,3 +112,41 @@ + daedalus_recipe_dispatch_h264_deblock_luma_h(g_dctx, pix, (size_t)stride, + 1, &meta); + } ++ ++void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0) ++{ ++ daedalus_h264_deblock_meta meta = { ++ .dst_off = 0, ++ .alpha = alpha, ++ .beta = beta, ++ }; ++ meta.tc0[0] = tc0[0]; ++ meta.tc0[1] = tc0[1]; ++ meta.tc0[2] = tc0[2]; ++ meta.tc0[3] = tc0[3]; ++ ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ ++ daedalus_recipe_dispatch_h264_deblock_chroma_v(g_dctx, pix, (size_t)stride, ++ 1, &meta); ++} ++ ++void ff_h264_h_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0) ++{ ++ daedalus_h264_deblock_meta meta = { ++ .dst_off = 0, ++ .alpha = alpha, ++ .beta = beta, ++ }; ++ meta.tc0[0] = tc0[0]; ++ meta.tc0[1] = tc0[1]; ++ meta.tc0[2] = tc0[2]; ++ meta.tc0[3] = tc0[3]; ++ ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ ++ daedalus_recipe_dispatch_h264_deblock_chroma_h(g_dctx, pix, (size_t)stride, ++ 1, &meta); ++} +diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c +--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:15:45.996482360 +0200 ++++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:15:46.025604910 +0200 +@@ -39,8 +39,12 @@ + int beta); + void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); ++void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); + void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); ++void ff_h264_h_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); + void ff_h264_h_loop_filter_chroma422_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); + void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride, +@@ -123,11 +127,11 @@ + c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon; + c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; + +- c->v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; ++ c->v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_daedalus; + c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; + + if (chroma_format_idc <= 1) { +- c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; ++ c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_daedalus; + c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; + c->h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon; + } else { +-- +2.47.3 + diff --git a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD index 2f538563a..43442f40c 100644 --- a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD +++ b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD @@ -95,8 +95,9 @@ source=("git+https://github.com/Kwiboo/FFmpeg.git#commit=${_commit}" '0005-h264-deblock-luma-v-daedalus-fourier.patch' '0006-h264-restore-low-delay.patch' '0007-h264-qpel-mc20-daedalus-fourier.patch' - '0008-h264-deblock-luma-h-daedalus-fourier.patch') -sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') + '0008-h264-deblock-luma-h-daedalus-fourier.patch' + '0009-h264-deblock-chroma-daedalus-fourier.patch') +sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') pkgver() { cd "${_srcname}" @@ -115,6 +116,7 @@ prepare() { patch -Np1 -i "${srcdir}/0006-h264-restore-low-delay.patch" patch -Np1 -i "${srcdir}/0007-h264-qpel-mc20-daedalus-fourier.patch" patch -Np1 -i "${srcdir}/0008-h264-deblock-luma-h-daedalus-fourier.patch" + patch -Np1 -i "${srcdir}/0009-h264-deblock-chroma-daedalus-fourier.patch" } build() { diff --git a/debian/ffmpeg-v4l2-request-fourier/0009-h264-deblock-chroma-daedalus-fourier.patch b/debian/ffmpeg-v4l2-request-fourier/0009-h264-deblock-chroma-daedalus-fourier.patch new file mode 100644 index 000000000..1e0a0ca50 --- /dev/null +++ b/debian/ffmpeg-v4l2-request-fourier/0009-h264-deblock-chroma-daedalus-fourier.patch @@ -0,0 +1,127 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: claude-noether +Date: Sun, 25 May 2026 12:00:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 chroma v/h deblock through daedalus-fourier + +Chroma siblings of 0005 (luma_v) and 0008 (luma_h). Same +NEON-to-NEON pattern via the daedalus recipe layer: + + H264DSPContext.v_loop_filter_chroma → + daedalus_recipe_dispatch_h264_deblock_chroma_v + H264DSPContext.h_loop_filter_chroma → + daedalus_recipe_dispatch_h264_deblock_chroma_h + +Both kernels landed in daedalus-fourier PR #10. Recipe table +routes AUTO to CPU NEON (no chroma QPU shaders yet), so this +is plumbing-only and stays bit-exact against the in-tree NEON. + +Intra chroma (bS=4) loop filters remain on in-tree NEON; +daedalus_h264_deblock_meta covers the non-intra (bS<4) path. + +Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8 chroma. +--- +diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c +--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:15:45.995368233 +0200 ++++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:15:46.015839177 +0200 +@@ -1,10 +1,12 @@ + /* +- * H.264 4x4 / 8x8 IDCT + luma v/h deblock — daedalus-fourier substitution shims. ++ * H.264 4x4 / 8x8 IDCT + luma v/h + chroma v/h deblock — daedalus-fourier substitution shims. + * + * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 + * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 +- * H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v +- * H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h ++ * H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v ++ * H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h ++ * H264DSPContext.v_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_v ++ * H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h + * instead of the in-tree ff_h264_*_neon assembly. The recipe layer + * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8 + * is CPU primary with QPU opportunistic — the ctx below is no-QPU, +@@ -48,6 +50,10 @@ + int alpha, int beta, int8_t *tc0); + void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); ++void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); ++void ff_h264_h_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); + + void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) + { +@@ -106,3 +112,41 @@ + daedalus_recipe_dispatch_h264_deblock_luma_h(g_dctx, pix, (size_t)stride, + 1, &meta); + } ++ ++void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0) ++{ ++ daedalus_h264_deblock_meta meta = { ++ .dst_off = 0, ++ .alpha = alpha, ++ .beta = beta, ++ }; ++ meta.tc0[0] = tc0[0]; ++ meta.tc0[1] = tc0[1]; ++ meta.tc0[2] = tc0[2]; ++ meta.tc0[3] = tc0[3]; ++ ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ ++ daedalus_recipe_dispatch_h264_deblock_chroma_v(g_dctx, pix, (size_t)stride, ++ 1, &meta); ++} ++ ++void ff_h264_h_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0) ++{ ++ daedalus_h264_deblock_meta meta = { ++ .dst_off = 0, ++ .alpha = alpha, ++ .beta = beta, ++ }; ++ meta.tc0[0] = tc0[0]; ++ meta.tc0[1] = tc0[1]; ++ meta.tc0[2] = tc0[2]; ++ meta.tc0[3] = tc0[3]; ++ ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ ++ daedalus_recipe_dispatch_h264_deblock_chroma_h(g_dctx, pix, (size_t)stride, ++ 1, &meta); ++} +diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c +--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:15:45.996482360 +0200 ++++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:15:46.025604910 +0200 +@@ -39,8 +39,12 @@ + int beta); + void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); ++void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); + void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); ++void ff_h264_h_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta, int8_t *tc0); + void ff_h264_h_loop_filter_chroma422_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); + void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride, +@@ -123,11 +127,11 @@ + c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon; + c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; + +- c->v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; ++ c->v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_daedalus; + c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; + + if (chroma_format_idc <= 1) { +- c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; ++ c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_daedalus; + c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; + c->h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon; + } else { +-- +2.47.3 + diff --git a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh index 4deab98f5..dbeb48473 100755 --- a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh +++ b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh @@ -75,6 +75,7 @@ patch -Np1 -i "$HERE/0005-h264-deblock-luma-v-daedalus-fourier.patch" patch -Np1 -i "$HERE/0006-h264-restore-low-delay.patch" patch -Np1 -i "$HERE/0007-h264-qpel-mc20-daedalus-fourier.patch" patch -Np1 -i "$HERE/0008-h264-deblock-luma-h-daedalus-fourier.patch" +patch -Np1 -i "$HERE/0009-h264-deblock-chroma-daedalus-fourier.patch" # --- daedalus-fourier: fetch + build static .a with PIC, install to a # per-build prefix; libavcodec.so links it into the shared object so