diff --git a/arch/ffmpeg-v4l2-request-fourier/0010-h264-deblock-luma-intra-daedalus-fourier.patch b/arch/ffmpeg-v4l2-request-fourier/0010-h264-deblock-luma-intra-daedalus-fourier.patch new file mode 100644 index 000000000..8a9f1b993 --- /dev/null +++ b/arch/ffmpeg-v4l2-request-fourier/0010-h264-deblock-luma-intra-daedalus-fourier.patch @@ -0,0 +1,126 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: claude-noether +Date: Sun, 25 May 2026 12:30:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 luma intra deblock through daedalus-fourier + +Adds the bS=4 intra-strength variants of the already-substituted +luma_v / luma_h deblock (0005, 0008). Intra MBs and certain +inter-MB edges (4x4 transform boundaries inside an Intra_NxN +neighbour) force boundary strength to 4 per H.264 §8.7.2.1. + + H264DSPContext.v_loop_filter_luma_intra → + daedalus_recipe_dispatch_h264_deblock_luma_v_intra + H264DSPContext.h_loop_filter_luma_intra → + daedalus_recipe_dispatch_h264_deblock_luma_h_intra + +Both kernels landed in daedalus-fourier PR #11. Recipe table +routes AUTO to CPU NEON (no intra QPU shaders yet) — plumbing- +only NEON-to-NEON via daedalus, bit-exact against the in-tree +FFmpeg NEON path. + +Signature differs from bS<4: no tc0 argument. The wrapper +passes daedalus_h264_deblock_meta with alpha/beta set; tc0[] is +ignored by the intra dispatch (bS=4 hardcodes the strength). + +Chroma intra variants are deferred to a follow-up PR because the +chroma path has a 4:2:0 / 4:2:2 split (chroma_format_idc gating) +that needs explicit conditional substitution to avoid running +the 4:2:0-only daedalus dispatch on 4:2:2 chroma. + +Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8 intra. +--- +diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c +--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:18:54.992244965 +0200 ++++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:20:12.338122217 +0200 +@@ -1,5 +1,5 @@ + /* +- * H.264 4x4 / 8x8 IDCT + luma v/h + chroma v/h deblock — daedalus-fourier substitution shims. ++ * H.264 4x4 / 8x8 IDCT + luma v/h (inter + intra) + chroma v/h deblock — daedalus-fourier substitution shims. + * + * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 + * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 +@@ -7,6 +7,8 @@ + * H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h + * H264DSPContext.v_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_v + * H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h ++ * H264DSPContext.v_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_v_intra ++ * H264DSPContext.h_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_h_intra + * instead of the in-tree ff_h264_*_neon assembly. The recipe layer + * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8 + * is CPU primary with QPU opportunistic — the ctx below is no-QPU, +@@ -54,6 +56,10 @@ + int alpha, int beta, int8_t *tc0); + void ff_h264_h_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); ++void ff_h264_v_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta); ++void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta); + + void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) + { +@@ -150,3 +156,34 @@ + daedalus_recipe_dispatch_h264_deblock_chroma_h(g_dctx, pix, (size_t)stride, + 1, &meta); + } ++ ++void ff_h264_v_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta) ++{ ++ daedalus_h264_deblock_meta meta = { ++ .dst_off = 0, ++ .alpha = alpha, ++ .beta = beta, ++ }; ++ /* tc0[] is ignored by the intra-strength dispatch (bS=4 hardcodes the strength). */ ++ ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ ++ daedalus_recipe_dispatch_h264_deblock_luma_v_intra(g_dctx, pix, (size_t)stride, ++ 1, &meta); ++} ++ ++void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta) ++{ ++ daedalus_h264_deblock_meta meta = { ++ .dst_off = 0, ++ .alpha = alpha, ++ .beta = beta, ++ }; ++ ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ ++ daedalus_recipe_dispatch_h264_deblock_luma_h_intra(g_dctx, pix, (size_t)stride, ++ 1, &meta); ++} +diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c +--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:18:54.993349573 +0200 ++++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:20:12.338265830 +0200 +@@ -35,8 +35,12 @@ + int alpha, int beta, int8_t *tc0); + void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); ++void ff_h264_v_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta); + void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); ++void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta); + void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); + void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, +@@ -124,8 +128,8 @@ + if (have_neon(cpu_flags) && bit_depth == 8) { + c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_daedalus; + c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_daedalus; +- c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon; +- c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; ++ c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_daedalus; ++ c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_daedalus; + + c->v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_daedalus; + c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; +-- +2.47.3 + diff --git a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD index 43442f40c..242d66e94 100644 --- a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD +++ b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD @@ -96,8 +96,9 @@ source=("git+https://github.com/Kwiboo/FFmpeg.git#commit=${_commit}" '0006-h264-restore-low-delay.patch' '0007-h264-qpel-mc20-daedalus-fourier.patch' '0008-h264-deblock-luma-h-daedalus-fourier.patch' - '0009-h264-deblock-chroma-daedalus-fourier.patch') -sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') + '0009-h264-deblock-chroma-daedalus-fourier.patch' + '0010-h264-deblock-luma-intra-daedalus-fourier.patch') +sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') pkgver() { cd "${_srcname}" @@ -117,6 +118,7 @@ prepare() { patch -Np1 -i "${srcdir}/0007-h264-qpel-mc20-daedalus-fourier.patch" patch -Np1 -i "${srcdir}/0008-h264-deblock-luma-h-daedalus-fourier.patch" patch -Np1 -i "${srcdir}/0009-h264-deblock-chroma-daedalus-fourier.patch" + patch -Np1 -i "${srcdir}/0010-h264-deblock-luma-intra-daedalus-fourier.patch" } build() { diff --git a/debian/ffmpeg-v4l2-request-fourier/0010-h264-deblock-luma-intra-daedalus-fourier.patch b/debian/ffmpeg-v4l2-request-fourier/0010-h264-deblock-luma-intra-daedalus-fourier.patch new file mode 100644 index 000000000..8a9f1b993 --- /dev/null +++ b/debian/ffmpeg-v4l2-request-fourier/0010-h264-deblock-luma-intra-daedalus-fourier.patch @@ -0,0 +1,126 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: claude-noether +Date: Sun, 25 May 2026 12:30:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 luma intra deblock through daedalus-fourier + +Adds the bS=4 intra-strength variants of the already-substituted +luma_v / luma_h deblock (0005, 0008). Intra MBs and certain +inter-MB edges (4x4 transform boundaries inside an Intra_NxN +neighbour) force boundary strength to 4 per H.264 §8.7.2.1. + + H264DSPContext.v_loop_filter_luma_intra → + daedalus_recipe_dispatch_h264_deblock_luma_v_intra + H264DSPContext.h_loop_filter_luma_intra → + daedalus_recipe_dispatch_h264_deblock_luma_h_intra + +Both kernels landed in daedalus-fourier PR #11. Recipe table +routes AUTO to CPU NEON (no intra QPU shaders yet) — plumbing- +only NEON-to-NEON via daedalus, bit-exact against the in-tree +FFmpeg NEON path. + +Signature differs from bS<4: no tc0 argument. The wrapper +passes daedalus_h264_deblock_meta with alpha/beta set; tc0[] is +ignored by the intra dispatch (bS=4 hardcodes the strength). + +Chroma intra variants are deferred to a follow-up PR because the +chroma path has a 4:2:0 / 4:2:2 split (chroma_format_idc gating) +that needs explicit conditional substitution to avoid running +the 4:2:0-only daedalus dispatch on 4:2:2 chroma. + +Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8 intra. +--- +diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c +--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:18:54.992244965 +0200 ++++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:20:12.338122217 +0200 +@@ -1,5 +1,5 @@ + /* +- * H.264 4x4 / 8x8 IDCT + luma v/h + chroma v/h deblock — daedalus-fourier substitution shims. ++ * H.264 4x4 / 8x8 IDCT + luma v/h (inter + intra) + chroma v/h deblock — daedalus-fourier substitution shims. + * + * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 + * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 +@@ -7,6 +7,8 @@ + * H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h + * H264DSPContext.v_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_v + * H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h ++ * H264DSPContext.v_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_v_intra ++ * H264DSPContext.h_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_h_intra + * instead of the in-tree ff_h264_*_neon assembly. The recipe layer + * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8 + * is CPU primary with QPU opportunistic — the ctx below is no-QPU, +@@ -54,6 +56,10 @@ + int alpha, int beta, int8_t *tc0); + void ff_h264_h_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); ++void ff_h264_v_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta); ++void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta); + + void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) + { +@@ -150,3 +156,34 @@ + daedalus_recipe_dispatch_h264_deblock_chroma_h(g_dctx, pix, (size_t)stride, + 1, &meta); + } ++ ++void ff_h264_v_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta) ++{ ++ daedalus_h264_deblock_meta meta = { ++ .dst_off = 0, ++ .alpha = alpha, ++ .beta = beta, ++ }; ++ /* tc0[] is ignored by the intra-strength dispatch (bS=4 hardcodes the strength). */ ++ ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ ++ daedalus_recipe_dispatch_h264_deblock_luma_v_intra(g_dctx, pix, (size_t)stride, ++ 1, &meta); ++} ++ ++void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta) ++{ ++ daedalus_h264_deblock_meta meta = { ++ .dst_off = 0, ++ .alpha = alpha, ++ .beta = beta, ++ }; ++ ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ ++ daedalus_recipe_dispatch_h264_deblock_luma_h_intra(g_dctx, pix, (size_t)stride, ++ 1, &meta); ++} +diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c +--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:18:54.993349573 +0200 ++++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:20:12.338265830 +0200 +@@ -35,8 +35,12 @@ + int alpha, int beta, int8_t *tc0); + void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); ++void ff_h264_v_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta); + void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); ++void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, ++ int alpha, int beta); + void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); + void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, +@@ -124,8 +128,8 @@ + if (have_neon(cpu_flags) && bit_depth == 8) { + c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_daedalus; + c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_daedalus; +- c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon; +- c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; ++ c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_daedalus; ++ c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_daedalus; + + c->v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_daedalus; + c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; +-- +2.47.3 + diff --git a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh index dbeb48473..9566ef14c 100755 --- a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh +++ b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh @@ -76,6 +76,7 @@ patch -Np1 -i "$HERE/0006-h264-restore-low-delay.patch" patch -Np1 -i "$HERE/0007-h264-qpel-mc20-daedalus-fourier.patch" patch -Np1 -i "$HERE/0008-h264-deblock-luma-h-daedalus-fourier.patch" patch -Np1 -i "$HERE/0009-h264-deblock-chroma-daedalus-fourier.patch" +patch -Np1 -i "$HERE/0010-h264-deblock-luma-intra-daedalus-fourier.patch" # --- daedalus-fourier: fetch + build static .a with PIC, install to a # per-build prefix; libavcodec.so links it into the shared object so