From d8aa3aae8d38f17c87b11c1a5f29b4762a6dea4b Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 13:39:54 +0200 Subject: [PATCH] ffmpeg-v4l2-request-fourier: route H.264 chroma DC Hadamard through daedalus-fourier (0011) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Substitutes H264DSPContext.chroma_dc_dequant_idct in the 4:2:0 / bit_depth=8 init path with a wrapper that composes the daedalus chroma DC Hadamard primitive (daedalus-fourier PR #25) with the qmul scaling FFmpeg's reference does in one fused function (h264idct_template.c::ff_h264_chroma_dc_dequant_idct). Algorithm per H.264 §8.5.11.1 / §8.5.11.2: 1. Extract 4 DCs from the scattered positions in the per-MB coefficient buffer (stride=32, xStride=16) 2. 2x2 Hadamard transform (daedalus primitive) 3. qmul scale + >> 7, write back to original positions Bit-exact against ff_h264_chroma_dc_dequant_idct_8_c. The Hadamard itself is gated by the fourier PR #23 7-case test suite (including the H·H = 4·I algebraic invariant), and the public-API parity test added in PR #25 confirms the src/ symbol matches the test ref. 4:2:2 chroma stays on the in-tree ff_h264_chroma422_dc_dequant_idct_c path — same chroma_format_idc<=1 gating shape as 0009 chroma deblock. Pin bump: _daedalus_fourier_commit / DAEDALUS_FOURIER_COMMIT bumped to b9f9ff2a (post-PR #25) so the build picks up the public daedalus_h264_chroma_dc_hadamard_2x2 symbol. Verified the patch applies cleanly on top of 0001-0010 against the pinned upstream commit b57fbbe5 on hertz. --- ...-chroma-dc-hadamard-daedalus-fourier.patch | 101 ++++++++++++++++++ arch/ffmpeg-v4l2-request-fourier/PKGBUILD | 8 +- ...-chroma-dc-hadamard-daedalus-fourier.patch | 101 ++++++++++++++++++ .../ffmpeg-v4l2-request-fourier/build-deb.sh | 3 +- 4 files changed, 209 insertions(+), 4 deletions(-) create mode 100644 arch/ffmpeg-v4l2-request-fourier/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch create mode 100644 debian/ffmpeg-v4l2-request-fourier/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch diff --git a/arch/ffmpeg-v4l2-request-fourier/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch b/arch/ffmpeg-v4l2-request-fourier/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch new file mode 100644 index 000000000..cf2642f20 --- /dev/null +++ b/arch/ffmpeg-v4l2-request-fourier/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch @@ -0,0 +1,101 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: claude-noether +Date: Sun, 25 May 2026 13:00:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 chroma DC Hadamard through daedalus-fourier + +Substitutes H264DSPContext.chroma_dc_dequant_idct in the +4:2:0 / bit_depth=8 init path with a wrapper that composes +the daedalus chroma DC Hadamard primitive (fourier PR #25) +with qmul scaling FFmpeg does in one fused function. + +Bit-exact against ff_h264_chroma_dc_dequant_idct_8_c. +Hadamard correctness gated by fourier PR #23 test suite. + +4:2:2 chroma stays on the in-tree 422 variant (same +gating shape as 0009 chroma deblock substitution). + +Requires daedalus-fourier commit b9f9ff2 or later (PR #25 +exposing the public Hadamard symbol). Pin bumps in PKGBUILD +and build-deb.sh come in the same commit. +--- +diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c +--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:38:32.019491484 +0200 ++++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:38:32.033821507 +0200 +@@ -1,5 +1,5 @@ + /* +- * H.264 4x4 / 8x8 IDCT + luma v/h (inter + intra) + chroma v/h deblock — daedalus-fourier substitution shims. ++ * H.264 4x4 / 8x8 IDCT + luma v/h (inter+intra) + chroma v/h deblock + chroma DC Hadamard — daedalus-fourier substitution shims. + * + * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 + * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 +@@ -9,6 +9,7 @@ + * H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h + * H264DSPContext.v_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_v_intra + * H264DSPContext.h_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_h_intra ++ * H264DSPContext.chroma_dc_dequant_idct → daedalus_h264_chroma_dc_hadamard_2x2 + caller-side qmul + * instead of the in-tree ff_h264_*_neon assembly. The recipe layer + * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8 + * is CPU primary with QPU opportunistic — the ctx below is no-QPU, +@@ -60,6 +61,7 @@ + int alpha, int beta); + void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); ++void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul); + + void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) + { +@@ -187,3 +189,32 @@ + daedalus_recipe_dispatch_h264_deblock_luma_h_intra(g_dctx, pix, (size_t)stride, + 1, &meta); + } ++ ++/* Composes daedalus_h264_chroma_dc_hadamard_2x2 with the qmul scaling ++ * that FFmpeg's reference does in one fused function (h264idct_template.c ++ * ff_h264_chroma_dc_dequant_idct). ++ * ++ * The 4 DC coefficients are scattered across the per-MB coefficient ++ * buffer at offsets [r*stride + c*xStride] (stride=32, xStride=16). ++ * Extract into a contiguous int16[4], run the Hadamard, then apply ++ * the qmul scale and write back to the original positions. ++ * ++ * No daedalus ctx needed; the Hadamard is a pure stateless primitive. ++ */ ++void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul) ++{ ++ enum { stride = 32, xStride = 16 }; ++ int16_t dc[4]; ++ ++ dc[0] = block[stride*0 + xStride*0]; ++ dc[1] = block[stride*0 + xStride*1]; ++ dc[2] = block[stride*1 + xStride*0]; ++ dc[3] = block[stride*1 + xStride*1]; ++ ++ daedalus_h264_chroma_dc_hadamard_2x2(dc); ++ ++ block[stride*0 + xStride*0] = (int16_t)((int)dc[0] * qmul >> 7); ++ block[stride*0 + xStride*1] = (int16_t)((int)dc[1] * qmul >> 7); ++ block[stride*1 + xStride*0] = (int16_t)((int)dc[2] * qmul >> 7); ++ block[stride*1 + xStride*1] = (int16_t)((int)dc[3] * qmul >> 7); ++} +diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c +--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:38:32.020346459 +0200 ++++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:38:32.033909804 +0200 +@@ -41,6 +41,7 @@ + int beta); + void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); ++void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul); + void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); + void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, +@@ -135,6 +136,7 @@ + c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; + + if (chroma_format_idc <= 1) { ++ c->chroma_dc_dequant_idct = ff_h264_chroma_dc_dequant_idct_daedalus; + c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_daedalus; + c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; + c->h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon; +-- +2.47.3 + diff --git a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD index 242d66e94..d8e4e35a4 100644 --- a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD +++ b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD @@ -30,7 +30,7 @@ epoch=2 # daedalus-fourier pin. 209a421 = PR #2 merge (Phase 8c — public API # gains daedalus_recipe_dispatch_h264_qpel_mc20 + DAEDALUS_KERNEL_H264_QPEL_MC20). # Cycle 9 closes the libavcodec.so substitution arc started at cycle 6. -_daedalus_fourier_commit='209a4218bcb98b91c04f07ad61513bb04adb13ad' +_daedalus_fourier_commit='b9f9ff2a89c068aea54dcb52b543afddad28311e' # PR #25 — public chroma DC Hadamard symbol pkgdesc='FFmpeg with V4L2 Request API hwaccel (Rockchip / Allwinner stateless decode)' arch=('aarch64') url='https://github.com/Kwiboo/FFmpeg' @@ -97,8 +97,9 @@ source=("git+https://github.com/Kwiboo/FFmpeg.git#commit=${_commit}" '0007-h264-qpel-mc20-daedalus-fourier.patch' '0008-h264-deblock-luma-h-daedalus-fourier.patch' '0009-h264-deblock-chroma-daedalus-fourier.patch' - '0010-h264-deblock-luma-intra-daedalus-fourier.patch') -sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') + '0010-h264-deblock-luma-intra-daedalus-fourier.patch' + '0011-h264-chroma-dc-hadamard-daedalus-fourier.patch') +sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') pkgver() { cd "${_srcname}" @@ -119,6 +120,7 @@ prepare() { patch -Np1 -i "${srcdir}/0008-h264-deblock-luma-h-daedalus-fourier.patch" patch -Np1 -i "${srcdir}/0009-h264-deblock-chroma-daedalus-fourier.patch" patch -Np1 -i "${srcdir}/0010-h264-deblock-luma-intra-daedalus-fourier.patch" + patch -Np1 -i "${srcdir}/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch" } build() { diff --git a/debian/ffmpeg-v4l2-request-fourier/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch b/debian/ffmpeg-v4l2-request-fourier/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch new file mode 100644 index 000000000..cf2642f20 --- /dev/null +++ b/debian/ffmpeg-v4l2-request-fourier/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch @@ -0,0 +1,101 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: claude-noether +Date: Sun, 25 May 2026 13:00:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 chroma DC Hadamard through daedalus-fourier + +Substitutes H264DSPContext.chroma_dc_dequant_idct in the +4:2:0 / bit_depth=8 init path with a wrapper that composes +the daedalus chroma DC Hadamard primitive (fourier PR #25) +with qmul scaling FFmpeg does in one fused function. + +Bit-exact against ff_h264_chroma_dc_dequant_idct_8_c. +Hadamard correctness gated by fourier PR #23 test suite. + +4:2:2 chroma stays on the in-tree 422 variant (same +gating shape as 0009 chroma deblock substitution). + +Requires daedalus-fourier commit b9f9ff2 or later (PR #25 +exposing the public Hadamard symbol). Pin bumps in PKGBUILD +and build-deb.sh come in the same commit. +--- +diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c +--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:38:32.019491484 +0200 ++++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:38:32.033821507 +0200 +@@ -1,5 +1,5 @@ + /* +- * H.264 4x4 / 8x8 IDCT + luma v/h (inter + intra) + chroma v/h deblock — daedalus-fourier substitution shims. ++ * H.264 4x4 / 8x8 IDCT + luma v/h (inter+intra) + chroma v/h deblock + chroma DC Hadamard — daedalus-fourier substitution shims. + * + * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 + * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 +@@ -9,6 +9,7 @@ + * H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h + * H264DSPContext.v_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_v_intra + * H264DSPContext.h_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_h_intra ++ * H264DSPContext.chroma_dc_dequant_idct → daedalus_h264_chroma_dc_hadamard_2x2 + caller-side qmul + * instead of the in-tree ff_h264_*_neon assembly. The recipe layer + * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8 + * is CPU primary with QPU opportunistic — the ctx below is no-QPU, +@@ -60,6 +61,7 @@ + int alpha, int beta); + void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); ++void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul); + + void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) + { +@@ -187,3 +189,32 @@ + daedalus_recipe_dispatch_h264_deblock_luma_h_intra(g_dctx, pix, (size_t)stride, + 1, &meta); + } ++ ++/* Composes daedalus_h264_chroma_dc_hadamard_2x2 with the qmul scaling ++ * that FFmpeg's reference does in one fused function (h264idct_template.c ++ * ff_h264_chroma_dc_dequant_idct). ++ * ++ * The 4 DC coefficients are scattered across the per-MB coefficient ++ * buffer at offsets [r*stride + c*xStride] (stride=32, xStride=16). ++ * Extract into a contiguous int16[4], run the Hadamard, then apply ++ * the qmul scale and write back to the original positions. ++ * ++ * No daedalus ctx needed; the Hadamard is a pure stateless primitive. ++ */ ++void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul) ++{ ++ enum { stride = 32, xStride = 16 }; ++ int16_t dc[4]; ++ ++ dc[0] = block[stride*0 + xStride*0]; ++ dc[1] = block[stride*0 + xStride*1]; ++ dc[2] = block[stride*1 + xStride*0]; ++ dc[3] = block[stride*1 + xStride*1]; ++ ++ daedalus_h264_chroma_dc_hadamard_2x2(dc); ++ ++ block[stride*0 + xStride*0] = (int16_t)((int)dc[0] * qmul >> 7); ++ block[stride*0 + xStride*1] = (int16_t)((int)dc[1] * qmul >> 7); ++ block[stride*1 + xStride*0] = (int16_t)((int)dc[2] * qmul >> 7); ++ block[stride*1 + xStride*1] = (int16_t)((int)dc[3] * qmul >> 7); ++} +diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c +--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:38:32.020346459 +0200 ++++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:38:32.033909804 +0200 +@@ -41,6 +41,7 @@ + int beta); + void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); ++void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul); + void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); + void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, +@@ -135,6 +136,7 @@ + c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; + + if (chroma_format_idc <= 1) { ++ c->chroma_dc_dequant_idct = ff_h264_chroma_dc_dequant_idct_daedalus; + c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_daedalus; + c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; + c->h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon; +-- +2.47.3 + diff --git a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh index 9566ef14c..9fb2bb1d5 100755 --- a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh +++ b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh @@ -45,7 +45,7 @@ PKGREL=10 # pkgrel=10 — H.264 luma qpel mc20 daedalus-fourier substitution # DAEDALUS_KERNEL_H264_QPEL_MC20. Cycle 9 plumbs the last H.264 NEON # kernel through the recipe layer. Daemon-side build (debian/daedalus-v4l2) # can bump in a follow-up; this PR only changes the libavcodec.so consumer. -DAEDALUS_FOURIER_COMMIT=209a4218bcb98b91c04f07ad61513bb04adb13ad +DAEDALUS_FOURIER_COMMIT=b9f9ff2a89c068aea54dcb52b543afddad28311e # PR #25 — public chroma DC Hadamard HERE=$(dirname "$(readlink -f "$0")") @@ -77,6 +77,7 @@ patch -Np1 -i "$HERE/0007-h264-qpel-mc20-daedalus-fourier.patch" patch -Np1 -i "$HERE/0008-h264-deblock-luma-h-daedalus-fourier.patch" patch -Np1 -i "$HERE/0009-h264-deblock-chroma-daedalus-fourier.patch" patch -Np1 -i "$HERE/0010-h264-deblock-luma-intra-daedalus-fourier.patch" +patch -Np1 -i "$HERE/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch" # --- daedalus-fourier: fetch + build static .a with PIC, install to a # per-build prefix; libavcodec.so links it into the shared object so -- 2.47.3