From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sun, 25 May 2026 13:00:00 +0200 Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 chroma DC Hadamard through daedalus-fourier Substitutes H264DSPContext.chroma_dc_dequant_idct in the 4:2:0 / bit_depth=8 init path with a wrapper that composes the daedalus chroma DC Hadamard primitive (fourier PR #25) with qmul scaling FFmpeg does in one fused function. Bit-exact against ff_h264_chroma_dc_dequant_idct_8_c. Hadamard correctness gated by fourier PR #23 test suite. 4:2:2 chroma stays on the in-tree 422 variant (same gating shape as 0009 chroma deblock substitution). Requires daedalus-fourier commit b9f9ff2 or later (PR #25 exposing the public Hadamard symbol). Pin bumps in PKGBUILD and build-deb.sh come in the same commit. --- diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c --- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:38:32.019491484 +0200 +++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:38:32.033821507 +0200 @@ -1,5 +1,5 @@ /* - * H.264 4x4 / 8x8 IDCT + luma v/h (inter + intra) + chroma v/h deblock — daedalus-fourier substitution shims. + * H.264 4x4 / 8x8 IDCT + luma v/h (inter+intra) + chroma v/h deblock + chroma DC Hadamard — daedalus-fourier substitution shims. * * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 @@ -9,6 +9,7 @@ * H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h * H264DSPContext.v_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_v_intra * H264DSPContext.h_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_h_intra + * H264DSPContext.chroma_dc_dequant_idct → daedalus_h264_chroma_dc_hadamard_2x2 + caller-side qmul * instead of the in-tree ff_h264_*_neon assembly. The recipe layer * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8 * is CPU primary with QPU opportunistic — the ctx below is no-QPU, @@ -60,6 +61,7 @@ int alpha, int beta); void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); +void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul); void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) { @@ -187,3 +189,32 @@ daedalus_recipe_dispatch_h264_deblock_luma_h_intra(g_dctx, pix, (size_t)stride, 1, &meta); } + +/* Composes daedalus_h264_chroma_dc_hadamard_2x2 with the qmul scaling + * that FFmpeg's reference does in one fused function (h264idct_template.c + * ff_h264_chroma_dc_dequant_idct). + * + * The 4 DC coefficients are scattered across the per-MB coefficient + * buffer at offsets [r*stride + c*xStride] (stride=32, xStride=16). + * Extract into a contiguous int16[4], run the Hadamard, then apply + * the qmul scale and write back to the original positions. + * + * No daedalus ctx needed; the Hadamard is a pure stateless primitive. + */ +void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul) +{ + enum { stride = 32, xStride = 16 }; + int16_t dc[4]; + + dc[0] = block[stride*0 + xStride*0]; + dc[1] = block[stride*0 + xStride*1]; + dc[2] = block[stride*1 + xStride*0]; + dc[3] = block[stride*1 + xStride*1]; + + daedalus_h264_chroma_dc_hadamard_2x2(dc); + + block[stride*0 + xStride*0] = (int16_t)((int)dc[0] * qmul >> 7); + block[stride*0 + xStride*1] = (int16_t)((int)dc[1] * qmul >> 7); + block[stride*1 + xStride*0] = (int16_t)((int)dc[2] * qmul >> 7); + block[stride*1 + xStride*1] = (int16_t)((int)dc[3] * qmul >> 7); +} diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c --- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:38:32.020346459 +0200 +++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:38:32.033909804 +0200 @@ -41,6 +41,7 @@ int beta); void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); +void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul); void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride, @@ -135,6 +136,7 @@ c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; if (chroma_format_idc <= 1) { + c->chroma_dc_dequant_idct = ff_h264_chroma_dc_dequant_idct_daedalus; c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_daedalus; c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; c->h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon; -- 2.47.3