From 1b286ddb4efaca26ec9b9e290e989fec77dc1c77 Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Fri, 22 May 2026 10:18:21 +0200 Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 8x8 IDCT through daedalus-fourier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit H264DSPContext.idct8_add (called per 8x8 block from the High-profile intra-8x8-DCT decode path in h264_mb.c) now dispatches through daedalus_recipe_dispatch_h264_idct8 instead of ff_h264_idct8_add_neon. The recipe layer picks the substrate; for cycle 7 (H.264 IDCT 8x8) the recipe is CPU NEON, so this is effectively a NEON-to-NEON substitution layered on top of the cycle-6 IDCT 4x4 wiring. Same pthread_once global context, same destructive-zero semantics; FFmpeg column-major 8x8 storage block[r + 8*c] matches daedalus's convention. Bulk path c->idct8_add4 (used for inter 8x8-DCT macroblocks) remains on the in-tree NEON .S code and will be batched through daedalus_recipe_dispatch_h264_idct8 with n_blocks>1 in a follow-up. Bit-exact against ff_h264_idct8_add_neon (daedalus-fourier cycle 7 green). Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 7. --- libavcodec/aarch64/h264_idct_daedalus.c | 29 ++++++++++++++++------- libavcodec/aarch64/h264dsp_init_aarch64.c | 3 ++- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c index 538d223..cbb98af 100644 --- a/libavcodec/aarch64/h264_idct_daedalus.c +++ b/libavcodec/aarch64/h264_idct_daedalus.c @@ -1,14 +1,16 @@ /* - * H.264 4x4 IDCT + add — daedalus-fourier substitution shim. + * H.264 4x4 / 8x8 IDCT + add — daedalus-fourier substitution shims. * - * Routes H264DSPContext.idct_add through - * daedalus_recipe_dispatch_h264_idct4 instead of ff_h264_idct_add_neon. - * The recipe layer picks the substrate (CPU NEON by default for - * cycle 6; future cycles may dispatch to V3D opportunistically). + * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 + * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 + * instead of the in-tree ff_h264_idct{,8}_add_neon assembly. The + * recipe layer picks the substrate (CPU NEON by default for cycles + * 6 + 7; future cycles may dispatch to V3D opportunistically). * - * FFmpeg's 4x4 block memory layout matches daedalus's column-major - * convention: block[r + 4*c] = coefficient at (row r, col c). Both - * sides destructively zero the block after the transform. + * FFmpeg's 4x4 and 8x8 block memory layouts match daedalus's + * column-major convention: block[r + N*c] = coefficient at + * (row r, col c) for N ∈ {4, 8}. Both sides destructively zero the + * block after the transform. * * The library context is process-global and lazily initialised under * pthread_once. We pick the no-QPU constructor here because @@ -37,6 +39,7 @@ static void daedalus_ctx_init_once(void) } void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride); void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) { @@ -47,3 +50,13 @@ void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) daedalus_recipe_dispatch_h264_idct4(g_dctx, dst, (size_t)stride, block, 1, &meta); } + +void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride) +{ + static const daedalus_h264_block_meta meta = { .dst_off = 0 }; + + pthread_once(&g_dctx_once, daedalus_ctx_init_once); + + daedalus_recipe_dispatch_h264_idct8(g_dctx, dst, (size_t)stride, + block, 1, &meta); +} diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c index b993df2..741e551 100644 --- a/libavcodec/aarch64/h264dsp_init_aarch64.c +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -79,6 +79,7 @@ void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, const uint8_t nnzc[15 * 8]); void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride); void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride); void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, int16_t *block, int stride, @@ -146,7 +147,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, c->idct_add16intra = ff_h264_idct_add16intra_neon; if (chroma_format_idc <= 1) c->idct_add8 = ff_h264_idct_add8_neon; - c->idct8_add = ff_h264_idct8_add_neon; + c->idct8_add = ff_h264_idct8_add_daedalus; c->idct8_dc_add = ff_h264_idct8_dc_add_neon; c->idct8_add4 = ff_h264_idct8_add4_neon; } else if (have_neon(cpu_flags) && bit_depth == 10) { -- 2.47.3