From 68731c41d7ea68be0e912b128cb4e71fb56e8263 Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Fri, 22 May 2026 12:15:16 +0200 Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 luma-v deblock through daedalus-fourier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit H264DSPContext.v_loop_filter_luma (non-intra bS<4 vertical luma deblock, called per macroblock-row edge from the slice deblock loop) now dispatches through daedalus_recipe_dispatch_h264_deblock_luma_v instead of ff_h264_v_loop_filter_luma_neon. The recipe layer picks the substrate; for cycle 8 the daedalus docstring marks the kernel "CPU primary; QPU opportunistic", but the libavcodec.so context here is built with daedalus_ctx_create_no_qpu — process-global pthread_once init, shared with cycles 6/7. QPU opportunism stays gated off until a follow-up adds an explicit feature flag (no implicit Vulkan init in arbitrary host processes). In the meantime cycle 8 is a plumbing-only substitution, NEON-to-NEON via the daedalus recipe. Intra (bS=4) loop filter — c->v_loop_filter_luma_intra — stays on the in-tree NEON .S code; daedalus's daedalus_h264_deblock_meta only covers the non-intra path per its docstring. FFmpeg `int alpha/beta/int8_t tc0[4]` → daedalus_h264_deblock_meta (int32_t alpha/beta + inline int8_t tc0[4]). pix already points to row 0 of the bottom block per FFmpeg's deblock convention, satisfying daedalus's `dst_off >= 4 * dst_stride` constraint. Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8. --- libavcodec/aarch64/h264_idct_daedalus.c | 36 +++++++++++++++++++---- libavcodec/aarch64/h264dsp_init_aarch64.c | 4 ++- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c index cbb98af..92365fa 100644 --- a/libavcodec/aarch64/h264_idct_daedalus.c +++ b/libavcodec/aarch64/h264_idct_daedalus.c @@ -1,11 +1,14 @@ /* - * H.264 4x4 / 8x8 IDCT + add — daedalus-fourier substitution shims. + * H.264 4x4 / 8x8 IDCT + luma-v deblock — daedalus-fourier substitution shims. * - * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 - * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 - * instead of the in-tree ff_h264_idct{,8}_add_neon assembly. The - * recipe layer picks the substrate (CPU NEON by default for cycles - * 6 + 7; future cycles may dispatch to V3D opportunistically). + * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4 + * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8 + * H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v + * instead of the in-tree ff_h264_*_neon assembly. The recipe layer + * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8 + * is CPU primary with QPU opportunistic — the ctx below is no-QPU, + * so cycle 8 stays on the CPU NEON path until a separate change + * gates QPU init on a daedalus-fourier feature flag). * * FFmpeg's 4x4 and 8x8 block memory layouts match daedalus's * column-major convention: block[r + N*c] = coefficient at @@ -40,6 +43,8 @@ static void daedalus_ctx_init_once(void) void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride); void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride); +void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) { @@ -60,3 +65,22 @@ void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride) daedalus_recipe_dispatch_h264_idct8(g_dctx, dst, (size_t)stride, block, 1, &meta); } + +void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0) +{ + daedalus_h264_deblock_meta meta = { + .dst_off = 0, + .alpha = alpha, + .beta = beta, + }; + meta.tc0[0] = tc0[0]; + meta.tc0[1] = tc0[1]; + meta.tc0[2] = tc0[2]; + meta.tc0[3] = tc0[3]; + + pthread_once(&g_dctx_once, daedalus_ctx_init_once); + + daedalus_recipe_dispatch_h264_deblock_luma_v(g_dctx, pix, (size_t)stride, + 1, &meta); +} diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c index 741e551..85ac381 100644 --- a/libavcodec/aarch64/h264dsp_init_aarch64.c +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -27,6 +27,8 @@ void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); +void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, @@ -114,7 +116,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, int cpu_flags = av_get_cpu_flags(); if (have_neon(cpu_flags) && bit_depth == 8) { - c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; + c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_daedalus; c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon; c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; -- 2.47.3