From f760c0541586f43334c02611fcb4c212c08ad576 Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Thu, 21 May 2026 21:40:22 +0200 Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 4x4 IDCT through daedalus-fourier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit H264DSPContext.idct_add (called per 4x4 block from the intra-4x4 decode path in h264_mb.c) now dispatches through daedalus_recipe_dispatch_h264_idct4 instead of ff_h264_idct_add_neon. The recipe layer picks the substrate; for cycle 6 (H.264 IDCT 4x4) the recipe is CPU NEON, so this is effectively a NEON-to-NEON substitution with one extra dispatch call and recipe-table lookup. Provides the first end-to-end exercise of the daedalus-fourier kernel pack inside the libavcodec.so decode hot path; follow-up patches wire IDCT 8x8, luma-v deblock, and qpel mc20. The library context is process-global, lazily initialised under pthread_once on first call. We pick the no-QPU constructor because libavcodec.so is loaded into arbitrary host processes (firefox-fourier, mpv-fourier, daedalus_v4l2_daemon, ...) and we cannot assume the host has a usable Vulkan instance. Higher cycles (deblock luma-v, MC) that benefit from the QPU will provision their own recipe-selected context once that path is wired. Bulk paths (idct_add16, idct_add16intra, idct_add8 — used for non-intra4x4 macroblocks) remain on the stock NEON .S implementations and will be batched through daedalus_recipe_dispatch_h264_idct4 with n_blocks>1 in a follow-up. Bit-exact against ff_h264_idct_add_neon (daedalus-fourier cycle 6 green; see marfrit/daedalus-fourier/CYCLE_LOGS.md). Refs reauktion/daedalus-v4l2#11 — substitution arc step 2. --- libavcodec/aarch64/Makefile | 3 +- libavcodec/aarch64/h264_idct_daedalus.c | 49 +++++++++++++++++++++++ libavcodec/aarch64/h264dsp_init_aarch64.c | 3 +- 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 libavcodec/aarch64/h264_idct_daedalus.c diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 41ab025..7b95fb1 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -3,7 +3,8 @@ OBJS-$(CONFIG_AC3DSP) += aarch64/ac3dsp_init_aarch64.o OBJS-$(CONFIG_FDCTDSP) += aarch64/fdctdsp_init_aarch64.o OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o -OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o +OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o \ + aarch64/h264_idct_daedalus.o OBJS-$(CONFIG_HUFFYUVDSP) += aarch64/huffyuvdsp_init_aarch64.o OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c new file mode 100644 index 0000000..538d223 --- /dev/null +++ b/libavcodec/aarch64/h264_idct_daedalus.c @@ -0,0 +1,49 @@ +/* + * H.264 4x4 IDCT + add — daedalus-fourier substitution shim. + * + * Routes H264DSPContext.idct_add through + * daedalus_recipe_dispatch_h264_idct4 instead of ff_h264_idct_add_neon. + * The recipe layer picks the substrate (CPU NEON by default for + * cycle 6; future cycles may dispatch to V3D opportunistically). + * + * FFmpeg's 4x4 block memory layout matches daedalus's column-major + * convention: block[r + 4*c] = coefficient at (row r, col c). Both + * sides destructively zero the block after the transform. + * + * The library context is process-global and lazily initialised under + * pthread_once. We pick the no-QPU constructor here because + * libavcodec.so is loaded into arbitrary host processes + * (firefox-fourier, mpv-fourier, daedalus_v4l2_daemon, ...) and we + * cannot assume the host has a usable Vulkan instance. Higher cycles + * (deblock, MC) that benefit from the QPU initialise their own + * recipe-selected context once that path is wired. + */ + +#include +#include +#include + +#include + +#include "libavutil/attributes.h" +#include "libavcodec/h264dsp.h" + +static daedalus_ctx *g_dctx; +static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT; + +static void daedalus_ctx_init_once(void) +{ + g_dctx = daedalus_ctx_create_no_qpu(); +} + +void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride); + +void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride) +{ + static const daedalus_h264_block_meta meta = { .dst_off = 0 }; + + pthread_once(&g_dctx_once, daedalus_ctx_init_once); + + daedalus_recipe_dispatch_h264_idct4(g_dctx, dst, (size_t)stride, + block, 1, &meta); +} diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c index c684574..b993df2 100644 --- a/libavcodec/aarch64/h264dsp_init_aarch64.c +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -66,6 +66,7 @@ void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride int weights, int offset); void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride); void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride); void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, int16_t *block, int stride, @@ -139,7 +140,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, c->biweight_pixels_tab[1] = ff_biweight_h264_pixels_8_neon; c->biweight_pixels_tab[2] = ff_biweight_h264_pixels_4_neon; - c->idct_add = ff_h264_idct_add_neon; + c->idct_add = ff_h264_idct_add_daedalus; c->idct_dc_add = ff_h264_idct_dc_add_neon; c->idct_add16 = ff_h264_idct_add16_neon; c->idct_add16intra = ff_h264_idct_add16intra_neon; -- 2.47.3