e641d679d3
First cycle of the libavcodec.so substitution arc (reauktion/daedalus-v4l2#11 step 2). H264DSPContext.idct_add — called per 4×4 block from the intra-4×4 decode path in libavcodec/h264_mb.c — now dispatches through daedalus_recipe_dispatch_h264_idct4 instead of ff_h264_idct_add_neon. ## What - Add 0003-h264-idct4-daedalus-fourier.patch (in both arch/ and debian/ ffmpeg-v4l2-request-fourier/). Creates libavcodec/aarch64/h264_idct_daedalus.c (ff_h264_idct_add_daedalus shim + lazy pthread_once context init via daedalus_ctx_create_no_qpu), patches libavcodec/aarch64/h264dsp_init_aarch64.c to wire c->idct_add to the shim, adds the new .o to libavcodec/aarch64/Makefile. - arch/PKGBUILD + debian/build-deb.sh: fetch + build daedalus-fourier (pinned at d87239d — lockstep with the daedalus-v4l2 daemon's inline build) with -DCMAKE_POSITION_INDEPENDENT_CODE=ON into a per-build temp prefix, then pass --extra-cflags=-I.../include --extra-ldflags=-L.../lib --extra-libs="-ldaedalus_core -lvulkan -lpthread" to FFmpeg configure. daedalus_core.a is static-linked into libavcodec.so.62. - debian/control Depends gains libvulkan1 (daedalus_core PUBLIC-links Vulkan::Vulkan for the queryable QPU substrate; the no-QPU constructor still works at runtime but the loader needs libvulkan.so.1 present to dlopen libavcodec.so.62). - arch/PKGBUILD depends gains vulkan-icd-loader, makedepends gains cmake / ninja / vulkan-headers. ## Why The recipe layer picks the substrate; for cycle 6 (H.264 IDCT 4×4) the recipe is CPU NEON, so this is effectively a NEON-to-NEON substitution with one extra dispatch call and recipe-table lookup. The point of this first cycle isn't perf wins — it's plumbing. Once the path is wired and stable, follow-up patches batch through the bulk paths (idct_add16 / idct_add16intra / idct_add8) and stack cycles 7/8/9 (IDCT 8×8, luma-v deblock, qpel mc20). Bit-exact against ff_h264_idct_add_neon (daedalus-fourier cycle 6 green; FFmpeg's 4×4 block storage matches daedalus's column-major convention). ## Scope NOT covered - Bulk paths (idct_add16 / idct_add16intra / idct_add8) — most IDCT 4×4 calls in real H.264 streams go through these, not the per- block c->idct_add path; intra-4×4-only macroblocks are a minority. Batched substitution lands in a follow-up. - High-bit-depth (10-bit) path — not touched; 8-bit only. - Cycles 7/8/9 — separate PRs. ## SONAME Unchanged. libavcodec.so.62 / libavformat.so.62 / libavutil.so.60. No daedalus-v4l2-dkms or daedalus-v4l2 bump required. ## Refs - reauktion/daedalus-v4l2 issue #11 (substitution arc): reauktion/daedalus-v4l2#11 - marfrit/daedalus-fourier cycle 6 close (H.264 IDCT 4×4 NEON green)
138 lines
6.1 KiB
Diff
138 lines
6.1 KiB
Diff
From f760c0541586f43334c02611fcb4c212c08ad576 Mon Sep 17 00:00:00 2001
|
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
|
Date: Thu, 21 May 2026 21:40:22 +0200
|
|
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 4x4 IDCT through
|
|
daedalus-fourier
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
H264DSPContext.idct_add (called per 4x4 block from the intra-4x4
|
|
decode path in h264_mb.c) now dispatches through
|
|
daedalus_recipe_dispatch_h264_idct4 instead of ff_h264_idct_add_neon.
|
|
|
|
The recipe layer picks the substrate; for cycle 6 (H.264 IDCT 4x4)
|
|
the recipe is CPU NEON, so this is effectively a NEON-to-NEON
|
|
substitution with one extra dispatch call and recipe-table lookup.
|
|
Provides the first end-to-end exercise of the daedalus-fourier
|
|
kernel pack inside the libavcodec.so decode hot path; follow-up
|
|
patches wire IDCT 8x8, luma-v deblock, and qpel mc20.
|
|
|
|
The library context is process-global, lazily initialised under
|
|
pthread_once on first call. We pick the no-QPU constructor because
|
|
libavcodec.so is loaded into arbitrary host processes
|
|
(firefox-fourier, mpv-fourier, daedalus_v4l2_daemon, ...) and we
|
|
cannot assume the host has a usable Vulkan instance. Higher cycles
|
|
(deblock luma-v, MC) that benefit from the QPU will provision their
|
|
own recipe-selected context once that path is wired.
|
|
|
|
Bulk paths (idct_add16, idct_add16intra, idct_add8 — used for
|
|
non-intra4x4 macroblocks) remain on the stock NEON .S implementations
|
|
and will be batched through daedalus_recipe_dispatch_h264_idct4 with
|
|
n_blocks>1 in a follow-up.
|
|
|
|
Bit-exact against ff_h264_idct_add_neon (daedalus-fourier cycle 6
|
|
green; see marfrit/daedalus-fourier/CYCLE_LOGS.md).
|
|
|
|
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2.
|
|
---
|
|
libavcodec/aarch64/Makefile | 3 +-
|
|
libavcodec/aarch64/h264_idct_daedalus.c | 49 +++++++++++++++++++++++
|
|
libavcodec/aarch64/h264dsp_init_aarch64.c | 3 +-
|
|
3 files changed, 53 insertions(+), 2 deletions(-)
|
|
create mode 100644 libavcodec/aarch64/h264_idct_daedalus.c
|
|
|
|
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
|
|
index 41ab025..7b95fb1 100644
|
|
--- a/libavcodec/aarch64/Makefile
|
|
+++ b/libavcodec/aarch64/Makefile
|
|
@@ -3,7 +3,8 @@ OBJS-$(CONFIG_AC3DSP) += aarch64/ac3dsp_init_aarch64.o
|
|
OBJS-$(CONFIG_FDCTDSP) += aarch64/fdctdsp_init_aarch64.o
|
|
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
|
|
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
|
|
-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
|
|
+OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o \
|
|
+ aarch64/h264_idct_daedalus.o
|
|
OBJS-$(CONFIG_HUFFYUVDSP) += aarch64/huffyuvdsp_init_aarch64.o
|
|
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
|
|
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
|
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
|
new file mode 100644
|
|
index 0000000..538d223
|
|
--- /dev/null
|
|
+++ b/libavcodec/aarch64/h264_idct_daedalus.c
|
|
@@ -0,0 +1,49 @@
|
|
+/*
|
|
+ * H.264 4x4 IDCT + add — daedalus-fourier substitution shim.
|
|
+ *
|
|
+ * Routes H264DSPContext.idct_add through
|
|
+ * daedalus_recipe_dispatch_h264_idct4 instead of ff_h264_idct_add_neon.
|
|
+ * The recipe layer picks the substrate (CPU NEON by default for
|
|
+ * cycle 6; future cycles may dispatch to V3D opportunistically).
|
|
+ *
|
|
+ * FFmpeg's 4x4 block memory layout matches daedalus's column-major
|
|
+ * convention: block[r + 4*c] = coefficient at (row r, col c). Both
|
|
+ * sides destructively zero the block after the transform.
|
|
+ *
|
|
+ * The library context is process-global and lazily initialised under
|
|
+ * pthread_once. We pick the no-QPU constructor here because
|
|
+ * libavcodec.so is loaded into arbitrary host processes
|
|
+ * (firefox-fourier, mpv-fourier, daedalus_v4l2_daemon, ...) and we
|
|
+ * cannot assume the host has a usable Vulkan instance. Higher cycles
|
|
+ * (deblock, MC) that benefit from the QPU initialise their own
|
|
+ * recipe-selected context once that path is wired.
|
|
+ */
|
|
+
|
|
+#include <pthread.h>
|
|
+#include <stddef.h>
|
|
+#include <stdint.h>
|
|
+
|
|
+#include <daedalus.h>
|
|
+
|
|
+#include "libavutil/attributes.h"
|
|
+#include "libavcodec/h264dsp.h"
|
|
+
|
|
+static daedalus_ctx *g_dctx;
|
|
+static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT;
|
|
+
|
|
+static void daedalus_ctx_init_once(void)
|
|
+{
|
|
+ g_dctx = daedalus_ctx_create_no_qpu();
|
|
+}
|
|
+
|
|
+void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
|
+
|
|
+void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
|
+{
|
|
+ static const daedalus_h264_block_meta meta = { .dst_off = 0 };
|
|
+
|
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
|
+
|
|
+ daedalus_recipe_dispatch_h264_idct4(g_dctx, dst, (size_t)stride,
|
|
+ block, 1, &meta);
|
|
+}
|
|
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
|
index c684574..b993df2 100644
|
|
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
|
|
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
|
@@ -66,6 +66,7 @@ void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride
|
|
int weights, int offset);
|
|
|
|
void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
|
|
+void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
|
void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
|
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
|
|
int16_t *block, int stride,
|
|
@@ -139,7 +140,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
|
c->biweight_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
|
|
c->biweight_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
|
|
|
|
- c->idct_add = ff_h264_idct_add_neon;
|
|
+ c->idct_add = ff_h264_idct_add_daedalus;
|
|
c->idct_dc_add = ff_h264_idct_dc_add_neon;
|
|
c->idct_add16 = ff_h264_idct_add16_neon;
|
|
c->idct_add16intra = ff_h264_idct_add16intra_neon;
|
|
--
|
|
2.47.3
|
|
|