diff --git a/arch/ffmpeg-v4l2-request-fourier/0013-h264-ctx-qpu-capable.patch b/arch/ffmpeg-v4l2-request-fourier/0013-h264-ctx-qpu-capable.patch new file mode 100644 index 000000000..7a238aa0c --- /dev/null +++ b/arch/ffmpeg-v4l2-request-fourier/0013-h264-ctx-qpu-capable.patch @@ -0,0 +1,85 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Markus Fritsche +Date: Mon, 25 May 2026 21:00:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264: use QPU-capable daedalus ctx (bench + shows 4.30x faster on Pi 5) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Patches 0003 (IDCT 4x4) and 0007 (qpel mc20) created the libavcodec.so +process-global daedalus_ctx via daedalus_ctx_create_no_qpu(). Rationale +at the time: cycle 6/9 had only CPU NEON paths, so a QPU-capable ctx +would have meant pointless Vulkan init in every host process (firefox- +fourier, mpv-fourier, daedalus_v4l2_daemon, ...). + +Two things changed since: + + 1. Every H.264 hot-path primitive now has a V3D7 compute shader. + IDCT 4x4/8x8 (cycles 6, 7), 8 deblock variants (luma+chroma x V+H + x inter+intra), 30 qpel positions (15 put_ + 15 avg_). See + daedalus-fourier PRs #28-#35. + + 2. Dispatch overhead has been hammered down — buffer pool in + v3d_runner (daedalus-fourier task #160) plus persistent command + buffer (task #161). daedalus-fourier PR #36 bench measures the + 1080p worst-case sum on hertz (Pi 5 V3D 7.1, 30 iters x 5 warmup): + + kernel CPU ns/op QPU ns/op winner + IDCT 4x4 luma 10.79 2.47 QPU 4.36x + IDCT 8x8 luma 29.69 9.23 QPU 3.22x + Deblock luma_v 17.58 10.21 QPU 1.72x + Deblock luma_h 38.41 9.98 QPU 3.85x + qpel mc20 (8x8) 28.24 9.66 QPU 2.92x + qpel mc02 (8x8) 16.96 20.54 CPU 1.21x + qpel mc22 (8x8) 71.58 9.64 QPU 7.43x + + 1080p worst-case sum (IDCT4 + deblock luma + qpel mc22): + CPU NEON only: 5.57 ms + QPU only: 1.30 ms (CPU/QPU sum ratio = 4.30x) + +PR #10's verdict (CPU 4x faster than QPU at IDCT) is reversed. Switch +the substitution context to daedalus_ctx_create() in both H.264 TUs +(h264_idct_daedalus.c, h264_qpel_daedalus.c) so the recipe layer can +actually route through the now-faster QPU path. + +daedalus_ctx_create() probes for a usable Vulkan device and falls back +to no_qpu mode if unavailable, so this is safe on hosts without V3D +(x86 reauktion build runners, debian-aarch64 builders without renderD, +etc.). Hosts WITH V3D (Pi 5 deployment targets) get the speedup. + +The remaining qpel mc02 anomaly (single-axis vertical filter, 1.21x +CPU) is bench-flagged for a v2 shader follow-up; the recipe entry +stays QPU since the policy decree (2026-05-23 substrate decree) holds +and the gap is marginal. + +Refs reauktion/daedalus-fourier!36. +--- + libavcodec/aarch64/h264_idct_daedalus.c | 2 +- + libavcodec/aarch64/h264_qpel_daedalus.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c +--- a/libavcodec/aarch64/h264_idct_daedalus.c ++++ b/libavcodec/aarch64/h264_idct_daedalus.c +@@ -32,7 +32,7 @@ static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT; + + static void daedalus_ctx_init_once(void) + { +- g_dctx = daedalus_ctx_create_no_qpu(); ++ g_dctx = daedalus_ctx_create(); + } + + void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride); +diff --git a/libavcodec/aarch64/h264_qpel_daedalus.c b/libavcodec/aarch64/h264_qpel_daedalus.c +--- a/libavcodec/aarch64/h264_qpel_daedalus.c ++++ b/libavcodec/aarch64/h264_qpel_daedalus.c +@@ -38,7 +38,7 @@ static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT; + + static void daedalus_ctx_init_once(void) + { +- g_dctx = daedalus_ctx_create_no_qpu(); ++ g_dctx = daedalus_ctx_create(); + } + + void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); diff --git a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD index f2e145c35..f60b3ba96 100644 --- a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD +++ b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD @@ -24,7 +24,7 @@ _srcname=FFmpeg _version='8.1' _commit='b57fbbe50c9b2656fad86a1a7eeabfd2b2a50935' # v4l2-request-n8.1 tip 2026-04-24 pkgver=8.1.r123329.b57fbbe -pkgrel=10 # pkgrel=10 — H.264 luma qpel mc20 daedalus-fourier substitution (cycle 9, 2026-05-23) +pkgrel=11 # pkgrel=11 — libavcodec.so daedalus ctx flipped no_qpu → qpu-capable (PR #36 bench: QPU 4.30x, 2026-05-25) epoch=2 # daedalus-fourier pin. 209a421 = PR #2 merge (Phase 8c — public API @@ -99,8 +99,9 @@ source=("git+https://github.com/Kwiboo/FFmpeg.git#commit=${_commit}" '0009-h264-deblock-chroma-daedalus-fourier.patch' '0010-h264-deblock-luma-intra-daedalus-fourier.patch' '0011-h264-chroma-dc-hadamard-daedalus-fourier.patch' - '0012-h264-qpel-rest-daedalus-fourier.patch') -sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') + '0012-h264-qpel-rest-daedalus-fourier.patch' + '0013-h264-ctx-qpu-capable.patch') +sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') pkgver() { cd "${_srcname}" @@ -123,6 +124,7 @@ prepare() { patch -Np1 -i "${srcdir}/0010-h264-deblock-luma-intra-daedalus-fourier.patch" patch -Np1 -i "${srcdir}/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch" patch -Np1 -i "${srcdir}/0012-h264-qpel-rest-daedalus-fourier.patch" + patch -Np1 -i "${srcdir}/0013-h264-ctx-qpu-capable.patch" } build() { diff --git a/debian/ffmpeg-v4l2-request-fourier/0013-h264-ctx-qpu-capable.patch b/debian/ffmpeg-v4l2-request-fourier/0013-h264-ctx-qpu-capable.patch new file mode 100644 index 000000000..7a238aa0c --- /dev/null +++ b/debian/ffmpeg-v4l2-request-fourier/0013-h264-ctx-qpu-capable.patch @@ -0,0 +1,85 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Markus Fritsche +Date: Mon, 25 May 2026 21:00:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264: use QPU-capable daedalus ctx (bench + shows 4.30x faster on Pi 5) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Patches 0003 (IDCT 4x4) and 0007 (qpel mc20) created the libavcodec.so +process-global daedalus_ctx via daedalus_ctx_create_no_qpu(). Rationale +at the time: cycle 6/9 had only CPU NEON paths, so a QPU-capable ctx +would have meant pointless Vulkan init in every host process (firefox- +fourier, mpv-fourier, daedalus_v4l2_daemon, ...). + +Two things changed since: + + 1. Every H.264 hot-path primitive now has a V3D7 compute shader. + IDCT 4x4/8x8 (cycles 6, 7), 8 deblock variants (luma+chroma x V+H + x inter+intra), 30 qpel positions (15 put_ + 15 avg_). See + daedalus-fourier PRs #28-#35. + + 2. Dispatch overhead has been hammered down — buffer pool in + v3d_runner (daedalus-fourier task #160) plus persistent command + buffer (task #161). daedalus-fourier PR #36 bench measures the + 1080p worst-case sum on hertz (Pi 5 V3D 7.1, 30 iters x 5 warmup): + + kernel CPU ns/op QPU ns/op winner + IDCT 4x4 luma 10.79 2.47 QPU 4.36x + IDCT 8x8 luma 29.69 9.23 QPU 3.22x + Deblock luma_v 17.58 10.21 QPU 1.72x + Deblock luma_h 38.41 9.98 QPU 3.85x + qpel mc20 (8x8) 28.24 9.66 QPU 2.92x + qpel mc02 (8x8) 16.96 20.54 CPU 1.21x + qpel mc22 (8x8) 71.58 9.64 QPU 7.43x + + 1080p worst-case sum (IDCT4 + deblock luma + qpel mc22): + CPU NEON only: 5.57 ms + QPU only: 1.30 ms (CPU/QPU sum ratio = 4.30x) + +PR #10's verdict (CPU 4x faster than QPU at IDCT) is reversed. Switch +the substitution context to daedalus_ctx_create() in both H.264 TUs +(h264_idct_daedalus.c, h264_qpel_daedalus.c) so the recipe layer can +actually route through the now-faster QPU path. + +daedalus_ctx_create() probes for a usable Vulkan device and falls back +to no_qpu mode if unavailable, so this is safe on hosts without V3D +(x86 reauktion build runners, debian-aarch64 builders without renderD, +etc.). Hosts WITH V3D (Pi 5 deployment targets) get the speedup. + +The remaining qpel mc02 anomaly (single-axis vertical filter, 1.21x +CPU) is bench-flagged for a v2 shader follow-up; the recipe entry +stays QPU since the policy decree (2026-05-23 substrate decree) holds +and the gap is marginal. + +Refs reauktion/daedalus-fourier!36. +--- + libavcodec/aarch64/h264_idct_daedalus.c | 2 +- + libavcodec/aarch64/h264_qpel_daedalus.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c +--- a/libavcodec/aarch64/h264_idct_daedalus.c ++++ b/libavcodec/aarch64/h264_idct_daedalus.c +@@ -32,7 +32,7 @@ static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT; + + static void daedalus_ctx_init_once(void) + { +- g_dctx = daedalus_ctx_create_no_qpu(); ++ g_dctx = daedalus_ctx_create(); + } + + void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride); +diff --git a/libavcodec/aarch64/h264_qpel_daedalus.c b/libavcodec/aarch64/h264_qpel_daedalus.c +--- a/libavcodec/aarch64/h264_qpel_daedalus.c ++++ b/libavcodec/aarch64/h264_qpel_daedalus.c +@@ -38,7 +38,7 @@ static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT; + + static void daedalus_ctx_init_once(void) + { +- g_dctx = daedalus_ctx_create_no_qpu(); ++ g_dctx = daedalus_ctx_create(); + } + + void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); diff --git a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh index a9e7131af..9e6ae42ff 100755 --- a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh +++ b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh @@ -33,7 +33,7 @@ FFMPEG_VERSION=8.1 # epoch 2 matches Debian's stock ffmpeg (currently 7:7.1.x in trixie); # +rfourier suffix to avoid colliding with upstream/Debian rebuilds. PKGVER=2:${FFMPEG_VERSION}+rfourier+gb57fbbe -PKGREL=10 # pkgrel=10 — H.264 luma qpel mc20 daedalus-fourier substitution +PKGREL=11 # pkgrel=11 — libavcodec.so daedalus ctx flipped no_qpu → qpu-capable (PR #36 bench: QPU 4.30x) # (cycle 9 of the daedalus-v4l2#11 step 2 substitution arc; closes # the libavcodec.so substitution sequence 6 IDCT4 / 7 IDCT8 / # 8 luma-v deblock / 9 qpel mc20). Pulls daedalus-fourier PR #2 @@ -79,6 +79,7 @@ patch -Np1 -i "$HERE/0009-h264-deblock-chroma-daedalus-fourier.patch" patch -Np1 -i "$HERE/0010-h264-deblock-luma-intra-daedalus-fourier.patch" patch -Np1 -i "$HERE/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch" patch -Np1 -i "$HERE/0012-h264-qpel-rest-daedalus-fourier.patch" +patch -Np1 -i "$HERE/0013-h264-ctx-qpu-capable.patch" # --- daedalus-fourier: fetch + build static .a with PIC, install to a # per-build prefix; libavcodec.so links it into the shared object so