forked from marfrit/marfrit-packages
Compare commits
4 Commits
6047c04f7f
...
9c70ffffe7
| Author | SHA1 | Date | |
|---|---|---|---|
| 9c70ffffe7 | |||
| 520f2fce33 | |||
| 875156782e | |||
| 8f9487d355 |
+120
@@ -0,0 +1,120 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: claude-noether <claude-noether@noreply.localhost>
|
||||||
|
Date: Sun, 25 May 2026 14:30:00 +0200
|
||||||
|
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 chroma intra deblock (4:2:0) through daedalus-fourier
|
||||||
|
|
||||||
|
Substitutes c->v_loop_filter_chroma_intra and c->h_loop_filter_chroma_intra
|
||||||
|
with daedalus wrappers in the bit_depth=8 / chroma_format_idc<=1 (4:2:0)
|
||||||
|
branch. 4:2:2 stays on the in-tree NEON path (the daedalus chroma intra
|
||||||
|
dispatch is 4:2:0-only).
|
||||||
|
|
||||||
|
The fourier dispatches were exposed in PR #11 (DEFINE_INTRA_DISPATCH
|
||||||
|
macro generates the public daedalus_dispatch_h264_deblock_chroma_*_intra
|
||||||
|
symbols + recipe wrappers).
|
||||||
|
|
||||||
|
Re-architects the chroma init: v_loop_filter_chroma_intra was previously
|
||||||
|
assigned unconditionally to the NEON variant (which works for both 4:2:0
|
||||||
|
and 4:2:2). We now assign it INSIDE both branches of the chroma_format_idc
|
||||||
|
conditional, with the 4:2:0 branch picking daedalus and the 4:2:2 branch
|
||||||
|
keeping NEON. No regression for 4:2:2 streams.
|
||||||
|
|
||||||
|
Same NEON-to-NEON via recipe shape as 0010 luma intra.
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-v4l2#11 — substitution arc chroma intra.
|
||||||
|
---
|
||||||
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 14:21:08.267156263 +0200
|
||||||
|
+++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 14:21:08.287745931 +0200
|
||||||
|
@@ -1,5 +1,5 @@
|
||||||
|
/*
|
||||||
|
- * H.264 4x4 / 8x8 IDCT + luma v/h (inter+intra) + chroma v/h deblock + chroma DC Hadamard — daedalus-fourier substitution shims.
|
||||||
|
+ * H.264 4x4 / 8x8 IDCT + luma v/h (inter+intra) + chroma v/h (inter+intra) deblock + chroma DC Hadamard — daedalus-fourier substitution shims.
|
||||||
|
*
|
||||||
|
* Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4
|
||||||
|
* H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
|
||||||
|
@@ -9,6 +9,8 @@
|
||||||
|
* H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h
|
||||||
|
* H264DSPContext.v_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_v_intra
|
||||||
|
* H264DSPContext.h_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_h_intra
|
||||||
|
+ * H264DSPContext.v_loop_filter_chroma_intra → daedalus_recipe_dispatch_h264_deblock_chroma_v_intra
|
||||||
|
+ * H264DSPContext.h_loop_filter_chroma_intra → daedalus_recipe_dispatch_h264_deblock_chroma_h_intra
|
||||||
|
* H264DSPContext.chroma_dc_dequant_idct → daedalus_h264_chroma_dc_hadamard_2x2 + caller-side qmul
|
||||||
|
* instead of the in-tree ff_h264_*_neon assembly. The recipe layer
|
||||||
|
* picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8
|
||||||
|
@@ -61,6 +63,10 @@
|
||||||
|
int alpha, int beta);
|
||||||
|
void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
|
+void ff_h264_v_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta);
|
||||||
|
+void ff_h264_h_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta);
|
||||||
|
void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul);
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
@@ -218,3 +224,30 @@
|
||||||
|
block[stride*1 + xStride*0] = (int16_t)((int)dc[2] * qmul >> 7);
|
||||||
|
block[stride*1 + xStride*1] = (int16_t)((int)dc[3] * qmul >> 7);
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+void ff_h264_v_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta)
|
||||||
|
+{
|
||||||
|
+ daedalus_h264_deblock_meta meta = {
|
||||||
|
+ .dst_off = 0,
|
||||||
|
+ .alpha = alpha,
|
||||||
|
+ .beta = beta,
|
||||||
|
+ };
|
||||||
|
+ /* tc0[] unused for intra (bS=4 hardcodes the strength). */
|
||||||
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
||||||
|
+ daedalus_recipe_dispatch_h264_deblock_chroma_v_intra(g_dctx, pix, (size_t)stride,
|
||||||
|
+ 1, &meta);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void ff_h264_h_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta)
|
||||||
|
+{
|
||||||
|
+ daedalus_h264_deblock_meta meta = {
|
||||||
|
+ .dst_off = 0,
|
||||||
|
+ .alpha = alpha,
|
||||||
|
+ .beta = beta,
|
||||||
|
+ };
|
||||||
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
||||||
|
+ daedalus_recipe_dispatch_h264_deblock_chroma_h_intra(g_dctx, pix, (size_t)stride,
|
||||||
|
+ 1, &meta);
|
||||||
|
+}
|
||||||
|
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 14:21:08.268311057 +0200
|
||||||
|
+++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 14:21:08.287886563 +0200
|
||||||
|
@@ -42,6 +42,10 @@
|
||||||
|
void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
|
void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul);
|
||||||
|
+void ff_h264_v_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta);
|
||||||
|
+void ff_h264_h_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta);
|
||||||
|
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||||
|
int beta, int8_t *tc0);
|
||||||
|
void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
@@ -133,14 +137,15 @@
|
||||||
|
c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_daedalus;
|
||||||
|
|
||||||
|
c->v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_daedalus;
|
||||||
|
- c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
|
||||||
|
|
||||||
|
if (chroma_format_idc <= 1) {
|
||||||
|
c->chroma_dc_dequant_idct = ff_h264_chroma_dc_dequant_idct_daedalus;
|
||||||
|
+ c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_daedalus;
|
||||||
|
c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_daedalus;
|
||||||
|
- c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
|
||||||
|
+ c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_daedalus;
|
||||||
|
c->h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
|
||||||
|
} else {
|
||||||
|
+ c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
|
||||||
|
c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon;
|
||||||
|
c->h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon;
|
||||||
|
c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon;
|
||||||
|
--
|
||||||
|
2.47.3
|
||||||
|
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Date: Mon, 25 May 2026 21:00:00 +0200
|
||||||
|
Subject: [PATCH] avcodec/aarch64/h264: use QPU-capable daedalus ctx (bench
|
||||||
|
shows 4.30x faster on Pi 5)
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Patches 0003 (IDCT 4x4) and 0007 (qpel mc20) created the libavcodec.so
|
||||||
|
process-global daedalus_ctx via daedalus_ctx_create_no_qpu(). Rationale
|
||||||
|
at the time: cycle 6/9 had only CPU NEON paths, so a QPU-capable ctx
|
||||||
|
would have meant pointless Vulkan init in every host process (firefox-
|
||||||
|
fourier, mpv-fourier, daedalus_v4l2_daemon, ...).
|
||||||
|
|
||||||
|
Two things changed since:
|
||||||
|
|
||||||
|
1. Every H.264 hot-path primitive now has a V3D7 compute shader.
|
||||||
|
IDCT 4x4/8x8 (cycles 6, 7), 8 deblock variants (luma+chroma x V+H
|
||||||
|
x inter+intra), 30 qpel positions (15 put_ + 15 avg_). See
|
||||||
|
daedalus-fourier PRs #28-#35.
|
||||||
|
|
||||||
|
2. Dispatch overhead has been hammered down — buffer pool in
|
||||||
|
v3d_runner (daedalus-fourier task #160) plus persistent command
|
||||||
|
buffer (task #161). daedalus-fourier PR #36 bench measures the
|
||||||
|
1080p worst-case sum on hertz (Pi 5 V3D 7.1, 30 iters x 5 warmup):
|
||||||
|
|
||||||
|
kernel CPU ns/op QPU ns/op winner
|
||||||
|
IDCT 4x4 luma 10.79 2.47 QPU 4.36x
|
||||||
|
IDCT 8x8 luma 29.69 9.23 QPU 3.22x
|
||||||
|
Deblock luma_v 17.58 10.21 QPU 1.72x
|
||||||
|
Deblock luma_h 38.41 9.98 QPU 3.85x
|
||||||
|
qpel mc20 (8x8) 28.24 9.66 QPU 2.92x
|
||||||
|
qpel mc02 (8x8) 16.96 20.54 CPU 1.21x
|
||||||
|
qpel mc22 (8x8) 71.58 9.64 QPU 7.43x
|
||||||
|
|
||||||
|
1080p worst-case sum (IDCT4 + deblock luma + qpel mc22):
|
||||||
|
CPU NEON only: 5.57 ms
|
||||||
|
QPU only: 1.30 ms (CPU/QPU sum ratio = 4.30x)
|
||||||
|
|
||||||
|
PR #10's verdict (CPU 4x faster than QPU at IDCT) is reversed. Switch
|
||||||
|
the substitution context to daedalus_ctx_create() in both H.264 TUs
|
||||||
|
(h264_idct_daedalus.c, h264_qpel_daedalus.c) so the recipe layer can
|
||||||
|
actually route through the now-faster QPU path.
|
||||||
|
|
||||||
|
daedalus_ctx_create() probes for a usable Vulkan device and falls back
|
||||||
|
to no_qpu mode if unavailable, so this is safe on hosts without V3D
|
||||||
|
(x86 reauktion build runners, debian-aarch64 builders without renderD,
|
||||||
|
etc.). Hosts WITH V3D (Pi 5 deployment targets) get the speedup.
|
||||||
|
|
||||||
|
The remaining qpel mc02 anomaly (single-axis vertical filter, 1.21x
|
||||||
|
CPU) is bench-flagged for a v2 shader follow-up; the recipe entry
|
||||||
|
stays QPU since the policy decree (2026-05-23 substrate decree) holds
|
||||||
|
and the gap is marginal.
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-fourier!36.
|
||||||
|
---
|
||||||
|
libavcodec/aarch64/h264_idct_daedalus.c | 2 +-
|
||||||
|
libavcodec/aarch64/h264_qpel_daedalus.c | 2 +-
|
||||||
|
2 files changed, 2 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
--- a/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
+++ b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
@@ -32,7 +32,7 @@ static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT;
|
||||||
|
|
||||||
|
static void daedalus_ctx_init_once(void)
|
||||||
|
{
|
||||||
|
- g_dctx = daedalus_ctx_create_no_qpu();
|
||||||
|
+ g_dctx = daedalus_ctx_create();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
diff --git a/libavcodec/aarch64/h264_qpel_daedalus.c b/libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
--- a/libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
+++ b/libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
@@ -38,7 +38,7 @@ static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT;
|
||||||
|
|
||||||
|
static void daedalus_ctx_init_once(void)
|
||||||
|
{
|
||||||
|
- g_dctx = daedalus_ctx_create_no_qpu();
|
||||||
|
+ g_dctx = daedalus_ctx_create();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
@@ -24,7 +24,7 @@ _srcname=FFmpeg
|
|||||||
_version='8.1'
|
_version='8.1'
|
||||||
_commit='b57fbbe50c9b2656fad86a1a7eeabfd2b2a50935' # v4l2-request-n8.1 tip 2026-04-24
|
_commit='b57fbbe50c9b2656fad86a1a7eeabfd2b2a50935' # v4l2-request-n8.1 tip 2026-04-24
|
||||||
pkgver=8.1.r123329.b57fbbe
|
pkgver=8.1.r123329.b57fbbe
|
||||||
pkgrel=10 # pkgrel=10 — H.264 luma qpel mc20 daedalus-fourier substitution (cycle 9, 2026-05-23)
|
pkgrel=11 # pkgrel=11 — libavcodec.so daedalus ctx flipped no_qpu → qpu-capable (PR #36 bench: QPU 4.30x on 1080p hot-path sum, 2026-05-25)
|
||||||
epoch=2
|
epoch=2
|
||||||
|
|
||||||
# daedalus-fourier pin. 209a421 = PR #2 merge (Phase 8c — public API
|
# daedalus-fourier pin. 209a421 = PR #2 merge (Phase 8c — public API
|
||||||
@@ -99,8 +99,10 @@ source=("git+https://github.com/Kwiboo/FFmpeg.git#commit=${_commit}"
|
|||||||
'0009-h264-deblock-chroma-daedalus-fourier.patch'
|
'0009-h264-deblock-chroma-daedalus-fourier.patch'
|
||||||
'0010-h264-deblock-luma-intra-daedalus-fourier.patch'
|
'0010-h264-deblock-luma-intra-daedalus-fourier.patch'
|
||||||
'0011-h264-chroma-dc-hadamard-daedalus-fourier.patch'
|
'0011-h264-chroma-dc-hadamard-daedalus-fourier.patch'
|
||||||
'0012-h264-qpel-rest-daedalus-fourier.patch')
|
'0012-h264-qpel-rest-daedalus-fourier.patch'
|
||||||
sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP')
|
'0013-h264-deblock-chroma-intra-daedalus-fourier.patch'
|
||||||
|
'0014-h264-ctx-qpu-capable.patch')
|
||||||
|
sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP')
|
||||||
|
|
||||||
pkgver() {
|
pkgver() {
|
||||||
cd "${_srcname}"
|
cd "${_srcname}"
|
||||||
@@ -123,6 +125,8 @@ prepare() {
|
|||||||
patch -Np1 -i "${srcdir}/0010-h264-deblock-luma-intra-daedalus-fourier.patch"
|
patch -Np1 -i "${srcdir}/0010-h264-deblock-luma-intra-daedalus-fourier.patch"
|
||||||
patch -Np1 -i "${srcdir}/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch"
|
patch -Np1 -i "${srcdir}/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch"
|
||||||
patch -Np1 -i "${srcdir}/0012-h264-qpel-rest-daedalus-fourier.patch"
|
patch -Np1 -i "${srcdir}/0012-h264-qpel-rest-daedalus-fourier.patch"
|
||||||
|
patch -Np1 -i "${srcdir}/0013-h264-deblock-chroma-intra-daedalus-fourier.patch"
|
||||||
|
patch -Np1 -i "${srcdir}/0014-h264-ctx-qpu-capable.patch"
|
||||||
}
|
}
|
||||||
|
|
||||||
build() {
|
build() {
|
||||||
|
|||||||
+120
@@ -0,0 +1,120 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: claude-noether <claude-noether@noreply.localhost>
|
||||||
|
Date: Sun, 25 May 2026 14:30:00 +0200
|
||||||
|
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 chroma intra deblock (4:2:0) through daedalus-fourier
|
||||||
|
|
||||||
|
Substitutes c->v_loop_filter_chroma_intra and c->h_loop_filter_chroma_intra
|
||||||
|
with daedalus wrappers in the bit_depth=8 / chroma_format_idc<=1 (4:2:0)
|
||||||
|
branch. 4:2:2 stays on the in-tree NEON path (the daedalus chroma intra
|
||||||
|
dispatch is 4:2:0-only).
|
||||||
|
|
||||||
|
The fourier dispatches were exposed in PR #11 (DEFINE_INTRA_DISPATCH
|
||||||
|
macro generates the public daedalus_dispatch_h264_deblock_chroma_*_intra
|
||||||
|
symbols + recipe wrappers).
|
||||||
|
|
||||||
|
Re-architects the chroma init: v_loop_filter_chroma_intra was previously
|
||||||
|
assigned unconditionally to the NEON variant (which works for both 4:2:0
|
||||||
|
and 4:2:2). We now assign it INSIDE both branches of the chroma_format_idc
|
||||||
|
conditional, with the 4:2:0 branch picking daedalus and the 4:2:2 branch
|
||||||
|
keeping NEON. No regression for 4:2:2 streams.
|
||||||
|
|
||||||
|
Same NEON-to-NEON via recipe shape as 0010 luma intra.
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-v4l2#11 — substitution arc chroma intra.
|
||||||
|
---
|
||||||
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 14:21:08.267156263 +0200
|
||||||
|
+++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 14:21:08.287745931 +0200
|
||||||
|
@@ -1,5 +1,5 @@
|
||||||
|
/*
|
||||||
|
- * H.264 4x4 / 8x8 IDCT + luma v/h (inter+intra) + chroma v/h deblock + chroma DC Hadamard — daedalus-fourier substitution shims.
|
||||||
|
+ * H.264 4x4 / 8x8 IDCT + luma v/h (inter+intra) + chroma v/h (inter+intra) deblock + chroma DC Hadamard — daedalus-fourier substitution shims.
|
||||||
|
*
|
||||||
|
* Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4
|
||||||
|
* H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
|
||||||
|
@@ -9,6 +9,8 @@
|
||||||
|
* H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h
|
||||||
|
* H264DSPContext.v_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_v_intra
|
||||||
|
* H264DSPContext.h_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_h_intra
|
||||||
|
+ * H264DSPContext.v_loop_filter_chroma_intra → daedalus_recipe_dispatch_h264_deblock_chroma_v_intra
|
||||||
|
+ * H264DSPContext.h_loop_filter_chroma_intra → daedalus_recipe_dispatch_h264_deblock_chroma_h_intra
|
||||||
|
* H264DSPContext.chroma_dc_dequant_idct → daedalus_h264_chroma_dc_hadamard_2x2 + caller-side qmul
|
||||||
|
* instead of the in-tree ff_h264_*_neon assembly. The recipe layer
|
||||||
|
* picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8
|
||||||
|
@@ -61,6 +63,10 @@
|
||||||
|
int alpha, int beta);
|
||||||
|
void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
|
+void ff_h264_v_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta);
|
||||||
|
+void ff_h264_h_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta);
|
||||||
|
void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul);
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
@@ -218,3 +224,30 @@
|
||||||
|
block[stride*1 + xStride*0] = (int16_t)((int)dc[2] * qmul >> 7);
|
||||||
|
block[stride*1 + xStride*1] = (int16_t)((int)dc[3] * qmul >> 7);
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+void ff_h264_v_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta)
|
||||||
|
+{
|
||||||
|
+ daedalus_h264_deblock_meta meta = {
|
||||||
|
+ .dst_off = 0,
|
||||||
|
+ .alpha = alpha,
|
||||||
|
+ .beta = beta,
|
||||||
|
+ };
|
||||||
|
+ /* tc0[] unused for intra (bS=4 hardcodes the strength). */
|
||||||
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
||||||
|
+ daedalus_recipe_dispatch_h264_deblock_chroma_v_intra(g_dctx, pix, (size_t)stride,
|
||||||
|
+ 1, &meta);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void ff_h264_h_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta)
|
||||||
|
+{
|
||||||
|
+ daedalus_h264_deblock_meta meta = {
|
||||||
|
+ .dst_off = 0,
|
||||||
|
+ .alpha = alpha,
|
||||||
|
+ .beta = beta,
|
||||||
|
+ };
|
||||||
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
||||||
|
+ daedalus_recipe_dispatch_h264_deblock_chroma_h_intra(g_dctx, pix, (size_t)stride,
|
||||||
|
+ 1, &meta);
|
||||||
|
+}
|
||||||
|
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 14:21:08.268311057 +0200
|
||||||
|
+++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 14:21:08.287886563 +0200
|
||||||
|
@@ -42,6 +42,10 @@
|
||||||
|
void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
|
void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul);
|
||||||
|
+void ff_h264_v_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta);
|
||||||
|
+void ff_h264_h_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta);
|
||||||
|
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||||
|
int beta, int8_t *tc0);
|
||||||
|
void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
@@ -133,14 +137,15 @@
|
||||||
|
c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_daedalus;
|
||||||
|
|
||||||
|
c->v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_daedalus;
|
||||||
|
- c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
|
||||||
|
|
||||||
|
if (chroma_format_idc <= 1) {
|
||||||
|
c->chroma_dc_dequant_idct = ff_h264_chroma_dc_dequant_idct_daedalus;
|
||||||
|
+ c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_daedalus;
|
||||||
|
c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_daedalus;
|
||||||
|
- c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
|
||||||
|
+ c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_daedalus;
|
||||||
|
c->h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
|
||||||
|
} else {
|
||||||
|
+ c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
|
||||||
|
c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon;
|
||||||
|
c->h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon;
|
||||||
|
c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon;
|
||||||
|
--
|
||||||
|
2.47.3
|
||||||
|
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Date: Mon, 25 May 2026 21:00:00 +0200
|
||||||
|
Subject: [PATCH] avcodec/aarch64/h264: use QPU-capable daedalus ctx (bench
|
||||||
|
shows 4.30x faster on Pi 5)
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Patches 0003 (IDCT 4x4) and 0007 (qpel mc20) created the libavcodec.so
|
||||||
|
process-global daedalus_ctx via daedalus_ctx_create_no_qpu(). Rationale
|
||||||
|
at the time: cycle 6/9 had only CPU NEON paths, so a QPU-capable ctx
|
||||||
|
would have meant pointless Vulkan init in every host process (firefox-
|
||||||
|
fourier, mpv-fourier, daedalus_v4l2_daemon, ...).
|
||||||
|
|
||||||
|
Two things changed since:
|
||||||
|
|
||||||
|
1. Every H.264 hot-path primitive now has a V3D7 compute shader.
|
||||||
|
IDCT 4x4/8x8 (cycles 6, 7), 8 deblock variants (luma+chroma x V+H
|
||||||
|
x inter+intra), 30 qpel positions (15 put_ + 15 avg_). See
|
||||||
|
daedalus-fourier PRs #28-#35.
|
||||||
|
|
||||||
|
2. Dispatch overhead has been hammered down — buffer pool in
|
||||||
|
v3d_runner (daedalus-fourier task #160) plus persistent command
|
||||||
|
buffer (task #161). daedalus-fourier PR #36 bench measures the
|
||||||
|
1080p worst-case sum on hertz (Pi 5 V3D 7.1, 30 iters x 5 warmup):
|
||||||
|
|
||||||
|
kernel CPU ns/op QPU ns/op winner
|
||||||
|
IDCT 4x4 luma 10.79 2.47 QPU 4.36x
|
||||||
|
IDCT 8x8 luma 29.69 9.23 QPU 3.22x
|
||||||
|
Deblock luma_v 17.58 10.21 QPU 1.72x
|
||||||
|
Deblock luma_h 38.41 9.98 QPU 3.85x
|
||||||
|
qpel mc20 (8x8) 28.24 9.66 QPU 2.92x
|
||||||
|
qpel mc02 (8x8) 16.96 20.54 CPU 1.21x
|
||||||
|
qpel mc22 (8x8) 71.58 9.64 QPU 7.43x
|
||||||
|
|
||||||
|
1080p worst-case sum (IDCT4 + deblock luma + qpel mc22):
|
||||||
|
CPU NEON only: 5.57 ms
|
||||||
|
QPU only: 1.30 ms (CPU/QPU sum ratio = 4.30x)
|
||||||
|
|
||||||
|
PR #10's verdict (CPU 4x faster than QPU at IDCT) is reversed. Switch
|
||||||
|
the substitution context to daedalus_ctx_create() in both H.264 TUs
|
||||||
|
(h264_idct_daedalus.c, h264_qpel_daedalus.c) so the recipe layer can
|
||||||
|
actually route through the now-faster QPU path.
|
||||||
|
|
||||||
|
daedalus_ctx_create() probes for a usable Vulkan device and falls back
|
||||||
|
to no_qpu mode if unavailable, so this is safe on hosts without V3D
|
||||||
|
(x86 reauktion build runners, debian-aarch64 builders without renderD,
|
||||||
|
etc.). Hosts WITH V3D (Pi 5 deployment targets) get the speedup.
|
||||||
|
|
||||||
|
The remaining qpel mc02 anomaly (single-axis vertical filter, 1.21x
|
||||||
|
CPU) is bench-flagged for a v2 shader follow-up; the recipe entry
|
||||||
|
stays QPU since the policy decree (2026-05-23 substrate decree) holds
|
||||||
|
and the gap is marginal.
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-fourier!36.
|
||||||
|
---
|
||||||
|
libavcodec/aarch64/h264_idct_daedalus.c | 2 +-
|
||||||
|
libavcodec/aarch64/h264_qpel_daedalus.c | 2 +-
|
||||||
|
2 files changed, 2 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
--- a/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
+++ b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
@@ -32,7 +32,7 @@ static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT;
|
||||||
|
|
||||||
|
static void daedalus_ctx_init_once(void)
|
||||||
|
{
|
||||||
|
- g_dctx = daedalus_ctx_create_no_qpu();
|
||||||
|
+ g_dctx = daedalus_ctx_create();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
diff --git a/libavcodec/aarch64/h264_qpel_daedalus.c b/libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
--- a/libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
+++ b/libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
@@ -38,7 +38,7 @@ static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT;
|
||||||
|
|
||||||
|
static void daedalus_ctx_init_once(void)
|
||||||
|
{
|
||||||
|
- g_dctx = daedalus_ctx_create_no_qpu();
|
||||||
|
+ g_dctx = daedalus_ctx_create();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
+3
-1
@@ -33,7 +33,7 @@ FFMPEG_VERSION=8.1
|
|||||||
# epoch 2 matches Debian's stock ffmpeg (currently 7:7.1.x in trixie);
|
# epoch 2 matches Debian's stock ffmpeg (currently 7:7.1.x in trixie);
|
||||||
# +rfourier suffix to avoid colliding with upstream/Debian rebuilds.
|
# +rfourier suffix to avoid colliding with upstream/Debian rebuilds.
|
||||||
PKGVER=2:${FFMPEG_VERSION}+rfourier+gb57fbbe
|
PKGVER=2:${FFMPEG_VERSION}+rfourier+gb57fbbe
|
||||||
PKGREL=10 # pkgrel=10 — H.264 luma qpel mc20 daedalus-fourier substitution
|
PKGREL=11 # pkgrel=11 — libavcodec.so daedalus ctx flipped no_qpu → qpu-capable (PR #36 bench: QPU 4.30x on 1080p hot-path sum, 2026-05-25)
|
||||||
# (cycle 9 of the daedalus-v4l2#11 step 2 substitution arc; closes
|
# (cycle 9 of the daedalus-v4l2#11 step 2 substitution arc; closes
|
||||||
# the libavcodec.so substitution sequence 6 IDCT4 / 7 IDCT8 /
|
# the libavcodec.so substitution sequence 6 IDCT4 / 7 IDCT8 /
|
||||||
# 8 luma-v deblock / 9 qpel mc20). Pulls daedalus-fourier PR #2
|
# 8 luma-v deblock / 9 qpel mc20). Pulls daedalus-fourier PR #2
|
||||||
@@ -79,6 +79,8 @@ patch -Np1 -i "$HERE/0009-h264-deblock-chroma-daedalus-fourier.patch"
|
|||||||
patch -Np1 -i "$HERE/0010-h264-deblock-luma-intra-daedalus-fourier.patch"
|
patch -Np1 -i "$HERE/0010-h264-deblock-luma-intra-daedalus-fourier.patch"
|
||||||
patch -Np1 -i "$HERE/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch"
|
patch -Np1 -i "$HERE/0011-h264-chroma-dc-hadamard-daedalus-fourier.patch"
|
||||||
patch -Np1 -i "$HERE/0012-h264-qpel-rest-daedalus-fourier.patch"
|
patch -Np1 -i "$HERE/0012-h264-qpel-rest-daedalus-fourier.patch"
|
||||||
|
patch -Np1 -i "$HERE/0013-h264-deblock-chroma-intra-daedalus-fourier.patch"
|
||||||
|
patch -Np1 -i "$HERE/0014-h264-ctx-qpu-capable.patch"
|
||||||
|
|
||||||
# --- daedalus-fourier: fetch + build static .a with PIC, install to a
|
# --- daedalus-fourier: fetch + build static .a with PIC, install to a
|
||||||
# per-build prefix; libavcodec.so links it into the shared object so
|
# per-build prefix; libavcodec.so links it into the shared object so
|
||||||
|
|||||||
Reference in New Issue
Block a user