624f83e877
Adds patch 0008 to the substitution arc, mirroring 0005's V variant
for H.264 non-intra bS<4 horizontal luma deblock.
H264DSPContext.h_loop_filter_luma →
daedalus_recipe_dispatch_h264_deblock_luma_h
The H kernel was added to daedalus-fourier in PR #9 (vendored
ff_h264_h_loop_filter_luma_neon, wired through the same CPU-dispatch
pattern as V). Recipe table routes AUTO to CPU NEON (no QPU shader
for H yet), so this is a NEON-to-NEON substitution via the daedalus
recipe layer — same shape as 0005.
The libavcodec.so ctx remains no-QPU (daedalus_ctx_create_no_qpu),
matching the existing 0003/0004/0005/0007 patches. Higher-cycle
QPU init waits for a feature-flag gating change in a separate PR.
Intra (bS=4) h_loop_filter_luma_intra stays on the in-tree NEON .S
code; daedalus_h264_deblock_meta covers the non-intra path only.
A follow-up can route intra once daedalus-fourier exposes the
intra-h dispatch (the kernel already exists internally per fourier
PR #11).
Wires the new patch into both arch/PKGBUILD and debian/build-deb.sh
sequences. Verified the patch applies cleanly on top of 0001-0007
against the pinned upstream commit b57fbbe5 on hertz.
93 lines
4.5 KiB
Diff
93 lines
4.5 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: claude-noether <claude-noether@noreply.localhost>
|
|
Date: Sun, 25 May 2026 12:00:00 +0200
|
|
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 luma-h deblock through daedalus-fourier
|
|
|
|
Sibling of 0005 (which substituted v_loop_filter_luma). Same
|
|
NEON-to-NEON substitution: H264DSPContext.h_loop_filter_luma →
|
|
daedalus_recipe_dispatch_h264_deblock_luma_h. The H kernel landed
|
|
in daedalus-fourier PR #9 (CPU NEON only — no QPU shader yet).
|
|
|
|
libavcodec.so ctx is no-QPU per the existing 0003-0005 / 0007
|
|
pattern; we cannot assume Vulkan in arbitrary host processes
|
|
(firefox-fourier RDD, mpv-fourier, etc.).
|
|
|
|
Intra (bS=4) h_loop_filter_luma_intra stays on the in-tree NEON .S
|
|
code; daedalus_h264_deblock_meta only covers the non-intra path.
|
|
An intra-h substitution can land once daedalus-fourier exposes a
|
|
dispatch helper (the kernel already exists internally per PR #11).
|
|
|
|
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8 H.
|
|
---
|
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
|
--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:09:33.694760715 +0200
|
|
+++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 13:09:33.715603719 +0200
|
|
@@ -1,9 +1,10 @@
|
|
/*
|
|
- * H.264 4x4 / 8x8 IDCT + luma-v deblock — daedalus-fourier substitution shims.
|
|
+ * H.264 4x4 / 8x8 IDCT + luma v/h deblock — daedalus-fourier substitution shims.
|
|
*
|
|
* Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4
|
|
* H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
|
|
* H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v
|
|
+ * H264DSPContext.h_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_h
|
|
* instead of the in-tree ff_h264_*_neon assembly. The recipe layer
|
|
* picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8
|
|
* is CPU primary with QPU opportunistic — the ctx below is no-QPU,
|
|
@@ -45,6 +46,8 @@
|
|
void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
|
void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
int alpha, int beta, int8_t *tc0);
|
|
+void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
+ int alpha, int beta, int8_t *tc0);
|
|
|
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
|
{
|
|
@@ -84,3 +87,22 @@
|
|
daedalus_recipe_dispatch_h264_deblock_luma_v(g_dctx, pix, (size_t)stride,
|
|
1, &meta);
|
|
}
|
|
+
|
|
+void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
+ int alpha, int beta, int8_t *tc0)
|
|
+{
|
|
+ daedalus_h264_deblock_meta meta = {
|
|
+ .dst_off = 0,
|
|
+ .alpha = alpha,
|
|
+ .beta = beta,
|
|
+ };
|
|
+ meta.tc0[0] = tc0[0];
|
|
+ meta.tc0[1] = tc0[1];
|
|
+ meta.tc0[2] = tc0[2];
|
|
+ meta.tc0[3] = tc0[3];
|
|
+
|
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
|
+
|
|
+ daedalus_recipe_dispatch_h264_deblock_luma_h(g_dctx, pix, (size_t)stride,
|
|
+ 1, &meta);
|
|
+}
|
|
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
|
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:09:33.695937103 +0200
|
|
+++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 13:09:33.715541700 +0200
|
|
@@ -31,6 +31,8 @@
|
|
int alpha, int beta, int8_t *tc0);
|
|
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
|
int beta, int8_t *tc0);
|
|
+void ff_h264_h_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
+ int alpha, int beta, int8_t *tc0);
|
|
void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
|
int beta);
|
|
void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
|
@@ -117,7 +119,7 @@
|
|
|
|
if (have_neon(cpu_flags) && bit_depth == 8) {
|
|
c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_daedalus;
|
|
- c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
|
|
+ c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_daedalus;
|
|
c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon;
|
|
c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
|
|
|
|
--
|
|
2.47.3
|
|
|