8f9487d355
Substitutes c->v_loop_filter_chroma_intra and c->h_loop_filter_chroma_intra with daedalus wrappers in the bit_depth=8 / chroma_format_idc<=1 (4:2:0) branch. 4:2:2 stays on the in-tree NEON path (the daedalus chroma intra dispatch is 4:2:0-only). The fourier dispatches were exposed in PR #11 (DEFINE_INTRA_DISPATCH macro generates the public daedalus_dispatch_h264_deblock_chroma_*_intra symbols + recipe wrappers). Re-architects the chroma init: v_loop_filter_chroma_intra was previously assigned unconditionally to the NEON variant (which works for both 4:2:0 and 4:2:2). We now assign it INSIDE both branches of the chroma_format_idc conditional — 4:2:0 picks daedalus, 4:2:2 keeps NEON. No regression for 4:2:2 streams. Same NEON-to-NEON via recipe shape as 0010 luma intra. Closes the deblock substitution layer for the 4:2:0 / 8-bit hot path: - 0005 luma_v non-intra ✓ - 0008 luma_h non-intra ✓ - 0009 chroma_v / chroma_h non-intra ✓ - 0010 luma_v / luma_h intra ✓ - 0013 chroma_v / chroma_h intra ✓ All 8 deblock variants for the common 4:2:0 path now route through daedalus. 4:2:2 chroma + the chroma422 mbaff variants stay on in-tree NEON. Verified the patch applies cleanly on top of 0001-0012 against the pinned upstream commit b57fbbe5 on hertz.
121 lines
6.7 KiB
Diff
121 lines
6.7 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: claude-noether <claude-noether@noreply.localhost>
|
|
Date: Sun, 25 May 2026 14:30:00 +0200
|
|
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 chroma intra deblock (4:2:0) through daedalus-fourier
|
|
|
|
Substitutes c->v_loop_filter_chroma_intra and c->h_loop_filter_chroma_intra
|
|
with daedalus wrappers in the bit_depth=8 / chroma_format_idc<=1 (4:2:0)
|
|
branch. 4:2:2 stays on the in-tree NEON path (the daedalus chroma intra
|
|
dispatch is 4:2:0-only).
|
|
|
|
The fourier dispatches were exposed in PR #11 (DEFINE_INTRA_DISPATCH
|
|
macro generates the public daedalus_dispatch_h264_deblock_chroma_*_intra
|
|
symbols + recipe wrappers).
|
|
|
|
Re-architects the chroma init: v_loop_filter_chroma_intra was previously
|
|
assigned unconditionally to the NEON variant (which works for both 4:2:0
|
|
and 4:2:2). We now assign it INSIDE both branches of the chroma_format_idc
|
|
conditional, with the 4:2:0 branch picking daedalus and the 4:2:2 branch
|
|
keeping NEON. No regression for 4:2:2 streams.
|
|
|
|
Same NEON-to-NEON via recipe shape as 0010 luma intra.
|
|
|
|
Refs reauktion/daedalus-v4l2#11 — substitution arc chroma intra.
|
|
---
|
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
|
--- a/libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 14:21:08.267156263 +0200
|
|
+++ libavcodec/aarch64/h264_idct_daedalus.c 2026-05-25 14:21:08.287745931 +0200
|
|
@@ -1,5 +1,5 @@
|
|
/*
|
|
- * H.264 4x4 / 8x8 IDCT + luma v/h (inter+intra) + chroma v/h deblock + chroma DC Hadamard — daedalus-fourier substitution shims.
|
|
+ * H.264 4x4 / 8x8 IDCT + luma v/h (inter+intra) + chroma v/h (inter+intra) deblock + chroma DC Hadamard — daedalus-fourier substitution shims.
|
|
*
|
|
* Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4
|
|
* H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
|
|
@@ -9,6 +9,8 @@
|
|
* H264DSPContext.h_loop_filter_chroma → daedalus_recipe_dispatch_h264_deblock_chroma_h
|
|
* H264DSPContext.v_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_v_intra
|
|
* H264DSPContext.h_loop_filter_luma_intra → daedalus_recipe_dispatch_h264_deblock_luma_h_intra
|
|
+ * H264DSPContext.v_loop_filter_chroma_intra → daedalus_recipe_dispatch_h264_deblock_chroma_v_intra
|
|
+ * H264DSPContext.h_loop_filter_chroma_intra → daedalus_recipe_dispatch_h264_deblock_chroma_h_intra
|
|
* H264DSPContext.chroma_dc_dequant_idct → daedalus_h264_chroma_dc_hadamard_2x2 + caller-side qmul
|
|
* instead of the in-tree ff_h264_*_neon assembly. The recipe layer
|
|
* picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8
|
|
@@ -61,6 +63,10 @@
|
|
int alpha, int beta);
|
|
void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
int alpha, int beta);
|
|
+void ff_h264_v_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
+ int alpha, int beta);
|
|
+void ff_h264_h_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
+ int alpha, int beta);
|
|
void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul);
|
|
|
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
|
@@ -218,3 +224,30 @@
|
|
block[stride*1 + xStride*0] = (int16_t)((int)dc[2] * qmul >> 7);
|
|
block[stride*1 + xStride*1] = (int16_t)((int)dc[3] * qmul >> 7);
|
|
}
|
|
+
|
|
+void ff_h264_v_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
+ int alpha, int beta)
|
|
+{
|
|
+ daedalus_h264_deblock_meta meta = {
|
|
+ .dst_off = 0,
|
|
+ .alpha = alpha,
|
|
+ .beta = beta,
|
|
+ };
|
|
+ /* tc0[] unused for intra (bS=4 hardcodes the strength). */
|
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
|
+ daedalus_recipe_dispatch_h264_deblock_chroma_v_intra(g_dctx, pix, (size_t)stride,
|
|
+ 1, &meta);
|
|
+}
|
|
+
|
|
+void ff_h264_h_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
+ int alpha, int beta)
|
|
+{
|
|
+ daedalus_h264_deblock_meta meta = {
|
|
+ .dst_off = 0,
|
|
+ .alpha = alpha,
|
|
+ .beta = beta,
|
|
+ };
|
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
|
+ daedalus_recipe_dispatch_h264_deblock_chroma_h_intra(g_dctx, pix, (size_t)stride,
|
|
+ 1, &meta);
|
|
+}
|
|
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
|
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 14:21:08.268311057 +0200
|
|
+++ libavcodec/aarch64/h264dsp_init_aarch64.c 2026-05-25 14:21:08.287886563 +0200
|
|
@@ -42,6 +42,10 @@
|
|
void ff_h264_h_loop_filter_luma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
int alpha, int beta);
|
|
void ff_h264_chroma_dc_dequant_idct_daedalus(int16_t *block, int qmul);
|
|
+void ff_h264_v_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
+ int alpha, int beta);
|
|
+void ff_h264_h_loop_filter_chroma_intra_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
+ int alpha, int beta);
|
|
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
|
int beta, int8_t *tc0);
|
|
void ff_h264_v_loop_filter_chroma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
|
@@ -133,14 +137,15 @@
|
|
c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_daedalus;
|
|
|
|
c->v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_daedalus;
|
|
- c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
|
|
|
|
if (chroma_format_idc <= 1) {
|
|
c->chroma_dc_dequant_idct = ff_h264_chroma_dc_dequant_idct_daedalus;
|
|
+ c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_daedalus;
|
|
c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_daedalus;
|
|
- c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
|
|
+ c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_daedalus;
|
|
c->h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
|
|
} else {
|
|
+ c->v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
|
|
c->h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon;
|
|
c->h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon;
|
|
c->h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon;
|
|
--
|
|
2.47.3
|
|
|