h264: deblock bS=4 intra variants (luma + chroma, V + H)
Closes the deblock matrix: adds the four bS=4 intra-strength loop
filters used at I-MB edges (and other boundaries where H.264
§8.7.2.1 forces boundary strength to 4). After this PR fourier
covers all 8 standard 8-bit 4:2:0 deblock combinations:
bS<4 bS=4
----- -----
luma_v ✓ (cycle 8 QPU) ✓ (CPU)
luma_h ✓ (CPU, PR #9) ✓ (CPU)
chrm_v ✓ (CPU, PR #10) ✓ (CPU)
chrm_h ✓ (CPU, PR #10) ✓ (CPU)
Scope:
- 4 new kernel enums (LV_INTRA=13, LH_INTRA=14, CV_INTRA=15,
CH_INTRA=16), all → CPU substrate in the recipe table.
- 4 new public dispatch fns + 4 recipe wrappers (defined via two
DEFINE_INTRA_DISPATCH / DEFINE_INTRA_RECIPE macros to keep the
boilerplate tight).
- 4 new extern decls for the vendored
ff_h264_{v,h}_loop_filter_{luma,chroma}_intra_neon symbols.
- C reference: tests/h264_intra_loop_filter_ref.c covers all four
orientations. Algorithm per H.264 §8.7.2.3:
Luma: per-side strong/weak filter selector
strong_p = (|p2-p0| < β) AND (|p0-q0| < (α>>2)+2)
strong_q = (|q2-q0| < β) AND (|p0-q0| < (α>>2)+2)
Strong updates p0/p1/p2 (and mirror); weak updates p0 only.
Chroma: always weak, only p0/q0 updated.
- daedalus_h264_deblock_meta is REUSED for intra dispatches; the
tc0[] field is ignored (bS=4 hardcodes the strength). Callers
can build a single edge list and route by kernel without an
extra struct.
- Test refactor: an intra_test_spec table + run_intra_test helper
drives all four orientations through one harness, keeping the
new test surface compact (~50 LOC for 4 kernels vs ~200 if each
had its own test_deblock_*_intra fn).
Verified on hertz (Pi 5 / V3D 7.1):
$ ./build/test_api_h264
=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===
...
H.264 deblock luma v intra: 1024/1024 bytes bit-exact (100.0000%)
H.264 deblock luma h intra: 1024/1024 bytes bit-exact (100.0000%)
H.264 deblock chroma v intra: 256/256 bytes bit-exact (100.0000%)
H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%)
...
All 11 H.264 kernels bit-exact PASS — the deblock matrix is closed.
The bit-exact match on first try is meaningful for these kernels:
the strong/weak filter selector + per-side asymmetry would have
surfaced any sign / shift / rounding mistake immediately. The
C reference is now a usable spec checkpoint for the eventual QPU
shader work.
QPU shader follow-up: not in this PR. The intra path's 3-cell
per-side update + strong/weak branch is structurally more complex
than the bS<4 path that already has a V shader (v3d_h264deblock.spv).
Per the prior R-band logic for deblock, intra edges are < 20% of
total deblock work at typical bit-rates, so NEON-only at ~ 10 ns/edge
fits comfortably in the budget.
This commit is contained in:
@@ -133,6 +133,10 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 luma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
@@ -164,6 +168,14 @@ extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
extern void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
|
||||
@@ -320,6 +332,63 @@ static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* --- bS=4 intra variants. Note: the daedalus_h264_deblock_meta
|
||||
* struct's tc0[] field is unused for intra (the spec hardcodes the
|
||||
* strength). We accept the same meta type so callers can build a
|
||||
* single edge-list and route by kernel — saves an extra struct.
|
||||
*/
|
||||
static int dispatch_h264_deblock_luma_v_intra_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
ff_h264_v_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_luma_h_intra_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
ff_h264_h_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_chroma_v_intra_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
ff_h264_v_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_chroma_h_intra_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
ff_h264_h_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
@@ -1270,6 +1339,27 @@ int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrat
|
||||
return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
#define DEFINE_INTRA_DISPATCH(name, kernel, cpu_fn) \
|
||||
int daedalus_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
|
||||
daedalus_substrate sub, uint8_t *dst, size_t dst_stride, \
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
|
||||
{ \
|
||||
daedalus_substrate eff = sub; \
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
|
||||
eff = daedalus_recipe_substrate_for(kernel); \
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||||
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \
|
||||
return cpu_fn(ctx, dst, dst_stride, n_edges, meta); \
|
||||
}
|
||||
|
||||
DEFINE_INTRA_DISPATCH(luma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA, dispatch_h264_deblock_luma_v_intra_cpu)
|
||||
DEFINE_INTRA_DISPATCH(luma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA, dispatch_h264_deblock_luma_h_intra_cpu)
|
||||
DEFINE_INTRA_DISPATCH(chroma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA, dispatch_h264_deblock_chroma_v_intra_cpu)
|
||||
DEFINE_INTRA_DISPATCH(chroma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA, dispatch_h264_deblock_chroma_h_intra_cpu)
|
||||
|
||||
#undef DEFINE_INTRA_DISPATCH
|
||||
|
||||
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
@@ -1381,6 +1471,22 @@ int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
#define DEFINE_INTRA_RECIPE(name) \
|
||||
int daedalus_recipe_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
|
||||
uint8_t *dst, size_t dst_stride, \
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
|
||||
{ \
|
||||
return daedalus_dispatch_h264_deblock_ ## name (ctx, DAEDALUS_SUBSTRATE_AUTO, \
|
||||
dst, dst_stride, n_edges, meta); \
|
||||
}
|
||||
|
||||
DEFINE_INTRA_RECIPE(luma_v_intra)
|
||||
DEFINE_INTRA_RECIPE(luma_h_intra)
|
||||
DEFINE_INTRA_RECIPE(chroma_v_intra)
|
||||
DEFINE_INTRA_RECIPE(chroma_h_intra)
|
||||
|
||||
#undef DEFINE_INTRA_RECIPE
|
||||
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
|
||||
Reference in New Issue
Block a user