h264: deblock bS=4 intra variants (luma + chroma, V + H)

Closes the deblock matrix: adds the four bS=4 intra-strength loop
filters used at I-MB edges (and other boundaries where H.264
§8.7.2.1 forces boundary strength to 4).  After this PR fourier
covers all 8 standard 8-bit 4:2:0 deblock combinations:

    bS<4   bS=4
    -----  -----
  luma_v  ✓ (cycle 8 QPU)   ✓ (CPU)
  luma_h  ✓ (CPU, PR #9)    ✓ (CPU)
  chrm_v  ✓ (CPU, PR #10)   ✓ (CPU)
  chrm_h  ✓ (CPU, PR #10)   ✓ (CPU)

Scope:
  - 4 new kernel enums (LV_INTRA=13, LH_INTRA=14, CV_INTRA=15,
    CH_INTRA=16), all → CPU substrate in the recipe table.
  - 4 new public dispatch fns + 4 recipe wrappers (defined via two
    DEFINE_INTRA_DISPATCH / DEFINE_INTRA_RECIPE macros to keep the
    boilerplate tight).
  - 4 new extern decls for the vendored
    ff_h264_{v,h}_loop_filter_{luma,chroma}_intra_neon symbols.
  - C reference: tests/h264_intra_loop_filter_ref.c covers all four
    orientations.  Algorithm per H.264 §8.7.2.3:

      Luma: per-side strong/weak filter selector
        strong_p = (|p2-p0| < β) AND (|p0-q0| < (α>>2)+2)
        strong_q = (|q2-q0| < β) AND (|p0-q0| < (α>>2)+2)
        Strong updates p0/p1/p2 (and mirror); weak updates p0 only.
      Chroma: always weak, only p0/q0 updated.

  - daedalus_h264_deblock_meta is REUSED for intra dispatches; the
    tc0[] field is ignored (bS=4 hardcodes the strength).  Callers
    can build a single edge list and route by kernel without an
    extra struct.

  - Test refactor: an intra_test_spec table + run_intra_test helper
    drives all four orientations through one harness, keeping the
    new test surface compact (~50 LOC for 4 kernels vs ~200 if each
    had its own test_deblock_*_intra fn).

Verified on hertz (Pi 5 / V3D 7.1):

  $ ./build/test_api_h264
  === Phase 8a API smoke: H.264 kernels via recipe dispatch ===
    ...
    H.264 deblock luma v intra: 1024/1024 bytes bit-exact (100.0000%)
    H.264 deblock luma h intra: 1024/1024 bytes bit-exact (100.0000%)
    H.264 deblock chroma v intra: 256/256 bytes bit-exact (100.0000%)
    H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%)
    ...

  All 11 H.264 kernels bit-exact PASS — the deblock matrix is closed.

The bit-exact match on first try is meaningful for these kernels:
the strong/weak filter selector + per-side asymmetry would have
surfaced any sign / shift / rounding mistake immediately.  The
C reference is now a usable spec checkpoint for the eventual QPU
shader work.

QPU shader follow-up: not in this PR.  The intra path's 3-cell
per-side update + strong/weak branch is structurally more complex
than the bS<4 path that already has a V shader (v3d_h264deblock.spv).
Per the prior R-band logic for deblock, intra edges are < 20% of
total deblock work at typical bit-rates, so NEON-only at ~ 10 ns/edge
fits comfortably in the budget.
This commit is contained in:
2026-05-25 00:00:46 +02:00
parent ce436bfd96
commit 9b1c106dc5
5 changed files with 423 additions and 0 deletions
+106
View File
@@ -133,6 +133,10 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */
case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
case DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 luma QPU pending */
case DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */
case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
}
return DAEDALUS_SUBSTRATE_CPU;
@@ -164,6 +168,14 @@ extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
extern void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
@@ -320,6 +332,63 @@ static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
return 0;
}
/* --- bS=4 intra variants. Note: the daedalus_h264_deblock_meta
* struct's tc0[] field is unused for intra (the spec hardcodes the
* strength). We accept the same meta type so callers can build a
* single edge-list and route by kernel — saves an extra struct.
*/
static int dispatch_h264_deblock_luma_v_intra_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
ff_h264_v_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta);
}
return 0;
}
static int dispatch_h264_deblock_luma_h_intra_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
ff_h264_h_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta);
}
return 0;
}
static int dispatch_h264_deblock_chroma_v_intra_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
ff_h264_v_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta);
}
return 0;
}
static int dispatch_h264_deblock_chroma_h_intra_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
ff_h264_h_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta);
}
return 0;
}
static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1270,6 +1339,27 @@ int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrat
return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
}
#define DEFINE_INTRA_DISPATCH(name, kernel, cpu_fn) \
int daedalus_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
daedalus_substrate sub, uint8_t *dst, size_t dst_stride, \
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
{ \
daedalus_substrate eff = sub; \
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
eff = daedalus_recipe_substrate_for(kernel); \
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
eff = DAEDALUS_SUBSTRATE_CPU; \
if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \
return cpu_fn(ctx, dst, dst_stride, n_edges, meta); \
}
DEFINE_INTRA_DISPATCH(luma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA, dispatch_h264_deblock_luma_v_intra_cpu)
DEFINE_INTRA_DISPATCH(luma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA, dispatch_h264_deblock_luma_h_intra_cpu)
DEFINE_INTRA_DISPATCH(chroma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA, dispatch_h264_deblock_chroma_v_intra_cpu)
DEFINE_INTRA_DISPATCH(chroma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA, dispatch_h264_deblock_chroma_h_intra_cpu)
#undef DEFINE_INTRA_DISPATCH
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1381,6 +1471,22 @@ int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
dst, dst_stride, n_edges, meta);
}
#define DEFINE_INTRA_RECIPE(name) \
int daedalus_recipe_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
uint8_t *dst, size_t dst_stride, \
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
{ \
return daedalus_dispatch_h264_deblock_ ## name (ctx, DAEDALUS_SUBSTRATE_AUTO, \
dst, dst_stride, n_edges, meta); \
}
DEFINE_INTRA_RECIPE(luma_v_intra)
DEFINE_INTRA_RECIPE(luma_h_intra)
DEFINE_INTRA_RECIPE(chroma_v_intra)
DEFINE_INTRA_RECIPE(chroma_h_intra)
#undef DEFINE_INTRA_RECIPE
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)