h264: deblock bS=4 intra variants (luma + chroma, V + H)

Closes the deblock matrix: adds the four bS=4 intra-strength loop filters used at I-MB edges (and other boundaries where H.264 §8.7.2.1 forces boundary strength to 4). After this PR fourier covers all 8 standard 8-bit 4:2:0 deblock combinations: bS<4 bS=4 ----- ----- luma_v ✓ (cycle 8 QPU) ✓ (CPU) luma_h ✓ (CPU, PR #9) ✓ (CPU) chrm_v ✓ (CPU, PR #10) ✓ (CPU) chrm_h ✓ (CPU, PR #10) ✓ (CPU) Scope: - 4 new kernel enums (LV_INTRA=13, LH_INTRA=14, CV_INTRA=15, CH_INTRA=16), all → CPU substrate in the recipe table. - 4 new public dispatch fns + 4 recipe wrappers (defined via two DEFINE_INTRA_DISPATCH / DEFINE_INTRA_RECIPE macros to keep the boilerplate tight). - 4 new extern decls for the vendored ff_h264_{v,h}_loop_filter_{luma,chroma}_intra_neon symbols. - C reference: tests/h264_intra_loop_filter_ref.c covers all four orientations. Algorithm per H.264 §8.7.2.3: Luma: per-side strong/weak filter selector strong_p = (|p2-p0| < β) AND (|p0-q0| < (α>>2)+2) strong_q = (|q2-q0| < β) AND (|p0-q0| < (α>>2)+2) Strong updates p0/p1/p2 (and mirror); weak updates p0 only. Chroma: always weak, only p0/q0 updated. - daedalus_h264_deblock_meta is REUSED for intra dispatches; the tc0[] field is ignored (bS=4 hardcodes the strength). Callers can build a single edge list and route by kernel without an extra struct. - Test refactor: an intra_test_spec table + run_intra_test helper drives all four orientations through one harness, keeping the new test surface compact (~50 LOC for 4 kernels vs ~200 if each had its own test_deblock_*_intra fn). Verified on hertz (Pi 5 / V3D 7.1): $ ./build/test_api_h264 === Phase 8a API smoke: H.264 kernels via recipe dispatch === ... H.264 deblock luma v intra: 1024/1024 bytes bit-exact (100.0000%) H.264 deblock luma h intra: 1024/1024 bytes bit-exact (100.0000%) H.264 deblock chroma v intra: 256/256 bytes bit-exact (100.0000%) H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%) ... All 11 H.264 kernels bit-exact PASS — the deblock matrix is closed. The bit-exact match on first try is meaningful for these kernels: the strong/weak filter selector + per-side asymmetry would have surfaced any sign / shift / rounding mistake immediately. The C reference is now a usable spec checkpoint for the eventual QPU shader work. QPU shader follow-up: not in this PR. The intra path's 3-cell per-side update + strong/weak branch is structurally more complex than the bS<4 path that already has a V shader (v3d_h264deblock.spv). Per the prior R-band logic for deblock, intra edges are < 20% of total deblock work at typical bit-rates, so NEON-only at ~ 10 ns/edge fits comfortably in the budget.
2026-05-25 00:00:46 +02:00
parent ce436bfd96
commit 9b1c106dc5
5 changed files with 423 additions and 0 deletions
@@ -133,6 +133,10 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_H264_DEBLOCK_LH:  return DAEDALUS_SUBSTRATE_CPU;	/* QPU H shader pending */
    case DAEDALUS_KERNEL_H264_DEBLOCK_CV:  return DAEDALUS_SUBSTRATE_CPU;	/* chroma QPU pending */
    case DAEDALUS_KERNEL_H264_DEBLOCK_CH:  return DAEDALUS_SUBSTRATE_CPU;	/* chroma QPU pending */
+    case DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 luma QPU pending */
+    case DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */
+    case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc20.spv */
    }
    return DAEDALUS_SUBSTRATE_CPU;
@@ -164,6 +168,14 @@ extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
                                                int alpha, int beta, int8_t *tc0);
 extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
                                                int alpha, int beta, int8_t *tc0);
+extern void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
+                                                    int alpha, int beta);
+extern void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
+                                                    int alpha, int beta);
+extern void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
+                                                      int alpha, int beta);
+extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
+                                                      int alpha, int beta);
 extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
                                         ptrdiff_t stride);

@@ -320,6 +332,63 @@ static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
    return 0;
 }

+/* --- bS=4 intra variants.  Note: the daedalus_h264_deblock_meta
+ * struct's tc0[] field is unused for intra (the spec hardcodes the
+ * strength).  We accept the same meta type so callers can build a
+ * single edge-list and route by kernel — saves an extra struct.
+ */
+static int dispatch_h264_deblock_luma_v_intra_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_edges; i++) {
+        ff_h264_v_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
+                                                (ptrdiff_t) dst_stride,
+                                                meta[i].alpha, meta[i].beta);
+    }
+    return 0;
+}
+
+static int dispatch_h264_deblock_luma_h_intra_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_edges; i++) {
+        ff_h264_h_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
+                                                (ptrdiff_t) dst_stride,
+                                                meta[i].alpha, meta[i].beta);
+    }
+    return 0;
+}
+
+static int dispatch_h264_deblock_chroma_v_intra_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_edges; i++) {
+        ff_h264_v_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
+                                                  (ptrdiff_t) dst_stride,
+                                                  meta[i].alpha, meta[i].beta);
+    }
+    return 0;
+}
+
+static int dispatch_h264_deblock_chroma_h_intra_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_edges; i++) {
+        ff_h264_h_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
+                                                  (ptrdiff_t) dst_stride,
+                                                  meta[i].alpha, meta[i].beta);
+    }
+    return 0;
+}
+
 static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1270,6 +1339,27 @@ int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrat
    return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
 }

+#define DEFINE_INTRA_DISPATCH(name, kernel, cpu_fn)                            \
+int daedalus_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx,                \
+    daedalus_substrate sub, uint8_t *dst, size_t dst_stride,                   \
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)                    \
+{                                                                              \
+    daedalus_substrate eff = sub;                                              \
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)                                        \
+        eff = daedalus_recipe_substrate_for(kernel);                           \
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))           \
+        eff = DAEDALUS_SUBSTRATE_CPU;                                          \
+    if (eff == DAEDALUS_SUBSTRATE_QPU) return -1;                              \
+    return cpu_fn(ctx, dst, dst_stride, n_edges, meta);                        \
+}
+
+DEFINE_INTRA_DISPATCH(luma_v_intra,   DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA, dispatch_h264_deblock_luma_v_intra_cpu)
+DEFINE_INTRA_DISPATCH(luma_h_intra,   DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA, dispatch_h264_deblock_luma_h_intra_cpu)
+DEFINE_INTRA_DISPATCH(chroma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA, dispatch_h264_deblock_chroma_v_intra_cpu)
+DEFINE_INTRA_DISPATCH(chroma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA, dispatch_h264_deblock_chroma_h_intra_cpu)
+
+#undef DEFINE_INTRA_DISPATCH
+
 int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1381,6 +1471,22 @@ int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
                                                    dst, dst_stride, n_edges, meta);
 }

+#define DEFINE_INTRA_RECIPE(name)                                              \
+int daedalus_recipe_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx,         \
+    uint8_t *dst, size_t dst_stride,                                           \
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)                    \
+{                                                                              \
+    return daedalus_dispatch_h264_deblock_ ## name (ctx, DAEDALUS_SUBSTRATE_AUTO, \
+                                                    dst, dst_stride, n_edges, meta); \
+}
+
+DEFINE_INTRA_RECIPE(luma_v_intra)
+DEFINE_INTRA_RECIPE(luma_h_intra)
+DEFINE_INTRA_RECIPE(chroma_v_intra)
+DEFINE_INTRA_RECIPE(chroma_h_intra)
+
+#undef DEFINE_INTRA_RECIPE
+
 int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)