h264: deblock bS=4 intra variants (luma + chroma, V + H)

Closes the deblock matrix: adds the four bS=4 intra-strength loop filters used at I-MB edges (and other boundaries where H.264 §8.7.2.1 forces boundary strength to 4). After this PR fourier covers all 8 standard 8-bit 4:2:0 deblock combinations: bS<4 bS=4 ----- ----- luma_v ✓ (cycle 8 QPU) ✓ (CPU) luma_h ✓ (CPU, PR #9) ✓ (CPU) chrm_v ✓ (CPU, PR #10) ✓ (CPU) chrm_h ✓ (CPU, PR #10) ✓ (CPU) Scope: - 4 new kernel enums (LV_INTRA=13, LH_INTRA=14, CV_INTRA=15, CH_INTRA=16), all → CPU substrate in the recipe table. - 4 new public dispatch fns + 4 recipe wrappers (defined via two DEFINE_INTRA_DISPATCH / DEFINE_INTRA_RECIPE macros to keep the boilerplate tight). - 4 new extern decls for the vendored ff_h264_{v,h}_loop_filter_{luma,chroma}_intra_neon symbols. - C reference: tests/h264_intra_loop_filter_ref.c covers all four orientations. Algorithm per H.264 §8.7.2.3: Luma: per-side strong/weak filter selector strong_p = (|p2-p0| < β) AND (|p0-q0| < (α>>2)+2) strong_q = (|q2-q0| < β) AND (|p0-q0| < (α>>2)+2) Strong updates p0/p1/p2 (and mirror); weak updates p0 only. Chroma: always weak, only p0/q0 updated. - daedalus_h264_deblock_meta is REUSED for intra dispatches; the tc0[] field is ignored (bS=4 hardcodes the strength). Callers can build a single edge list and route by kernel without an extra struct. - Test refactor: an intra_test_spec table + run_intra_test helper drives all four orientations through one harness, keeping the new test surface compact (~50 LOC for 4 kernels vs ~200 if each had its own test_deblock_*_intra fn). Verified on hertz (Pi 5 / V3D 7.1): $ ./build/test_api_h264 === Phase 8a API smoke: H.264 kernels via recipe dispatch === ... H.264 deblock luma v intra: 1024/1024 bytes bit-exact (100.0000%) H.264 deblock luma h intra: 1024/1024 bytes bit-exact (100.0000%) H.264 deblock chroma v intra: 256/256 bytes bit-exact (100.0000%) H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%) ... All 11 H.264 kernels bit-exact PASS — the deblock matrix is closed. The bit-exact match on first try is meaningful for these kernels: the strong/weak filter selector + per-side asymmetry would have surfaced any sign / shift / rounding mistake immediately. The C reference is now a usable spec checkpoint for the eventual QPU shader work. QPU shader follow-up: not in this PR. The intra path's 3-cell per-side update + strong/weak branch is structurally more complex than the bS<4 path that already has a V shader (v3d_h264deblock.spv). Per the prior R-band logic for deblock, intra edges are < 20% of total deblock work at typical bit-rates, so NEON-only at ~ 10 ns/edge fits comfortably in the budget.
2026-05-25 00:00:46 +02:00
parent ce436bfd96
commit 9b1c106dc5
5 changed files with 423 additions and 0 deletions
@@ -22,6 +22,14 @@ extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t strid
                                                     int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
                                                     int alpha, int beta, int8_t tc0[4]);
+extern void daedalus_h264_v_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
+                                                         int alpha, int beta);
+extern void daedalus_h264_h_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
+                                                         int alpha, int beta);
+extern void daedalus_h264_v_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
+                                                           int alpha, int beta);
+extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
+                                                           int alpha, int beta);
 extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                  int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
@@ -278,6 +286,79 @@ static int test_deblock_chroma_h(void)
    return diff == 0 ? 0 : 1;
 }

+/* --- bS=4 intra-strength deblock tests ---
+ * Tile geometry per orientation matches the bS<4 variant; only the
+ * dispatch + reference function change.  alpha/beta are non-trivial
+ * (the C ref + NEON both early-return when alpha|beta == 0).
+ */
+typedef struct {
+    const char *name;
+    int n_edges, tile_stride, tile_rows, edge_off;
+    void (*ref)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta);
+    int (*dispatch)(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride,
+                    size_t n_edges, const daedalus_h264_deblock_meta *meta);
+} intra_test_spec;
+
+static int run_intra_test(const intra_test_spec *t)
+{
+    int total = t->n_edges * t->tile_stride * t->tile_rows;
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t *dst     = malloc((size_t) total);
+    uint8_t *dst_ref = malloc((size_t) total);
+    daedalus_h264_deblock_meta *meta = calloc((size_t) t->n_edges, sizeof(*meta));
+    if (!dst || !dst_ref || !meta) return 1;
+
+    for (int i = 0; i < total; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
+    int tile_bytes = t->tile_stride * t->tile_rows;
+    for (int i = 0; i < t->n_edges; i++) {
+        meta[i].dst_off = (uint32_t)(i * tile_bytes + t->edge_off);
+        meta[i].alpha   = (int)(xs() % 64) + 1;
+        meta[i].beta    = (int)(xs() % 16) + 1;
+        /* tc0[] unused for intra; leave at 0 from calloc. */
+    }
+    for (int i = 0; i < t->n_edges; i++) {
+        t->ref(dst_ref + meta[i].dst_off,
+               (ptrdiff_t) t->tile_stride,
+               meta[i].alpha, meta[i].beta);
+    }
+    int rc = t->dispatch(ctx, dst, (size_t) t->tile_stride,
+                          (size_t) t->n_edges, meta);
+    if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", t->name, rc); return 1; }
+
+    int diff = 0;
+    for (int i = 0; i < total; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 deblock %s: %d/%d bytes bit-exact (%.4f%%)\n",
+           t->name, total - diff, total, 100.0 * (total - diff) / total);
+
+    free(meta); free(dst_ref); free(dst);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
+static int test_deblock_intra_all(void)
+{
+    intra_test_spec specs[] = {
+        { "luma v intra",   8, 16,  8, 4 * 16,
+            daedalus_h264_v_loop_filter_luma_intra_ref,
+            daedalus_recipe_dispatch_h264_deblock_luma_v_intra },
+        { "luma h intra",   8,  8, 16, 4,
+            daedalus_h264_h_loop_filter_luma_intra_ref,
+            daedalus_recipe_dispatch_h264_deblock_luma_h_intra },
+        { "chroma v intra", 8,  8,  4, 2 * 8,
+            daedalus_h264_v_loop_filter_chroma_intra_ref,
+            daedalus_recipe_dispatch_h264_deblock_chroma_v_intra },
+        { "chroma h intra", 8,  4,  8, 2,
+            daedalus_h264_h_loop_filter_chroma_intra_ref,
+            daedalus_recipe_dispatch_h264_deblock_chroma_h_intra },
+    };
+    int fail = 0;
+    for (size_t i = 0; i < sizeof(specs)/sizeof(specs[0]); i++)
+        fail |= run_intra_test(&specs[i]);
+    return fail;
+}
+
 static int test_qpel_mc20(void)
 {
    /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
@@ -336,6 +417,8 @@ int main(void)
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
    printf("  H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
+    printf("  H264_DEBLOCK_*_INTRA recipe substrate: %d (CPU, bS=4 set)\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA));

    int fail = 0;
    fail |= test_idct4();
@@ -344,6 +427,7 @@ int main(void)
    fail |= test_deblock_h();
    fail |= test_deblock_chroma_v();
    fail |= test_deblock_chroma_h();
+    fail |= test_deblock_intra_all();
    fail |= test_qpel_mc20();
    return fail;
 }