h264: deblock chroma_v + chroma_h (CPU/NEON, bS<4)

Continues the deblock buildout after PR #9 (luma_h). Adds the two chroma orientations via the same recipe-table-routed-to-CPU pattern; QPU shaders for chroma deblock are still a follow-up. Scope: - Public API: 4 new fns (dispatch + recipe wrapper × {v, h}). - Internal: dispatch_h264_deblock_chroma_{v,h}_cpu calling the vendored ff_h264_{v,h}_loop_filter_chroma_neon symbols. - Recipe table: DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11, DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12, both → CPU. Explicit SUBSTRATE_QPU returns -1 (no shader yet). - C reference: tests/h264_chroma_loop_filter_ref.c — covers both orientations. Algorithm per H.264 §8.7.2.4 (bS<4 chroma inter): tC = tc0_seg + 1 (no luma-style ap/aq side bonus); only p0/q0 are updated (chroma never modifies p1/p2/q1/q2). - Tests: test_deblock_chroma_v (8x4 tile, edge at row 2) + test_deblock_chroma_h (4x8 tile, edge at col 2), 4 segments x 2 cells per segment per spec. Verified on hertz (Pi 5 / V3D 7.1): $ ./build/test_api_h264 === Phase 8a API smoke: H.264 kernels via recipe dispatch === H264_IDCT4 recipe substrate: 2 (1=CPU, 2=QPU) H264_IDCT8 recipe substrate: 2 H264_DEBLOCK_LV recipe substrate: 2 H264_QPEL_MC20 recipe substrate: 2 H264_DEBLOCK_LH recipe substrate: 1 (CPU, no QPU H shader yet) H264_DEBLOCK_CV recipe substrate: 1 (CPU) H264_DEBLOCK_CH recipe substrate: 1 (CPU) H.264 IDCT 4x4: 2048/2048 bytes bit-exact (100.0000%) H.264 IDCT 8x8: 2048/2048 bytes bit-exact (100.0000%) H.264 deblock luma v: 2048/2048 bytes bit-exact (100.0000%) H.264 deblock luma h: 1024/1024 bytes bit-exact (100.0000%) H.264 deblock chroma v: 256/256 bytes bit-exact (100.0000%) H.264 deblock chroma h: 256/256 bytes bit-exact (100.0000%) H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%) All 7 kernels bit-exact PASS. Chroma test sizes are smaller (256 bytes per orientation) because the per-MB chroma deblock surface is smaller than luma — accurate to the production geometry. Why no QPU shader yet (per the established pattern): - Chroma deblock is ~25% of total deblock work at 4:2:0 (one quarter the pixel count of luma per MB) — modest QPU win even after the shader exists. - Same R-band considerations as the luma _h follow-up: the V shader transpose isn't mechanical, and the 8-cell tile is small enough that NEON's per-edge cost (~3 ns) is already inside the budget. - Total bench at 1080p: 8160 MBs × 4 chroma edges × 3 ns = ~100 us. Negligible compared to the IDCT layer's 10 ms (CPU NEON). Now coverage in fourier for the bS<4 8-bit 4:2:0 deblock matrix is complete: luma_v ✓, luma_h ✓, chroma_v ✓, chroma_h ✓. Remaining deblock work: bS=4 intra variants (luma + chroma, V + H). What this unblocks downstream: - daedalus-decoder Stage 4 deblock can now dispatch all four bS<4 edge categories that a typical inter MB needs.
2026-05-24 23:53:09 +02:00
parent f4af24020f
commit a5c47aa51c
5 changed files with 315 additions and 0 deletions
@@ -0,0 +1,110 @@
+/*
+ * Standalone bit-exact C reference for H.264 chroma loop filters
+ * (bS < 4 variant; "intra" / bS=4 variant lives in a separate file
+ * when added).  Covers both orientations:
+ *
+ *   v_loop_filter_chroma: filter applied VERTICALLY across a
+ *     HORIZONTAL edge.  Tile is 8 cols × 4 rows of context
+ *     (rows -2..+1); pix points to row 0 of the bottom block.
+ *   h_loop_filter_chroma: filter applied HORIZONTALLY across a
+ *     VERTICAL edge.  Tile is 4 cols × 8 rows of context
+ *     (cols -2..+1); pix points to col 0 of the right block.
+ *
+ * Mirrors FFmpeg `ff_h264_v_loop_filter_chroma_neon` (line 412) and
+ * `ff_h264_h_loop_filter_chroma_neon` (line 430) in
+ * external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
+ *
+ * Algorithm per H.264 §8.7.2.4 (chroma bS<4 inter):
+ *   - Same edge preconditions as luma: |p0-q0|<α, |p1-p0|<β, |q1-q0|<β.
+ *   - tC = tc0_seg + 1 (chroma's tc has no luma-style ap/aq side bonus).
+ *   - δ = clip3((((q0-p0)<<2) + (p1-q1) + 4) >> 3, -tC, tC).
+ *   - p0' = clip255(p0+δ); q0' = clip255(q0-δ).
+ *   - Chroma NEVER updates p1, p2, q1, q2 (unlike luma).
+ *
+ * tc0[4]: 4 segments × 2 cells per segment = 8 cells per edge
+ * (matches both 4:2:0 chroma plane geometry — 8 cols for V edge or
+ * 8 rows for H edge).
+ *
+ * Signature (matches FFmpeg + the existing luma refs):
+ *   void(uint8_t *pix, ptrdiff_t stride,
+ *        int alpha, int beta, int8_t tc0[4]);
+ *
+ * License: LGPL-2.1-or-later (matches FFmpeg upstream).
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
+static inline int clip3(int v, int lo, int hi) {
+    return v < lo ? lo : v > hi ? hi : v;
+}
+static inline int abs_i(int x) { return x < 0 ? -x : x; }
+
+/* Per-cell chroma filter, vertical-direction access (one column
+ * across the horizontal edge).  p1 is at pix[-2*stride], q1 at
+ * pix[+1*stride]. */
+static void h264_chroma_cell_v(uint8_t *pix, ptrdiff_t stride,
+                                int alpha, int beta, int tc0_s)
+{
+    int p1 = pix[-2*stride], p0 = pix[-1*stride];
+    int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
+    if (abs_i(p0 - q0) >= alpha) return;
+    if (abs_i(p1 - p0) >= beta)  return;
+    if (abs_i(q1 - q0) >= beta)  return;
+    int tc = tc0_s + 1;
+    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
+    pix[-1*stride] = (uint8_t) clip_u8(p0 + delta);
+    pix[ 0*stride] = (uint8_t) clip_u8(q0 - delta);
+}
+
+/* Same kernel, horizontal-direction access (one row across the
+ * vertical edge).  p1 at pix[-2], q1 at pix[+1]. */
+static void h264_chroma_cell_h(uint8_t *pix,
+                                int alpha, int beta, int tc0_s)
+{
+    int p1 = pix[-2], p0 = pix[-1];
+    int q0 = pix[ 0], q1 = pix[ 1];
+    if (abs_i(p0 - q0) >= alpha) return;
+    if (abs_i(p1 - p0) >= beta)  return;
+    if (abs_i(q1 - q0) >= beta)  return;
+    int tc = tc0_s + 1;
+    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
+    pix[-1] = (uint8_t) clip_u8(p0 + delta);
+    pix[ 0] = (uint8_t) clip_u8(q0 - delta);
+}
+
+void daedalus_h264_v_loop_filter_chroma_ref(
+    uint8_t *pix, ptrdiff_t stride,
+    int alpha, int beta, int8_t tc0[4])
+{
+    if (alpha == 0 || beta == 0) return;
+    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
+
+    /* 8 cols divided into 4 segments of 2 cols each. */
+    for (int s = 0; s < 4; s++) {
+        int tc0_s = tc0[s];
+        if (tc0_s < 0) continue;
+        for (int c = 0; c < 2; c++) {
+            int col = s * 2 + c;
+            h264_chroma_cell_v(pix + col, stride, alpha, beta, tc0_s);
+        }
+    }
+}
+
+void daedalus_h264_h_loop_filter_chroma_ref(
+    uint8_t *pix, ptrdiff_t stride,
+    int alpha, int beta, int8_t tc0[4])
+{
+    if (alpha == 0 || beta == 0) return;
+    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
+
+    /* 8 rows divided into 4 segments of 2 rows each. */
+    for (int s = 0; s < 4; s++) {
+        int tc0_s = tc0[s];
+        if (tc0_s < 0) continue;
+        for (int r = 0; r < 2; r++) {
+            int row = s * 2 + r;
+            h264_chroma_cell_h(pix + row * stride, alpha, beta, tc0_s);
+        }
+    }
+}
@@ -18,6 +18,10 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t s
 extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                   int alpha, int beta, int8_t tc0[4]);
+extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
+                                                     int alpha, int beta, int8_t tc0[4]);
+extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
+                                                     int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                  int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
@@ -191,6 +195,89 @@ static int test_deblock_h(void)
    return diff == 0 ? 0 : 1;
 }

+static int test_deblock_chroma_v(void)
+{
+    /* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
+     * (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
+    enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
+           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
+           TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
+           EDGE_OFF = EDGE_ROW * TILE_STRIDE };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t dst[TOTAL], dst_ref[TOTAL];
+    daedalus_h264_deblock_meta meta[N_EDGES];
+
+    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
+    for (int i = 0; i < N_EDGES; i++) {
+        meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
+        meta[i].alpha = (int)(xs() % 64) + 1;
+        meta[i].beta  = (int)(xs() % 16) + 1;
+        for (int s = 0; s < 4; s++) {
+            int r = (int)(xs() % 8);
+            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
+        }
+    }
+
+    for (int i = 0; i < N_EDGES; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
+        daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
+                                                 meta[i].alpha, meta[i].beta, tc0_local);
+    }
+
+    int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
+                                                              N_EDGES, meta);
+    if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
+           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
+static int test_deblock_chroma_h(void)
+{
+    /* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
+     * (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
+    enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
+           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
+           TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t dst[TOTAL], dst_ref[TOTAL];
+    daedalus_h264_deblock_meta meta[N_EDGES];
+
+    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
+    for (int i = 0; i < N_EDGES; i++) {
+        meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
+        meta[i].alpha = (int)(xs() % 64) + 1;
+        meta[i].beta  = (int)(xs() % 16) + 1;
+        for (int s = 0; s < 4; s++) {
+            int r = (int)(xs() % 8);
+            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
+        }
+    }
+
+    for (int i = 0; i < N_EDGES; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
+        daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
+                                                 meta[i].alpha, meta[i].beta, tc0_local);
+    }
+
+    int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
+                                                              N_EDGES, meta);
+    if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
+           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
 static int test_qpel_mc20(void)
 {
    /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
@@ -245,12 +332,18 @@ int main(void)

    printf("  H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
+    printf("  H264_DEBLOCK_CV recipe substrate: %d (CPU)\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
+    printf("  H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));

    int fail = 0;
    fail |= test_idct4();
    fail |= test_idct8();
    fail |= test_deblock();
    fail |= test_deblock_h();
+    fail |= test_deblock_chroma_v();
+    fail |= test_deblock_chroma_h();
    fail |= test_qpel_mc20();
    return fail;
 }