Merge pull request 'h264: deblock chroma_v + chroma_h (CPU/NEON, bS<4)' (#10) from noether/h264-deblock-chroma into main

Reviewed-on: #10
2026-05-24 21:55:57 +00:00
parent f4af24020f a5c47aa51c
commit ce436bfd96
5 changed files with 315 additions and 0 deletions
@@ -520,6 +520,7 @@ add_executable(test_api_h264
    tests/h264_idct8_ref.c
    tests/h264_deblock_ref.c
    tests/h264_h_loop_filter_luma_ref.c
+    tests/h264_chroma_loop_filter_ref.c
    tests/h264_qpel8_mc20_ref.c
 )
 target_link_libraries(test_api_h264 PRIVATE daedalus_core)
@@ -286,6 +286,35 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

+/* H.264 chroma (4:2:0) loop filters — bS<4 variant.  Chroma uses
+ * the SAME daedalus_h264_deblock_meta struct as luma but on smaller
+ * tiles: 8 cols × 4 rows for V (4 segments of 2 cols), 4 cols × 8
+ * rows for H (4 segments of 2 rows).  Each segment has its own tc0
+ * strength (tc0[s] applies to both cells in segment s).
+ *
+ * Algorithm difference vs luma: chroma updates only p0 and q0
+ * (never p1/p2/q1/q2) and uses tC = tc0_seg + 1 directly (no
+ * luma-style ap/aq side-condition bonus).
+ *
+ * QPU shaders for chroma deblock not implemented yet; recipe table
+ * routes AUTO to CPU NEON.  Explicit SUBSTRATE_QPU returns -1.
+ */
+int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta);
+
+int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta);
+
+int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta);
+
+int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta);
+
 /* -------------------------------------------------------------------
 * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
 * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
@@ -333,6 +362,8 @@ typedef enum {
    DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
    DAEDALUS_KERNEL_H264_QPEL_MC20  = 9,
    DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
+    DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11,
+    DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12,
 } daedalus_kernel;

 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
@@ -131,6 +131,8 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct8.spv */
    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264deblock.spv */
    case DAEDALUS_KERNEL_H264_DEBLOCK_LH:  return DAEDALUS_SUBSTRATE_CPU;	/* QPU H shader pending */
+    case DAEDALUS_KERNEL_H264_DEBLOCK_CV:  return DAEDALUS_SUBSTRATE_CPU;	/* chroma QPU pending */
+    case DAEDALUS_KERNEL_H264_DEBLOCK_CH:  return DAEDALUS_SUBSTRATE_CPU;	/* chroma QPU pending */
    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc20.spv */
    }
    return DAEDALUS_SUBSTRATE_CPU;
@@ -158,6 +160,10 @@ extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
 extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
+extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
+                                                int alpha, int beta, int8_t *tc0);
+extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
+                                                int alpha, int beta, int8_t *tc0);
 extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
                                         ptrdiff_t stride);

@@ -284,6 +290,36 @@ static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx,
    return 0;
 }

+static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_edges; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
+                                 meta[i].tc0[2], meta[i].tc0[3] };
+        ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off,
+                                            (ptrdiff_t) dst_stride,
+                                            meta[i].alpha, meta[i].beta, tc0_local);
+    }
+    return 0;
+}
+
+static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_edges; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
+                                 meta[i].tc0[2], meta[i].tc0[3] };
+        ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off,
+                                            (ptrdiff_t) dst_stride,
+                                            meta[i].alpha, meta[i].beta, tc0_local);
+    }
+    return 0;
+}
+
 static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1206,6 +1242,34 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate
    return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta);
 }

+int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_QPU)
+        return -1;  /* No chroma QPU shader yet. */
+    return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta);
+}
+
+int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_QPU)
+        return -1;
+    return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
+}
+
 int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1301,6 +1365,22 @@ int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
                                                  dst, dst_stride, n_edges, meta);
 }

+int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                                    dst, dst_stride, n_edges, meta);
+}
+
+int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                                    dst, dst_stride, n_edges, meta);
+}
+
 int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -0,0 +1,110 @@
+/*
+ * Standalone bit-exact C reference for H.264 chroma loop filters
+ * (bS < 4 variant; "intra" / bS=4 variant lives in a separate file
+ * when added).  Covers both orientations:
+ *
+ *   v_loop_filter_chroma: filter applied VERTICALLY across a
+ *     HORIZONTAL edge.  Tile is 8 cols × 4 rows of context
+ *     (rows -2..+1); pix points to row 0 of the bottom block.
+ *   h_loop_filter_chroma: filter applied HORIZONTALLY across a
+ *     VERTICAL edge.  Tile is 4 cols × 8 rows of context
+ *     (cols -2..+1); pix points to col 0 of the right block.
+ *
+ * Mirrors FFmpeg `ff_h264_v_loop_filter_chroma_neon` (line 412) and
+ * `ff_h264_h_loop_filter_chroma_neon` (line 430) in
+ * external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
+ *
+ * Algorithm per H.264 §8.7.2.4 (chroma bS<4 inter):
+ *   - Same edge preconditions as luma: |p0-q0|<α, |p1-p0|<β, |q1-q0|<β.
+ *   - tC = tc0_seg + 1 (chroma's tc has no luma-style ap/aq side bonus).
+ *   - δ = clip3((((q0-p0)<<2) + (p1-q1) + 4) >> 3, -tC, tC).
+ *   - p0' = clip255(p0+δ); q0' = clip255(q0-δ).
+ *   - Chroma NEVER updates p1, p2, q1, q2 (unlike luma).
+ *
+ * tc0[4]: 4 segments × 2 cells per segment = 8 cells per edge
+ * (matches both 4:2:0 chroma plane geometry — 8 cols for V edge or
+ * 8 rows for H edge).
+ *
+ * Signature (matches FFmpeg + the existing luma refs):
+ *   void(uint8_t *pix, ptrdiff_t stride,
+ *        int alpha, int beta, int8_t tc0[4]);
+ *
+ * License: LGPL-2.1-or-later (matches FFmpeg upstream).
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
+static inline int clip3(int v, int lo, int hi) {
+    return v < lo ? lo : v > hi ? hi : v;
+}
+static inline int abs_i(int x) { return x < 0 ? -x : x; }
+
+/* Per-cell chroma filter, vertical-direction access (one column
+ * across the horizontal edge).  p1 is at pix[-2*stride], q1 at
+ * pix[+1*stride]. */
+static void h264_chroma_cell_v(uint8_t *pix, ptrdiff_t stride,
+                                int alpha, int beta, int tc0_s)
+{
+    int p1 = pix[-2*stride], p0 = pix[-1*stride];
+    int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
+    if (abs_i(p0 - q0) >= alpha) return;
+    if (abs_i(p1 - p0) >= beta)  return;
+    if (abs_i(q1 - q0) >= beta)  return;
+    int tc = tc0_s + 1;
+    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
+    pix[-1*stride] = (uint8_t) clip_u8(p0 + delta);
+    pix[ 0*stride] = (uint8_t) clip_u8(q0 - delta);
+}
+
+/* Same kernel, horizontal-direction access (one row across the
+ * vertical edge).  p1 at pix[-2], q1 at pix[+1]. */
+static void h264_chroma_cell_h(uint8_t *pix,
+                                int alpha, int beta, int tc0_s)
+{
+    int p1 = pix[-2], p0 = pix[-1];
+    int q0 = pix[ 0], q1 = pix[ 1];
+    if (abs_i(p0 - q0) >= alpha) return;
+    if (abs_i(p1 - p0) >= beta)  return;
+    if (abs_i(q1 - q0) >= beta)  return;
+    int tc = tc0_s + 1;
+    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
+    pix[-1] = (uint8_t) clip_u8(p0 + delta);
+    pix[ 0] = (uint8_t) clip_u8(q0 - delta);
+}
+
+void daedalus_h264_v_loop_filter_chroma_ref(
+    uint8_t *pix, ptrdiff_t stride,
+    int alpha, int beta, int8_t tc0[4])
+{
+    if (alpha == 0 || beta == 0) return;
+    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
+
+    /* 8 cols divided into 4 segments of 2 cols each. */
+    for (int s = 0; s < 4; s++) {
+        int tc0_s = tc0[s];
+        if (tc0_s < 0) continue;
+        for (int c = 0; c < 2; c++) {
+            int col = s * 2 + c;
+            h264_chroma_cell_v(pix + col, stride, alpha, beta, tc0_s);
+        }
+    }
+}
+
+void daedalus_h264_h_loop_filter_chroma_ref(
+    uint8_t *pix, ptrdiff_t stride,
+    int alpha, int beta, int8_t tc0[4])
+{
+    if (alpha == 0 || beta == 0) return;
+    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
+
+    /* 8 rows divided into 4 segments of 2 rows each. */
+    for (int s = 0; s < 4; s++) {
+        int tc0_s = tc0[s];
+        if (tc0_s < 0) continue;
+        for (int r = 0; r < 2; r++) {
+            int row = s * 2 + r;
+            h264_chroma_cell_h(pix + row * stride, alpha, beta, tc0_s);
+        }
+    }
+}
@@ -18,6 +18,10 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t s
 extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                   int alpha, int beta, int8_t tc0[4]);
+extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
+                                                     int alpha, int beta, int8_t tc0[4]);
+extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
+                                                     int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                  int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
@@ -191,6 +195,89 @@ static int test_deblock_h(void)
    return diff == 0 ? 0 : 1;
 }

+static int test_deblock_chroma_v(void)
+{
+    /* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
+     * (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
+    enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
+           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
+           TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
+           EDGE_OFF = EDGE_ROW * TILE_STRIDE };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t dst[TOTAL], dst_ref[TOTAL];
+    daedalus_h264_deblock_meta meta[N_EDGES];
+
+    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
+    for (int i = 0; i < N_EDGES; i++) {
+        meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
+        meta[i].alpha = (int)(xs() % 64) + 1;
+        meta[i].beta  = (int)(xs() % 16) + 1;
+        for (int s = 0; s < 4; s++) {
+            int r = (int)(xs() % 8);
+            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
+        }
+    }
+
+    for (int i = 0; i < N_EDGES; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
+        daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
+                                                 meta[i].alpha, meta[i].beta, tc0_local);
+    }
+
+    int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
+                                                              N_EDGES, meta);
+    if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
+           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
+static int test_deblock_chroma_h(void)
+{
+    /* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
+     * (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
+    enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
+           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
+           TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t dst[TOTAL], dst_ref[TOTAL];
+    daedalus_h264_deblock_meta meta[N_EDGES];
+
+    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
+    for (int i = 0; i < N_EDGES; i++) {
+        meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
+        meta[i].alpha = (int)(xs() % 64) + 1;
+        meta[i].beta  = (int)(xs() % 16) + 1;
+        for (int s = 0; s < 4; s++) {
+            int r = (int)(xs() % 8);
+            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
+        }
+    }
+
+    for (int i = 0; i < N_EDGES; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
+        daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
+                                                 meta[i].alpha, meta[i].beta, tc0_local);
+    }
+
+    int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
+                                                              N_EDGES, meta);
+    if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
+           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
 static int test_qpel_mc20(void)
 {
    /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
@@ -245,12 +332,18 @@ int main(void)

    printf("  H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
+    printf("  H264_DEBLOCK_CV recipe substrate: %d (CPU)\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
+    printf("  H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));

    int fail = 0;
    fail |= test_idct4();
    fail |= test_idct8();
    fail |= test_deblock();
    fail |= test_deblock_h();
+    fail |= test_deblock_chroma_v();
+    fail |= test_deblock_chroma_h();
    fail |= test_qpel_mc20();
    return fail;
 }