h264: deblock chroma_v + chroma_h (CPU/NEON, bS<4)

Continues the deblock buildout after PR #9 (luma_h). Adds the two chroma orientations via the same recipe-table-routed-to-CPU pattern; QPU shaders for chroma deblock are still a follow-up. Scope: - Public API: 4 new fns (dispatch + recipe wrapper × {v, h}). - Internal: dispatch_h264_deblock_chroma_{v,h}_cpu calling the vendored ff_h264_{v,h}_loop_filter_chroma_neon symbols. - Recipe table: DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11, DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12, both → CPU. Explicit SUBSTRATE_QPU returns -1 (no shader yet). - C reference: tests/h264_chroma_loop_filter_ref.c — covers both orientations. Algorithm per H.264 §8.7.2.4 (bS<4 chroma inter): tC = tc0_seg + 1 (no luma-style ap/aq side bonus); only p0/q0 are updated (chroma never modifies p1/p2/q1/q2). - Tests: test_deblock_chroma_v (8x4 tile, edge at row 2) + test_deblock_chroma_h (4x8 tile, edge at col 2), 4 segments x 2 cells per segment per spec. Verified on hertz (Pi 5 / V3D 7.1): $ ./build/test_api_h264 === Phase 8a API smoke: H.264 kernels via recipe dispatch === H264_IDCT4 recipe substrate: 2 (1=CPU, 2=QPU) H264_IDCT8 recipe substrate: 2 H264_DEBLOCK_LV recipe substrate: 2 H264_QPEL_MC20 recipe substrate: 2 H264_DEBLOCK_LH recipe substrate: 1 (CPU, no QPU H shader yet) H264_DEBLOCK_CV recipe substrate: 1 (CPU) H264_DEBLOCK_CH recipe substrate: 1 (CPU) H.264 IDCT 4x4: 2048/2048 bytes bit-exact (100.0000%) H.264 IDCT 8x8: 2048/2048 bytes bit-exact (100.0000%) H.264 deblock luma v: 2048/2048 bytes bit-exact (100.0000%) H.264 deblock luma h: 1024/1024 bytes bit-exact (100.0000%) H.264 deblock chroma v: 256/256 bytes bit-exact (100.0000%) H.264 deblock chroma h: 256/256 bytes bit-exact (100.0000%) H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%) All 7 kernels bit-exact PASS. Chroma test sizes are smaller (256 bytes per orientation) because the per-MB chroma deblock surface is smaller than luma — accurate to the production geometry. Why no QPU shader yet (per the established pattern): - Chroma deblock is ~25% of total deblock work at 4:2:0 (one quarter the pixel count of luma per MB) — modest QPU win even after the shader exists. - Same R-band considerations as the luma _h follow-up: the V shader transpose isn't mechanical, and the 8-cell tile is small enough that NEON's per-edge cost (~3 ns) is already inside the budget. - Total bench at 1080p: 8160 MBs × 4 chroma edges × 3 ns = ~100 us. Negligible compared to the IDCT layer's 10 ms (CPU NEON). Now coverage in fourier for the bS<4 8-bit 4:2:0 deblock matrix is complete: luma_v ✓, luma_h ✓, chroma_v ✓, chroma_h ✓. Remaining deblock work: bS=4 intra variants (luma + chroma, V + H). What this unblocks downstream: - daedalus-decoder Stage 4 deblock can now dispatch all four bS<4 edge categories that a typical inter MB needs.
2026-05-24 23:53:09 +02:00
parent f4af24020f
commit a5c47aa51c
5 changed files with 315 additions and 0 deletions
@@ -131,6 +131,8 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct8.spv */
    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264deblock.spv */
    case DAEDALUS_KERNEL_H264_DEBLOCK_LH:  return DAEDALUS_SUBSTRATE_CPU;	/* QPU H shader pending */
+    case DAEDALUS_KERNEL_H264_DEBLOCK_CV:  return DAEDALUS_SUBSTRATE_CPU;	/* chroma QPU pending */
+    case DAEDALUS_KERNEL_H264_DEBLOCK_CH:  return DAEDALUS_SUBSTRATE_CPU;	/* chroma QPU pending */
    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc20.spv */
    }
    return DAEDALUS_SUBSTRATE_CPU;
@@ -158,6 +160,10 @@ extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
 extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
+extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
+                                                int alpha, int beta, int8_t *tc0);
+extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
+                                                int alpha, int beta, int8_t *tc0);
 extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
                                         ptrdiff_t stride);

@@ -284,6 +290,36 @@ static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx,
    return 0;
 }

+static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_edges; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
+                                 meta[i].tc0[2], meta[i].tc0[3] };
+        ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off,
+                                            (ptrdiff_t) dst_stride,
+                                            meta[i].alpha, meta[i].beta, tc0_local);
+    }
+    return 0;
+}
+
+static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_edges; i++) {
+        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
+                                 meta[i].tc0[2], meta[i].tc0[3] };
+        ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off,
+                                            (ptrdiff_t) dst_stride,
+                                            meta[i].alpha, meta[i].beta, tc0_local);
+    }
+    return 0;
+}
+
 static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1206,6 +1242,34 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate
    return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta);
 }

+int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_QPU)
+        return -1;  /* No chroma QPU shader yet. */
+    return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta);
+}
+
+int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_QPU)
+        return -1;
+    return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
+}
+
 int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1301,6 +1365,22 @@ int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
                                                  dst, dst_stride, n_edges, meta);
 }

+int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                                    dst, dst_stride, n_edges, meta);
+}
+
+int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                                    dst, dst_stride, n_edges, meta);
+}
+
 int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)