Merge pull request 'h264: deblock chroma_v + chroma_h (CPU/NEON, bS<4)' (#10) from noether/h264-deblock-chroma into main

Reviewed-on: #10
2026-05-24 21:55:57 +00:00
parent f4af24020f a5c47aa51c
commit ce436bfd96
5 changed files with 315 additions and 0 deletions
@@ -520,6 +520,7 @@ add_executable(test_api_h264
    tests/h264_idct8_ref.c
    tests/h264_deblock_ref.c
    tests/h264_h_loop_filter_luma_ref.c
    tests/h264_chroma_loop_filter_ref.c
    tests/h264_qpel8_mc20_ref.c
 )
 target_link_libraries(test_api_h264 PRIVATE daedalus_core)
@@ -286,6 +286,35 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 /* H.264 chroma (4:2:0) loop filters — bS<4 variant.  Chroma uses
 * the SAME daedalus_h264_deblock_meta struct as luma but on smaller
 * tiles: 8 cols × 4 rows for V (4 segments of 2 cols), 4 cols × 8
 * rows for H (4 segments of 2 rows).  Each segment has its own tc0
 * strength (tc0[s] applies to both cells in segment s).
 *
 * Algorithm difference vs luma: chroma updates only p0 and q0
 * (never p1/p2/q1/q2) and uses tC = tc0_seg + 1 directly (no
 * luma-style ap/aq side-condition bonus).
 *
 * QPU shaders for chroma deblock not implemented yet; recipe table
 * routes AUTO to CPU NEON.  Explicit SUBSTRATE_QPU returns -1.
 */
 int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
 /* -------------------------------------------------------------------
 * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
 * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
@@ -333,6 +362,8 @@ typedef enum {
    DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
    DAEDALUS_KERNEL_H264_QPEL_MC20  = 9,
    DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
    DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11,
    DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12,
 } daedalus_kernel;
 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
@@ -131,6 +131,8 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_idct8.spv */
    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264deblock.spv */
    case DAEDALUS_KERNEL_H264_DEBLOCK_LH:  return DAEDALUS_SUBSTRATE_CPU;	/* QPU H shader pending */
    case DAEDALUS_KERNEL_H264_DEBLOCK_CV:  return DAEDALUS_SUBSTRATE_CPU;	/* chroma QPU pending */
    case DAEDALUS_KERNEL_H264_DEBLOCK_CH:  return DAEDALUS_SUBSTRATE_CPU;	/* chroma QPU pending */
    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc20.spv */
    }
    return DAEDALUS_SUBSTRATE_CPU;
@@ -158,6 +160,10 @@ extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
 extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
 extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
                                                int alpha, int beta, int8_t *tc0);
 extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
                                                int alpha, int beta, int8_t *tc0);
 extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
                                         ptrdiff_t stride);
@@ -284,6 +290,36 @@ static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx,
    return 0;
 }
 static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    (void) ctx;
    for (size_t i = 0; i < n_edges; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
                                 meta[i].tc0[2], meta[i].tc0[3] };
        ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off,
                                            (ptrdiff_t) dst_stride,
                                            meta[i].alpha, meta[i].beta, tc0_local);
    }
    return 0;
 }
 static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    (void) ctx;
    for (size_t i = 0; i < n_edges; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
                                 meta[i].tc0[2], meta[i].tc0[3] };
        ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off,
                                            (ptrdiff_t) dst_stride,
                                            meta[i].alpha, meta[i].beta, tc0_local);
    }
    return 0;
 }
 static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1206,6 +1242,34 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate
    return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta);
 }
 int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    daedalus_substrate eff = sub;
    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_QPU)
        return -1;  /* No chroma QPU shader yet. */
    return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta);
 }
 int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    daedalus_substrate eff = sub;
    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_QPU)
        return -1;
    return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
 }
 int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1301,6 +1365,22 @@ int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
                                                  dst, dst_stride, n_edges, meta);
 }
 int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                                    dst, dst_stride, n_edges, meta);
 }
 int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
    return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                                    dst, dst_stride, n_edges, meta);
 }
 int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -0,0 +1,110 @@
 /*
 * Standalone bit-exact C reference for H.264 chroma loop filters
 * (bS < 4 variant; "intra" / bS=4 variant lives in a separate file
 * when added).  Covers both orientations:
 *
 *   v_loop_filter_chroma: filter applied VERTICALLY across a
 *     HORIZONTAL edge.  Tile is 8 cols × 4 rows of context
 *     (rows -2..+1); pix points to row 0 of the bottom block.
 *   h_loop_filter_chroma: filter applied HORIZONTALLY across a
 *     VERTICAL edge.  Tile is 4 cols × 8 rows of context
 *     (cols -2..+1); pix points to col 0 of the right block.
 *
 * Mirrors FFmpeg `ff_h264_v_loop_filter_chroma_neon` (line 412) and
 * `ff_h264_h_loop_filter_chroma_neon` (line 430) in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
 *
 * Algorithm per H.264 §8.7.2.4 (chroma bS<4 inter):
 *   - Same edge preconditions as luma: |p0-q0|<α, |p1-p0|<β, |q1-q0|<β.
 *   - tC = tc0_seg + 1 (chroma's tc has no luma-style ap/aq side bonus).
 *   - δ = clip3((((q0-p0)<<2) + (p1-q1) + 4) >> 3, -tC, tC).
 *   - p0' = clip255(p0+δ); q0' = clip255(q0-δ).
 *   - Chroma NEVER updates p1, p2, q1, q2 (unlike luma).
 *
 * tc0[4]: 4 segments × 2 cells per segment = 8 cells per edge
 * (matches both 4:2:0 chroma plane geometry — 8 cols for V edge or
 * 8 rows for H edge).
 *
 * Signature (matches FFmpeg + the existing luma refs):
 *   void(uint8_t *pix, ptrdiff_t stride,
 *        int alpha, int beta, int8_t tc0[4]);
 *
 * License: LGPL-2.1-or-later (matches FFmpeg upstream).
 */
 #include <stdint.h>
 #include <stddef.h>
 static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
 static inline int clip3(int v, int lo, int hi) {
    return v < lo ? lo : v > hi ? hi : v;
 }
 static inline int abs_i(int x) { return x < 0 ? -x : x; }
 /* Per-cell chroma filter, vertical-direction access (one column
 * across the horizontal edge).  p1 is at pix[-2*stride], q1 at
 * pix[+1*stride]. */
 static void h264_chroma_cell_v(uint8_t *pix, ptrdiff_t stride,
                                int alpha, int beta, int tc0_s)
 {
    int p1 = pix[-2*stride], p0 = pix[-1*stride];
    int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
    if (abs_i(p0 - q0) >= alpha) return;
    if (abs_i(p1 - p0) >= beta)  return;
    if (abs_i(q1 - q0) >= beta)  return;
    int tc = tc0_s + 1;
    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
    pix[-1*stride] = (uint8_t) clip_u8(p0 + delta);
    pix[ 0*stride] = (uint8_t) clip_u8(q0 - delta);
 }
 /* Same kernel, horizontal-direction access (one row across the
 * vertical edge).  p1 at pix[-2], q1 at pix[+1]. */
 static void h264_chroma_cell_h(uint8_t *pix,
                                int alpha, int beta, int tc0_s)
 {
    int p1 = pix[-2], p0 = pix[-1];
    int q0 = pix[ 0], q1 = pix[ 1];
    if (abs_i(p0 - q0) >= alpha) return;
    if (abs_i(p1 - p0) >= beta)  return;
    if (abs_i(q1 - q0) >= beta)  return;
    int tc = tc0_s + 1;
    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
    pix[-1] = (uint8_t) clip_u8(p0 + delta);
    pix[ 0] = (uint8_t) clip_u8(q0 - delta);
 }
 void daedalus_h264_v_loop_filter_chroma_ref(
    uint8_t *pix, ptrdiff_t stride,
    int alpha, int beta, int8_t tc0[4])
 {
    if (alpha == 0 || beta == 0) return;
    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
    /* 8 cols divided into 4 segments of 2 cols each. */
    for (int s = 0; s < 4; s++) {
        int tc0_s = tc0[s];
        if (tc0_s < 0) continue;
        for (int c = 0; c < 2; c++) {
            int col = s * 2 + c;
            h264_chroma_cell_v(pix + col, stride, alpha, beta, tc0_s);
        }
    }
 }
 void daedalus_h264_h_loop_filter_chroma_ref(
    uint8_t *pix, ptrdiff_t stride,
    int alpha, int beta, int8_t tc0[4])
 {
    if (alpha == 0 || beta == 0) return;
    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
    /* 8 rows divided into 4 segments of 2 rows each. */
    for (int s = 0; s < 4; s++) {
        int tc0_s = tc0[s];
        if (tc0_s < 0) continue;
        for (int r = 0; r < 2; r++) {
            int row = s * 2 + r;
            h264_chroma_cell_h(pix + row * stride, alpha, beta, tc0_s);
        }
    }
 }
@@ -18,6 +18,10 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t s
 extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                   int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
                                                     int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
                                                     int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                  int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
@@ -191,6 +195,89 @@ static int test_deblock_h(void)
    return diff == 0 ? 0 : 1;
 }
 static int test_deblock_chroma_v(void)
 {
    /* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
     * (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
    enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
           TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
           EDGE_OFF = EDGE_ROW * TILE_STRIDE };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;
    uint8_t dst[TOTAL], dst_ref[TOTAL];
    daedalus_h264_deblock_meta meta[N_EDGES];
    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
    for (int i = 0; i < N_EDGES; i++) {
        meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
        meta[i].alpha = (int)(xs() % 64) + 1;
        meta[i].beta  = (int)(xs() % 16) + 1;
        for (int s = 0; s < 4; s++) {
            int r = (int)(xs() % 8);
            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
        }
    }
    for (int i = 0; i < N_EDGES; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
        daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
                                                 meta[i].alpha, meta[i].beta, tc0_local);
    }
    int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
                                                              N_EDGES, meta);
    if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
    int diff = 0;
    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
    printf("  H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
 }
 static int test_deblock_chroma_h(void)
 {
    /* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
     * (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
    enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
           TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;
    uint8_t dst[TOTAL], dst_ref[TOTAL];
    daedalus_h264_deblock_meta meta[N_EDGES];
    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
    for (int i = 0; i < N_EDGES; i++) {
        meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
        meta[i].alpha = (int)(xs() % 64) + 1;
        meta[i].beta  = (int)(xs() % 16) + 1;
        for (int s = 0; s < 4; s++) {
            int r = (int)(xs() % 8);
            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
        }
    }
    for (int i = 0; i < N_EDGES; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
        daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
                                                 meta[i].alpha, meta[i].beta, tc0_local);
    }
    int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
                                                              N_EDGES, meta);
    if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
    int diff = 0;
    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
    printf("  H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
 }
 static int test_qpel_mc20(void)
 {
    /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
@@ -245,12 +332,18 @@ int main(void)
    printf("  H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
    printf("  H264_DEBLOCK_CV recipe substrate: %d (CPU)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
    printf("  H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
    int fail = 0;
    fail |= test_idct4();
    fail |= test_idct8();
    fail |= test_deblock();
    fail |= test_deblock_h();
    fail |= test_deblock_chroma_v();
    fail |= test_deblock_chroma_h();
    fail |= test_qpel_mc20();
    return fail;
 }