diff --git a/CMakeLists.txt b/CMakeLists.txt index f5c7f4d..367c414 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -520,6 +520,7 @@ add_executable(test_api_h264 tests/h264_idct8_ref.c tests/h264_deblock_ref.c tests/h264_h_loop_filter_luma_ref.c + tests/h264_chroma_loop_filter_ref.c tests/h264_qpel8_mc20_ref.c ) target_link_libraries(test_api_h264 PRIVATE daedalus_core) diff --git a/include/daedalus.h b/include/daedalus.h index 1cb8f1f..8fcbebd 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -286,6 +286,35 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta); +/* H.264 chroma (4:2:0) loop filters — bS<4 variant. Chroma uses + * the SAME daedalus_h264_deblock_meta struct as luma but on smaller + * tiles: 8 cols × 4 rows for V (4 segments of 2 cols), 4 cols × 8 + * rows for H (4 segments of 2 rows). Each segment has its own tc0 + * strength (tc0[s] applies to both cells in segment s). + * + * Algorithm difference vs luma: chroma updates only p0 and q0 + * (never p1/p2/q1/q2) and uses tC = tc0_seg + 1 directly (no + * luma-style ap/aq side-condition bonus). + * + * QPU shaders for chroma deblock not implemented yet; recipe table + * routes AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1. + */ +int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + +int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + +int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + +int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + /* ------------------------------------------------------------------- * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9 * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see @@ -333,6 +362,8 @@ typedef enum { DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8, DAEDALUS_KERNEL_H264_QPEL_MC20 = 9, DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10, + DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11, + DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 34ac6d0..699beb3 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -131,6 +131,8 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */ case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */ case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */ + case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */ + case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */ case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */ } return DAEDALUS_SUBSTRATE_CPU; @@ -158,6 +160,10 @@ extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); +extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); +extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); @@ -284,6 +290,36 @@ static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx, return 0; } +static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_edges; i++) { + int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], + meta[i].tc0[2], meta[i].tc0[3] }; + ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + meta[i].alpha, meta[i].beta, tc0_local); + } + return 0; +} + +static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_edges; i++) { + int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], + meta[i].tc0[2], meta[i].tc0[3] }; + ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + meta[i].alpha, meta[i].beta, tc0_local); + } + return 0; +} + static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) @@ -1206,6 +1242,34 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta); } +int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_QPU) + return -1; /* No chroma QPU shader yet. */ + return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta); +} + +int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_QPU) + return -1; + return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta); +} + int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) @@ -1301,6 +1365,22 @@ int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, dst, dst_stride, n_edges, meta); } +int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, n_edges, meta); +} + +int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, n_edges, meta); +} + int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) diff --git a/tests/h264_chroma_loop_filter_ref.c b/tests/h264_chroma_loop_filter_ref.c new file mode 100644 index 0000000..5c46e51 --- /dev/null +++ b/tests/h264_chroma_loop_filter_ref.c @@ -0,0 +1,110 @@ +/* + * Standalone bit-exact C reference for H.264 chroma loop filters + * (bS < 4 variant; "intra" / bS=4 variant lives in a separate file + * when added). Covers both orientations: + * + * v_loop_filter_chroma: filter applied VERTICALLY across a + * HORIZONTAL edge. Tile is 8 cols × 4 rows of context + * (rows -2..+1); pix points to row 0 of the bottom block. + * h_loop_filter_chroma: filter applied HORIZONTALLY across a + * VERTICAL edge. Tile is 4 cols × 8 rows of context + * (cols -2..+1); pix points to col 0 of the right block. + * + * Mirrors FFmpeg `ff_h264_v_loop_filter_chroma_neon` (line 412) and + * `ff_h264_h_loop_filter_chroma_neon` (line 430) in + * external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S. + * + * Algorithm per H.264 §8.7.2.4 (chroma bS<4 inter): + * - Same edge preconditions as luma: |p0-q0|<α, |p1-p0|<β, |q1-q0|<β. + * - tC = tc0_seg + 1 (chroma's tc has no luma-style ap/aq side bonus). + * - δ = clip3((((q0-p0)<<2) + (p1-q1) + 4) >> 3, -tC, tC). + * - p0' = clip255(p0+δ); q0' = clip255(q0-δ). + * - Chroma NEVER updates p1, p2, q1, q2 (unlike luma). + * + * tc0[4]: 4 segments × 2 cells per segment = 8 cells per edge + * (matches both 4:2:0 chroma plane geometry — 8 cols for V edge or + * 8 rows for H edge). + * + * Signature (matches FFmpeg + the existing luma refs): + * void(uint8_t *pix, ptrdiff_t stride, + * int alpha, int beta, int8_t tc0[4]); + * + * License: LGPL-2.1-or-later (matches FFmpeg upstream). + */ +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } +static inline int clip3(int v, int lo, int hi) { + return v < lo ? lo : v > hi ? hi : v; +} +static inline int abs_i(int x) { return x < 0 ? -x : x; } + +/* Per-cell chroma filter, vertical-direction access (one column + * across the horizontal edge). p1 is at pix[-2*stride], q1 at + * pix[+1*stride]. */ +static void h264_chroma_cell_v(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int tc0_s) +{ + int p1 = pix[-2*stride], p0 = pix[-1*stride]; + int q0 = pix[ 0*stride], q1 = pix[ 1*stride]; + if (abs_i(p0 - q0) >= alpha) return; + if (abs_i(p1 - p0) >= beta) return; + if (abs_i(q1 - q0) >= beta) return; + int tc = tc0_s + 1; + int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc); + pix[-1*stride] = (uint8_t) clip_u8(p0 + delta); + pix[ 0*stride] = (uint8_t) clip_u8(q0 - delta); +} + +/* Same kernel, horizontal-direction access (one row across the + * vertical edge). p1 at pix[-2], q1 at pix[+1]. */ +static void h264_chroma_cell_h(uint8_t *pix, + int alpha, int beta, int tc0_s) +{ + int p1 = pix[-2], p0 = pix[-1]; + int q0 = pix[ 0], q1 = pix[ 1]; + if (abs_i(p0 - q0) >= alpha) return; + if (abs_i(p1 - p0) >= beta) return; + if (abs_i(q1 - q0) >= beta) return; + int tc = tc0_s + 1; + int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc); + pix[-1] = (uint8_t) clip_u8(p0 + delta); + pix[ 0] = (uint8_t) clip_u8(q0 - delta); +} + +void daedalus_h264_v_loop_filter_chroma_ref( + uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t tc0[4]) +{ + if (alpha == 0 || beta == 0) return; + if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return; + + /* 8 cols divided into 4 segments of 2 cols each. */ + for (int s = 0; s < 4; s++) { + int tc0_s = tc0[s]; + if (tc0_s < 0) continue; + for (int c = 0; c < 2; c++) { + int col = s * 2 + c; + h264_chroma_cell_v(pix + col, stride, alpha, beta, tc0_s); + } + } +} + +void daedalus_h264_h_loop_filter_chroma_ref( + uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t tc0[4]) +{ + if (alpha == 0 || beta == 0) return; + if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return; + + /* 8 rows divided into 4 segments of 2 rows each. */ + for (int s = 0; s < 4; s++) { + int tc0_s = tc0[s]; + if (tc0_s < 0) continue; + for (int r = 0; r < 2; r++) { + int row = s * 2 + r; + h264_chroma_cell_h(pix + row * stride, alpha, beta, tc0_s); + } + } +} diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c index a0a9991..0254bd9 100644 --- a/tests/test_api_h264.c +++ b/tests/test_api_h264.c @@ -18,6 +18,10 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t s extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); +extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t tc0[4]); +extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta, int8_t tc0[4]); extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, @@ -191,6 +195,89 @@ static int test_deblock_h(void) return diff == 0 ? 0 : 1; } +static int test_deblock_chroma_v(void) +{ + /* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2 + * (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */ + enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4, + TILE_BYTES = TILE_STRIDE * TILE_ROWS, + TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2, + EDGE_OFF = EDGE_ROW * TILE_STRIDE }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + uint8_t dst[TOTAL], dst_ref[TOTAL]; + daedalus_h264_deblock_meta meta[N_EDGES]; + + for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); + for (int i = 0; i < N_EDGES; i++) { + meta[i].dst_off = i * TILE_BYTES + EDGE_OFF; + meta[i].alpha = (int)(xs() % 64) + 1; + meta[i].beta = (int)(xs() % 16) + 1; + for (int s = 0; s < 4; s++) { + int r = (int)(xs() % 8); + meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); + } + } + + for (int i = 0; i < N_EDGES; i++) { + int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; + daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE, + meta[i].alpha, meta[i].beta, tc0_local); + } + + int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE, + N_EDGES, meta); + if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; } + int diff = 0; + for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n", + TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + +static int test_deblock_chroma_h(void) +{ + /* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2 + * (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */ + enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8, + TILE_BYTES = TILE_STRIDE * TILE_ROWS, + TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + uint8_t dst[TOTAL], dst_ref[TOTAL]; + daedalus_h264_deblock_meta meta[N_EDGES]; + + for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); + for (int i = 0; i < N_EDGES; i++) { + meta[i].dst_off = i * TILE_BYTES + EDGE_COL; + meta[i].alpha = (int)(xs() % 64) + 1; + meta[i].beta = (int)(xs() % 16) + 1; + for (int s = 0; s < 4; s++) { + int r = (int)(xs() % 8); + meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); + } + } + + for (int i = 0; i < N_EDGES; i++) { + int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; + daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE, + meta[i].alpha, meta[i].beta, tc0_local); + } + + int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE, + N_EDGES, meta); + if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; } + int diff = 0; + for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n", + TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + static int test_qpel_mc20(void) { /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile @@ -245,12 +332,18 @@ int main(void) printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH)); + printf(" H264_DEBLOCK_CV recipe substrate: %d (CPU)\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV)); + printf(" H264_DEBLOCK_CH recipe substrate: %d (CPU)\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH)); int fail = 0; fail |= test_idct4(); fail |= test_idct8(); fail |= test_deblock(); fail |= test_deblock_h(); + fail |= test_deblock_chroma_v(); + fail |= test_deblock_chroma_h(); fail |= test_qpel_mc20(); return fail; }