From c3301b0c2e6efb852b18558a13fdf8a1f9eb4b43 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 00:47:37 +0200 Subject: [PATCH] h264: qpel mc02 (vertical half-pel, CPU/NEON) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror of cycle 9's mc20 transposed to vertical orientation. Wires up the second qpel half-pel position via the vendored ff_put_h264_qpel8_mc02_neon symbol, closes the "missing vertical sibling" gap that mc20 left open since cycle 9. Scope: - Public API: daedalus_dispatch_h264_qpel_mc02 + recipe wrapper. - Internal: dispatch_h264_qpel_mc02_cpu calling the NEON entry. - Recipe table: DAEDALUS_KERNEL_H264_QPEL_MC02 = 17 → CPU. Explicit SUBSTRATE_QPU returns -1 (no shader yet). - C reference: tests/h264_qpel8_mc02_ref.c — vertical 6-tap transpose of mc20 (reads src[(r±N)*stride + c] instead of src[r*stride + c±N]). - Test: test_qpel_mc02 in test_api_h264, 8 tiles × 16×16 cols × 16 rows, random input, bit-exact compare against the C ref. Verified on hertz: $ ./build/test_api_h264 ... H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%) H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%) All 12 H.264 kernels in the api_smoke now bit-exact PASS. Why CPU-only: same R-band logic as the deblock _h sibling pattern. mc02 at ~7.6 ns per 8x8 block on NEON (per the cycle 9 baseline measurements) gives ~700 us for 8160 MBs × 4 8x8 luma blocks at 1080p — comfortably inside the 33 ms budget. QPU shader is a fast-follow once the V vs H shader work is consolidated (the transpose for the V shader is not mechanical — different SIMD access pattern than the H shader). Coverage matrix update: qpel position put_ status avg_ status ------------- ----------- ----------- mc00 (copy) not wired not wired mc10 (¼-H) not wired not wired mc20 (½-H) ✓ QPU+CPU not wired mc30 (¾-H) not wired not wired mc01 (¼-V) not wired not wired mc02 (½-V) ✓ CPU not wired (this PR) mc03 (¾-V) not wired not wired mc11..mc33 not wired not wired 13 more qpel positions to go for the full put_ matrix. Adding them follows the same template; each is a small contained PR. --- CMakeLists.txt | 1 + include/daedalus.h | 24 ++++++++++++++++++++ src/daedalus_core.c | 38 +++++++++++++++++++++++++++++++ tests/h264_qpel8_mc02_ref.c | 45 +++++++++++++++++++++++++++++++++++++ tests/test_api_h264.c | 43 +++++++++++++++++++++++++++++++++++ 5 files changed, 151 insertions(+) create mode 100644 tests/h264_qpel8_mc02_ref.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ebdac..b48aaa9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -523,6 +523,7 @@ add_executable(test_api_h264 tests/h264_chroma_loop_filter_ref.c tests/h264_intra_loop_filter_ref.c tests/h264_qpel8_mc20_ref.c + tests/h264_qpel8_mc02_ref.c ) target_link_libraries(test_api_h264 PRIVATE daedalus_core) target_compile_options(test_api_h264 PRIVATE -O2) diff --git a/include/daedalus.h b/include/daedalus.h index 02944e3..ccb2260 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -392,6 +392,29 @@ int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta); +/* H.264 luma qpel mc02 (vertical half-pel) — mirror of mc20. + * 6-tap filter applied vertically: + * dst[r,c] = clip255((s[r-2,c] - 5*s[r-1,c] + 20*s[r,c] + * + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c] + * + 16) >> 5) + * + * Same single-stride convention as mc20. src + src_off points at + * row 0 col 0 of the OUTPUT block; the filter reads rows -2..+3, so + * the caller must guarantee 2 rows of top context and 3 rows of + * bottom context per block (FFmpeg edge-emulated buffer handles + * frame boundaries; same contract as mc20). + * + * QPU shader not implemented yet; recipe table routes AUTO to CPU + * NEON. Explicit DAEDALUS_SUBSTRATE_QPU returns -1. + */ +int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + +int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + /* ------------------------------------------------------------------- * Recipe query — what does the API recommend for each kernel? * ----------------------------------------------------------------- */ @@ -412,6 +435,7 @@ typedef enum { DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA = 14, DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16, + DAEDALUS_KERNEL_H264_QPEL_MC02 = 17, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 0334581..497d5df 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -138,6 +138,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */ case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU; case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */ + case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */ } return DAEDALUS_SUBSTRATE_CPU; } @@ -178,6 +179,8 @@ extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stri int alpha, int beta); extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); /* -------------------- CPU dispatch implementations -------------- */ @@ -405,6 +408,19 @@ static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx, return 0; } +static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_blocks; i++) { + ff_put_h264_qpel8_mc02_neon(dst + meta[i].dst_off, + src + meta[i].src_off, + (ptrdiff_t) stride); + } + return 0; +} + /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */ typedef struct { @@ -1376,6 +1392,20 @@ int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub, n_blocks, meta); } +int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta) +{ + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC02); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_QPU) + return -1; /* No mc02 QPU shader yet — explicit QPU fast-fails. */ + return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta); +} + /* -------------------- Recipe convenience wrappers --------------- */ int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx, @@ -1494,3 +1524,11 @@ int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, src, stride, n_blocks, meta); } + +int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta) +{ + return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, src, stride, n_blocks, meta); +} diff --git a/tests/h264_qpel8_mc02_ref.c b/tests/h264_qpel8_mc02_ref.c new file mode 100644 index 0000000..16dd2d7 --- /dev/null +++ b/tests/h264_qpel8_mc02_ref.c @@ -0,0 +1,45 @@ +/* + * Standalone bit-exact C reference for H.264 luma qpel 8×8 mc02 + * (vertical half-pel, "put" variant). Mirror of mc20 with rows + * and columns transposed. 6-tap filter applied vertically: + * + * dst[r,c] = clip255( (s[r-2,c] - 5*s[r-1,c] + 20*s[r,c] + * + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c] + * + 16) >> 5 ) + * + * Mirrors FFmpeg `ff_put_h264_qpel8_mc02_neon` (in + * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S + * line 678, which tail-calls put_h264_qpel8_v_lowpass_neon). + * + * Signature: + * void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + * + * Both dst and src use the SAME stride. src points at row 0 col 0 + * of the output block; the filter reads rows -2..+3 (2 rows of top + * context, 3 rows of bottom context). Caller must guarantee the + * source buffer has those rows available (FFmpeg's edge-emulated + * buffer handles this at the frame boundary; matches the contract + * documented for mc20). + * + * License: LGPL-2.1-or-later. + */ +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } + +void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +{ + for (int r = 0; r < 8; r++) { + for (int c = 0; c < 8; c++) { + int s_m2 = src[(r - 2) * stride + c]; + int s_m1 = src[(r - 1) * stride + c]; + int s_0 = src[(r + 0) * stride + c]; + int s_p1 = src[(r + 1) * stride + c]; + int s_p2 = src[(r + 2) * stride + c]; + int s_p3 = src[(r + 3) * stride + c]; + int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16; + dst[r * stride + c] = (uint8_t) clip_u8(v >> 5); + } + } +} diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c index 4eeb505..2c61fac 100644 --- a/tests/test_api_h264.c +++ b/tests/test_api_h264.c @@ -32,6 +32,8 @@ extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t int alpha, int beta); extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); +extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); @@ -399,6 +401,46 @@ static int test_qpel_mc20(void) return diff == 0 ? 0 : 1; } +static int test_qpel_mc02(void) +{ + /* mc02: vertical 6-tap. Tile is 16 cols × 16 rows so the kernel + * can read rows [SRC_ROW-2 .. SRC_ROW+7+3] inside the buffer. + * SRC_ROW = 3 leaves rows -2..-1 above the output (rows 1..2 of + * the tile) and rows 8..10 below (rows 11..13). */ + enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16, + TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES, + SRC_ROW = 3 }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL]; + daedalus_h264_qpel_meta meta[N]; + + for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff); + memset(dst, 0, sizeof(dst)); + memset(dst_ref, 0, sizeof(dst_ref)); + + for (int i = 0; i < N; i++) { + meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE); + meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE); + } + + for (int i = 0; i < N; i++) + daedalus_put_h264_qpel8_mc02_ref(dst_ref + meta[i].dst_off, + src + meta[i].src_off, + TILE_STRIDE); + + int rc = daedalus_recipe_dispatch_h264_qpel_mc02(ctx, dst, src, + TILE_STRIDE, N, meta); + if (rc) { fprintf(stderr, "qpel_mc02 dispatch rc=%d\n", rc); return 1; } + int diff = 0; + for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 qpel mc02: %d/%d bytes bit-exact (%.4f%%)\n", + TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + int main(void) { printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n"); @@ -429,5 +471,6 @@ int main(void) fail |= test_deblock_chroma_h(); fail |= test_deblock_intra_all(); fail |= test_qpel_mc20(); + fail |= test_qpel_mc02(); return fail; }