From 20a4299c5cd603e129981456f156390eba9a074d Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 01:03:14 +0200 Subject: [PATCH] h264: qpel mc22 (2D half-pel, CPU/NEON) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the "j position" 2D half-pel via cascaded H + V 6-tap lowpass with intermediate 16-bit precision per H.264 §8.4.2.2.1. One of the most common qpel positions in real H.264 streams — many encoders emit 1/2-1/2 motion vectors as their best-RD choice. Algorithmically distinct from the 1D mc20/mc02 siblings: - Horizontal 6-tap produces 13 rows of int16 intermediate (no per-stage clip/round — full precision retained). - Vertical 6-tap on the intermediate, then +512 >> 10 (the double-shift compensates for both 6-tap scalings) + clip255. The intermediate-precision requirement means the C reference can't just be "call mc20 then mc02" — that would double-clip and produce the wrong result. The 13-row int16 tmp[] buffer is the central invariant. Scope (same pattern as mc02 PR #15): - Public API: daedalus_dispatch_h264_qpel_mc22 + recipe wrapper. - Internal: dispatch_h264_qpel_mc22_cpu calling ff_put_h264_qpel8_mc22_neon. - Recipe table: DAEDALUS_KERNEL_H264_QPEL_MC22 = 18 → CPU. - C reference: tests/h264_qpel8_mc22_ref.c — explicit tmp[13][8] int16 staging buffer; spec-derived shifts and rounding. - Test: test_qpel_mc22 in test_api_h264, 8 tiles at 16×16 with output positioned at (SRC_ROW=3, SRC_COL=3) so the kernel's [-2 .. +10] read window stays in-tile. Verified on hertz: $ ./build/test_api_h264 | tail -5 H.264 deblock chroma v intra: 256/256 bytes bit-exact (100.0000%) H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%) H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%) H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc22: 2048/2048 bytes bit-exact (100.0000%) All 13 H.264 kernels in api_smoke now bit-exact PASS. mc22 being right first try is meaningful — the +512 >> 10 scaling + int16 intermediate sequence has multiple sign/shift/clip pitfalls and any of them would surface on random inputs immediately. Coverage matrix update: put_ mc20 ✓ (QPU+CPU) put_ mc02 ✓ (CPU) put_ mc22 ✓ (CPU) → 12 single put_ positions still missing (¼/¾ + HV combos with L2 averaging). --- CMakeLists.txt | 1 + include/daedalus.h | 22 ++++++++++++ src/daedalus_core.c | 38 ++++++++++++++++++++ tests/h264_qpel8_mc22_ref.c | 70 +++++++++++++++++++++++++++++++++++++ tests/test_api_h264.c | 43 +++++++++++++++++++++++ 5 files changed, 174 insertions(+) create mode 100644 tests/h264_qpel8_mc22_ref.c diff --git a/CMakeLists.txt b/CMakeLists.txt index b48aaa9..38ced99 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -524,6 +524,7 @@ add_executable(test_api_h264 tests/h264_intra_loop_filter_ref.c tests/h264_qpel8_mc20_ref.c tests/h264_qpel8_mc02_ref.c + tests/h264_qpel8_mc22_ref.c ) target_link_libraries(test_api_h264 PRIVATE daedalus_core) target_compile_options(test_api_h264 PRIVATE -O2) diff --git a/include/daedalus.h b/include/daedalus.h index ccb2260..d827d3e 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -415,6 +415,27 @@ int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta); +/* H.264 luma qpel mc22 (2D half-pel "j" position per spec §8.4.2.2.1). + * Horizontal 6-tap cascaded into vertical 6-tap with intermediate + * 16-bit precision; final +512 >> 10 with clip255. Common position + * in real H.264 streams. + * + * src + src_off points at row 0 col 0 of the OUTPUT block; the + * cascade reads rows -2..+10 (13 rows of context) and cols -2..+5 + * (10 cols of context). Caller must guarantee. + * + * QPU shader not implemented yet (the HV lowpass is the meatiest + * qpel kernel; structurally distinct from the 1D mc20 shader). + * Recipe routes AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1. + */ +int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + +int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + /* ------------------------------------------------------------------- * Recipe query — what does the API recommend for each kernel? * ----------------------------------------------------------------- */ @@ -436,6 +457,7 @@ typedef enum { DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16, DAEDALUS_KERNEL_H264_QPEL_MC02 = 17, + DAEDALUS_KERNEL_H264_QPEL_MC22 = 18, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 497d5df..45ecff5 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -139,6 +139,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU; case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */ case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */ + case DAEDALUS_KERNEL_H264_QPEL_MC22: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc22 shader pending (hv lowpass) */ } return DAEDALUS_SUBSTRATE_CPU; } @@ -181,6 +182,8 @@ extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); /* -------------------- CPU dispatch implementations -------------- */ @@ -421,6 +424,19 @@ static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx, return 0; } +static int dispatch_h264_qpel_mc22_cpu(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_blocks; i++) { + ff_put_h264_qpel8_mc22_neon(dst + meta[i].dst_off, + src + meta[i].src_off, + (ptrdiff_t) stride); + } + return 0; +} + /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */ typedef struct { @@ -1406,6 +1422,20 @@ int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub, return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta); } +int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta) +{ + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC22); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_QPU) + return -1; /* No mc22 QPU shader yet — explicit QPU fast-fails. */ + return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta); +} + /* -------------------- Recipe convenience wrappers --------------- */ int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx, @@ -1532,3 +1562,11 @@ int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, src, stride, n_blocks, meta); } + +int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta) +{ + return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, src, stride, n_blocks, meta); +} diff --git a/tests/h264_qpel8_mc22_ref.c b/tests/h264_qpel8_mc22_ref.c new file mode 100644 index 0000000..fda59d2 --- /dev/null +++ b/tests/h264_qpel8_mc22_ref.c @@ -0,0 +1,70 @@ +/* + * Standalone bit-exact C reference for H.264 luma qpel 8x8 mc22 + * (2D half-pel, "put" variant). Cascade of horizontal 6-tap then + * vertical 6-tap with INTERMEDIATE 16-bit precision (no per-stage + * clip/round), final +512 >> 10 to scale back. + * + * Per H.264 §8.4.2.2.1, "j" position: + * + * tmp[r,c] = s[r,c-2] - 5*s[r,c-1] + 20*s[r,c] + 20*s[r,c+1] + * - 5*s[r,c+2] + s[r,c+3] (16-bit signed) + * + * dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c] + * + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c] + * + 512) >> 10) + * + * The tmp[] array spans rows r-2 .. r+3 around each output row, so + * we need 13 intermediate rows (rows -2..+10 of the SOURCE + * neighbourhood) for 8 output rows. Caller's src must have 2 rows + * of top context + 3 rows of bottom context AND 2 cols of left + + * 3 cols of right context (FFmpeg's edge-emulated buffer provides + * this at the frame boundary; same contract as mc20). + * + * Mirrors FFmpeg `ff_put_h264_qpel8_mc22_neon` (in + * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S + * line 710, which tail-calls put_h264_qpel8_hv_lowpass_neon). + * + * Signature: + * void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + * + * Same single-stride convention as mc20/mc02. + * + * License: LGPL-2.1-or-later. + */ +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } + +void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +{ + /* 13 intermediate rows × 8 cols (for the 8 output rows + * dst[0..7][0..7], we need tmp[-2..+10][0..7] — but tmp is + * indexed RELATIVE to the output, so tmp_buf[0..12] corresponds + * to source rows [-2..+10]). */ + int16_t tmp[13][8]; + for (int rr = 0; rr < 13; rr++) { + int src_row = rr - 2; /* maps tmp_buf[0..12] → src rows [-2..+10] */ + const uint8_t *s = src + src_row * stride; + for (int c = 0; c < 8; c++) { + int v = (int) s[c - 2] - 5 * (int) s[c - 1] + + 20 * (int) s[c] + 20 * (int) s[c + 1] + - 5 * (int) s[c + 2] + (int) s[c + 3]; + tmp[rr][c] = (int16_t) v; + } + } + + for (int r = 0; r < 8; r++) { + /* tmp[r-2..r+3] in the output's coord system → tmp_buf[r..r+5]. */ + for (int c = 0; c < 8; c++) { + int v = tmp[r + 0][c] /* "r-2" + shift 2 */ + - 5 * tmp[r + 1][c] /* "r-1" */ + + 20 * tmp[r + 2][c] /* "r+0" */ + + 20 * tmp[r + 3][c] /* "r+1" */ + - 5 * tmp[r + 4][c] /* "r+2" */ + + tmp[r + 5][c] /* "r+3" */ + + 512; + dst[r * stride + c] = (uint8_t) clip_u8(v >> 10); + } + } +} diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c index 2c61fac..275a556 100644 --- a/tests/test_api_h264.c +++ b/tests/test_api_h264.c @@ -34,6 +34,8 @@ extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); @@ -441,6 +443,46 @@ static int test_qpel_mc02(void) return diff == 0 ? 0 : 1; } +static int test_qpel_mc22(void) +{ + /* mc22: 2D HV lowpass. Needs 2 cols left + 3 cols right + 2 rows + * top + 3 rows bottom of context per 8x8 output. Tile is 16x16 + * with output positioned at (SRC_ROW=3, SRC_COL=3) so the read + * range [SRC_*-2 .. SRC_*+7+3] stays inside the tile. */ + enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16, + TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES, + SRC_ROW = 3, SRC_COL = 3 }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL]; + daedalus_h264_qpel_meta meta[N]; + + for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff); + memset(dst, 0, sizeof(dst)); + memset(dst_ref, 0, sizeof(dst_ref)); + + for (int i = 0; i < N; i++) { + meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); + meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); + } + + for (int i = 0; i < N; i++) + daedalus_put_h264_qpel8_mc22_ref(dst_ref + meta[i].dst_off, + src + meta[i].src_off, + TILE_STRIDE); + + int rc = daedalus_recipe_dispatch_h264_qpel_mc22(ctx, dst, src, + TILE_STRIDE, N, meta); + if (rc) { fprintf(stderr, "qpel_mc22 dispatch rc=%d\n", rc); return 1; } + int diff = 0; + for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 qpel mc22: %d/%d bytes bit-exact (%.4f%%)\n", + TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + int main(void) { printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n"); @@ -472,5 +514,6 @@ int main(void) fail |= test_deblock_intra_all(); fail |= test_qpel_mc20(); fail |= test_qpel_mc02(); + fail |= test_qpel_mc22(); return fail; } -- 2.47.3