From 8fdef27a7d0344a14449aaee28098b6aef0654ed Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sat, 23 May 2026 03:25:24 +0200 Subject: [PATCH] Phase 8c: H.264 luma qpel mc20 through public API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends daedalus-fourier with daedalus_recipe_dispatch_h264_qpel_mc20 so libavcodec.so can route H264QpelContext.put_h264_qpel_pixels_tab[1][2] through the recipe layer instead of ff_put_h264_qpel8_mc20_neon directly. API additions (header + library): - daedalus_h264_qpel_meta { dst_off, src_off } - daedalus_dispatch_h264_qpel_mc20(ctx, sub, dst, src, stride, n_blocks, meta) - daedalus_recipe_dispatch_h264_qpel_mc20(...) (AUTO wrapper) - DAEDALUS_KERNEL_H264_QPEL_MC20 = 9 in the recipe-query enum - daedalus_recipe_substrate_for() returns CPU NEON for cycle 9 The 6-tap horizontal half-pel filter signature matches FFmpeg's H264QpelContext convention exactly: dst and src share a single stride and src already points at output column 0 (filter reads cols -2..+3). Single-stride API to make the marfrit-packages FFmpeg shim a straight pointer-pass; no buffer rearrangement. Verdict per docs/k9_h264qpel_mc20.md: CPU NEON. Per-block 7.6 ns gives 135x margin over 30 fps 1080p; QPU dispatch floor at ~250 ns makes any V3D shader strictly worse. Recipe table reflects that — the recipe_dispatch entry is a one-line forward to the CPU path. CMakeLists changes: - h264qpel_neon.S added to the daedalus_core static lib (only the bench targets owned it before; now the public API needs it too) - tests/h264_qpel8_mc20_ref.c added to the test_api_h264 target Phase 8a/8b smoke gains a 4th case (test_qpel_mc20): 1024/1024 bytes bit-exact via daedalus_recipe_dispatch_h264_qpel_mc20. Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 9. --- CMakeLists.txt | 2 ++ include/daedalus.h | 34 ++++++++++++++++++++++++++++++++ src/daedalus_core.c | 35 +++++++++++++++++++++++++++++++++ tests/test_api_h264.c | 45 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 116 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index bd38c3e..293cc6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -365,6 +365,7 @@ add_library(daedalus_core STATIC ${FFC_MC_SOURCES} ${FFASM_H264IDCT_SOURCES} ${FFASM_H264DSP_SOURCES} + ${FFASM_H264QPEL_SOURCES} ${DAV1D_CDEF_ASM_SOURCES} ${DAV1D_CDEF_C_SOURCES} ) @@ -458,6 +459,7 @@ add_executable(test_api_h264 tests/h264_idct4_ref.c tests/h264_idct8_ref.c tests/h264_deblock_ref.c + tests/h264_qpel8_mc20_ref.c ) target_link_libraries(test_api_h264 PRIVATE daedalus_core) target_compile_options(test_api_h264 PRIVATE -O2) diff --git a/include/daedalus.h b/include/daedalus.h index 6a26866..ee434f9 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -263,6 +263,39 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta); +/* ------------------------------------------------------------------- + * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9 + * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see + * docs/k9_h264qpel_mc20.md for the R-band rationale). + * + * Per H.264 §8.4.2.2.1, horizontal half-pel luma 6-tap filter: + * dst[r,c] = clip255((s[r,c-2] - 5*s[r,c-1] + 20*s[r,c] + * + 20*s[r,c+1] - 5*s[r,c+2] + s[r,c+3] + * + 16) >> 5) + * + * Single-stride: dst and src share `stride`; this matches FFmpeg's + * H264QpelContext.put_h264_qpel_pixels_tab[][] convention and the + * vendored ff_put_h264_qpel8_mc20_neon signature. + * + * `src + src_off` points at the leftmost OUTPUT column (col 0); the + * filter reads cols -2..+3, so the caller must guarantee src has at + * least 2 pixels of left context and 3 pixels of right context per + * row. (FFmpeg already maintains an edge-emulated buffer for the + * frame boundary; this matches that contract.) + * ----------------------------------------------------------------- */ +typedef struct { + uint32_t dst_off; /* byte offset into dst (block top-left) */ + uint32_t src_off; /* byte offset into src (col 0, row 0) */ +} daedalus_h264_qpel_meta; + +int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + +int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + /* ------------------------------------------------------------------- * Recipe query — what does the API recommend for each kernel? * ----------------------------------------------------------------- */ @@ -275,6 +308,7 @@ typedef enum { DAEDALUS_KERNEL_H264_IDCT4 = 6, DAEDALUS_KERNEL_H264_IDCT8 = 7, DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8, + DAEDALUS_KERNEL_H264_QPEL_MC20 = 9, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 0cdec1e..fd7d73b 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -93,6 +93,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU; case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU; case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_CPU; } return DAEDALUS_SUBSTRATE_CPU; } @@ -117,6 +118,8 @@ extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); +extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); /* -------------------- CPU dispatch implementations -------------- */ @@ -226,6 +229,22 @@ static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx, return 0; } +static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta) +{ + (void) ctx; + /* FFmpeg's NEON entry uses a single stride for both dst and src + * (H264QpelContext convention). Caller already guarantees this + * via the public API contract documented in daedalus.h. */ + for (size_t i = 0; i < n_blocks; i++) { + ff_put_h264_qpel8_mc20_neon(dst + meta[i].dst_off, + src + meta[i].src_off, + (ptrdiff_t) stride); + } + return 0; +} + /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */ typedef struct { @@ -811,6 +830,14 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta); } +int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta) +{ + ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_QPEL_MC20, dispatch_h264_qpel_mc20_cpu, + dst, src, stride, n_blocks, meta); +} + /* -------------------- Recipe convenience wrappers --------------- */ int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx, @@ -881,3 +908,11 @@ int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, n_edges, meta); } + +int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta) +{ + return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, src, stride, n_blocks, meta); +} diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c index b9d29a2..412e3a7 100644 --- a/tests/test_api_h264.c +++ b/tests/test_api_h264.c @@ -18,6 +18,8 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t s extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); +extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); static uint64_t xs_state = 0xa11264ULL; static inline uint64_t xs(void) { @@ -143,6 +145,46 @@ static int test_deblock(void) return diff == 0 ? 0 : 1; } +static int test_qpel_mc20(void) +{ + /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile + * holds rows 0..7; src[c-2..c+3] read via SRC_COL offset matches the + * cycle-9 bench convention so the same C reference and NEON .S can + * be compared. */ + enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 8, + TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES, + SRC_COL = 3 }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL]; + daedalus_h264_qpel_meta meta[N]; + + for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff); + memset(dst, 0, sizeof(dst)); + memset(dst_ref, 0, sizeof(dst_ref)); + + for (int i = 0; i < N; i++) { + meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_COL); + meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_COL); + } + + for (int i = 0; i < N; i++) + daedalus_put_h264_qpel8_mc20_ref(dst_ref + meta[i].dst_off, + src + meta[i].src_off, + TILE_STRIDE); + + int rc = daedalus_recipe_dispatch_h264_qpel_mc20(ctx, dst, src, + TILE_STRIDE, N, meta); + if (rc) { fprintf(stderr, "qpel_mc20 dispatch rc=%d\n", rc); return 1; } + int diff = 0; + for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 qpel mc20: %d/%d bytes bit-exact (%.4f%%)\n", + TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + int main(void) { printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n"); @@ -152,10 +194,13 @@ int main(void) (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8)); printf(" H264_DEBLOCK_LV recipe substrate: %d\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV)); + printf(" H264_QPEL_MC20 recipe substrate: %d\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20)); int fail = 0; fail |= test_idct4(); fail |= test_idct8(); fail |= test_deblock(); + fail |= test_qpel_mc20(); return fail; }