Merge pull request 'Phase 8c: H.264 luma qpel mc20 through public API' (#2) from noether/api-h264-qpel-mc20 into main

Reviewed-on: #2
2026-05-23 01:29:24 +00:00
parent d87239d817 8fdef27a7d
commit 209a4218bc
4 changed files with 116 additions and 0 deletions
@@ -365,6 +365,7 @@ add_library(daedalus_core STATIC
    ${FFC_MC_SOURCES}
    ${FFASM_H264IDCT_SOURCES}
    ${FFASM_H264DSP_SOURCES}
+    ${FFASM_H264QPEL_SOURCES}
    ${DAV1D_CDEF_ASM_SOURCES}
    ${DAV1D_CDEF_C_SOURCES}
 )
@@ -458,6 +459,7 @@ add_executable(test_api_h264
    tests/h264_idct4_ref.c
    tests/h264_idct8_ref.c
    tests/h264_deblock_ref.c
+    tests/h264_qpel8_mc20_ref.c
 )
 target_link_libraries(test_api_h264 PRIVATE daedalus_core)
 target_compile_options(test_api_h264 PRIVATE -O2)
@@ -263,6 +263,39 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

+/* -------------------------------------------------------------------
+ * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
+ * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
+ * docs/k9_h264qpel_mc20.md for the R-band rationale).
+ *
+ * Per H.264 §8.4.2.2.1, horizontal half-pel luma 6-tap filter:
+ *   dst[r,c] = clip255((s[r,c-2] - 5*s[r,c-1] + 20*s[r,c]
+ *                       + 20*s[r,c+1] - 5*s[r,c+2] + s[r,c+3]
+ *                       + 16) >> 5)
+ *
+ * Single-stride: dst and src share `stride`; this matches FFmpeg's
+ * H264QpelContext.put_h264_qpel_pixels_tab[][] convention and the
+ * vendored ff_put_h264_qpel8_mc20_neon signature.
+ *
+ * `src + src_off` points at the leftmost OUTPUT column (col 0); the
+ * filter reads cols -2..+3, so the caller must guarantee src has at
+ * least 2 pixels of left context and 3 pixels of right context per
+ * row. (FFmpeg already maintains an edge-emulated buffer for the
+ * frame boundary; this matches that contract.)
+ * ----------------------------------------------------------------- */
+typedef struct {
+    uint32_t dst_off;        /* byte offset into dst (block top-left) */
+    uint32_t src_off;        /* byte offset into src (col 0, row 0)   */
+} daedalus_h264_qpel_meta;
+
+int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta);
+
+int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta);
+
 /* -------------------------------------------------------------------
 * Recipe query — what does the API recommend for each kernel?
 * ----------------------------------------------------------------- */
@@ -275,6 +308,7 @@ typedef enum {
    DAEDALUS_KERNEL_H264_IDCT4      = 6,
    DAEDALUS_KERNEL_H264_IDCT8      = 7,
    DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
+    DAEDALUS_KERNEL_H264_QPEL_MC20  = 9,
 } daedalus_kernel;

 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
@@ -93,6 +93,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_H264_IDCT4:       return DAEDALUS_SUBSTRATE_CPU;
    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_CPU;
    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_CPU;
    }
    return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -117,6 +118,8 @@ extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride
 extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
+extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);

 /* -------------------- CPU dispatch implementations -------------- */

@@ -226,6 +229,22 @@ static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
    return 0;
 }

+static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    (void) ctx;
+    /* FFmpeg's NEON entry uses a single stride for both dst and src
+     * (H264QpelContext convention).  Caller already guarantees this
+     * via the public API contract documented in daedalus.h. */
+    for (size_t i = 0; i < n_blocks; i++) {
+        ff_put_h264_qpel8_mc20_neon(dst + meta[i].dst_off,
+                                     src + meta[i].src_off,
+                                     (ptrdiff_t) stride);
+    }
+    return 0;
+}
+
 /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */

 typedef struct {
@@ -811,6 +830,14 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
    return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
 }

+int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_QPEL_MC20, dispatch_h264_qpel_mc20_cpu,
+                   dst, src, stride, n_blocks, meta);
+}
+
 /* -------------------- Recipe convenience wrappers --------------- */

 int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
@@ -881,3 +908,11 @@ int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
    return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                                  dst, dst_stride, n_edges, meta);
 }
+
+int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                             dst, src, stride, n_blocks, meta);
+}
@@ -18,6 +18,8 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t s
 extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                  int alpha, int beta, int8_t tc0[4]);
+extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
+                                              ptrdiff_t stride);

 static uint64_t xs_state = 0xa11264ULL;
 static inline uint64_t xs(void) {
@@ -143,6 +145,46 @@ static int test_deblock(void)
    return diff == 0 ? 0 : 1;
 }

+static int test_qpel_mc20(void)
+{
+    /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
+     * holds rows 0..7; src[c-2..c+3] read via SRC_COL offset matches the
+     * cycle-9 bench convention so the same C reference and NEON .S can
+     * be compared. */
+    enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 8,
+           TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
+           SRC_COL = 3 };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
+    daedalus_h264_qpel_meta meta[N];
+
+    for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
+    memset(dst, 0, sizeof(dst));
+    memset(dst_ref, 0, sizeof(dst_ref));
+
+    for (int i = 0; i < N; i++) {
+        meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
+        meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
+    }
+
+    for (int i = 0; i < N; i++)
+        daedalus_put_h264_qpel8_mc20_ref(dst_ref + meta[i].dst_off,
+                                          src + meta[i].src_off,
+                                          TILE_STRIDE);
+
+    int rc = daedalus_recipe_dispatch_h264_qpel_mc20(ctx, dst, src,
+                                                      TILE_STRIDE, N, meta);
+    if (rc) { fprintf(stderr, "qpel_mc20 dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 qpel mc20: %d/%d bytes bit-exact (%.4f%%)\n",
+           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
 int main(void)
 {
    printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
@@ -152,10 +194,13 @@ int main(void)
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8));
    printf("  H264_DEBLOCK_LV recipe substrate: %d\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV));
+    printf("  H264_QPEL_MC20 recipe substrate:  %d\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));

    int fail = 0;
    fail |= test_idct4();
    fail |= test_idct8();
    fail |= test_deblock();
+    fail |= test_qpel_mc20();
    return fail;
 }