Phase 8c: H.264 luma qpel mc20 through public API

Extends daedalus-fourier with daedalus_recipe_dispatch_h264_qpel_mc20 so libavcodec.so can route H264QpelContext.put_h264_qpel_pixels_tab[1][2] through the recipe layer instead of ff_put_h264_qpel8_mc20_neon directly. API additions (header + library): - daedalus_h264_qpel_meta { dst_off, src_off } - daedalus_dispatch_h264_qpel_mc20(ctx, sub, dst, src, stride, n_blocks, meta) - daedalus_recipe_dispatch_h264_qpel_mc20(...) (AUTO wrapper) - DAEDALUS_KERNEL_H264_QPEL_MC20 = 9 in the recipe-query enum - daedalus_recipe_substrate_for() returns CPU NEON for cycle 9 The 6-tap horizontal half-pel filter signature matches FFmpeg's H264QpelContext convention exactly: dst and src share a single stride and src already points at output column 0 (filter reads cols -2..+3). Single-stride API to make the marfrit-packages FFmpeg shim a straight pointer-pass; no buffer rearrangement. Verdict per docs/k9_h264qpel_mc20.md: CPU NEON. Per-block 7.6 ns gives 135x margin over 30 fps 1080p; QPU dispatch floor at ~250 ns makes any V3D shader strictly worse. Recipe table reflects that — the recipe_dispatch entry is a one-line forward to the CPU path. CMakeLists changes: - h264qpel_neon.S added to the daedalus_core static lib (only the bench targets owned it before; now the public API needs it too) - tests/h264_qpel8_mc20_ref.c added to the test_api_h264 target Phase 8a/8b smoke gains a 4th case (test_qpel_mc20): 1024/1024 bytes bit-exact via daedalus_recipe_dispatch_h264_qpel_mc20. Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 9.
2026-05-23 03:25:24 +02:00
parent d87239d817
commit 8fdef27a7d
4 changed files with 116 additions and 0 deletions
@@ -93,6 +93,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_H264_IDCT4:       return DAEDALUS_SUBSTRATE_CPU;
    case DAEDALUS_KERNEL_H264_IDCT8:       return DAEDALUS_SUBSTRATE_CPU;
    case DAEDALUS_KERNEL_H264_DEBLOCK_LV:  return DAEDALUS_SUBSTRATE_CPU;
+    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_CPU;
    }
    return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -117,6 +118,8 @@ extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride
 extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
                                              int alpha, int beta, int8_t *tc0);
+extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);

 /* -------------------- CPU dispatch implementations -------------- */

@@ -226,6 +229,22 @@ static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
    return 0;
 }

+static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    (void) ctx;
+    /* FFmpeg's NEON entry uses a single stride for both dst and src
+     * (H264QpelContext convention).  Caller already guarantees this
+     * via the public API contract documented in daedalus.h. */
+    for (size_t i = 0; i < n_blocks; i++) {
+        ff_put_h264_qpel8_mc20_neon(dst + meta[i].dst_off,
+                                     src + meta[i].src_off,
+                                     (ptrdiff_t) stride);
+    }
+    return 0;
+}
+
 /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */

 typedef struct {
@@ -811,6 +830,14 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
    return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
 }

+int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_QPEL_MC20, dispatch_h264_qpel_mc20_cpu,
+                   dst, src, stride, n_blocks, meta);
+}
+
 /* -------------------- Recipe convenience wrappers --------------- */

 int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
@@ -881,3 +908,11 @@ int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
    return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                                  dst, dst_stride, n_edges, meta);
 }
+
+int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                             dst, src, stride, n_blocks, meta);
+}