h264: qpel single-axis quarter-pel — mc10/mc30/mc01/mc03 (CPU/NEON)

Closes the 4 single-axis quarter-pel positions in one PR. Each is a half-pel lowpass clipped to u8 followed by L2 rounded-average with an integer-aligned source pixel per H.264 §8.4.2.2.1: mc10 ¼-H ("a" pos): clip255(mc20(s)) avg src[r,c] mc30 ¾-H ("c" pos): clip255(mc20(s)) avg src[r,c+1] mc01 ¼-V ("d" pos): clip255(mc02(s)) avg src[r,c] mc03 ¾-V ("n" pos): clip255(mc02(s)) avg src[r+1,c] The mc10/mc30 pair and mc01/mc03 pair only differ in WHICH integer source pixel they average with — the half-pel computation is the same. Putting them in one PR is justified by that uniformity. Scope: - 4 new kernel enums: MC10=19, MC30=20, MC01=21, MC03=22 → CPU. - 4 NEON externs for the vendored ff_put_h264_qpel8_mc{10,30,01,03}_neon. - 4 CPU dispatch wrappers via DEFINE_QPEL_CPU_DISPATCH macro (collapses ~50 LOC of repetition). - 4 public dispatch fns via DEFINE_QPEL_DISPATCH macro. - 4 recipe wrappers via DEFINE_QPEL_RECIPE macro. - tests/h264_qpel8_quarter_axis_ref.c covers all four via shared hpel_h() / hpel_v() inlines + per-mode L2 average. - Test refactor: generic run_quarter_axis_qpel() harness exercises all 4 positions through a single helper (~50 LOC for 4 tests vs ~200 if each was hand-rolled). Verified on hertz: $ ./build/test_api_h264 | tail -8 H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%) H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%) H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc22: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc10: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc30: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc01: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc03: 2048/2048 bytes bit-exact (100.0000%) All 4 new positions bit-exact PASS first try. Coverage matrix update: put_ mc00 mc10 mc20 mc30 mc01 — ✓ — ✓ mc11 — — ✓ — ← this row mc21 — — — — mc31 — — — — mc02 — — ✓ — ← mc02 + mc22 anchor mc03 — — ✓ — After this PR: 7 of 16 single-axis + diagonal positions done. Remaining 9 are the off-axis quarter-pel combinations (mc11/mc12/mc13/mc21/mc23/mc31/mc32/mc33) — each combines a 2D lowpass intermediate with L2 averaging against a 1D-lowpass output. Next PR scope. Why no QPU shaders: same R-band logic as the prior CPU additions. At ~10 ns per 8x8 NEON block, all 16 qpel positions together would land in ~1.3 ms/frame at 1080p worst case — comfortably inside the 33 ms budget. QPU shader for mc20 already exists (cycle 9 / v3d_h264_qpel_mc20.spv); the other 15 follow once a clear perf reason emerges.
2026-05-25 01:29:52 +02:00
parent f3d4b15b9a
commit e01f7bc7c6
5 changed files with 264 additions and 0 deletions
@@ -140,6 +140,10 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc20.spv */
    case DAEDALUS_KERNEL_H264_QPEL_MC02:   return DAEDALUS_SUBSTRATE_CPU;	/* QPU mc02 shader pending */
    case DAEDALUS_KERNEL_H264_QPEL_MC22:   return DAEDALUS_SUBSTRATE_CPU;	/* QPU mc22 shader pending (hv lowpass) */
+    case DAEDALUS_KERNEL_H264_QPEL_MC10:   return DAEDALUS_SUBSTRATE_CPU;	/* ¼-H L2 */
+    case DAEDALUS_KERNEL_H264_QPEL_MC30:   return DAEDALUS_SUBSTRATE_CPU;	/* ¾-H L2 */
+    case DAEDALUS_KERNEL_H264_QPEL_MC01:   return DAEDALUS_SUBSTRATE_CPU;	/* ¼-V L2 */
+    case DAEDALUS_KERNEL_H264_QPEL_MC03:   return DAEDALUS_SUBSTRATE_CPU;	/* ¾-V L2 */
    }
    return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -184,6 +188,14 @@ extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
                                         ptrdiff_t stride);
 extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src,
                                         ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);

 /* -------------------- CPU dispatch implementations -------------- */

@@ -437,6 +449,28 @@ static int dispatch_h264_qpel_mc22_cpu(daedalus_ctx *ctx,
    return 0;
 }

+/* The four single-axis quarter-pel CPU dispatches are uniform; the
+ * macro collapses ~50 LOC of repetition. */
+#define DEFINE_QPEL_CPU_DISPATCH(suffix, neon_fn)                              \
+static int dispatch_h264_qpel_ ## suffix ## _cpu(daedalus_ctx *ctx,            \
+    uint8_t *dst, const uint8_t *src, size_t stride,                           \
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)                      \
+{                                                                              \
+    (void) ctx;                                                                \
+    for (size_t i = 0; i < n_blocks; i++) {                                    \
+        neon_fn(dst + meta[i].dst_off, src + meta[i].src_off,                  \
+                (ptrdiff_t) stride);                                           \
+    }                                                                          \
+    return 0;                                                                  \
+}
+
+DEFINE_QPEL_CPU_DISPATCH(mc10, ff_put_h264_qpel8_mc10_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc30, ff_put_h264_qpel8_mc30_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc01, ff_put_h264_qpel8_mc01_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc03, ff_put_h264_qpel8_mc03_neon)
+
+#undef DEFINE_QPEL_CPU_DISPATCH
+
 /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */

 typedef struct {
@@ -1436,6 +1470,28 @@ int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
    return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta);
 }

+#define DEFINE_QPEL_DISPATCH(suffix, kernel)                                   \
+int daedalus_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx,                  \
+    daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride,   \
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)                      \
+{                                                                              \
+    daedalus_substrate eff = sub;                                              \
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)                                        \
+        eff = daedalus_recipe_substrate_for(kernel);                           \
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))           \
+        eff = DAEDALUS_SUBSTRATE_CPU;                                          \
+    if (eff == DAEDALUS_SUBSTRATE_QPU) return -1;                              \
+    return dispatch_h264_qpel_ ## suffix ## _cpu(ctx, dst, src, stride,        \
+                                                  n_blocks, meta);             \
+}
+
+DEFINE_QPEL_DISPATCH(mc10, DAEDALUS_KERNEL_H264_QPEL_MC10)
+DEFINE_QPEL_DISPATCH(mc30, DAEDALUS_KERNEL_H264_QPEL_MC30)
+DEFINE_QPEL_DISPATCH(mc01, DAEDALUS_KERNEL_H264_QPEL_MC01)
+DEFINE_QPEL_DISPATCH(mc03, DAEDALUS_KERNEL_H264_QPEL_MC03)
+
+#undef DEFINE_QPEL_DISPATCH
+
 /* -------------------- Recipe convenience wrappers --------------- */

 int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
@@ -1570,3 +1626,19 @@ int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx,
    return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                             dst, src, stride, n_blocks, meta);
 }
+
+#define DEFINE_QPEL_RECIPE(suffix)                                             \
+int daedalus_recipe_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx,           \
+    uint8_t *dst, const uint8_t *src, size_t stride,                           \
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)                      \
+{                                                                              \
+    return daedalus_dispatch_h264_qpel_ ## suffix(ctx, DAEDALUS_SUBSTRATE_AUTO,\
+                                                   dst, src, stride, n_blocks, meta); \
+}
+
+DEFINE_QPEL_RECIPE(mc10)
+DEFINE_QPEL_RECIPE(mc30)
+DEFINE_QPEL_RECIPE(mc01)
+DEFINE_QPEL_RECIPE(mc03)
+
+#undef DEFINE_QPEL_RECIPE