h264: qpel mc22 (2D half-pel, CPU/NEON)

Adds the "j position" 2D half-pel via cascaded H + V 6-tap lowpass with intermediate 16-bit precision per H.264 §8.4.2.2.1. One of the most common qpel positions in real H.264 streams — many encoders emit 1/2-1/2 motion vectors as their best-RD choice. Algorithmically distinct from the 1D mc20/mc02 siblings: - Horizontal 6-tap produces 13 rows of int16 intermediate (no per-stage clip/round — full precision retained). - Vertical 6-tap on the intermediate, then +512 >> 10 (the double-shift compensates for both 6-tap scalings) + clip255. The intermediate-precision requirement means the C reference can't just be "call mc20 then mc02" — that would double-clip and produce the wrong result. The 13-row int16 tmp[] buffer is the central invariant. Scope (same pattern as mc02 PR #15): - Public API: daedalus_dispatch_h264_qpel_mc22 + recipe wrapper. - Internal: dispatch_h264_qpel_mc22_cpu calling ff_put_h264_qpel8_mc22_neon. - Recipe table: DAEDALUS_KERNEL_H264_QPEL_MC22 = 18 → CPU. - C reference: tests/h264_qpel8_mc22_ref.c — explicit tmp[13][8] int16 staging buffer; spec-derived shifts and rounding. - Test: test_qpel_mc22 in test_api_h264, 8 tiles at 16×16 with output positioned at (SRC_ROW=3, SRC_COL=3) so the kernel's [-2 .. +10] read window stays in-tile. Verified on hertz: $ ./build/test_api_h264 | tail -5 H.264 deblock chroma v intra: 256/256 bytes bit-exact (100.0000%) H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%) H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%) H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc22: 2048/2048 bytes bit-exact (100.0000%) All 13 H.264 kernels in api_smoke now bit-exact PASS. mc22 being right first try is meaningful — the +512 >> 10 scaling + int16 intermediate sequence has multiple sign/shift/clip pitfalls and any of them would surface on random inputs immediately. Coverage matrix update: put_ mc20 ✓ (QPU+CPU) put_ mc02 ✓ (CPU) put_ mc22 ✓ (CPU) → 12 single put_ positions still missing (¼/¾ + HV combos with L2 averaging).
2026-05-25 01:03:14 +02:00
parent a2575d5e42
commit 20a4299c5c
5 changed files with 174 additions and 0 deletions
@@ -139,6 +139,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
    case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc20.spv */
    case DAEDALUS_KERNEL_H264_QPEL_MC02:   return DAEDALUS_SUBSTRATE_CPU;	/* QPU mc02 shader pending */
+    case DAEDALUS_KERNEL_H264_QPEL_MC22:   return DAEDALUS_SUBSTRATE_CPU;	/* QPU mc22 shader pending (hv lowpass) */
    }
    return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -181,6 +182,8 @@ extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
                                         ptrdiff_t stride);
 extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
                                         ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);

 /* -------------------- CPU dispatch implementations -------------- */

@@ -421,6 +424,19 @@ static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx,
    return 0;
 }

+static int dispatch_h264_qpel_mc22_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_blocks; i++) {
+        ff_put_h264_qpel8_mc22_neon(dst + meta[i].dst_off,
+                                     src + meta[i].src_off,
+                                     (ptrdiff_t) stride);
+    }
+    return 0;
+}
+
 /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */

 typedef struct {
@@ -1406,6 +1422,20 @@ int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
    return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta);
 }

+int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC22);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_QPU)
+        return -1;  /* No mc22 QPU shader yet — explicit QPU fast-fails. */
+    return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta);
+}
+
 /* -------------------- Recipe convenience wrappers --------------- */

 int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
@@ -1532,3 +1562,11 @@ int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
    return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                             dst, src, stride, n_blocks, meta);
 }
+
+int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                             dst, src, stride, n_blocks, meta);
+}