From 20a4299c5cd603e129981456f156390eba9a074d Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Mon, 25 May 2026 01:03:14 +0200
Subject: [PATCH] h264: qpel mc22 (2D half-pel, CPU/NEON)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the "j position" 2D half-pel via cascaded H + V 6-tap lowpass
with intermediate 16-bit precision per H.264 §8.4.2.2.1.  One of the
most common qpel positions in real H.264 streams — many encoders
emit 1/2-1/2 motion vectors as their best-RD choice.

Algorithmically distinct from the 1D mc20/mc02 siblings:
  - Horizontal 6-tap produces 13 rows of int16 intermediate (no
    per-stage clip/round — full precision retained).
  - Vertical 6-tap on the intermediate, then +512 >> 10 (the
    double-shift compensates for both 6-tap scalings) + clip255.

The intermediate-precision requirement means the C reference can't
just be "call mc20 then mc02" — that would double-clip and produce
the wrong result.  The 13-row int16 tmp[] buffer is the central
invariant.

Scope (same pattern as mc02 PR #15):
  - Public API: daedalus_dispatch_h264_qpel_mc22 + recipe wrapper.
  - Internal: dispatch_h264_qpel_mc22_cpu calling
    ff_put_h264_qpel8_mc22_neon.
  - Recipe table: DAEDALUS_KERNEL_H264_QPEL_MC22 = 18 → CPU.
  - C reference: tests/h264_qpel8_mc22_ref.c — explicit tmp[13][8]
    int16 staging buffer; spec-derived shifts and rounding.
  - Test: test_qpel_mc22 in test_api_h264, 8 tiles at 16×16 with
    output positioned at (SRC_ROW=3, SRC_COL=3) so the kernel's
    [-2 .. +10] read window stays in-tile.

Verified on hertz:

  $ ./build/test_api_h264 | tail -5
    H.264 deblock chroma v intra: 256/256 bytes bit-exact (100.0000%)
    H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%)
    H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%)
    H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc22: 2048/2048 bytes bit-exact (100.0000%)

  All 13 H.264 kernels in api_smoke now bit-exact PASS.

mc22 being right first try is meaningful — the +512 >> 10 scaling
+ int16 intermediate sequence has multiple sign/shift/clip pitfalls
and any of them would surface on random inputs immediately.

Coverage matrix update:
  put_ mc20 ✓ (QPU+CPU)  put_ mc02 ✓ (CPU)  put_ mc22 ✓ (CPU)
  → 12 single put_ positions still missing (¼/¾ + HV combos with
  L2 averaging).
---
 CMakeLists.txt              |  1 +
 include/daedalus.h          | 22 ++++++++++++
 src/daedalus_core.c         | 38 ++++++++++++++++++++
 tests/h264_qpel8_mc22_ref.c | 70 +++++++++++++++++++++++++++++++++++++
 tests/test_api_h264.c       | 43 +++++++++++++++++++++++
 5 files changed, 174 insertions(+)
 create mode 100644 tests/h264_qpel8_mc22_ref.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b48aaa9..38ced99 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -524,6 +524,7 @@ add_executable(test_api_h264
     tests/h264_intra_loop_filter_ref.c
     tests/h264_qpel8_mc20_ref.c
     tests/h264_qpel8_mc02_ref.c
+    tests/h264_qpel8_mc22_ref.c
 )
 target_link_libraries(test_api_h264 PRIVATE daedalus_core)
 target_compile_options(test_api_h264 PRIVATE -O2)
diff --git a/include/daedalus.h b/include/daedalus.h
index ccb2260..d827d3e 100644
--- a/include/daedalus.h
+++ b/include/daedalus.h
@@ -415,6 +415,27 @@ int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
     uint8_t *dst, const uint8_t *src, size_t stride,
     size_t n_blocks, const daedalus_h264_qpel_meta *meta);
 
+/* H.264 luma qpel mc22 (2D half-pel "j" position per spec §8.4.2.2.1).
+ * Horizontal 6-tap cascaded into vertical 6-tap with intermediate
+ * 16-bit precision; final +512 >> 10 with clip255.  Common position
+ * in real H.264 streams.
+ *
+ * src + src_off points at row 0 col 0 of the OUTPUT block; the
+ * cascade reads rows -2..+10 (13 rows of context) and cols -2..+5
+ * (10 cols of context).  Caller must guarantee.
+ *
+ * QPU shader not implemented yet (the HV lowpass is the meatiest
+ * qpel kernel; structurally distinct from the 1D mc20 shader).
+ * Recipe routes AUTO to CPU NEON.  Explicit SUBSTRATE_QPU returns -1.
+ */
+int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta);
+
+int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta);
+
 /* -------------------------------------------------------------------
  * Recipe query — what does the API recommend for each kernel?
  * ----------------------------------------------------------------- */
@@ -436,6 +457,7 @@ typedef enum {
     DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15,
     DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
     DAEDALUS_KERNEL_H264_QPEL_MC02        = 17,
+    DAEDALUS_KERNEL_H264_QPEL_MC22        = 18,
 } daedalus_kernel;
 
 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
diff --git a/src/daedalus_core.c b/src/daedalus_core.c
index 497d5df..45ecff5 100644
--- a/src/daedalus_core.c
+++ b/src/daedalus_core.c
@@ -139,6 +139,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
     case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
     case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc20.spv */
     case DAEDALUS_KERNEL_H264_QPEL_MC02:   return DAEDALUS_SUBSTRATE_CPU;	/* QPU mc02 shader pending */
+    case DAEDALUS_KERNEL_H264_QPEL_MC22:   return DAEDALUS_SUBSTRATE_CPU;	/* QPU mc22 shader pending (hv lowpass) */
     }
     return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -181,6 +182,8 @@ extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
                                          ptrdiff_t stride);
 extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
                                          ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
 
 /* -------------------- CPU dispatch implementations -------------- */
 
@@ -421,6 +424,19 @@ static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx,
     return 0;
 }
 
+static int dispatch_h264_qpel_mc22_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_blocks; i++) {
+        ff_put_h264_qpel8_mc22_neon(dst + meta[i].dst_off,
+                                     src + meta[i].src_off,
+                                     (ptrdiff_t) stride);
+    }
+    return 0;
+}
+
 /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
 
 typedef struct {
@@ -1406,6 +1422,20 @@ int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
     return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta);
 }
 
+int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC22);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_QPU)
+        return -1;  /* No mc22 QPU shader yet — explicit QPU fast-fails. */
+    return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta);
+}
+
 /* -------------------- Recipe convenience wrappers --------------- */
 
 int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
@@ -1532,3 +1562,11 @@ int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
     return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                              dst, src, stride, n_blocks, meta);
 }
+
+int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                             dst, src, stride, n_blocks, meta);
+}
diff --git a/tests/h264_qpel8_mc22_ref.c b/tests/h264_qpel8_mc22_ref.c
new file mode 100644
index 0000000..fda59d2
--- /dev/null
+++ b/tests/h264_qpel8_mc22_ref.c
@@ -0,0 +1,70 @@
+/*
+ * Standalone bit-exact C reference for H.264 luma qpel 8x8 mc22
+ * (2D half-pel, "put" variant).  Cascade of horizontal 6-tap then
+ * vertical 6-tap with INTERMEDIATE 16-bit precision (no per-stage
+ * clip/round), final +512 >> 10 to scale back.
+ *
+ * Per H.264 §8.4.2.2.1, "j" position:
+ *
+ *   tmp[r,c] = s[r,c-2] - 5*s[r,c-1] + 20*s[r,c] + 20*s[r,c+1]
+ *              - 5*s[r,c+2] + s[r,c+3]               (16-bit signed)
+ *
+ *   dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c]
+ *                       + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c]
+ *                       + 512) >> 10)
+ *
+ * The tmp[] array spans rows r-2 .. r+3 around each output row, so
+ * we need 13 intermediate rows (rows -2..+10 of the SOURCE
+ * neighbourhood) for 8 output rows.  Caller's src must have 2 rows
+ * of top context + 3 rows of bottom context AND 2 cols of left +
+ * 3 cols of right context (FFmpeg's edge-emulated buffer provides
+ * this at the frame boundary; same contract as mc20).
+ *
+ * Mirrors FFmpeg `ff_put_h264_qpel8_mc22_neon` (in
+ * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
+ * line 710, which tail-calls put_h264_qpel8_hv_lowpass_neon).
+ *
+ * Signature:
+ *   void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+ *
+ * Same single-stride convention as mc20/mc02.
+ *
+ * License: LGPL-2.1-or-later.
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
+
+void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
+{
+    /* 13 intermediate rows × 8 cols (for the 8 output rows
+     * dst[0..7][0..7], we need tmp[-2..+10][0..7] — but tmp is
+     * indexed RELATIVE to the output, so tmp_buf[0..12] corresponds
+     * to source rows [-2..+10]). */
+    int16_t tmp[13][8];
+    for (int rr = 0; rr < 13; rr++) {
+        int src_row = rr - 2;  /* maps tmp_buf[0..12] → src rows [-2..+10] */
+        const uint8_t *s = src + src_row * stride;
+        for (int c = 0; c < 8; c++) {
+            int v = (int) s[c - 2] - 5 * (int) s[c - 1]
+                  + 20 * (int) s[c] + 20 * (int) s[c + 1]
+                  - 5 * (int) s[c + 2] + (int) s[c + 3];
+            tmp[rr][c] = (int16_t) v;
+        }
+    }
+
+    for (int r = 0; r < 8; r++) {
+        /* tmp[r-2..r+3] in the output's coord system → tmp_buf[r..r+5]. */
+        for (int c = 0; c < 8; c++) {
+            int v = tmp[r + 0][c]                       /* "r-2" + shift 2 */
+                  - 5  * tmp[r + 1][c]                  /* "r-1" */
+                  + 20 * tmp[r + 2][c]                  /* "r+0" */
+                  + 20 * tmp[r + 3][c]                  /* "r+1" */
+                  - 5  * tmp[r + 4][c]                  /* "r+2" */
+                  +      tmp[r + 5][c]                  /* "r+3" */
+                  + 512;
+            dst[r * stride + c] = (uint8_t) clip_u8(v >> 10);
+        }
+    }
+}
diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c
index 2c61fac..275a556 100644
--- a/tests/test_api_h264.c
+++ b/tests/test_api_h264.c
@@ -34,6 +34,8 @@ extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                   int alpha, int beta, int8_t tc0[4]);
 extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
                                                 ptrdiff_t stride);
+extern void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src,
+                                                ptrdiff_t stride);
 extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
                                               ptrdiff_t stride);
 
@@ -441,6 +443,46 @@ static int test_qpel_mc02(void)
     return diff == 0 ? 0 : 1;
 }
 
+static int test_qpel_mc22(void)
+{
+    /* mc22: 2D HV lowpass.  Needs 2 cols left + 3 cols right + 2 rows
+     * top + 3 rows bottom of context per 8x8 output.  Tile is 16x16
+     * with output positioned at (SRC_ROW=3, SRC_COL=3) so the read
+     * range [SRC_*-2 .. SRC_*+7+3] stays inside the tile. */
+    enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
+           TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
+           SRC_ROW = 3, SRC_COL = 3 };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
+    daedalus_h264_qpel_meta meta[N];
+
+    for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
+    memset(dst, 0, sizeof(dst));
+    memset(dst_ref, 0, sizeof(dst_ref));
+
+    for (int i = 0; i < N; i++) {
+        meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
+        meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
+    }
+
+    for (int i = 0; i < N; i++)
+        daedalus_put_h264_qpel8_mc22_ref(dst_ref + meta[i].dst_off,
+                                          src + meta[i].src_off,
+                                          TILE_STRIDE);
+
+    int rc = daedalus_recipe_dispatch_h264_qpel_mc22(ctx, dst, src,
+                                                      TILE_STRIDE, N, meta);
+    if (rc) { fprintf(stderr, "qpel_mc22 dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 qpel mc22: %d/%d bytes bit-exact (%.4f%%)\n",
+           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
 int main(void)
 {
     printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
@@ -472,5 +514,6 @@ int main(void)
     fail |= test_deblock_intra_all();
     fail |= test_qpel_mc20();
     fail |= test_qpel_mc02();
+    fail |= test_qpel_mc22();
     return fail;
 }
-- 
2.47.3