From c3301b0c2e6efb852b18558a13fdf8a1f9eb4b43 Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Mon, 25 May 2026 00:47:37 +0200
Subject: [PATCH] h264: qpel mc02 (vertical half-pel, CPU/NEON)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirror of cycle 9's mc20 transposed to vertical orientation.  Wires
up the second qpel half-pel position via the vendored
ff_put_h264_qpel8_mc02_neon symbol, closes the "missing vertical
sibling" gap that mc20 left open since cycle 9.

Scope:
  - Public API: daedalus_dispatch_h264_qpel_mc02 + recipe wrapper.
  - Internal: dispatch_h264_qpel_mc02_cpu calling the NEON entry.
  - Recipe table: DAEDALUS_KERNEL_H264_QPEL_MC02 = 17 → CPU.
    Explicit SUBSTRATE_QPU returns -1 (no shader yet).
  - C reference: tests/h264_qpel8_mc02_ref.c — vertical 6-tap
    transpose of mc20 (reads src[(r±N)*stride + c] instead of
    src[r*stride + c±N]).
  - Test: test_qpel_mc02 in test_api_h264, 8 tiles × 16×16 cols
    × 16 rows, random input, bit-exact compare against the C ref.

Verified on hertz:

  $ ./build/test_api_h264
  ...
    H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%)
    H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%)

  All 12 H.264 kernels in the api_smoke now bit-exact PASS.

Why CPU-only: same R-band logic as the deblock _h sibling pattern.
mc02 at ~7.6 ns per 8x8 block on NEON (per the cycle 9 baseline
measurements) gives ~700 us for 8160 MBs × 4 8x8 luma blocks at
1080p — comfortably inside the 33 ms budget.  QPU shader is a
fast-follow once the V vs H shader work is consolidated (the
transpose for the V shader is not mechanical — different SIMD
access pattern than the H shader).

Coverage matrix update:

  qpel position  put_ status  avg_ status
  -------------  -----------  -----------
  mc00 (copy)    not wired    not wired
  mc10 (¼-H)     not wired    not wired
  mc20 (½-H)    ✓ QPU+CPU     not wired
  mc30 (¾-H)     not wired    not wired
  mc01 (¼-V)     not wired    not wired
  mc02 (½-V)    ✓ CPU         not wired (this PR)
  mc03 (¾-V)     not wired    not wired
  mc11..mc33     not wired    not wired

13 more qpel positions to go for the full put_ matrix.  Adding them
follows the same template; each is a small contained PR.
---
 CMakeLists.txt              |  1 +
 include/daedalus.h          | 24 ++++++++++++++++++++
 src/daedalus_core.c         | 38 +++++++++++++++++++++++++++++++
 tests/h264_qpel8_mc02_ref.c | 45 +++++++++++++++++++++++++++++++++++++
 tests/test_api_h264.c       | 43 +++++++++++++++++++++++++++++++++++
 5 files changed, 151 insertions(+)
 create mode 100644 tests/h264_qpel8_mc02_ref.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43ebdac..b48aaa9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -523,6 +523,7 @@ add_executable(test_api_h264
     tests/h264_chroma_loop_filter_ref.c
     tests/h264_intra_loop_filter_ref.c
     tests/h264_qpel8_mc20_ref.c
+    tests/h264_qpel8_mc02_ref.c
 )
 target_link_libraries(test_api_h264 PRIVATE daedalus_core)
 target_compile_options(test_api_h264 PRIVATE -O2)
diff --git a/include/daedalus.h b/include/daedalus.h
index 02944e3..ccb2260 100644
--- a/include/daedalus.h
+++ b/include/daedalus.h
@@ -392,6 +392,29 @@ int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
     uint8_t *dst, const uint8_t *src, size_t stride,
     size_t n_blocks, const daedalus_h264_qpel_meta *meta);
 
+/* H.264 luma qpel mc02 (vertical half-pel) — mirror of mc20.
+ * 6-tap filter applied vertically:
+ *   dst[r,c] = clip255((s[r-2,c] - 5*s[r-1,c] + 20*s[r,c]
+ *                       + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c]
+ *                       + 16) >> 5)
+ *
+ * Same single-stride convention as mc20.  src + src_off points at
+ * row 0 col 0 of the OUTPUT block; the filter reads rows -2..+3, so
+ * the caller must guarantee 2 rows of top context and 3 rows of
+ * bottom context per block (FFmpeg edge-emulated buffer handles
+ * frame boundaries; same contract as mc20).
+ *
+ * QPU shader not implemented yet; recipe table routes AUTO to CPU
+ * NEON.  Explicit DAEDALUS_SUBSTRATE_QPU returns -1.
+ */
+int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta);
+
+int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta);
+
 /* -------------------------------------------------------------------
  * Recipe query — what does the API recommend for each kernel?
  * ----------------------------------------------------------------- */
@@ -412,6 +435,7 @@ typedef enum {
     DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA = 14,
     DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15,
     DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
+    DAEDALUS_KERNEL_H264_QPEL_MC02        = 17,
 } daedalus_kernel;
 
 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
diff --git a/src/daedalus_core.c b/src/daedalus_core.c
index 0334581..497d5df 100644
--- a/src/daedalus_core.c
+++ b/src/daedalus_core.c
@@ -138,6 +138,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
     case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */
     case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
     case DAEDALUS_KERNEL_H264_QPEL_MC20:   return DAEDALUS_SUBSTRATE_QPU;	/* v3d_h264_qpel_mc20.spv */
+    case DAEDALUS_KERNEL_H264_QPEL_MC02:   return DAEDALUS_SUBSTRATE_CPU;	/* QPU mc02 shader pending */
     }
     return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -178,6 +179,8 @@ extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stri
                                                       int alpha, int beta);
 extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
                                          ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
+                                         ptrdiff_t stride);
 
 /* -------------------- CPU dispatch implementations -------------- */
 
@@ -405,6 +408,19 @@ static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
     return 0;
 }
 
+static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    (void) ctx;
+    for (size_t i = 0; i < n_blocks; i++) {
+        ff_put_h264_qpel8_mc02_neon(dst + meta[i].dst_off,
+                                     src + meta[i].src_off,
+                                     (ptrdiff_t) stride);
+    }
+    return 0;
+}
+
 /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
 
 typedef struct {
@@ -1376,6 +1392,20 @@ int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
                                        n_blocks, meta);
 }
 
+int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC02);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_QPU)
+        return -1;  /* No mc02 QPU shader yet — explicit QPU fast-fails. */
+    return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta);
+}
+
 /* -------------------- Recipe convenience wrappers --------------- */
 
 int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
@@ -1494,3 +1524,11 @@ int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
     return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO,
                                              dst, src, stride, n_blocks, meta);
 }
+
+int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
+    uint8_t *dst, const uint8_t *src, size_t stride,
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+{
+    return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO,
+                                             dst, src, stride, n_blocks, meta);
+}
diff --git a/tests/h264_qpel8_mc02_ref.c b/tests/h264_qpel8_mc02_ref.c
new file mode 100644
index 0000000..16dd2d7
--- /dev/null
+++ b/tests/h264_qpel8_mc02_ref.c
@@ -0,0 +1,45 @@
+/*
+ * Standalone bit-exact C reference for H.264 luma qpel 8×8 mc02
+ * (vertical half-pel, "put" variant).  Mirror of mc20 with rows
+ * and columns transposed.  6-tap filter applied vertically:
+ *
+ *   dst[r,c] = clip255( (s[r-2,c] - 5*s[r-1,c] + 20*s[r,c]
+ *                       + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c]
+ *                       + 16) >> 5 )
+ *
+ * Mirrors FFmpeg `ff_put_h264_qpel8_mc02_neon` (in
+ * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
+ * line 678, which tail-calls put_h264_qpel8_v_lowpass_neon).
+ *
+ * Signature:
+ *   void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+ *
+ * Both dst and src use the SAME stride.  src points at row 0 col 0
+ * of the output block; the filter reads rows -2..+3 (2 rows of top
+ * context, 3 rows of bottom context).  Caller must guarantee the
+ * source buffer has those rows available (FFmpeg's edge-emulated
+ * buffer handles this at the frame boundary; matches the contract
+ * documented for mc20).
+ *
+ * License: LGPL-2.1-or-later.
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
+
+void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
+{
+    for (int r = 0; r < 8; r++) {
+        for (int c = 0; c < 8; c++) {
+            int s_m2 = src[(r - 2) * stride + c];
+            int s_m1 = src[(r - 1) * stride + c];
+            int s_0  = src[(r + 0) * stride + c];
+            int s_p1 = src[(r + 1) * stride + c];
+            int s_p2 = src[(r + 2) * stride + c];
+            int s_p3 = src[(r + 3) * stride + c];
+            int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
+            dst[r * stride + c] = (uint8_t) clip_u8(v >> 5);
+        }
+    }
+}
diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c
index 4eeb505..2c61fac 100644
--- a/tests/test_api_h264.c
+++ b/tests/test_api_h264.c
@@ -32,6 +32,8 @@ extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t
                                                            int alpha, int beta);
 extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                   int alpha, int beta, int8_t tc0[4]);
+extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
+                                                ptrdiff_t stride);
 extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
                                               ptrdiff_t stride);
 
@@ -399,6 +401,46 @@ static int test_qpel_mc20(void)
     return diff == 0 ? 0 : 1;
 }
 
+static int test_qpel_mc02(void)
+{
+    /* mc02: vertical 6-tap.  Tile is 16 cols × 16 rows so the kernel
+     * can read rows [SRC_ROW-2 .. SRC_ROW+7+3] inside the buffer.
+     * SRC_ROW = 3 leaves rows -2..-1 above the output (rows 1..2 of
+     * the tile) and rows 8..10 below (rows 11..13). */
+    enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
+           TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
+           SRC_ROW = 3 };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
+    daedalus_h264_qpel_meta meta[N];
+
+    for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
+    memset(dst, 0, sizeof(dst));
+    memset(dst_ref, 0, sizeof(dst_ref));
+
+    for (int i = 0; i < N; i++) {
+        meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
+        meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
+    }
+
+    for (int i = 0; i < N; i++)
+        daedalus_put_h264_qpel8_mc02_ref(dst_ref + meta[i].dst_off,
+                                          src + meta[i].src_off,
+                                          TILE_STRIDE);
+
+    int rc = daedalus_recipe_dispatch_h264_qpel_mc02(ctx, dst, src,
+                                                      TILE_STRIDE, N, meta);
+    if (rc) { fprintf(stderr, "qpel_mc02 dispatch rc=%d\n", rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 qpel mc02: %d/%d bytes bit-exact (%.4f%%)\n",
+           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
 int main(void)
 {
     printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
@@ -429,5 +471,6 @@ int main(void)
     fail |= test_deblock_chroma_h();
     fail |= test_deblock_intra_all();
     fail |= test_qpel_mc20();
+    fail |= test_qpel_mc02();
     return fail;
 }