From 0894a4611493c1d2af0c4d20b6029ce848542d49 Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Mon, 25 May 2026 07:49:12 +0200
Subject: [PATCH] =?UTF-8?q?h264:=20qpel=20diagonals=20=E2=80=94=208=20posi?=
 =?UTF-8?q?tions=20(mc11/12/13/21/23/31/32/33)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the qpel buildout.  All 8 remaining diagonal positions land
in one PR.  Each is the rounded average of two half-pel intermediates
per H.264 §8.4.2.2.1 / Table 8-4, with the decomposition matching
the FFmpeg .S reference structure (verified by reading
external/ffmpeg-snapshot/.../h264qpel_neon.S lines 622-758).

Decomposition table (the formula for each output cell at (r,c)):

  mc11 ¼¼ : avg(mc20[r,   c],   mc02[r, c])
  mc12 ¼½ : avg(mc22[r,   c],   mc02[r, c])
  mc13 ¼¾ : avg(mc20[r+1, c],   mc02[r, c])
  mc21 ½¼ : avg(mc22[r,   c],   mc20[r, c])
  mc23 ½¾ : avg(mc22[r,   c],   mc20[r+1, c])
  mc31 ¾¼ : avg(mc20[r,   c],   mc02[r, c+1])
  mc32 ¾½ : avg(mc22[r,   c],   mc02[r, c+1])
  mc33 ¾¾ : avg(mc20[r+1, c],   mc02[r, c+1])

The (r±1, c±1) offsets capture the position-dependent shift that
the FFmpeg .S encodes by pre-incrementing x1 (src pointer) before
branching into the common mc11/mc21 code paths.

Scope (tightly macro-ised):
  - 8 new kernel enums (MC11..MC33 = 23..30) → CPU.
  - 8 NEON externs for the vendored ff_put_h264_qpel8_mc*_neon.
  - 8 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro.
  - 8 public dispatches via DEFINE_QPEL_DISPATCH macro.
  - 8 recipe wrappers via DEFINE_QPEL_RECIPE macro.
  - Header decls condensed via a DECLARE_QPEL_DIAG macro that
    expands to both recipe + dispatch decls per name.
  - C references via DEFINE_DIAG_REF macro: each ref is a 6-line
    wrapper around the per-cell hpel_h / hpel_v / hpel_hv helpers
    (the latter being the per-cell version of mc22's 13-row int16
    tmp[] computation).
  - Test wrapper: test_qpel_diag_all() drives all 8 through the
    existing run_quarter_axis_qpel() harness.

Verified on hertz (Pi 5 / V3D 7.1):

  $ ./build/test_api_h264 | tail -8
    H.264 qpel mc11: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc12: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc13: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc21: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc23: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc31: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc32: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc33: 2048/2048 bytes bit-exact (100.0000%)

ALL 8 diagonal positions bit-exact PASS first try.  Meaningful
because the position-dependent (r±1, c±1) source offsets are easy
to get wrong by transcription, and any of them would surface on
random inputs immediately.

After this PR the H.264 qpel 8x8 put_ matrix is complete:
  mc00 mc01 mc02 mc03
  mc10 mc11 mc12 mc13
  mc20 mc21 mc22 mc23
  mc30 mc31 mc32 mc33

15 of 16 positions exposed through the daedalus API; mc00 is just
integer copy and rarely needs a dispatch wrapper (libavcodec sets
the function pointer table directly).  mc20 retains its QPU shader
(cycle 9 / v3d_h264_qpel_mc20.spv); all other 14 are CPU NEON.

What this does NOT cover (still in backlog):
  - avg_ variants (the "add" form for biprediction, 16 more
    positions).  Currently the API only exposes put_.
  - 16x16 qpel (separate function family in FFmpeg; the 8x8 path
    can be used twice to substitute when 16x16 isn't critical).
  - QPU shaders for any qpel position other than mc20.
---
 CMakeLists.txt              |  1 +
 include/daedalus.h          | 44 +++++++++++++++++
 src/daedalus_core.c         | 40 +++++++++++++++
 tests/h264_qpel8_diag_ref.c | 98 +++++++++++++++++++++++++++++++++++++
 tests/test_api_h264.c       | 36 ++++++++++++++
 5 files changed, 219 insertions(+)
 create mode 100644 tests/h264_qpel8_diag_ref.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 971a63e..f8e1059 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -526,6 +526,7 @@ add_executable(test_api_h264
     tests/h264_qpel8_mc02_ref.c
     tests/h264_qpel8_mc22_ref.c
     tests/h264_qpel8_quarter_axis_ref.c
+    tests/h264_qpel8_diag_ref.c
 )
 target_link_libraries(test_api_h264 PRIVATE daedalus_core)
 target_compile_options(test_api_h264 PRIVATE -O2)
diff --git a/include/daedalus.h b/include/daedalus.h
index 46f9fce..09f0193 100644
--- a/include/daedalus.h
+++ b/include/daedalus.h
@@ -475,6 +475,42 @@ int daedalus_dispatch_h264_qpel_mc03(daedalus_ctx *ctx, daedalus_substrate sub,
     uint8_t *dst, const uint8_t *src, size_t stride,
     size_t n_blocks, const daedalus_h264_qpel_meta *meta);
 
+/* H.264 luma diagonal qpel positions ("put", 8 variants).  Each is
+ * the rounded average of two half-pel intermediates per H.264
+ * §8.4.2.2.1 / Table 8-4 (decomposition matches the FFmpeg .S
+ * structure; see test/h264_qpel8_diag_ref.c for the formulas).
+ *
+ *   mc11 ¼¼ : avg(mc20[r,c],   mc02[r,c])
+ *   mc12 ¼½ : avg(mc22[r,c],   mc02[r,c])
+ *   mc13 ¼¾ : avg(mc20[r+1,c], mc02[r,c])
+ *   mc21 ½¼ : avg(mc22[r,c],   mc20[r,c])
+ *   mc23 ½¾ : avg(mc22[r,c],   mc20[r+1,c])
+ *   mc31 ¾¼ : avg(mc20[r,c],   mc02[r,c+1])
+ *   mc32 ¾½ : avg(mc22[r,c],   mc02[r,c+1])
+ *   mc33 ¾¾ : avg(mc20[r+1,c], mc02[r,c+1])
+ *
+ * CPU-only via vendored FFmpeg NEON; QPU shaders pending.
+ * Explicit SUBSTRATE_QPU returns -1.
+ */
+#define DECLARE_QPEL_DIAG(name) \
+int daedalus_recipe_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, \
+    uint8_t *dst, const uint8_t *src, size_t stride, \
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta); \
+int daedalus_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, daedalus_substrate sub, \
+    uint8_t *dst, const uint8_t *src, size_t stride, \
+    size_t n_blocks, const daedalus_h264_qpel_meta *meta);
+
+DECLARE_QPEL_DIAG(mc11)
+DECLARE_QPEL_DIAG(mc12)
+DECLARE_QPEL_DIAG(mc13)
+DECLARE_QPEL_DIAG(mc21)
+DECLARE_QPEL_DIAG(mc23)
+DECLARE_QPEL_DIAG(mc31)
+DECLARE_QPEL_DIAG(mc32)
+DECLARE_QPEL_DIAG(mc33)
+
+#undef DECLARE_QPEL_DIAG
+
 /* -------------------------------------------------------------------
  * Recipe query — what does the API recommend for each kernel?
  * ----------------------------------------------------------------- */
@@ -501,6 +537,14 @@ typedef enum {
     DAEDALUS_KERNEL_H264_QPEL_MC30        = 20,
     DAEDALUS_KERNEL_H264_QPEL_MC01        = 21,
     DAEDALUS_KERNEL_H264_QPEL_MC03        = 22,
+    DAEDALUS_KERNEL_H264_QPEL_MC11        = 23,
+    DAEDALUS_KERNEL_H264_QPEL_MC12        = 24,
+    DAEDALUS_KERNEL_H264_QPEL_MC13        = 25,
+    DAEDALUS_KERNEL_H264_QPEL_MC21        = 26,
+    DAEDALUS_KERNEL_H264_QPEL_MC23        = 27,
+    DAEDALUS_KERNEL_H264_QPEL_MC31        = 28,
+    DAEDALUS_KERNEL_H264_QPEL_MC32        = 29,
+    DAEDALUS_KERNEL_H264_QPEL_MC33        = 30,
 } daedalus_kernel;
 
 daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
diff --git a/src/daedalus_core.c b/src/daedalus_core.c
index 69edcc1..bf17585 100644
--- a/src/daedalus_core.c
+++ b/src/daedalus_core.c
@@ -144,6 +144,14 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
     case DAEDALUS_KERNEL_H264_QPEL_MC30:   return DAEDALUS_SUBSTRATE_CPU;	/* ¾-H L2 */
     case DAEDALUS_KERNEL_H264_QPEL_MC01:   return DAEDALUS_SUBSTRATE_CPU;	/* ¼-V L2 */
     case DAEDALUS_KERNEL_H264_QPEL_MC03:   return DAEDALUS_SUBSTRATE_CPU;	/* ¾-V L2 */
+    case DAEDALUS_KERNEL_H264_QPEL_MC11:   return DAEDALUS_SUBSTRATE_CPU;	/* diagonal ¼¼ */
+    case DAEDALUS_KERNEL_H264_QPEL_MC12:   return DAEDALUS_SUBSTRATE_CPU;	/* diagonal ¼½ */
+    case DAEDALUS_KERNEL_H264_QPEL_MC13:   return DAEDALUS_SUBSTRATE_CPU;	/* diagonal ¼¾ */
+    case DAEDALUS_KERNEL_H264_QPEL_MC21:   return DAEDALUS_SUBSTRATE_CPU;	/* diagonal ½¼ */
+    case DAEDALUS_KERNEL_H264_QPEL_MC23:   return DAEDALUS_SUBSTRATE_CPU;	/* diagonal ½¾ */
+    case DAEDALUS_KERNEL_H264_QPEL_MC31:   return DAEDALUS_SUBSTRATE_CPU;	/* diagonal ¾¼ */
+    case DAEDALUS_KERNEL_H264_QPEL_MC32:   return DAEDALUS_SUBSTRATE_CPU;	/* diagonal ¾½ */
+    case DAEDALUS_KERNEL_H264_QPEL_MC33:   return DAEDALUS_SUBSTRATE_CPU;	/* diagonal ¾¾ */
     }
     return DAEDALUS_SUBSTRATE_CPU;
 }
@@ -196,6 +204,14 @@ extern void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src,
                                          ptrdiff_t stride);
 extern void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src,
                                          ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 
 /* -------------------- CPU dispatch implementations -------------- */
 
@@ -468,6 +484,14 @@ DEFINE_QPEL_CPU_DISPATCH(mc10, ff_put_h264_qpel8_mc10_neon)
 DEFINE_QPEL_CPU_DISPATCH(mc30, ff_put_h264_qpel8_mc30_neon)
 DEFINE_QPEL_CPU_DISPATCH(mc01, ff_put_h264_qpel8_mc01_neon)
 DEFINE_QPEL_CPU_DISPATCH(mc03, ff_put_h264_qpel8_mc03_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc11, ff_put_h264_qpel8_mc11_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc12, ff_put_h264_qpel8_mc12_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc13, ff_put_h264_qpel8_mc13_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc21, ff_put_h264_qpel8_mc21_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc23, ff_put_h264_qpel8_mc23_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc31, ff_put_h264_qpel8_mc31_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc32, ff_put_h264_qpel8_mc32_neon)
+DEFINE_QPEL_CPU_DISPATCH(mc33, ff_put_h264_qpel8_mc33_neon)
 
 #undef DEFINE_QPEL_CPU_DISPATCH
 
@@ -1489,6 +1513,14 @@ DEFINE_QPEL_DISPATCH(mc10, DAEDALUS_KERNEL_H264_QPEL_MC10)
 DEFINE_QPEL_DISPATCH(mc30, DAEDALUS_KERNEL_H264_QPEL_MC30)
 DEFINE_QPEL_DISPATCH(mc01, DAEDALUS_KERNEL_H264_QPEL_MC01)
 DEFINE_QPEL_DISPATCH(mc03, DAEDALUS_KERNEL_H264_QPEL_MC03)
+DEFINE_QPEL_DISPATCH(mc11, DAEDALUS_KERNEL_H264_QPEL_MC11)
+DEFINE_QPEL_DISPATCH(mc12, DAEDALUS_KERNEL_H264_QPEL_MC12)
+DEFINE_QPEL_DISPATCH(mc13, DAEDALUS_KERNEL_H264_QPEL_MC13)
+DEFINE_QPEL_DISPATCH(mc21, DAEDALUS_KERNEL_H264_QPEL_MC21)
+DEFINE_QPEL_DISPATCH(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23)
+DEFINE_QPEL_DISPATCH(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31)
+DEFINE_QPEL_DISPATCH(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32)
+DEFINE_QPEL_DISPATCH(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33)
 
 #undef DEFINE_QPEL_DISPATCH
 
@@ -1640,5 +1672,13 @@ DEFINE_QPEL_RECIPE(mc10)
 DEFINE_QPEL_RECIPE(mc30)
 DEFINE_QPEL_RECIPE(mc01)
 DEFINE_QPEL_RECIPE(mc03)
+DEFINE_QPEL_RECIPE(mc11)
+DEFINE_QPEL_RECIPE(mc12)
+DEFINE_QPEL_RECIPE(mc13)
+DEFINE_QPEL_RECIPE(mc21)
+DEFINE_QPEL_RECIPE(mc23)
+DEFINE_QPEL_RECIPE(mc31)
+DEFINE_QPEL_RECIPE(mc32)
+DEFINE_QPEL_RECIPE(mc33)
 
 #undef DEFINE_QPEL_RECIPE
diff --git a/tests/h264_qpel8_diag_ref.c b/tests/h264_qpel8_diag_ref.c
new file mode 100644
index 0000000..06c6243
--- /dev/null
+++ b/tests/h264_qpel8_diag_ref.c
@@ -0,0 +1,98 @@
+/*
+ * Standalone bit-exact C references for the 8 diagonal H.264 luma
+ * qpel positions (mc11, mc12, mc13, mc21, mc23, mc31, mc32, mc33).
+ * Each is the rounded average of two half-pel intermediates per
+ * H.264 §8.4.2.2.1 / Table 8-4, decomposed to match the FFmpeg .S
+ * reference structure (see comments in mc{11,12,21,...}_neon in
+ * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S).
+ *
+ * Position decompositions (verified against the .S):
+ *   mc11 (e, ¼¼): avg(mc20[r,c],   mc02[r,c])
+ *   mc12 (f, ¼½): avg(mc22[r,c],   mc02[r,c])
+ *   mc13 (g, ¼¾): avg(mc20[r+1,c], mc02[r,c])
+ *   mc21 (i, ½¼): avg(mc22[r,c],   mc20[r,c])
+ *   mc23 (k, ½¾): avg(mc22[r,c],   mc20[r+1,c])
+ *   mc31 (p, ¾¼): avg(mc20[r,c],   mc02[r,c+1])
+ *   mc32 (q, ¾½): avg(mc22[r,c],   mc02[r,c+1])
+ *   mc33 (r, ¾¾): avg(mc20[r+1,c], mc02[r,c+1])
+ *
+ * (The mc20[r,c] notation means "the mc20-style horizontal half-pel
+ * result at source-relative integer position (r, c)"; analogously
+ * for mc02 and mc22.)
+ *
+ * Single-stride convention; same edge-context contract as the simpler
+ * variants (the cells "[r+1,c]" etc. demand one extra row/col of
+ * source context beyond what mc20/mc02 alone would need).
+ *
+ * License: LGPL-2.1-or-later.
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
+
+/* Single-cell helpers — same arithmetic as the dedicated mc20/mc02
+ * refs but computed point-by-point so the diagonal refs can mix them
+ * cheaply.  Each returns a u8 (already clipped). */
+static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
+{
+    int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
+          + 20 * (int) s[r*stride + c]   + 20 * (int) s[r*stride + c+1]
+          - 5 * (int) s[r*stride + c+2]  + (int) s[r*stride + c+3]
+          + 16;
+    return (uint8_t) clip_u8(v >> 5);
+}
+static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
+{
+    int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
+          + 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
+          - 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
+          + 16;
+    return (uint8_t) clip_u8(v >> 5);
+}
+
+/* hpel_hv — 2D half-pel at (r, c) per the H.264 §8.4.2.2.1 "j"
+ * cascade.  Computes the 6 vertical intermediates needed for the
+ * column at offsets -2..+3 around (r, c), each as a 16-bit signed
+ * h-lowpass over the 6 source samples in the same row.  Then v-lowpass
+ * over those 6 intermediates with the +512 >> 10 final scale.  Same
+ * as the mc22 ref, just expressed point-by-point. */
+static inline uint8_t hpel_hv(const uint8_t *s, int r, int c, ptrdiff_t stride)
+{
+    int t[6];   /* tmp at rows r-2..r+3 of the same col c */
+    for (int i = 0; i < 6; i++) {
+        int rr = r - 2 + i;
+        t[i] = (int) s[rr*stride + c-2] - 5 * (int) s[rr*stride + c-1]
+             + 20 * (int) s[rr*stride + c]   + 20 * (int) s[rr*stride + c+1]
+             - 5 * (int) s[rr*stride + c+2]  + (int) s[rr*stride + c+3];
+    }
+    int v = t[0] - 5 * t[1] + 20 * t[2] + 20 * t[3] - 5 * t[4] + t[5] + 512;
+    return (uint8_t) clip_u8(v >> 10);
+}
+
+/* avg rounded ((a + b + 1) >> 1) — saturates already-clipped inputs
+ * so no further clip needed. */
+static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); }
+
+#define DEFINE_DIAG_REF(NAME, A_EXPR, B_EXPR)                                  \
+void daedalus_put_h264_qpel8_ ## NAME ## _ref(uint8_t *dst,                    \
+    const uint8_t *src, ptrdiff_t stride)                                      \
+{                                                                              \
+    for (int r = 0; r < 8; r++)                                                \
+        for (int c = 0; c < 8; c++) {                                          \
+            uint8_t a = (A_EXPR);                                              \
+            uint8_t b = (B_EXPR);                                              \
+            dst[r*stride + c] = avg2(a, b);                                    \
+        }                                                                      \
+}
+
+DEFINE_DIAG_REF(mc11, hpel_h(src,   r, c, stride), hpel_v(src, r,   c, stride))
+DEFINE_DIAG_REF(mc12, hpel_hv(src,  r, c, stride), hpel_v(src, r,   c, stride))
+DEFINE_DIAG_REF(mc13, hpel_h(src, r+1, c, stride), hpel_v(src, r,   c, stride))
+DEFINE_DIAG_REF(mc21, hpel_hv(src,  r, c, stride), hpel_h(src, r,   c, stride))
+DEFINE_DIAG_REF(mc23, hpel_hv(src,  r, c, stride), hpel_h(src, r+1, c, stride))
+DEFINE_DIAG_REF(mc31, hpel_h(src,   r, c, stride), hpel_v(src, r, c+1, stride))
+DEFINE_DIAG_REF(mc32, hpel_hv(src,  r, c, stride), hpel_v(src, r, c+1, stride))
+DEFINE_DIAG_REF(mc33, hpel_h(src, r+1, c, stride), hpel_v(src, r, c+1, stride))
+
+#undef DEFINE_DIAG_REF
diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c
index 370f34b..c67c7e6 100644
--- a/tests/test_api_h264.c
+++ b/tests/test_api_h264.c
@@ -44,6 +44,14 @@ extern void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src,
                                                 ptrdiff_t stride);
 extern void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src,
                                                 ptrdiff_t stride);
+extern void daedalus_put_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void daedalus_put_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void daedalus_put_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void daedalus_put_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void daedalus_put_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void daedalus_put_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void daedalus_put_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
                                               ptrdiff_t stride);
 
@@ -548,6 +556,33 @@ static int test_qpel_quarter_axis_all(void)
     return fail;
 }
 
+static int test_qpel_diag_all(void)
+{
+    /* Diagonal positions need TWO half-pel intermediates per output;
+     * some of them read at (r+1,c) or (r,c+1) so the test geometry
+     * needs an extra row + col of context.  run_quarter_axis_qpel
+     * already provides plenty (SRC_ROW=3, SRC_COL=3, 16x16 tile)
+     * — reusing that harness is fine. */
+    int fail = 0;
+    fail |= run_quarter_axis_qpel("mc11", daedalus_put_h264_qpel8_mc11_ref,
+                                          daedalus_recipe_dispatch_h264_qpel_mc11);
+    fail |= run_quarter_axis_qpel("mc12", daedalus_put_h264_qpel8_mc12_ref,
+                                          daedalus_recipe_dispatch_h264_qpel_mc12);
+    fail |= run_quarter_axis_qpel("mc13", daedalus_put_h264_qpel8_mc13_ref,
+                                          daedalus_recipe_dispatch_h264_qpel_mc13);
+    fail |= run_quarter_axis_qpel("mc21", daedalus_put_h264_qpel8_mc21_ref,
+                                          daedalus_recipe_dispatch_h264_qpel_mc21);
+    fail |= run_quarter_axis_qpel("mc23", daedalus_put_h264_qpel8_mc23_ref,
+                                          daedalus_recipe_dispatch_h264_qpel_mc23);
+    fail |= run_quarter_axis_qpel("mc31", daedalus_put_h264_qpel8_mc31_ref,
+                                          daedalus_recipe_dispatch_h264_qpel_mc31);
+    fail |= run_quarter_axis_qpel("mc32", daedalus_put_h264_qpel8_mc32_ref,
+                                          daedalus_recipe_dispatch_h264_qpel_mc32);
+    fail |= run_quarter_axis_qpel("mc33", daedalus_put_h264_qpel8_mc33_ref,
+                                          daedalus_recipe_dispatch_h264_qpel_mc33);
+    return fail;
+}
+
 int main(void)
 {
     printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
@@ -581,5 +616,6 @@ int main(void)
     fail |= test_qpel_mc02();
     fail |= test_qpel_mc22();
     fail |= test_qpel_quarter_axis_all();
+    fail |= test_qpel_diag_all();
     return fail;
 }