From ce6703a862ecc21b8a8dc3fc1fd02b56054ec82d Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Mon, 25 May 2026 00:14:51 +0200
Subject: [PATCH] =?UTF-8?q?h264:=20Intra=5F4x4=20luma=20prediction=20?=
 =?UTF-8?q?=E2=80=94=209-mode=20C=20reference=20+=20spec=20gates?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lays the bit-exact gate for H.264 §8.3.1.4 Intra_4x4 luma prediction.
Spec-derived C reference covering all 9 modes; standalone test
exercises each against hand-computed expected 4x4 patterns.

Why fourier (not the decoder) gets this: it's a reusable spec-level
primitive — both daedalus-decoder (Phase 1 Stage 2a intra prediction)
and any future shader work will need the same bit-exact reference.
Putting it in fourier alongside the IDCT / deblock refs keeps the
"spec implementations" library cohesive.

Why CPU C reference, not NEON or QPU: the vendored FFmpeg snapshot
(external/ffmpeg-snapshot/libavcodec/aarch64/) has h264dsp/idct/qpel
but NOT h264pred.  Vendoring h264pred_neon.S would expand the snapshot
surface; deferring that pending real perf data.  Per the cycle 9
NEON benches that take ~5 ns per 8x8 qpel block, intra prediction
at ~5 ns per 4x4 block × 16 blocks/MB × 8160 MBs = ~650 us/frame at
1080p — well inside budget even at NEON, and much further inside at
plain C.  Not the critical-path concern.

Scope:
  - tests/h264_intra_pred_4x4_ref.c — 9 prediction modes per
    H.264 spec §8.3.1.4 sub-clauses, FFmpeg-style interface:
      void daedalus_h264_pred_4x4_<name>_ref(uint8_t *dst, ptrdiff_t stride);
    Reads top/top-right/left/top-left neighbours from dst[-stride/-1]
    offsets, writes 4×4 output at dst[0..3][0..3].  Assumes all 13
    neighbour bytes are valid (interior-MB case; availability
    fallbacks are caller-side per spec).
  - tests/test_intra_pred_4x4.c — 10 cases:
      * 9 uniform-context degenerate tests (one per mode), establishing
        that nothing is structurally broken (all output cells must
        equal the uniform input value).
      * 1 asymmetric Vertical_Right sanity test with 16 distinct
        expected cells hand-computed from spec §8.3.1.4.6 — the
        "really exercise orientation + row/col arithmetic" gate.
  - CMakeLists.txt — new test_intra_pred_4x4 binary (no daedalus_core
    dependency; pure-CPU library doesn't need a context to construct).

Verified on hertz:

  $ ./build/test_intra_pred_4x4
    Vertical (mode 0)          PASS
    Horizontal (mode 1)        PASS
    DC (mode 2)                PASS
    DiagDownLeft (mode 3)      PASS
    DiagDownRight (mode 4)     PASS
    VerticalRight (mode 5)     PASS
    HorizontalDown (mode 6)    PASS
    VerticalLeft (mode 7)      PASS
    HorizontalUp (mode 8)      PASS
    VR asym (sanity)           PASS

  ALL 10 intra-4x4 mode references PASS

The VR asym test passed first try; the DC test fell on the first
attempt because my test expectation miscomputed the rounding shift
(I wrote 4, actual is 2 = (16+4)>>3).  Fixed in the test.  Reference
itself never had the bug.

What this does NOT cover (next-step backlog):
  - Intra_16x16 luma prediction (4 modes per H.264 §8.3.2): vertical,
    horizontal, DC, plane.
  - Intra_8x8 chroma prediction (4 modes per H.264 §8.3.3): DC,
    horizontal, vertical, plane.
  - Intra_8x8 luma prediction (High profile, 9 modes per §8.3.2.1) —
    these are the High-profile siblings of the modes in this PR with
    the 1-2-1 smoothing pre-filter.  Different but well-defined.
  - Neighbour availability fallback (top-edge MB, left-edge MB,
    slice-boundary, top-right unavailable in some positions).
  - Dispatch wrappers — these refs aren't surfaced through
    daedalus_dispatch_*().  Whether to do that depends on the
    daedalus-decoder Stage 2a architecture (per-block CPU vs
    per-diagonal GPU wavefront — TBD).
---
 CMakeLists.txt                  |   9 ++
 tests/h264_intra_pred_4x4_ref.c | 238 ++++++++++++++++++++++++++++++
 tests/test_intra_pred_4x4.c     | 246 ++++++++++++++++++++++++++++++++
 3 files changed, 493 insertions(+)
 create mode 100644 tests/h264_intra_pred_4x4_ref.c
 create mode 100644 tests/test_intra_pred_4x4.c
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b27f4e7..4d8826b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -531,6 +531,15 @@ add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c)
 target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core)
 target_compile_options(test_api_opportunistic_qpu PRIVATE -O2)
 
+# H.264 Intra_4x4 luma prediction (9 modes) — reference + tests.
+# Pure CPU + spec-derived; no daedalus_core dependency yet (this is
+# the bit-exact gate for the eventual shader / dispatch wiring).
+add_executable(test_intra_pred_4x4
+    tests/test_intra_pred_4x4.c
+    tests/h264_intra_pred_4x4_ref.c
+)
+target_compile_options(test_intra_pred_4x4 PRIVATE -O2)
+
 add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
 target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
 target_compile_options(bench_pool_overhead PRIVATE -O2)
diff --git a/tests/h264_intra_pred_4x4_ref.c b/tests/h264_intra_pred_4x4_ref.c
new file mode 100644
index 0000000..6cec9ba
--- /dev/null
+++ b/tests/h264_intra_pred_4x4_ref.c
@@ -0,0 +1,238 @@
+/*
+ * Standalone bit-exact C reference for H.264 luma Intra_4x4
+ * prediction modes (per H.264 spec §8.3.1.4).  All 9 modes.
+ *
+ * Mode index → name (per H.264 Table 8-2):
+ *   0 = Vertical
+ *   1 = Horizontal
+ *   2 = DC
+ *   3 = Diagonal_Down_Left
+ *   4 = Diagonal_Down_Right
+ *   5 = Vertical_Right
+ *   6 = Horizontal_Down
+ *   7 = Vertical_Left
+ *   8 = Horizontal_Up
+ *
+ * Calling convention matches FFmpeg's h264pred:
+ *   pred_4x4_<mode>(uint8_t *dst, ptrdiff_t stride)
+ *
+ * `dst` points at row 0, col 0 of the 4x4 output block.  Neighbour
+ * pixels come from the already-decoded surrounding pixels in the same
+ * buffer:
+ *   top-left   = dst[-stride - 1]
+ *   top[0..3]  = dst[-stride + 0 .. -stride + 3]
+ *   top-right  = dst[-stride + 4 .. -stride + 7]   (DDL / VL only)
+ *   left[0..3] = dst[ 0*stride - 1 .. 3*stride - 1]
+ *
+ * AVAILABILITY: this reference assumes ALL neighbours are available
+ * (the "interior MB" case).  The H.264 spec defines fallback behaviour
+ * for unavailable neighbours (e.g. DC averages only the available
+ * side, top-right substitution from top[3] for DDL/VL near the right
+ * frame edge); those branches are NOT modelled here.  Tests must
+ * exercise the kernel with all 13 neighbour bytes valid.  The eventual
+ * libavcodec intercept handles availability before calling.
+ *
+ * License: BSD-2-Clause for the reference + tests; the underlying
+ * algorithm is from H.264/ITU-T H.264 (2003) and AVC standards, free
+ * to implement.
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+/* Helper: 3-tap weighted average ((a + 2*b + c + 2) >> 2). */
+static inline uint8_t avg3(int a, int b, int c)
+{
+    return (uint8_t)((a + 2*b + c + 2) >> 2);
+}
+
+/* Helper: 2-tap mean ((a + b + 1) >> 1). */
+static inline uint8_t avg2(int a, int b)
+{
+    return (uint8_t)((a + b + 1) >> 1);
+}
+
+/* Mode 0 — Vertical: each col = top[col]. */
+void daedalus_h264_pred_4x4_vertical_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    const uint8_t *top = dst - stride;
+    for (int r = 0; r < 4; r++) {
+        for (int c = 0; c < 4; c++) dst[r * stride + c] = top[c];
+    }
+}
+
+/* Mode 1 — Horizontal: each row = left[row]. */
+void daedalus_h264_pred_4x4_horizontal_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    for (int r = 0; r < 4; r++) {
+        uint8_t l = dst[r * stride - 1];
+        for (int c = 0; c < 4; c++) dst[r * stride + c] = l;
+    }
+}
+
+/* Mode 2 — DC: mean of top 4 + left 4, broadcast. */
+void daedalus_h264_pred_4x4_dc_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    const uint8_t *top = dst - stride;
+    int sum = 4;  /* rounding for ((sum + 4) >> 3) */
+    for (int i = 0; i < 4; i++) sum += top[i];
+    for (int i = 0; i < 4; i++) sum += dst[i * stride - 1];
+    uint8_t v = (uint8_t)(sum >> 3);
+    for (int r = 0; r < 4; r++)
+        for (int c = 0; c < 4; c++) dst[r * stride + c] = v;
+}
+
+/* Mode 3 — Diagonal_Down_Left.  Uses top[0..7] (incl. top-right). */
+void daedalus_h264_pred_4x4_ddl_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    const uint8_t *top = dst - stride;
+    int t0 = top[0], t1 = top[1], t2 = top[2], t3 = top[3];
+    int t4 = top[4], t5 = top[5], t6 = top[6], t7 = top[7];
+    /* zz[7] = top filtered with 3-tap; spec table 8-7. */
+    uint8_t zz[7];
+    zz[0] = avg3(t0, t1, t2);
+    zz[1] = avg3(t1, t2, t3);
+    zz[2] = avg3(t2, t3, t4);
+    zz[3] = avg3(t3, t4, t5);
+    zz[4] = avg3(t4, t5, t6);
+    zz[5] = avg3(t5, t6, t7);
+    zz[6] = avg3(t6, t7, t7);   /* spec: t7 doubled at the boundary */
+    /* dst[r][c] = zz[c + r] */
+    for (int r = 0; r < 4; r++)
+        for (int c = 0; c < 4; c++) dst[r * stride + c] = zz[c + r];
+}
+
+/* Mode 4 — Diagonal_Down_Right.  Uses top-left + top[0..3] + left[0..3]. */
+void daedalus_h264_pred_4x4_ddr_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    int tl = dst[-stride - 1];
+    int t0 = dst[-stride + 0], t1 = dst[-stride + 1];
+    int t2 = dst[-stride + 2], t3 = dst[-stride + 3];
+    int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
+    int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1];
+    /* zz indexed by (col - row): -3..+3 */
+    uint8_t zz_m3 = avg3(l1, l2, l3);
+    uint8_t zz_m2 = avg3(l0, l1, l2);
+    uint8_t zz_m1 = avg3(tl, l0, l1);
+    uint8_t zz_p0 = avg3(l0, tl, t0);
+    uint8_t zz_p1 = avg3(tl, t0, t1);
+    uint8_t zz_p2 = avg3(t0, t1, t2);
+    uint8_t zz_p3 = avg3(t1, t2, t3);
+    uint8_t zz[7] = { zz_m3, zz_m2, zz_m1, zz_p0, zz_p1, zz_p2, zz_p3 };
+    for (int r = 0; r < 4; r++)
+        for (int c = 0; c < 4; c++) dst[r * stride + c] = zz[(c - r) + 3];
+}
+
+/* Mode 5 — Vertical_Right. */
+void daedalus_h264_pred_4x4_vr_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    int tl = dst[-stride - 1];
+    int t0 = dst[-stride + 0], t1 = dst[-stride + 1];
+    int t2 = dst[-stride + 2], t3 = dst[-stride + 3];
+    int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
+    int l2 = dst[ 2*stride - 1];
+    /* H.264 §8.3.1.4.6: two patterns based on (2c - r) parity. */
+    dst[0*stride + 0] = avg2(tl, t0);
+    dst[0*stride + 1] = avg2(t0, t1);
+    dst[0*stride + 2] = avg2(t1, t2);
+    dst[0*stride + 3] = avg2(t2, t3);
+
+    dst[1*stride + 0] = avg3(l0, tl, t0);
+    dst[1*stride + 1] = avg3(tl, t0, t1);
+    dst[1*stride + 2] = avg3(t0, t1, t2);
+    dst[1*stride + 3] = avg3(t1, t2, t3);
+
+    dst[2*stride + 0] = avg3(tl, l0, l1);
+    dst[2*stride + 1] = dst[0*stride + 0];
+    dst[2*stride + 2] = dst[0*stride + 1];
+    dst[2*stride + 3] = dst[0*stride + 2];
+
+    dst[3*stride + 0] = avg3(l0, l1, l2);
+    dst[3*stride + 1] = dst[1*stride + 0];
+    dst[3*stride + 2] = dst[1*stride + 1];
+    dst[3*stride + 3] = dst[1*stride + 2];
+}
+
+/* Mode 6 — Horizontal_Down. */
+void daedalus_h264_pred_4x4_hd_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    int tl = dst[-stride - 1];
+    int t0 = dst[-stride + 0], t1 = dst[-stride + 1], t2 = dst[-stride + 2];
+    int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
+    int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1];
+
+    dst[0*stride + 0] = avg2(tl, l0);
+    dst[0*stride + 1] = avg3(l0, tl, t0);
+    dst[0*stride + 2] = avg3(tl, t0, t1);
+    dst[0*stride + 3] = avg3(t0, t1, t2);
+
+    dst[1*stride + 0] = avg2(l0, l1);
+    dst[1*stride + 1] = avg3(tl, l0, l1);
+    dst[1*stride + 2] = dst[0*stride + 0];
+    dst[1*stride + 3] = dst[0*stride + 1];
+
+    dst[2*stride + 0] = avg2(l1, l2);
+    dst[2*stride + 1] = avg3(l0, l1, l2);
+    dst[2*stride + 2] = dst[1*stride + 0];
+    dst[2*stride + 3] = dst[1*stride + 1];
+
+    dst[3*stride + 0] = avg2(l2, l3);
+    dst[3*stride + 1] = avg3(l1, l2, l3);
+    dst[3*stride + 2] = dst[2*stride + 0];
+    dst[3*stride + 3] = dst[2*stride + 1];
+}
+
+/* Mode 7 — Vertical_Left.  Uses top[0..7]. */
+void daedalus_h264_pred_4x4_vl_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    const uint8_t *top = dst - stride;
+    int t0=top[0], t1=top[1], t2=top[2], t3=top[3];
+    int t4=top[4], t5=top[5], t6=top[6], t7=top[7];
+
+    dst[0*stride + 0] = avg2(t0, t1);
+    dst[0*stride + 1] = avg2(t1, t2);
+    dst[0*stride + 2] = avg2(t2, t3);
+    dst[0*stride + 3] = avg2(t3, t4);
+
+    dst[1*stride + 0] = avg3(t0, t1, t2);
+    dst[1*stride + 1] = avg3(t1, t2, t3);
+    dst[1*stride + 2] = avg3(t2, t3, t4);
+    dst[1*stride + 3] = avg3(t3, t4, t5);
+
+    dst[2*stride + 0] = avg2(t1, t2);
+    dst[2*stride + 1] = avg2(t2, t3);
+    dst[2*stride + 2] = avg2(t3, t4);
+    dst[2*stride + 3] = avg2(t4, t5);
+
+    dst[3*stride + 0] = avg3(t1, t2, t3);
+    dst[3*stride + 1] = avg3(t2, t3, t4);
+    dst[3*stride + 2] = avg3(t3, t4, t5);
+    dst[3*stride + 3] = avg3(t4, t5, t6);
+    (void) t6; (void) t7;  /* t6 used; t7 unused in 4x4 VL */
+}
+
+/* Mode 8 — Horizontal_Up.  Uses left[0..3] only. */
+void daedalus_h264_pred_4x4_hu_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
+    int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1];
+
+    dst[0*stride + 0] = avg2(l0, l1);
+    dst[0*stride + 1] = avg3(l0, l1, l2);
+    dst[0*stride + 2] = avg2(l1, l2);
+    dst[0*stride + 3] = avg3(l1, l2, l3);
+
+    dst[1*stride + 0] = avg2(l1, l2);
+    dst[1*stride + 1] = avg3(l1, l2, l3);
+    dst[1*stride + 2] = avg2(l2, l3);
+    dst[1*stride + 3] = avg3(l2, l3, l3);
+
+    dst[2*stride + 0] = avg2(l2, l3);
+    dst[2*stride + 1] = avg3(l2, l3, l3);
+    dst[2*stride + 2] = l3;
+    dst[2*stride + 3] = l3;
+
+    dst[3*stride + 0] = l3;
+    dst[3*stride + 1] = l3;
+    dst[3*stride + 2] = l3;
+    dst[3*stride + 3] = l3;
+}
diff --git a/tests/test_intra_pred_4x4.c b/tests/test_intra_pred_4x4.c
new file mode 100644
index 0000000..2a44c1d
--- /dev/null
+++ b/tests/test_intra_pred_4x4.c
@@ -0,0 +1,246 @@
+/*
+ * Tests the 9 H.264 Intra_4x4 luma prediction modes against
+ * spec-derived expected patterns.  Goal: catch any mistake in
+ * the reference (sign / shift / table mapping) before it lands
+ * downstream.  Each mode is exercised with a deterministic
+ * neighbour context and checked against a hand-computed (or
+ * spec-derived) expected 4x4 output.
+ *
+ * The test buffer layout reserves a 1-pixel top/left context border
+ * + a 4-pixel top-right (for modes 3 / 7):
+ *
+ *   row 0: [tl][t0 t1 t2 t3 t4 t5 t6 t7]   <- TOP_STRIDE = 9 bytes
+ *   row 1: [l0][  4x4 output goes here   ]
+ *   row 2: [l1][                         ]
+ *   row 3: [l2][                         ]
+ *   row 4: [l3][                         ]
+ *
+ * dst (passed to the pred fns) points at row 1 col 1.
+ */
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+extern void daedalus_h264_pred_4x4_vertical_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_4x4_horizontal_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_4x4_dc_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_4x4_ddl_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_4x4_ddr_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_4x4_vr_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_4x4_hd_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_4x4_vl_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_4x4_hu_ref(uint8_t *dst, ptrdiff_t stride);
+
+#define STRIDE 9
+typedef void (*pred_fn)(uint8_t *dst, ptrdiff_t stride);
+
+/* Set up the buffer: 5 rows × STRIDE cols.
+ * top-left = tl, top[0..7] = t[0..7], left[0..3] = l[0..3].
+ * The 4x4 output region (rows 1..4, cols 1..4) is filled with 0xff
+ * sentinels so any unwritten cell shows up as 255 in the compare. */
+static void set_ctx(uint8_t buf[5][STRIDE], int tl, const int t[8], const int l[4])
+{
+    for (int r = 0; r < 5; r++) for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff;
+    buf[0][0] = (uint8_t) tl;
+    for (int c = 0; c < 8; c++) buf[0][1 + c] = (uint8_t) t[c];
+    for (int r = 0; r < 4; r++) buf[1 + r][0] = (uint8_t) l[r];
+}
+
+static int check(const uint8_t buf[5][STRIDE], const char *name,
+                  const uint8_t expect[4][4])
+{
+    int diff = 0;
+    for (int r = 0; r < 4; r++) {
+        for (int c = 0; c < 4; c++) {
+            uint8_t got = buf[1 + r][1 + c];
+            uint8_t exp = expect[r][c];
+            if (got != exp) {
+                if (diff == 0)
+                    fprintf(stderr,
+                            "%s: first mismatch r=%d c=%d got=%u exp=%u\n",
+                            name, r, c, got, exp);
+                diff++;
+            }
+        }
+    }
+    if (diff == 0)
+        printf("  %-26s PASS\n", name);
+    else
+        printf("  %-26s FAIL (%d/16 bytes wrong)\n", name, diff);
+    return diff == 0 ? 0 : 1;
+}
+
+int main(void)
+{
+    int fail = 0;
+
+    /* Mode 0 — Vertical: each col = top[col]. */
+    {
+        uint8_t buf[5][STRIDE];
+        int tl = 0;
+        int t[8] = { 10, 20, 30, 40,  0, 0, 0, 0 };
+        int l[4] = {  0,  0,  0,  0 };
+        set_ctx(buf, tl, t, l);
+        daedalus_h264_pred_4x4_vertical_ref(&buf[1][1], STRIDE);
+        uint8_t exp[4][4] = {
+            {10,20,30,40}, {10,20,30,40}, {10,20,30,40}, {10,20,30,40}
+        };
+        fail |= check(buf, "Vertical (mode 0)", exp);
+    }
+
+    /* Mode 1 — Horizontal: each row = left[row]. */
+    {
+        uint8_t buf[5][STRIDE];
+        int t[8] = { 0,0,0,0, 0,0,0,0 };
+        int l[4] = { 50, 60, 70, 80 };
+        set_ctx(buf, 0, t, l);
+        daedalus_h264_pred_4x4_horizontal_ref(&buf[1][1], STRIDE);
+        uint8_t exp[4][4] = {
+            {50,50,50,50}, {60,60,60,60}, {70,70,70,70}, {80,80,80,80}
+        };
+        fail |= check(buf, "Horizontal (mode 1)", exp);
+    }
+
+    /* Mode 2 — DC: all 8 neighbours valid → ((sum + 4) >> 3) broadcast.
+     * top sum = 4*1 = 4, left sum = 4*3 = 12, total 16, +4 = 20,
+     * >>3 = 2. */
+    {
+        uint8_t buf[5][STRIDE];
+        int t[8] = { 1,1,1,1, 0,0,0,0 };
+        int l[4] = { 3,3,3,3 };
+        set_ctx(buf, 99, t, l);   /* tl unused for DC */
+        daedalus_h264_pred_4x4_dc_ref(&buf[1][1], STRIDE);
+        uint8_t exp[4][4] = {
+            {2,2,2,2}, {2,2,2,2}, {2,2,2,2}, {2,2,2,2}
+        };
+        fail |= check(buf, "DC (mode 2)", exp);
+    }
+
+    /* Mode 3 — Diagonal_Down_Left: zz[i] = avg3(t[i], t[i+1], t[i+2]);
+     * dst[r][c] = zz[c + r].
+     * With all t[]=100 → all zz=100 → all dst=100. */
+    {
+        uint8_t buf[5][STRIDE];
+        int t[8] = { 100,100,100,100, 100,100,100,100 };
+        int l[4] = { 0,0,0,0 };
+        set_ctx(buf, 0, t, l);
+        daedalus_h264_pred_4x4_ddl_ref(&buf[1][1], STRIDE);
+        uint8_t exp[4][4] = {
+            {100,100,100,100}, {100,100,100,100},
+            {100,100,100,100}, {100,100,100,100}
+        };
+        fail |= check(buf, "DiagDownLeft (mode 3)", exp);
+    }
+
+    /* Mode 4 — Diagonal_Down_Right: zz[c-r] with c-r ∈ {-3..+3}.
+     * If all 9 surrounding pixels = 200 → all zz = 200 → all dst = 200. */
+    {
+        uint8_t buf[5][STRIDE];
+        int t[8] = { 200,200,200,200, 0,0,0,0 };
+        int l[4] = { 200,200,200,200 };
+        set_ctx(buf, 200, t, l);
+        daedalus_h264_pred_4x4_ddr_ref(&buf[1][1], STRIDE);
+        uint8_t exp[4][4] = {
+            {200,200,200,200}, {200,200,200,200},
+            {200,200,200,200}, {200,200,200,200}
+        };
+        fail |= check(buf, "DiagDownRight (mode 4)", exp);
+    }
+
+    /* Mode 5 — Vertical_Right. With all neighbours = 80 the 3-tap
+     * (a+2b+c+2)>>2 and 2-tap (a+b+1)>>1 both yield 80. */
+    {
+        uint8_t buf[5][STRIDE];
+        int t[8] = { 80,80,80,80, 0,0,0,0 };
+        int l[4] = { 80,80,80,80 };
+        set_ctx(buf, 80, t, l);
+        daedalus_h264_pred_4x4_vr_ref(&buf[1][1], STRIDE);
+        uint8_t exp[4][4] = {
+            {80,80,80,80}, {80,80,80,80}, {80,80,80,80}, {80,80,80,80}
+        };
+        fail |= check(buf, "VerticalRight (mode 5)", exp);
+    }
+
+    /* Mode 6 — Horizontal_Down.  Same uniform-context degenerate case. */
+    {
+        uint8_t buf[5][STRIDE];
+        int t[8] = { 120,120,120,120, 0,0,0,0 };
+        int l[4] = { 120,120,120,120 };
+        set_ctx(buf, 120, t, l);
+        daedalus_h264_pred_4x4_hd_ref(&buf[1][1], STRIDE);
+        uint8_t exp[4][4] = {
+            {120,120,120,120}, {120,120,120,120},
+            {120,120,120,120}, {120,120,120,120}
+        };
+        fail |= check(buf, "HorizontalDown (mode 6)", exp);
+    }
+
+    /* Mode 7 — Vertical_Left.  Uniform context. */
+    {
+        uint8_t buf[5][STRIDE];
+        int t[8] = { 64,64,64,64, 64,64,64,64 };
+        int l[4] = { 0,0,0,0 };
+        set_ctx(buf, 0, t, l);
+        daedalus_h264_pred_4x4_vl_ref(&buf[1][1], STRIDE);
+        uint8_t exp[4][4] = {
+            {64,64,64,64}, {64,64,64,64}, {64,64,64,64}, {64,64,64,64}
+        };
+        fail |= check(buf, "VerticalLeft (mode 7)", exp);
+    }
+
+    /* Mode 8 — Horizontal_Up.  Uniform context. */
+    {
+        uint8_t buf[5][STRIDE];
+        int t[8] = { 0,0,0,0, 0,0,0,0 };
+        int l[4] = { 200,200,200,200 };
+        set_ctx(buf, 0, t, l);
+        daedalus_h264_pred_4x4_hu_ref(&buf[1][1], STRIDE);
+        uint8_t exp[4][4] = {
+            {200,200,200,200}, {200,200,200,200},
+            {200,200,200,200}, {200,200,200,200}
+        };
+        fail |= check(buf, "HorizontalUp (mode 8)", exp);
+    }
+
+    /* Asymmetric Vertical_Right test: detects orientation /
+     * row-vs-col confusion.  Top=10,20,30,40, Left=50,60,70,
+     * top-left=5.  Spec-derived expected output computed by hand
+     * from §8.3.1.4.6.
+     *
+     *   d[0][0] = (tl+t0+1)>>1 = (5+10+1)>>1 = 8
+     *   d[0][1] = (t0+t1+1)>>1 = (10+20+1)>>1 = 15
+     *   d[0][2] = (t1+t2+1)>>1 = (20+30+1)>>1 = 25
+     *   d[0][3] = (t2+t3+1)>>1 = (30+40+1)>>1 = 35
+     *   d[1][0] = avg3(l0,tl,t0) = (50+2*5+10+2)>>2 = 72/4 = 18
+     *   d[1][1] = avg3(tl,t0,t1) = (5+20+20+2)>>2 = 47/4 = 11
+     *   d[1][2] = avg3(t0,t1,t2) = (10+40+30+2)>>2 = 82/4 = 20
+     *   d[1][3] = avg3(t1,t2,t3) = (20+60+40+2)>>2 = 122/4 = 30
+     *   d[2][0] = avg3(tl,l0,l1) = (5+100+60+2)>>2 = 167/4 = 41
+     *   d[2][1] = d[0][0] = 8
+     *   d[2][2] = d[0][1] = 15
+     *   d[2][3] = d[0][2] = 25
+     *   d[3][0] = avg3(l0,l1,l2) = (50+120+70+2)>>2 = 242/4 = 60
+     *   d[3][1] = d[1][0] = 18
+     *   d[3][2] = d[1][1] = 11
+     *   d[3][3] = d[1][2] = 20
+     */
+    {
+        uint8_t buf[5][STRIDE];
+        int t[8] = { 10,20,30,40, 0,0,0,0 };
+        int l[4] = { 50,60,70,0 };
+        set_ctx(buf, 5, t, l);
+        daedalus_h264_pred_4x4_vr_ref(&buf[1][1], STRIDE);
+        uint8_t exp[4][4] = {
+            { 8,15,25,35},
+            {18,11,20,30},
+            {41, 8,15,25},
+            {60,18,11,20},
+        };
+        fail |= check(buf, "VR asym (sanity)", exp);
+    }
+
+    if (fail == 0) printf("\nALL %d intra-4x4 mode references PASS\n", 10);
+    else           fprintf(stderr, "\n%d test(s) FAILED\n", fail);
+    return fail ? 1 : 0;
+}
-- 
2.47.3