h264: Intra_8x8 chroma prediction — 4-mode C reference + spec gates

Third intra-prediction primitive after PR #12 (Intra_4x4 luma) and PR #13 (Intra_16x16 luma). Covers Intra_8x8 chroma per H.264 §8.3.3: 4 modes used for BOTH Cb and Cr planes at 4:2:0. Mode quirks worth flagging in code review: - Mode 0 DC is asymmetric per quadrant. The 8x8 chroma block splits into four 4x4 quadrants with different DC formulas: (0,0) top-left : (sum_top[0..3] + sum_left[0..3] + 4) >> 3 (0,1) top-right : (sum_top[4..7] + 2) >> 2 (1,0) bot-left : (sum_left[4..7] + 2) >> 2 (1,1) bot-right : (sum_top[4..7] + sum_left[4..7] + 4) >> 3 The top-right quadrant deliberately IGNORES the top-left half even though it's available — that's per spec §8.3.3.2. - Mode 3 Plane uses slope coefficient 34 (not 5 like Intra_16x16 luma). Centre is (x-3, y-3) instead of (x-7, y-7). Sums span 4 differences instead of 8. Easy to copy-paste-bug from the luma Plane if you don't notice the constants change. Test highlights: - DC quadrants: distinct expected values per quadrant (16, 16, 40, 28 from asymmetric top/left halves) — any quadrant mix-up would surface immediately. Hand-derived from the formulas in the test comment. - Plane uniform: all-100 context → all-100 output (a = 3200, H = V = 0, (3200+16) >> 5 = 100 exactly). - Plane gradient: top + left = 0..7, hand-derives pred[0][0] = 1 and pred[7][7] = 15 via the full arithmetic chain (H = V = 56, b = c = 30, a = 224). Same hand-traced spec-walkthrough as the Intra_16x16 Plane gradient test. Verified on hertz: $ ./build/test_intra_pred_chroma8x8 Horizontal (mode 1) PASS Vertical (mode 2) PASS DC quadrants (mode 0) PASS Plane uniform (mode 3) PASS Plane gradient (mode 3) PASS (corners 1, 15) ALL Intra_8x8 chroma mode references PASS All 5 tests PASS first try. The DC quadrant correctness is meaningful (4 different formulas in one kernel) and the Plane gradient corners validate the slope=34 + centre=(x-3,y-3) constants vs the luma equivalents. Combined coverage after this PR: - Intra_4x4 luma: 9 modes ✓ (PR #12, all 9 PASS) - Intra_16x16 luma: 4 modes ✓ (PR #13, all 5 tests PASS) - Intra_8x8 chroma: 4 modes ✓ (this PR, all 5 tests PASS) - Intra_8x8 luma (High profile): 9 modes + smoothing — pending. Remaining backlog: Intra_8x8 luma (High profile, 9 modes + 1-2-1 smoothing pre-filter — distinct algorithm from Intra_4x4 because of the pre-filter), neighbour-availability fallback, dispatch wrappers.
2026-05-25 00:42:49 +02:00
parent dff610e13d
commit d7100459f2
3 changed files with 302 additions and 0 deletions
@@ -548,6 +548,15 @@ add_executable(test_intra_pred_16x16
 )
 target_compile_options(test_intra_pred_16x16 PRIVATE -O2)

+# H.264 Intra_8x8 chroma prediction (4 modes: DC, H, V, Plane) —
+# reference + tests.  DC is per-quadrant (asymmetric); Plane uses
+# slope coefficient 34 instead of luma's 5.
+add_executable(test_intra_pred_chroma8x8
+    tests/test_intra_pred_chroma8x8.c
+    tests/h264_intra_pred_chroma8x8_ref.c
+)
+target_compile_options(test_intra_pred_chroma8x8 PRIVATE -O2)
+
 add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
 target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
 target_compile_options(bench_pool_overhead PRIVATE -O2)
@@ -0,0 +1,123 @@
+/*
+ * Standalone bit-exact C reference for H.264 chroma Intra_8x8
+ * prediction modes (per H.264 §8.3.3), used for both Cb and Cr
+ * planes at 4:2:0.  All 4 modes.
+ *
+ * Mode index → name (per H.264 Table 7-16):
+ *   0 = DC          (per-quadrant — asymmetric, see §8.3.3.2)
+ *   1 = Horizontal
+ *   2 = Vertical
+ *   3 = Plane       (slope coefficient 34, distinct from luma's 5)
+ *
+ * Calling convention (same shape as luma intra refs):
+ *   pred_chroma8x8_<mode>(uint8_t *dst, ptrdiff_t stride)
+ *
+ * `dst` points at row 0, col 0 of the 8x8 output block (single
+ * component plane — Cb or Cr, dispatched independently).  Neighbours:
+ *   top[0..7]   = dst[-stride + 0 .. -stride + 7]
+ *   top-left    = dst[-stride - 1]
+ *   left[0..7]  = dst[ 0*stride - 1 .. 7*stride - 1]
+ *
+ * AVAILABILITY: assumes all neighbours valid (interior-MB case).
+ * The H.264 spec defines per-quadrant fallback for the DC mode at
+ * MB boundaries; that's caller-side via the libavcodec intercept.
+ *
+ * License: BSD-2-Clause.
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
+
+/* Mode 0 — DC (per-quadrant, 4:2:0 layout per §8.3.3.2).
+ *
+ * The 8×8 block is split into four 4×4 quadrants.  For interior
+ * MBs (all neighbours available), the DC value per quadrant uses:
+ *   (0,0) top-left  : (sum_top[0..3] + sum_left[0..3] + 4) >> 3
+ *   (0,1) top-right :  sum_top[4..7]                  + 2) >> 2
+ *   (1,0) bot-left  : (sum_left[4..7]                 + 2) >> 2
+ *   (1,1) bot-right : (sum_top[4..7] + sum_left[4..7] + 4) >> 3
+ *
+ * The asymmetry mirrors what neighbours are "logically available"
+ * for each quadrant in the spec's availability model.  Top-right
+ * quadrant ignores the top-left-half because that half is "vertically
+ * above" the top-left quadrant; the spec uses top[4..7] only.
+ */
+void daedalus_h264_pred_chroma8x8_dc_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    const uint8_t *top = dst - stride;
+    int top_lo = 0, top_hi = 0, left_lo = 0, left_hi = 0;
+    for (int i = 0; i < 4; i++) {
+        top_lo  += top[i];
+        top_hi  += top[4 + i];
+        left_lo += dst[i * stride - 1];
+        left_hi += dst[(4 + i) * stride - 1];
+    }
+    uint8_t dc00 = (uint8_t)((top_lo  + left_lo + 4) >> 3);  /* top-left */
+    uint8_t dc01 = (uint8_t)((top_hi             + 2) >> 2); /* top-right */
+    uint8_t dc10 = (uint8_t)((           left_hi + 2) >> 2); /* bot-left  */
+    uint8_t dc11 = (uint8_t)((top_hi  + left_hi + 4) >> 3);  /* bot-right */
+    for (int r = 0; r < 4; r++) {
+        for (int c = 0; c < 4; c++) {
+            dst[(    r) * stride +     c    ] = dc00;
+            dst[(    r) * stride + 4 + c    ] = dc01;
+            dst[(4 + r) * stride +     c    ] = dc10;
+            dst[(4 + r) * stride + 4 + c    ] = dc11;
+        }
+    }
+}
+
+/* Mode 1 — Horizontal: each row = left[row]. */
+void daedalus_h264_pred_chroma8x8_horizontal_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    for (int r = 0; r < 8; r++) {
+        uint8_t l = dst[r * stride - 1];
+        for (int c = 0; c < 8; c++) dst[r * stride + c] = l;
+    }
+}
+
+/* Mode 2 — Vertical: each col = top[col]. */
+void daedalus_h264_pred_chroma8x8_vertical_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    const uint8_t *top = dst - stride;
+    for (int r = 0; r < 8; r++)
+        for (int c = 0; c < 8; c++) dst[r * stride + c] = top[c];
+}
+
+/* Mode 3 — Plane (per H.264 §8.3.3.4):
+ *   H = sum_{i=0..3} (i+1) * (p[4+i, -1]  - p[2-i, -1])    ; i=3 uses p[-1,-1]
+ *   V = sum_{j=0..3} (j+1) * (p[-1, 4+j]  - p[-1, 2-j])    ; j=3 uses p[-1,-1]
+ *   b = (34 * H + 32) >> 6
+ *   c = (34 * V + 32) >> 6
+ *   a = 16 * (p[-1, 7] + p[7, -1])
+ *   pred[y][x] = Clip1((a + b*(x - 3) + c*(y - 3) + 16) >> 5)
+ *
+ * Distinct from the Intra_16x16 luma Plane:
+ *   - Slope coefficient is 34 (not 5).
+ *   - Centre is (x-3, y-3) (not x-7, y-7).
+ *   - Spans 4 differences per sum (not 8).
+ */
+void daedalus_h264_pred_chroma8x8_plane_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    const uint8_t *top = dst - stride;
+    int H = 0, V = 0;
+    for (int i = 0; i < 4; i++) {
+        int t_right = top[4 + i];
+        int t_left  = (i == 3) ? top[-1] : top[2 - i];
+        H += (i + 1) * (t_right - t_left);
+    }
+    for (int j = 0; j < 4; j++) {
+        int l_bot = dst[(4 + j) * stride - 1];
+        int l_top = (j == 3) ? top[-1] : dst[(2 - j) * stride - 1];
+        V += (j + 1) * (l_bot - l_top);
+    }
+    int b = (34 * H + 32) >> 6;
+    int c = (34 * V + 32) >> 6;
+    int a = 16 * (dst[7 * stride - 1] + top[7]);
+    for (int y = 0; y < 8; y++) {
+        for (int x = 0; x < 8; x++) {
+            int v = (a + b * (x - 3) + c * (y - 3) + 16) >> 5;
+            dst[y * stride + x] = (uint8_t) clip_u8(v);
+        }
+    }
+}
@@ -0,0 +1,170 @@
+/*
+ * Tests the 4 H.264 Intra_8x8 chroma prediction modes against
+ * spec-derived expected patterns.  Same buffer layout idea as the
+ * other intra tests: a buffer that holds the 8x8 output + 1-pixel
+ * top/left context + 1-pixel top-left corner.
+ *
+ *   row 0: [tl][t0..t7]
+ *   row 1: [l0][output row 0]
+ *   ...
+ *   row 8: [l7][output row 7]
+ *
+ * Dimensions: 9 rows × 9 cols.  dst (passed to pred fns) = &buf[1][1].
+ */
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+extern void daedalus_h264_pred_chroma8x8_dc_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_chroma8x8_horizontal_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_chroma8x8_vertical_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_chroma8x8_plane_ref(uint8_t *dst, ptrdiff_t stride);
+
+#define STRIDE 9
+#define ROWS   9
+
+static void set_ctx(uint8_t buf[ROWS][STRIDE], int tl,
+                     const int t[8], const int l[8])
+{
+    for (int r = 0; r < ROWS; r++)
+        for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff;
+    buf[0][0] = (uint8_t) tl;
+    for (int c = 0; c < 8; c++) buf[0][1 + c] = (uint8_t) t[c];
+    for (int r = 0; r < 8; r++) buf[1 + r][0] = (uint8_t) l[r];
+}
+
+static int check_per_cell(const uint8_t buf[ROWS][STRIDE], const char *name,
+                           const uint8_t expect[8][8])
+{
+    int diff = 0;
+    int first_r = 0, first_c = 0, first_got = 0, first_exp = 0;
+    for (int r = 0; r < 8; r++) {
+        for (int c = 0; c < 8; c++) {
+            uint8_t got = buf[1 + r][1 + c];
+            uint8_t exp = expect[r][c];
+            if (got != exp) {
+                if (diff == 0) {
+                    first_r = r; first_c = c;
+                    first_got = got; first_exp = exp;
+                }
+                diff++;
+            }
+        }
+    }
+    if (diff == 0)
+        printf("  %-30s PASS\n", name);
+    else
+        printf("  %-30s FAIL (%d/64 wrong, first r=%d c=%d got=%u exp=%u)\n",
+               name, diff, first_r, first_c, first_got, first_exp);
+    return diff == 0 ? 0 : 1;
+}
+
+int main(void)
+{
+    int fail = 0;
+
+    /* --- Mode 1 Horizontal --- */
+    {
+        uint8_t buf[ROWS][STRIDE];
+        int t[8] = {0}, l[8] = {10, 20, 30, 40, 50, 60, 70, 80};
+        set_ctx(buf, 0, t, l);
+        daedalus_h264_pred_chroma8x8_horizontal_ref(&buf[1][1], STRIDE);
+        uint8_t exp[8][8];
+        for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = (uint8_t) l[r];
+        fail |= check_per_cell(buf, "Horizontal (mode 1)", exp);
+    }
+
+    /* --- Mode 2 Vertical --- */
+    {
+        uint8_t buf[ROWS][STRIDE];
+        int t[8] = {15, 25, 35, 45, 55, 65, 75, 85}, l[8] = {0};
+        set_ctx(buf, 0, t, l);
+        daedalus_h264_pred_chroma8x8_vertical_ref(&buf[1][1], STRIDE);
+        uint8_t exp[8][8];
+        for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = (uint8_t) t[c];
+        fail |= check_per_cell(buf, "Vertical (mode 2)", exp);
+    }
+
+    /* --- Mode 0 DC: per-quadrant.  Test with distinct halves so any
+     * quadrant mix-up surfaces immediately.
+     *
+     *   top[0..3] = 4 × 8  → sum_top_lo  = 32
+     *   top[4..7] = 4 × 16 → sum_top_hi  = 64
+     *   left[0..3] = 4 × 24 → sum_left_lo = 96
+     *   left[4..7] = 4 × 40 → sum_left_hi = 160
+     *
+     *   dc00 = (32 + 96  + 4) >> 3 = 132/8  = 16
+     *   dc01 = (64       + 2) >> 2 =  66/4  = 16
+     *   dc10 = (     160 + 2) >> 2 = 162/4  = 40
+     *   dc11 = (64 + 160 + 4) >> 3 = 228/8  = 28
+     */
+    {
+        uint8_t buf[ROWS][STRIDE];
+        int t[8] = { 8, 8, 8, 8,  16, 16, 16, 16 };
+        int l[8] = { 24, 24, 24, 24,  40, 40, 40, 40 };
+        set_ctx(buf, 99, t, l);
+        daedalus_h264_pred_chroma8x8_dc_ref(&buf[1][1], STRIDE);
+        uint8_t exp[8][8] = {
+            {16,16,16,16, 16,16,16,16},
+            {16,16,16,16, 16,16,16,16},
+            {16,16,16,16, 16,16,16,16},
+            {16,16,16,16, 16,16,16,16},
+            {40,40,40,40, 28,28,28,28},
+            {40,40,40,40, 28,28,28,28},
+            {40,40,40,40, 28,28,28,28},
+            {40,40,40,40, 28,28,28,28},
+        };
+        fail |= check_per_cell(buf, "DC quadrants (mode 0)", exp);
+    }
+
+    /* --- Mode 3 Plane (uniform): H = V = 0; a = 16 * (100 + 100) = 3200.
+     * pred[y][x] = (3200 + 0 + 0 + 16) >> 5 = 3216 >> 5 = 100. */
+    {
+        uint8_t buf[ROWS][STRIDE];
+        int t[8], l[8];
+        for (int i = 0; i < 8; i++) { t[i] = 100; l[i] = 100; }
+        set_ctx(buf, 100, t, l);
+        daedalus_h264_pred_chroma8x8_plane_ref(&buf[1][1], STRIDE);
+        uint8_t exp[8][8];
+        for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = 100;
+        fail |= check_per_cell(buf, "Plane uniform (mode 3)", exp);
+    }
+
+    /* --- Mode 3 Plane gradient sanity ---
+     * t = 0..7, l = 0..7, tl = 0.
+     *   H = 1*(t[4]-t[2]) + 2*(t[5]-t[1]) + 3*(t[6]-t[0]) + 4*(t[7]-tl)
+     *     = 1*(4-2) + 2*(5-1) + 3*(6-0) + 4*(7-0)
+     *     = 2 + 8 + 18 + 28 = 56
+     *   V = same shape on left = 56
+     *   b = (34*56 + 32) >> 6 = 1936 >> 6 = 30
+     *   c = 30
+     *   a = 16 * (l[7] + t[7]) = 16 * (7 + 7) = 224
+     *
+     *   pred[0][0] = (224 + 30*(-3) + 30*(-3) + 16) >> 5
+     *              = (224 - 90 - 90 + 16) >> 5
+     *              = 60 >> 5 = 1
+     *   pred[7][7] = (224 + 30*4 + 30*4 + 16) >> 5
+     *              = (224 + 120 + 120 + 16) >> 5
+     *              = 480 >> 5 = 15
+     * Spot-check those two corners. */
+    {
+        uint8_t buf[ROWS][STRIDE];
+        int t[8], l[8];
+        for (int i = 0; i < 8; i++) { t[i] = i; l[i] = i; }
+        set_ctx(buf, 0, t, l);
+        daedalus_h264_pred_chroma8x8_plane_ref(&buf[1][1], STRIDE);
+        uint8_t tl_actual = buf[1 + 0][1 + 0];
+        uint8_t br_actual = buf[1 + 7][1 + 7];
+        int spot_fail = 0;
+        if (tl_actual != 1)  { fprintf(stderr, "Plane gradient pred[0][0] = %u, expected 1\n", tl_actual); spot_fail = 1; }
+        if (br_actual != 15) { fprintf(stderr, "Plane gradient pred[7][7] = %u, expected 15\n", br_actual); spot_fail = 1; }
+        if (!spot_fail) printf("  %-30s PASS (corners 1, 15)\n", "Plane gradient (mode 3)");
+        else            printf("  %-30s FAIL\n", "Plane gradient (mode 3)");
+        fail |= spot_fail;
+    }
+
+    if (fail == 0) printf("\nALL Intra_8x8 chroma mode references PASS\n");
+    else           fprintf(stderr, "\n%d test(s) FAILED\n", fail);
+    return fail ? 1 : 0;
+}