Merge pull request 'h264: Intra_8x8 luma (High profile) — pre-filter + 3 modes (V/H/DC)' (#21) from noether/h264-intra-pred-8x8-luma into main

Reviewed-on: #21
2026-05-25 07:51:51 +00:00
parent 1ee8b1c0ab 8bc6d27ea7
commit 18ca708f87
3 changed files with 271 additions and 0 deletions
@@ -563,6 +563,15 @@ add_executable(test_intra_pred_chroma8x8
 )
 target_compile_options(test_intra_pred_chroma8x8 PRIVATE -O2)

+# H.264 Intra_8x8 luma prediction (High profile, 9 modes + 1-2-1
+# reference-sample pre-filter).  This PR ships the pre-filter + the
+# 3 simple modes (V, H, DC); the 6 directional modes follow.
+add_executable(test_intra_pred_8x8_luma
+    tests/test_intra_pred_8x8_luma.c
+    tests/h264_intra_pred_8x8_luma_ref.c
+)
+target_compile_options(test_intra_pred_8x8_luma PRIVATE -O2)
+
 add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
 target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
 target_compile_options(bench_pool_overhead PRIVATE -O2)
@@ -0,0 +1,123 @@
+/*
+ * Standalone bit-exact C reference for H.264 luma Intra_8x8
+ * prediction modes (per H.264 spec §8.3.2.1).  High-profile-only
+ * MB type — Baseline/Main/Extended profiles don't see Intra_8x8.
+ *
+ * Distinct from Intra_4x4 in two ways:
+ *
+ *   1. REFERENCE SAMPLE FILTERING (§8.3.2.1.1).  The 25 raw
+ *      neighbour samples are pre-filtered with a 1-2-1 smoothing
+ *      filter BEFORE prediction.  The filtering has spec-defined
+ *      boundary handling at the corners and the right-edge of the
+ *      top-row extension.
+ *
+ *   2. SCALE.  All 9 prediction modes operate at 8x8 with the
+ *      filtered samples (Intra_4x4 operates at 4x4 with the raw
+ *      samples).
+ *
+ * This PR implements the filter + the 3 simple modes (Vertical,
+ * Horizontal, DC).  The 6 directional modes (DDL, DDR, VR, HD, VL,
+ * HU at 8x8) follow in a separate PR — same template, different
+ * formulas per spec sections §8.3.2.1.4..§8.3.2.1.9.
+ *
+ * Calling convention (FFmpeg-style):
+ *   pred_8x8_<mode>_ref(uint8_t *dst, ptrdiff_t stride)
+ *
+ * `dst` points at row 0 col 0 of the 8x8 output block.  Reads from
+ *   top[0..15]  = dst[-stride + 0..15]
+ *   top-left    = dst[-stride - 1]
+ *   left[0..7]  = dst[ 0*stride - 1 .. 7*stride - 1]
+ *
+ * AVAILABILITY: assumes all neighbours valid (interior-MB case).
+ *
+ * License: BSD-2-Clause.
+ */
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
+
+/* H.264 §8.3.2.1.1 reference sample filtering.  Filters the 25 raw
+ * samples around the 8x8 block into a `filt` array with the same
+ * indices.  When called against an "all neighbours available" tile,
+ * the filtered output uses these spec-defined formulas:
+ *
+ *   filt[top -1] (= filtered top-left) = (top[0] + 2*tl + left[0] + 2) >> 2
+ *
+ *   filt[top  0] = (tl + 2*top[0] + top[1] + 2) >> 2
+ *   filt[top  i] for 1<=i<=14 = (top[i-1] + 2*top[i] + top[i+1] + 2) >> 2
+ *   filt[top 15] = (top[14] + 3*top[15] + 2) >> 2    (boundary)
+ *
+ *   filt[left 0] = (tl + 2*left[0] + left[1] + 2) >> 2
+ *   filt[left j] for 1<=j<=6 = (left[j-1] + 2*left[j] + left[j+1] + 2) >> 2
+ *   filt[left 7] = (left[6] + 3*left[7] + 2) >> 2    (boundary)
+ *
+ * Reads neighbours from the dst buffer; writes filtered values to
+ * a caller-provided 26-element array indexed as:
+ *   filt[0]      = filtered top-left
+ *   filt[1..16]  = filtered top[0..15]
+ *   filt[17..24] = filtered left[0..7]
+ */
+static void filter_refs(const uint8_t *dst, ptrdiff_t stride,
+                         uint8_t filt[25])
+{
+    int tl = dst[-stride - 1];
+    int t[16];
+    for (int i = 0; i < 16; i++) t[i] = dst[-stride + i];
+    int l[8];
+    for (int j = 0; j < 8; j++) l[j] = dst[j * stride - 1];
+
+    /* Filtered top-left. */
+    filt[0] = (uint8_t)((t[0] + 2*tl + l[0] + 2) >> 2);
+
+    /* Filtered top. */
+    filt[1] = (uint8_t)((tl + 2*t[0] + t[1] + 2) >> 2);
+    for (int i = 1; i <= 14; i++)
+        filt[1 + i] = (uint8_t)((t[i-1] + 2*t[i] + t[i+1] + 2) >> 2);
+    filt[1 + 15] = (uint8_t)((t[14] + 3*t[15] + 2) >> 2);
+
+    /* Filtered left. */
+    filt[17 + 0] = (uint8_t)((tl + 2*l[0] + l[1] + 2) >> 2);
+    for (int j = 1; j <= 6; j++)
+        filt[17 + j] = (uint8_t)((l[j-1] + 2*l[j] + l[j+1] + 2) >> 2);
+    filt[17 + 7] = (uint8_t)((l[6] + 3*l[7] + 2) >> 2);
+}
+
+/* Convenience macros for accessing the filt[] array by spec-style index. */
+#define FT(i)  filt[1 + (i)]    /* filtered top[i],  i in 0..15  */
+#define FL(j)  filt[17 + (j)]   /* filtered left[j], j in 0..7   */
+#define FTL    filt[0]          /* filtered top-left              */
+
+/* Mode 0 Vertical (§8.3.2.1.2): pred[r,c] = filt_top[c]. */
+void daedalus_h264_pred_8x8l_vertical_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    uint8_t filt[25];
+    filter_refs(dst, stride, filt);
+    for (int r = 0; r < 8; r++)
+        for (int c = 0; c < 8; c++) dst[r * stride + c] = FT(c);
+}
+
+/* Mode 1 Horizontal (§8.3.2.1.3): pred[r,c] = filt_left[r]. */
+void daedalus_h264_pred_8x8l_horizontal_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    uint8_t filt[25];
+    filter_refs(dst, stride, filt);
+    for (int r = 0; r < 8; r++)
+        for (int c = 0; c < 8; c++) dst[r * stride + c] = FL(r);
+}
+
+/* Mode 2 DC (§8.3.2.1.4): ((sum_filt_top[0..7] + sum_filt_left[0..7]
+ * + 8) >> 4) broadcast.  Note the +8 (not +4 like 4x4): there are
+ * 16 samples summed total, so >> 4 with half-step rounding +8. */
+void daedalus_h264_pred_8x8l_dc_ref(uint8_t *dst, ptrdiff_t stride)
+{
+    uint8_t filt[25];
+    filter_refs(dst, stride, filt);
+    int sum = 8;
+    for (int i = 0; i < 8; i++) sum += FT(i);
+    for (int j = 0; j < 8; j++) sum += FL(j);
+    uint8_t v = (uint8_t)(sum >> 4);
+    for (int r = 0; r < 8; r++)
+        for (int c = 0; c < 8; c++) dst[r * stride + c] = v;
+}
@@ -0,0 +1,139 @@
+/*
+ * Tests the H.264 Intra_8x8 luma prediction modes against spec-derived
+ * expectations.  Buffer layout is 9 rows × 17 cols (extra cols for the
+ * top-right extension that DDL/VL need; not exercised by V/H/DC but
+ * already in-place for the eventual directional-modes follow-up):
+ *
+ *   row 0: [tl][t0..t15]                                — 17 bytes
+ *   row 1: [l0][output row 0  ..]                       — 17 bytes
+ *   ...
+ *   row 8: [l7][output row 7  ..]
+ */
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+extern void daedalus_h264_pred_8x8l_vertical_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_horizontal_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_dc_ref(uint8_t *dst, ptrdiff_t stride);
+
+#define STRIDE 17
+#define ROWS   9
+
+static void set_ctx(uint8_t buf[ROWS][STRIDE], int tl,
+                     const int t[16], const int l[8])
+{
+    for (int r = 0; r < ROWS; r++)
+        for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff;
+    buf[0][0] = (uint8_t) tl;
+    for (int c = 0; c < 16; c++) buf[0][1 + c] = (uint8_t) t[c];
+    for (int r = 0; r < 8; r++) buf[1 + r][0] = (uint8_t) l[r];
+}
+
+static int check_uniform(const uint8_t buf[ROWS][STRIDE], const char *name,
+                          uint8_t expect_val)
+{
+    int diff = 0;
+    for (int r = 0; r < 8; r++)
+        for (int c = 0; c < 8; c++)
+            if (buf[1+r][1+c] != expect_val) diff++;
+    if (diff == 0) printf("  %-30s PASS\n", name);
+    else           printf("  %-30s FAIL (%d/64 wrong, expected %u)\n", name, diff, expect_val);
+    return diff == 0 ? 0 : 1;
+}
+
+int main(void)
+{
+    int fail = 0;
+
+    /* Mode 0 Vertical with uniform top → uniform output.
+     * Filtered top[c] = (a + 2*a + a + 2) >> 2 = a for uniform a. */
+    {
+        uint8_t buf[ROWS][STRIDE];
+        int t[16], l[8];
+        for (int i = 0; i < 16; i++) t[i] = 50;
+        for (int j = 0; j < 8; j++)  l[j] = 0;
+        set_ctx(buf, 50, t, l);
+        daedalus_h264_pred_8x8l_vertical_ref(&buf[1][1], STRIDE);
+        fail |= check_uniform(buf, "Vertical (mode 0, uniform top)", 50);
+    }
+
+    /* Mode 1 Horizontal with uniform left → uniform output. */
+    {
+        uint8_t buf[ROWS][STRIDE];
+        int t[16] = {0}, l[8];
+        for (int j = 0; j < 8; j++) l[j] = 70;
+        set_ctx(buf, 70, t, l);
+        daedalus_h264_pred_8x8l_horizontal_ref(&buf[1][1], STRIDE);
+        fail |= check_uniform(buf, "Horizontal (mode 1, uniform left)", 70);
+    }
+
+    /* Mode 2 DC with all-uniform neighbours → uniform output.
+     * Filtered top[c] = top  for uniform; filtered left[j] = left.
+     * sum = 8*a + 8*a + 8 = 16a + 8.  >> 4 = a (exact when +8 rounds). */
+    {
+        uint8_t buf[ROWS][STRIDE];
+        int t[16], l[8];
+        for (int i = 0; i < 16; i++) t[i] = 33;
+        for (int j = 0; j < 8; j++)  l[j] = 33;
+        set_ctx(buf, 33, t, l);
+        daedalus_h264_pred_8x8l_dc_ref(&buf[1][1], STRIDE);
+        fail |= check_uniform(buf, "DC (mode 2, uniform)", 33);
+    }
+
+    /* Mode 0 Vertical with NON-uniform top: gradient 0..15.  Filtered
+     * top[c] for c in 1..14 = (t[c-1] + 2*t[c] + t[c+1] + 2) >> 2
+     *                       = (c-1 + 2c + c+1 + 2) >> 2
+     *                       = (4c + 2) >> 2 = c (since (4c+2)/4 = c with rounding).
+     * Wait — (4c + 2) >> 2 = c + 0 (since 4c is divisible by 4 and +2 rounds
+     * BELOW 4, doesn't change anything).  So filtered = c for c=1..14.
+     * filt[0] (top-left) = (t[0] + 2*tl + l[0] + 2) >> 2 (not exercised
+     *   directly by Vertical mode).
+     * filt[top 0] = (tl + 2*t[0] + t[1] + 2) >> 2 = (0 + 0 + 1 + 2) >> 2 = 0
+     *   (tl=0, t[0]=0, t[1]=1)
+     * filt[top 15] = (t[14] + 3*t[15] + 2) >> 2 = (14 + 45 + 2) >> 2
+     *              = 61 >> 2 = 15
+     *
+     * So Vertical output col 0 = filt[top 0] = 0, col 1 = filt[top 1] = 1,
+     * ..., col 7 = filt[top 7] = 7.  Same for all 8 rows. */
+    {
+        uint8_t buf[ROWS][STRIDE];
+        int t[16], l[8] = {0};
+        for (int i = 0; i < 16; i++) t[i] = i;
+        set_ctx(buf, 0, t, l);
+        daedalus_h264_pred_8x8l_vertical_ref(&buf[1][1], STRIDE);
+        int diff = 0;
+        for (int r = 0; r < 8; r++)
+            for (int c = 0; c < 8; c++)
+                if (buf[1+r][1+c] != c) diff++;
+        if (diff == 0) printf("  %-30s PASS (filtered gradient)\n", "Vertical (mode 0, gradient)");
+        else           printf("  %-30s FAIL (%d/64 wrong)\n", "Vertical (mode 0, gradient)", diff);
+        fail |= (diff == 0) ? 0 : 1;
+    }
+
+    /* Mode 1 Horizontal gradient: left = 0..7.  Filtered left:
+     * filt[left 0] = (tl + 2*l[0] + l[1] + 2) >> 2 = (0 + 0 + 1 + 2) >> 2 = 0
+     * filt[left j] for j=1..6 = (l[j-1] + 2*l[j] + l[j+1] + 2) >> 2 = j
+     *   (same arithmetic as top)
+     * filt[left 7] = (l[6] + 3*l[7] + 2) >> 2 = (6 + 21 + 2) >> 2 = 7
+     * So Horizontal output row 0 = 0, row 7 = 7. */
+    {
+        uint8_t buf[ROWS][STRIDE];
+        int t[16] = {0}, l[8];
+        for (int j = 0; j < 8; j++) l[j] = j;
+        set_ctx(buf, 0, t, l);
+        daedalus_h264_pred_8x8l_horizontal_ref(&buf[1][1], STRIDE);
+        int diff = 0;
+        for (int r = 0; r < 8; r++)
+            for (int c = 0; c < 8; c++)
+                if (buf[1+r][1+c] != r) diff++;
+        if (diff == 0) printf("  %-30s PASS (filtered gradient)\n", "Horizontal (mode 1, gradient)");
+        else           printf("  %-30s FAIL (%d/64 wrong)\n", "Horizontal (mode 1, gradient)", diff);
+        fail |= (diff == 0) ? 0 : 1;
+    }
+
+    if (fail == 0) printf("\nALL Intra_8x8 luma PASS (3 modes — V, H, DC)\n");
+    else           fprintf(stderr, "\n%d test(s) FAILED\n", fail);
+    return fail ? 1 : 0;
+}