From c43ee84d8e58a7250473f1e5557f614ec45fe4e5 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 00:35:24 +0200 Subject: [PATCH] =?UTF-8?q?h264:=20Intra=5F16x16=20luma=20prediction=20?= =?UTF-8?q?=E2=80=94=204-mode=20C=20reference=20+=20spec=20gates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second piece of the intra-prediction primitive set after PR #12 (Intra_4x4 luma 9 modes). Covers the Intra_16x16 luma MB type per H.264 §8.3.2: 4 modes (Vertical, Horizontal, DC, Plane). Scope: - tests/h264_intra_pred_16x16_ref.c — 4 spec-derived modes. Same FFmpeg-style interface as the 4x4 sibling: void daedalus_h264_pred_16x16__ref(uint8_t *dst, ptrdiff_t stride); Assumes all neighbours valid (interior-MB case). The Plane mode is the algorithmically heaviest of the four — spec §8.3.2.4 has two slope sums (H, V) over the asymmetric top/left contexts, a clipped quadratic evaluation per cell, and a top-left-corner participant at i=7 / j=7. Implementation follows the spec straightforwardly with `clip_u8` on the final saturating cast. - tests/test_intra_pred_16x16.c — 5 test cases: * V, H, DC: standard contexts (gradient top / gradient left / small uniform pair). * Plane (uniform): all neighbours = 100 → H = V = 0 → output = (16*200 + 16) >> 5 = 100. Verifies the orientation-free portion of the formula. * Plane (gradient): top + left both 0..15, spec-derived corner expectations pred[0][0] = 1 and pred[15][15] = 31. The arithmetic chain (H = V = 400 → b = c = 31, a = 480) is fully hand-traced in the test comment so the expected values are auditable. - CMakeLists.txt — new test_intra_pred_16x16 binary; pure-CPU library, no daedalus_core dependency (same separation as the 4x4 ref). Verified on hertz: $ ./build/test_intra_pred_16x16 Vertical (mode 0) PASS Horizontal (mode 1) PASS DC (mode 2) PASS Plane (mode 3, uniform) PASS Plane (mode 3, gradient) PASS (corners 1, 31) ALL Intra_16x16 mode references PASS Plane mode being right first try is meaningful — H/V sums, b/c slope shifts, and the a-baseline arithmetic have many sign / index error opportunities. The asymmetric gradient test would have caught any of them; it didn't. What this does NOT cover (still in the intra-pred backlog): - Intra_8x8 chroma (4 modes per H.264 §8.3.3). - Intra_8x8 luma (High profile, 9 modes per §8.3.2.1 + the 1-2-1 smoothing pre-filter — distinct algorithm from Intra_4x4). - Neighbour-availability fallback for boundary MBs. - Dispatch wrappers (same architectural question as before — wait for decoder Stage 2a strategy decision). --- CMakeLists.txt | 8 ++ tests/h264_intra_pred_16x16_ref.c | 106 +++++++++++++++++++ tests/test_intra_pred_16x16.c | 167 ++++++++++++++++++++++++++++++ 3 files changed, 281 insertions(+) create mode 100644 tests/h264_intra_pred_16x16_ref.c create mode 100644 tests/test_intra_pred_16x16.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d8826b..a69ac98 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -540,6 +540,14 @@ add_executable(test_intra_pred_4x4 ) target_compile_options(test_intra_pred_4x4 PRIVATE -O2) +# H.264 Intra_16x16 luma prediction (4 modes: V, H, DC, Plane) — +# reference + tests. Same spec-gate role as the 4x4 sibling. +add_executable(test_intra_pred_16x16 + tests/test_intra_pred_16x16.c + tests/h264_intra_pred_16x16_ref.c +) +target_compile_options(test_intra_pred_16x16 PRIVATE -O2) + add_executable(bench_pool_overhead tests/bench_pool_overhead.c) target_link_libraries(bench_pool_overhead PRIVATE daedalus_core) target_compile_options(bench_pool_overhead PRIVATE -O2) diff --git a/tests/h264_intra_pred_16x16_ref.c b/tests/h264_intra_pred_16x16_ref.c new file mode 100644 index 0000000..81c0ed8 --- /dev/null +++ b/tests/h264_intra_pred_16x16_ref.c @@ -0,0 +1,106 @@ +/* + * Standalone bit-exact C reference for H.264 luma Intra_16x16 + * prediction modes (per H.264 spec §8.3.2). All 4 modes. + * + * Mode index → name (per H.264 Table 7-15): + * 0 = Vertical + * 1 = Horizontal + * 2 = DC + * 3 = Plane + * + * Calling convention (FFmpeg-style, matches the Intra_4x4 ref): + * pred_16x16_(uint8_t *dst, ptrdiff_t stride) + * + * `dst` points at row 0, col 0 of the 16x16 output block. Neighbours: + * top[0..15] = dst[-stride + 0 .. -stride + 15] + * top-left = dst[-stride - 1] + * left[0..15] = dst[ 0*stride - 1 .. 15*stride - 1] + * + * AVAILABILITY: assumes all neighbours valid (interior-MB case). The + * H.264 spec defines fallback for boundary cases (DC averages just + * the available side, etc.); the eventual libavcodec intercept + * handles availability before calling. + * + * License: BSD-2-Clause. + */ +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } + +/* Mode 0 — Vertical: each col = top[col]. */ +void daedalus_h264_pred_16x16_vertical_ref(uint8_t *dst, ptrdiff_t stride) +{ + const uint8_t *top = dst - stride; + for (int r = 0; r < 16; r++) + for (int c = 0; c < 16; c++) dst[r * stride + c] = top[c]; +} + +/* Mode 1 — Horizontal: each row = left[row]. */ +void daedalus_h264_pred_16x16_horizontal_ref(uint8_t *dst, ptrdiff_t stride) +{ + for (int r = 0; r < 16; r++) { + uint8_t l = dst[r * stride - 1]; + for (int c = 0; c < 16; c++) dst[r * stride + c] = l; + } +} + +/* Mode 2 — DC: ((sum_top16 + sum_left16 + 16) >> 5) broadcast. */ +void daedalus_h264_pred_16x16_dc_ref(uint8_t *dst, ptrdiff_t stride) +{ + const uint8_t *top = dst - stride; + int sum = 16; /* rounding for >> 5 over 32 samples */ + for (int i = 0; i < 16; i++) sum += top[i]; + for (int i = 0; i < 16; i++) sum += dst[i * stride - 1]; + uint8_t v = (uint8_t)(sum >> 5); + for (int r = 0; r < 16; r++) + for (int c = 0; c < 16; c++) dst[r * stride + c] = v; +} + +/* Mode 3 — Plane (per H.264 §8.3.2.4): + * H = sum_{i=0..7} (i+1) * (p[7+i+1, -1] - p[7-i-1, -1]) + * = sum_{i=0..7} (i+1) * (top[8+i] - top[6-i]) + * V = sum_{j=0..7} (j+1) * (p[-1, 7+j+1] - p[-1, 7-j-1]) + * = sum_{j=0..7} (j+1) * (left[8+j] - left[6-j]) + * b = (5*H + 32) >> 6 + * c = (5*V + 32) >> 6 + * a = 16 * (p[-1, 15] + p[15, -1]) + * = 16 * (left[15] + top[15]) + * pred[y][x] = Clip1((a + b*(x-7) + c*(y-7) + 16) >> 5) + * + * Note: spec indexing uses [x, y] with x = col, y = row (or vice + * versa depending on the section). Here I use the FFmpeg convention + * pred[y][x] = pred[row][col]; the H = horizontal-slope formula uses + * the TOP row's left-vs-right asymmetry; V = vertical-slope uses the + * LEFT col's top-vs-bottom asymmetry. Boundary participants are + * the top-left corner p[-1,-1] inferred from the spec's index range + * (it does NOT participate in the H/V sums in the 16x16 case — only + * for the chroma 8x8 plane mode). + */ +void daedalus_h264_pred_16x16_plane_ref(uint8_t *dst, ptrdiff_t stride) +{ + const uint8_t *top = dst - stride; + /* H accumulates differences across the right vs left half of the + * top row. Per spec, the top-left p[-1,-1] participates: i=7 uses + * p[15,-1] - p[-1,-1]. We include it by reading top[-1]. */ + int H = 0, V = 0; + for (int i = 0; i < 8; i++) { + int t_right = top[8 + i]; + int t_left = (i == 7) ? top[-1] : top[6 - i]; + H += (i + 1) * (t_right - t_left); + } + for (int j = 0; j < 8; j++) { + int l_bot = dst[(8 + j) * stride - 1]; + int l_top = (j == 7) ? top[-1] : dst[(6 - j) * stride - 1]; + V += (j + 1) * (l_bot - l_top); + } + int b = (5 * H + 32) >> 6; + int c = (5 * V + 32) >> 6; + int a = 16 * (dst[15 * stride - 1] + top[15]); + for (int y = 0; y < 16; y++) { + for (int x = 0; x < 16; x++) { + int v = (a + b * (x - 7) + c * (y - 7) + 16) >> 5; + dst[y * stride + x] = (uint8_t) clip_u8(v); + } + } +} diff --git a/tests/test_intra_pred_16x16.c b/tests/test_intra_pred_16x16.c new file mode 100644 index 0000000..24aa45d --- /dev/null +++ b/tests/test_intra_pred_16x16.c @@ -0,0 +1,167 @@ +/* + * Tests the 4 H.264 Intra_16x16 luma prediction modes against + * spec-derived expected patterns. Same layout as the 4x4 test: + * a buffer that holds the 16x16 output plus 1-pixel top/left + * context and 1-pixel top-left corner. + * + * row 0: [tl][t0..t15] + * row 1: [l0][output row 0] + * row 2: [l1][output row 1] + * ... + * row 16: [l15][output row 15] + * + * Buffer dimensions: 17 rows × 17 cols, total 289 bytes. + * dst (passed to the pred fns) points at row 1 col 1. + */ +#include +#include +#include +#include + +extern void daedalus_h264_pred_16x16_vertical_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_16x16_horizontal_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_16x16_dc_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_16x16_plane_ref(uint8_t *dst, ptrdiff_t stride); + +#define STRIDE 17 +#define ROWS 17 + +static void set_ctx(uint8_t buf[ROWS][STRIDE], int tl, + const int t[16], const int l[16]) +{ + for (int r = 0; r < ROWS; r++) + for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff; + buf[0][0] = (uint8_t) tl; + for (int c = 0; c < 16; c++) buf[0][1 + c] = (uint8_t) t[c]; + for (int r = 0; r < 16; r++) buf[1 + r][0] = (uint8_t) l[r]; +} + +static int check(const uint8_t buf[ROWS][STRIDE], const char *name, + uint8_t (*expect_at)(int r, int c, void *), void *cookie) +{ + int diff = 0; + int first_r = 0, first_c = 0, first_got = 0, first_exp = 0; + for (int r = 0; r < 16; r++) { + for (int c = 0; c < 16; c++) { + uint8_t got = buf[1 + r][1 + c]; + uint8_t exp = expect_at(r, c, cookie); + if (got != exp) { + if (diff == 0) { + first_r = r; first_c = c; + first_got = got; first_exp = exp; + } + diff++; + } + } + } + if (diff == 0) + printf(" %-30s PASS\n", name); + else + printf(" %-30s FAIL (%d/256 wrong, first r=%d c=%d got=%u exp=%u)\n", + name, diff, first_r, first_c, first_got, first_exp); + return diff == 0 ? 0 : 1; +} + +/* Expectation helpers for each mode. */ +static uint8_t expect_uniform(int r, int c, void *cookie) +{ (void)r; (void)c; return *(uint8_t *)cookie; } + +struct vertical_ctx { const int *t; }; +static uint8_t expect_vertical(int r, int c, void *cookie) +{ (void)r; return (uint8_t) ((struct vertical_ctx *)cookie)->t[c]; } + +struct horizontal_ctx { const int *l; }; +static uint8_t expect_horizontal(int r, int c, void *cookie) +{ (void)c; return (uint8_t) ((struct horizontal_ctx *)cookie)->l[r]; } + +int main(void) +{ + int fail = 0; + + /* --- Mode 0 Vertical: each col = top[col] --- */ + { + uint8_t buf[ROWS][STRIDE]; + int t[16], l[16]; + for (int i = 0; i < 16; i++) { t[i] = 10 + i; l[i] = 0; } + set_ctx(buf, 0, t, l); + daedalus_h264_pred_16x16_vertical_ref(&buf[1][1], STRIDE); + struct vertical_ctx vc = { t }; + fail |= check(buf, "Vertical (mode 0)", expect_vertical, &vc); + } + + /* --- Mode 1 Horizontal: each row = left[row] --- */ + { + uint8_t buf[ROWS][STRIDE]; + int t[16] = {0}, l[16]; + for (int i = 0; i < 16; i++) l[i] = 50 + i; + set_ctx(buf, 0, t, l); + daedalus_h264_pred_16x16_horizontal_ref(&buf[1][1], STRIDE); + struct horizontal_ctx hc = { l }; + fail |= check(buf, "Horizontal (mode 1)", expect_horizontal, &hc); + } + + /* --- Mode 2 DC: ((sum + 16) >> 5) --- */ + /* All top = 2, all left = 6: sum = 32 + 96 = 128, +16 = 144, + * >>5 = 144/32 = 4. */ + { + uint8_t buf[ROWS][STRIDE]; + int t[16], l[16]; + for (int i = 0; i < 16; i++) { t[i] = 2; l[i] = 6; } + set_ctx(buf, 99, t, l); + daedalus_h264_pred_16x16_dc_ref(&buf[1][1], STRIDE); + uint8_t exp_val = 4; + fail |= check(buf, "DC (mode 2)", expect_uniform, &exp_val); + } + + /* --- Mode 3 Plane: uniform neighbours → uniform output --- */ + /* H=V=0 when neighbours are uniform. a = 16*(p+p) = 32p. + * pred[y][x] = (32p + 0 + 0 + 16) >> 5 = (32p + 16) >> 5 = p + * (exact integer for any p, since 32p/32 = p and +16/32 = 0). + * Verifies the orientation-free portion of the formula. */ + { + uint8_t buf[ROWS][STRIDE]; + int t[16], l[16]; + for (int i = 0; i < 16; i++) { t[i] = 100; l[i] = 100; } + set_ctx(buf, 100, t, l); /* uniform tl too — H/V sums actually zero */ + daedalus_h264_pred_16x16_plane_ref(&buf[1][1], STRIDE); + uint8_t exp_val = 100; + fail |= check(buf, "Plane (mode 3, uniform)", expect_uniform, &exp_val); + } + + /* --- Mode 3 Plane: gradient sanity --- + * Top row = 0..15 (gradient), left col = 0..15, tl = 0. + * H = sum_{i=0..7} (i+1) * (t[8+i] - t[6-i] for i<7; or t[15]-tl=15 for i=7) + * = 1*(8-6) + 2*(9-5) + 3*(10-4) + 4*(11-3) + 5*(12-2) + 6*(13-1) + * + 7*(14-0) + 8*(15-0) + * = 2 + 8 + 18 + 32 + 50 + 72 + 98 + 120 = 400 + * V = same shape on left col = 400 + * b = (5*400 + 32) >> 6 = 2032 >> 6 = 31 + * c = (5*400 + 32) >> 6 = 31 + * a = 16 * (l[15] + t[15]) = 16 * (15 + 15) = 480 + * pred[0][0] = (480 + 31*(-7) + 31*(-7) + 16) >> 5 + * = (480 - 217 - 217 + 16) >> 5 + * = 62 >> 5 = 1 + * pred[15][15] = (480 + 31*8 + 31*8 + 16) >> 5 + * = (480 + 248 + 248 + 16) >> 5 + * = 992 >> 5 = 31 + * Just spot-check those two corners. */ + { + uint8_t buf[ROWS][STRIDE]; + int t[16], l[16]; + for (int i = 0; i < 16; i++) { t[i] = i; l[i] = i; } + set_ctx(buf, 0, t, l); + daedalus_h264_pred_16x16_plane_ref(&buf[1][1], STRIDE); + uint8_t tl_actual = buf[1 + 0][1 + 0]; + uint8_t br_actual = buf[1 + 15][1 + 15]; + int spot_fail = 0; + if (tl_actual != 1) { fprintf(stderr, "Plane gradient pred[0][0] = %u, expected 1\n", tl_actual); spot_fail = 1; } + if (br_actual != 31) { fprintf(stderr, "Plane gradient pred[15][15] = %u, expected 31\n", br_actual); spot_fail = 1; } + if (!spot_fail) printf(" %-30s PASS (corners 1, 31)\n", "Plane (mode 3, gradient)"); + else printf(" %-30s FAIL\n", "Plane (mode 3, gradient)"); + fail |= spot_fail; + } + + if (fail == 0) printf("\nALL Intra_16x16 mode references PASS\n"); + else fprintf(stderr, "\n%d test(s) FAILED\n", fail); + return fail ? 1 : 0; +}