diff --git a/CMakeLists.txt b/CMakeLists.txt index b27f4e7..4d8826b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -531,6 +531,15 @@ add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c) target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core) target_compile_options(test_api_opportunistic_qpu PRIVATE -O2) +# H.264 Intra_4x4 luma prediction (9 modes) — reference + tests. +# Pure CPU + spec-derived; no daedalus_core dependency yet (this is +# the bit-exact gate for the eventual shader / dispatch wiring). +add_executable(test_intra_pred_4x4 + tests/test_intra_pred_4x4.c + tests/h264_intra_pred_4x4_ref.c +) +target_compile_options(test_intra_pred_4x4 PRIVATE -O2) + add_executable(bench_pool_overhead tests/bench_pool_overhead.c) target_link_libraries(bench_pool_overhead PRIVATE daedalus_core) target_compile_options(bench_pool_overhead PRIVATE -O2) diff --git a/tests/h264_intra_pred_4x4_ref.c b/tests/h264_intra_pred_4x4_ref.c new file mode 100644 index 0000000..6cec9ba --- /dev/null +++ b/tests/h264_intra_pred_4x4_ref.c @@ -0,0 +1,238 @@ +/* + * Standalone bit-exact C reference for H.264 luma Intra_4x4 + * prediction modes (per H.264 spec §8.3.1.4). All 9 modes. + * + * Mode index → name (per H.264 Table 8-2): + * 0 = Vertical + * 1 = Horizontal + * 2 = DC + * 3 = Diagonal_Down_Left + * 4 = Diagonal_Down_Right + * 5 = Vertical_Right + * 6 = Horizontal_Down + * 7 = Vertical_Left + * 8 = Horizontal_Up + * + * Calling convention matches FFmpeg's h264pred: + * pred_4x4_(uint8_t *dst, ptrdiff_t stride) + * + * `dst` points at row 0, col 0 of the 4x4 output block. Neighbour + * pixels come from the already-decoded surrounding pixels in the same + * buffer: + * top-left = dst[-stride - 1] + * top[0..3] = dst[-stride + 0 .. -stride + 3] + * top-right = dst[-stride + 4 .. -stride + 7] (DDL / VL only) + * left[0..3] = dst[ 0*stride - 1 .. 3*stride - 1] + * + * AVAILABILITY: this reference assumes ALL neighbours are available + * (the "interior MB" case). The H.264 spec defines fallback behaviour + * for unavailable neighbours (e.g. DC averages only the available + * side, top-right substitution from top[3] for DDL/VL near the right + * frame edge); those branches are NOT modelled here. Tests must + * exercise the kernel with all 13 neighbour bytes valid. The eventual + * libavcodec intercept handles availability before calling. + * + * License: BSD-2-Clause for the reference + tests; the underlying + * algorithm is from H.264/ITU-T H.264 (2003) and AVC standards, free + * to implement. + */ +#include +#include + +/* Helper: 3-tap weighted average ((a + 2*b + c + 2) >> 2). */ +static inline uint8_t avg3(int a, int b, int c) +{ + return (uint8_t)((a + 2*b + c + 2) >> 2); +} + +/* Helper: 2-tap mean ((a + b + 1) >> 1). */ +static inline uint8_t avg2(int a, int b) +{ + return (uint8_t)((a + b + 1) >> 1); +} + +/* Mode 0 — Vertical: each col = top[col]. */ +void daedalus_h264_pred_4x4_vertical_ref(uint8_t *dst, ptrdiff_t stride) +{ + const uint8_t *top = dst - stride; + for (int r = 0; r < 4; r++) { + for (int c = 0; c < 4; c++) dst[r * stride + c] = top[c]; + } +} + +/* Mode 1 — Horizontal: each row = left[row]. */ +void daedalus_h264_pred_4x4_horizontal_ref(uint8_t *dst, ptrdiff_t stride) +{ + for (int r = 0; r < 4; r++) { + uint8_t l = dst[r * stride - 1]; + for (int c = 0; c < 4; c++) dst[r * stride + c] = l; + } +} + +/* Mode 2 — DC: mean of top 4 + left 4, broadcast. */ +void daedalus_h264_pred_4x4_dc_ref(uint8_t *dst, ptrdiff_t stride) +{ + const uint8_t *top = dst - stride; + int sum = 4; /* rounding for ((sum + 4) >> 3) */ + for (int i = 0; i < 4; i++) sum += top[i]; + for (int i = 0; i < 4; i++) sum += dst[i * stride - 1]; + uint8_t v = (uint8_t)(sum >> 3); + for (int r = 0; r < 4; r++) + for (int c = 0; c < 4; c++) dst[r * stride + c] = v; +} + +/* Mode 3 — Diagonal_Down_Left. Uses top[0..7] (incl. top-right). */ +void daedalus_h264_pred_4x4_ddl_ref(uint8_t *dst, ptrdiff_t stride) +{ + const uint8_t *top = dst - stride; + int t0 = top[0], t1 = top[1], t2 = top[2], t3 = top[3]; + int t4 = top[4], t5 = top[5], t6 = top[6], t7 = top[7]; + /* zz[7] = top filtered with 3-tap; spec table 8-7. */ + uint8_t zz[7]; + zz[0] = avg3(t0, t1, t2); + zz[1] = avg3(t1, t2, t3); + zz[2] = avg3(t2, t3, t4); + zz[3] = avg3(t3, t4, t5); + zz[4] = avg3(t4, t5, t6); + zz[5] = avg3(t5, t6, t7); + zz[6] = avg3(t6, t7, t7); /* spec: t7 doubled at the boundary */ + /* dst[r][c] = zz[c + r] */ + for (int r = 0; r < 4; r++) + for (int c = 0; c < 4; c++) dst[r * stride + c] = zz[c + r]; +} + +/* Mode 4 — Diagonal_Down_Right. Uses top-left + top[0..3] + left[0..3]. */ +void daedalus_h264_pred_4x4_ddr_ref(uint8_t *dst, ptrdiff_t stride) +{ + int tl = dst[-stride - 1]; + int t0 = dst[-stride + 0], t1 = dst[-stride + 1]; + int t2 = dst[-stride + 2], t3 = dst[-stride + 3]; + int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1]; + int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1]; + /* zz indexed by (col - row): -3..+3 */ + uint8_t zz_m3 = avg3(l1, l2, l3); + uint8_t zz_m2 = avg3(l0, l1, l2); + uint8_t zz_m1 = avg3(tl, l0, l1); + uint8_t zz_p0 = avg3(l0, tl, t0); + uint8_t zz_p1 = avg3(tl, t0, t1); + uint8_t zz_p2 = avg3(t0, t1, t2); + uint8_t zz_p3 = avg3(t1, t2, t3); + uint8_t zz[7] = { zz_m3, zz_m2, zz_m1, zz_p0, zz_p1, zz_p2, zz_p3 }; + for (int r = 0; r < 4; r++) + for (int c = 0; c < 4; c++) dst[r * stride + c] = zz[(c - r) + 3]; +} + +/* Mode 5 — Vertical_Right. */ +void daedalus_h264_pred_4x4_vr_ref(uint8_t *dst, ptrdiff_t stride) +{ + int tl = dst[-stride - 1]; + int t0 = dst[-stride + 0], t1 = dst[-stride + 1]; + int t2 = dst[-stride + 2], t3 = dst[-stride + 3]; + int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1]; + int l2 = dst[ 2*stride - 1]; + /* H.264 §8.3.1.4.6: two patterns based on (2c - r) parity. */ + dst[0*stride + 0] = avg2(tl, t0); + dst[0*stride + 1] = avg2(t0, t1); + dst[0*stride + 2] = avg2(t1, t2); + dst[0*stride + 3] = avg2(t2, t3); + + dst[1*stride + 0] = avg3(l0, tl, t0); + dst[1*stride + 1] = avg3(tl, t0, t1); + dst[1*stride + 2] = avg3(t0, t1, t2); + dst[1*stride + 3] = avg3(t1, t2, t3); + + dst[2*stride + 0] = avg3(tl, l0, l1); + dst[2*stride + 1] = dst[0*stride + 0]; + dst[2*stride + 2] = dst[0*stride + 1]; + dst[2*stride + 3] = dst[0*stride + 2]; + + dst[3*stride + 0] = avg3(l0, l1, l2); + dst[3*stride + 1] = dst[1*stride + 0]; + dst[3*stride + 2] = dst[1*stride + 1]; + dst[3*stride + 3] = dst[1*stride + 2]; +} + +/* Mode 6 — Horizontal_Down. */ +void daedalus_h264_pred_4x4_hd_ref(uint8_t *dst, ptrdiff_t stride) +{ + int tl = dst[-stride - 1]; + int t0 = dst[-stride + 0], t1 = dst[-stride + 1], t2 = dst[-stride + 2]; + int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1]; + int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1]; + + dst[0*stride + 0] = avg2(tl, l0); + dst[0*stride + 1] = avg3(l0, tl, t0); + dst[0*stride + 2] = avg3(tl, t0, t1); + dst[0*stride + 3] = avg3(t0, t1, t2); + + dst[1*stride + 0] = avg2(l0, l1); + dst[1*stride + 1] = avg3(tl, l0, l1); + dst[1*stride + 2] = dst[0*stride + 0]; + dst[1*stride + 3] = dst[0*stride + 1]; + + dst[2*stride + 0] = avg2(l1, l2); + dst[2*stride + 1] = avg3(l0, l1, l2); + dst[2*stride + 2] = dst[1*stride + 0]; + dst[2*stride + 3] = dst[1*stride + 1]; + + dst[3*stride + 0] = avg2(l2, l3); + dst[3*stride + 1] = avg3(l1, l2, l3); + dst[3*stride + 2] = dst[2*stride + 0]; + dst[3*stride + 3] = dst[2*stride + 1]; +} + +/* Mode 7 — Vertical_Left. Uses top[0..7]. */ +void daedalus_h264_pred_4x4_vl_ref(uint8_t *dst, ptrdiff_t stride) +{ + const uint8_t *top = dst - stride; + int t0=top[0], t1=top[1], t2=top[2], t3=top[3]; + int t4=top[4], t5=top[5], t6=top[6], t7=top[7]; + + dst[0*stride + 0] = avg2(t0, t1); + dst[0*stride + 1] = avg2(t1, t2); + dst[0*stride + 2] = avg2(t2, t3); + dst[0*stride + 3] = avg2(t3, t4); + + dst[1*stride + 0] = avg3(t0, t1, t2); + dst[1*stride + 1] = avg3(t1, t2, t3); + dst[1*stride + 2] = avg3(t2, t3, t4); + dst[1*stride + 3] = avg3(t3, t4, t5); + + dst[2*stride + 0] = avg2(t1, t2); + dst[2*stride + 1] = avg2(t2, t3); + dst[2*stride + 2] = avg2(t3, t4); + dst[2*stride + 3] = avg2(t4, t5); + + dst[3*stride + 0] = avg3(t1, t2, t3); + dst[3*stride + 1] = avg3(t2, t3, t4); + dst[3*stride + 2] = avg3(t3, t4, t5); + dst[3*stride + 3] = avg3(t4, t5, t6); + (void) t6; (void) t7; /* t6 used; t7 unused in 4x4 VL */ +} + +/* Mode 8 — Horizontal_Up. Uses left[0..3] only. */ +void daedalus_h264_pred_4x4_hu_ref(uint8_t *dst, ptrdiff_t stride) +{ + int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1]; + int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1]; + + dst[0*stride + 0] = avg2(l0, l1); + dst[0*stride + 1] = avg3(l0, l1, l2); + dst[0*stride + 2] = avg2(l1, l2); + dst[0*stride + 3] = avg3(l1, l2, l3); + + dst[1*stride + 0] = avg2(l1, l2); + dst[1*stride + 1] = avg3(l1, l2, l3); + dst[1*stride + 2] = avg2(l2, l3); + dst[1*stride + 3] = avg3(l2, l3, l3); + + dst[2*stride + 0] = avg2(l2, l3); + dst[2*stride + 1] = avg3(l2, l3, l3); + dst[2*stride + 2] = l3; + dst[2*stride + 3] = l3; + + dst[3*stride + 0] = l3; + dst[3*stride + 1] = l3; + dst[3*stride + 2] = l3; + dst[3*stride + 3] = l3; +} diff --git a/tests/test_intra_pred_4x4.c b/tests/test_intra_pred_4x4.c new file mode 100644 index 0000000..2a44c1d --- /dev/null +++ b/tests/test_intra_pred_4x4.c @@ -0,0 +1,246 @@ +/* + * Tests the 9 H.264 Intra_4x4 luma prediction modes against + * spec-derived expected patterns. Goal: catch any mistake in + * the reference (sign / shift / table mapping) before it lands + * downstream. Each mode is exercised with a deterministic + * neighbour context and checked against a hand-computed (or + * spec-derived) expected 4x4 output. + * + * The test buffer layout reserves a 1-pixel top/left context border + * + a 4-pixel top-right (for modes 3 / 7): + * + * row 0: [tl][t0 t1 t2 t3 t4 t5 t6 t7] <- TOP_STRIDE = 9 bytes + * row 1: [l0][ 4x4 output goes here ] + * row 2: [l1][ ] + * row 3: [l2][ ] + * row 4: [l3][ ] + * + * dst (passed to the pred fns) points at row 1 col 1. + */ +#include +#include +#include +#include + +extern void daedalus_h264_pred_4x4_vertical_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_horizontal_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_dc_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_ddl_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_ddr_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_vr_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_hd_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_vl_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_hu_ref(uint8_t *dst, ptrdiff_t stride); + +#define STRIDE 9 +typedef void (*pred_fn)(uint8_t *dst, ptrdiff_t stride); + +/* Set up the buffer: 5 rows × STRIDE cols. + * top-left = tl, top[0..7] = t[0..7], left[0..3] = l[0..3]. + * The 4x4 output region (rows 1..4, cols 1..4) is filled with 0xff + * sentinels so any unwritten cell shows up as 255 in the compare. */ +static void set_ctx(uint8_t buf[5][STRIDE], int tl, const int t[8], const int l[4]) +{ + for (int r = 0; r < 5; r++) for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff; + buf[0][0] = (uint8_t) tl; + for (int c = 0; c < 8; c++) buf[0][1 + c] = (uint8_t) t[c]; + for (int r = 0; r < 4; r++) buf[1 + r][0] = (uint8_t) l[r]; +} + +static int check(const uint8_t buf[5][STRIDE], const char *name, + const uint8_t expect[4][4]) +{ + int diff = 0; + for (int r = 0; r < 4; r++) { + for (int c = 0; c < 4; c++) { + uint8_t got = buf[1 + r][1 + c]; + uint8_t exp = expect[r][c]; + if (got != exp) { + if (diff == 0) + fprintf(stderr, + "%s: first mismatch r=%d c=%d got=%u exp=%u\n", + name, r, c, got, exp); + diff++; + } + } + } + if (diff == 0) + printf(" %-26s PASS\n", name); + else + printf(" %-26s FAIL (%d/16 bytes wrong)\n", name, diff); + return diff == 0 ? 0 : 1; +} + +int main(void) +{ + int fail = 0; + + /* Mode 0 — Vertical: each col = top[col]. */ + { + uint8_t buf[5][STRIDE]; + int tl = 0; + int t[8] = { 10, 20, 30, 40, 0, 0, 0, 0 }; + int l[4] = { 0, 0, 0, 0 }; + set_ctx(buf, tl, t, l); + daedalus_h264_pred_4x4_vertical_ref(&buf[1][1], STRIDE); + uint8_t exp[4][4] = { + {10,20,30,40}, {10,20,30,40}, {10,20,30,40}, {10,20,30,40} + }; + fail |= check(buf, "Vertical (mode 0)", exp); + } + + /* Mode 1 — Horizontal: each row = left[row]. */ + { + uint8_t buf[5][STRIDE]; + int t[8] = { 0,0,0,0, 0,0,0,0 }; + int l[4] = { 50, 60, 70, 80 }; + set_ctx(buf, 0, t, l); + daedalus_h264_pred_4x4_horizontal_ref(&buf[1][1], STRIDE); + uint8_t exp[4][4] = { + {50,50,50,50}, {60,60,60,60}, {70,70,70,70}, {80,80,80,80} + }; + fail |= check(buf, "Horizontal (mode 1)", exp); + } + + /* Mode 2 — DC: all 8 neighbours valid → ((sum + 4) >> 3) broadcast. + * top sum = 4*1 = 4, left sum = 4*3 = 12, total 16, +4 = 20, + * >>3 = 2. */ + { + uint8_t buf[5][STRIDE]; + int t[8] = { 1,1,1,1, 0,0,0,0 }; + int l[4] = { 3,3,3,3 }; + set_ctx(buf, 99, t, l); /* tl unused for DC */ + daedalus_h264_pred_4x4_dc_ref(&buf[1][1], STRIDE); + uint8_t exp[4][4] = { + {2,2,2,2}, {2,2,2,2}, {2,2,2,2}, {2,2,2,2} + }; + fail |= check(buf, "DC (mode 2)", exp); + } + + /* Mode 3 — Diagonal_Down_Left: zz[i] = avg3(t[i], t[i+1], t[i+2]); + * dst[r][c] = zz[c + r]. + * With all t[]=100 → all zz=100 → all dst=100. */ + { + uint8_t buf[5][STRIDE]; + int t[8] = { 100,100,100,100, 100,100,100,100 }; + int l[4] = { 0,0,0,0 }; + set_ctx(buf, 0, t, l); + daedalus_h264_pred_4x4_ddl_ref(&buf[1][1], STRIDE); + uint8_t exp[4][4] = { + {100,100,100,100}, {100,100,100,100}, + {100,100,100,100}, {100,100,100,100} + }; + fail |= check(buf, "DiagDownLeft (mode 3)", exp); + } + + /* Mode 4 — Diagonal_Down_Right: zz[c-r] with c-r ∈ {-3..+3}. + * If all 9 surrounding pixels = 200 → all zz = 200 → all dst = 200. */ + { + uint8_t buf[5][STRIDE]; + int t[8] = { 200,200,200,200, 0,0,0,0 }; + int l[4] = { 200,200,200,200 }; + set_ctx(buf, 200, t, l); + daedalus_h264_pred_4x4_ddr_ref(&buf[1][1], STRIDE); + uint8_t exp[4][4] = { + {200,200,200,200}, {200,200,200,200}, + {200,200,200,200}, {200,200,200,200} + }; + fail |= check(buf, "DiagDownRight (mode 4)", exp); + } + + /* Mode 5 — Vertical_Right. With all neighbours = 80 the 3-tap + * (a+2b+c+2)>>2 and 2-tap (a+b+1)>>1 both yield 80. */ + { + uint8_t buf[5][STRIDE]; + int t[8] = { 80,80,80,80, 0,0,0,0 }; + int l[4] = { 80,80,80,80 }; + set_ctx(buf, 80, t, l); + daedalus_h264_pred_4x4_vr_ref(&buf[1][1], STRIDE); + uint8_t exp[4][4] = { + {80,80,80,80}, {80,80,80,80}, {80,80,80,80}, {80,80,80,80} + }; + fail |= check(buf, "VerticalRight (mode 5)", exp); + } + + /* Mode 6 — Horizontal_Down. Same uniform-context degenerate case. */ + { + uint8_t buf[5][STRIDE]; + int t[8] = { 120,120,120,120, 0,0,0,0 }; + int l[4] = { 120,120,120,120 }; + set_ctx(buf, 120, t, l); + daedalus_h264_pred_4x4_hd_ref(&buf[1][1], STRIDE); + uint8_t exp[4][4] = { + {120,120,120,120}, {120,120,120,120}, + {120,120,120,120}, {120,120,120,120} + }; + fail |= check(buf, "HorizontalDown (mode 6)", exp); + } + + /* Mode 7 — Vertical_Left. Uniform context. */ + { + uint8_t buf[5][STRIDE]; + int t[8] = { 64,64,64,64, 64,64,64,64 }; + int l[4] = { 0,0,0,0 }; + set_ctx(buf, 0, t, l); + daedalus_h264_pred_4x4_vl_ref(&buf[1][1], STRIDE); + uint8_t exp[4][4] = { + {64,64,64,64}, {64,64,64,64}, {64,64,64,64}, {64,64,64,64} + }; + fail |= check(buf, "VerticalLeft (mode 7)", exp); + } + + /* Mode 8 — Horizontal_Up. Uniform context. */ + { + uint8_t buf[5][STRIDE]; + int t[8] = { 0,0,0,0, 0,0,0,0 }; + int l[4] = { 200,200,200,200 }; + set_ctx(buf, 0, t, l); + daedalus_h264_pred_4x4_hu_ref(&buf[1][1], STRIDE); + uint8_t exp[4][4] = { + {200,200,200,200}, {200,200,200,200}, + {200,200,200,200}, {200,200,200,200} + }; + fail |= check(buf, "HorizontalUp (mode 8)", exp); + } + + /* Asymmetric Vertical_Right test: detects orientation / + * row-vs-col confusion. Top=10,20,30,40, Left=50,60,70, + * top-left=5. Spec-derived expected output computed by hand + * from §8.3.1.4.6. + * + * d[0][0] = (tl+t0+1)>>1 = (5+10+1)>>1 = 8 + * d[0][1] = (t0+t1+1)>>1 = (10+20+1)>>1 = 15 + * d[0][2] = (t1+t2+1)>>1 = (20+30+1)>>1 = 25 + * d[0][3] = (t2+t3+1)>>1 = (30+40+1)>>1 = 35 + * d[1][0] = avg3(l0,tl,t0) = (50+2*5+10+2)>>2 = 72/4 = 18 + * d[1][1] = avg3(tl,t0,t1) = (5+20+20+2)>>2 = 47/4 = 11 + * d[1][2] = avg3(t0,t1,t2) = (10+40+30+2)>>2 = 82/4 = 20 + * d[1][3] = avg3(t1,t2,t3) = (20+60+40+2)>>2 = 122/4 = 30 + * d[2][0] = avg3(tl,l0,l1) = (5+100+60+2)>>2 = 167/4 = 41 + * d[2][1] = d[0][0] = 8 + * d[2][2] = d[0][1] = 15 + * d[2][3] = d[0][2] = 25 + * d[3][0] = avg3(l0,l1,l2) = (50+120+70+2)>>2 = 242/4 = 60 + * d[3][1] = d[1][0] = 18 + * d[3][2] = d[1][1] = 11 + * d[3][3] = d[1][2] = 20 + */ + { + uint8_t buf[5][STRIDE]; + int t[8] = { 10,20,30,40, 0,0,0,0 }; + int l[4] = { 50,60,70,0 }; + set_ctx(buf, 5, t, l); + daedalus_h264_pred_4x4_vr_ref(&buf[1][1], STRIDE); + uint8_t exp[4][4] = { + { 8,15,25,35}, + {18,11,20,30}, + {41, 8,15,25}, + {60,18,11,20}, + }; + fail |= check(buf, "VR asym (sanity)", exp); + } + + if (fail == 0) printf("\nALL %d intra-4x4 mode references PASS\n", 10); + else fprintf(stderr, "\n%d test(s) FAILED\n", fail); + return fail ? 1 : 0; +}