diff --git a/CMakeLists.txt b/CMakeLists.txt index cb07a13..1f10f1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -392,6 +392,7 @@ endif() add_library(daedalus_core STATIC src/daedalus_core.c src/h264_chroma_dc.c + src/h264_intra_pred_4x4.c src/v3d_runner.c ${FFASM_SOURCES} ${FFASM_LPF_SOURCES} @@ -538,13 +539,12 @@ add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c) target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core) target_compile_options(test_api_opportunistic_qpu PRIVATE -O2) -# H.264 Intra_4x4 luma prediction (9 modes) — reference + tests. -# Pure CPU + spec-derived; no daedalus_core dependency yet (this is -# the bit-exact gate for the eventual shader / dispatch wiring). -add_executable(test_intra_pred_4x4 - tests/test_intra_pred_4x4.c - tests/h264_intra_pred_4x4_ref.c -) +# H.264 Intra_4x4 luma prediction (9 modes) — public src primitives. +# The bodies now live in src/h264_intra_pred_4x4.c (linked into +# daedalus_core for use by libavcodec.so substitution-arc consumers). +# This test exercises the public symbols. +add_executable(test_intra_pred_4x4 tests/test_intra_pred_4x4.c) +target_link_libraries(test_intra_pred_4x4 PRIVATE daedalus_core) target_compile_options(test_intra_pred_4x4 PRIVATE -O2) # H.264 Intra_16x16 luma prediction (4 modes: V, H, DC, Plane) — diff --git a/include/daedalus.h b/include/daedalus.h index 0a565d2..cd76602 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -559,6 +559,33 @@ DECLARE_QPEL_AVG(avg_mc33) * ----------------------------------------------------------------- */ void daedalus_h264_chroma_dc_hadamard_2x2(int16_t c[4]); +/* ------------------------------------------------------------------- + * H.264 Intra_4x4 luma prediction (per H.264 §8.3.1.4). 9 modes. + * + * Pure CPU primitives — each is a small straightforward fill of a + * 4x4 output block from neighbour pixels in the same buffer. No + * substrate-dispatch wrapper (the work is too small to amortise). + * + * FFmpeg-style interface: `dst` at row 0 col 0 of the 4x4 output. + * Reads top-left at dst[-stride-1], top at dst[-stride..-stride+7] + * (top-right for DDL/VL), and left at dst[r*stride - 1] for r=0..3. + * Caller must ensure all 13 neighbour bytes are valid (interior-MB + * assumption — H.264 availability fallback handled at caller). + * + * Bit-exact validated against tests/test_intra_pred_4x4.c (10-case + * spec-derived test suite including the asymmetric Vertical_Right + * 16-cell hand-derived case; see fourier PR #12). + * ----------------------------------------------------------------- */ +void daedalus_h264_pred_4x4_vertical (uint8_t *dst, ptrdiff_t stride); +void daedalus_h264_pred_4x4_horizontal(uint8_t *dst, ptrdiff_t stride); +void daedalus_h264_pred_4x4_dc (uint8_t *dst, ptrdiff_t stride); +void daedalus_h264_pred_4x4_ddl (uint8_t *dst, ptrdiff_t stride); +void daedalus_h264_pred_4x4_ddr (uint8_t *dst, ptrdiff_t stride); +void daedalus_h264_pred_4x4_vr (uint8_t *dst, ptrdiff_t stride); +void daedalus_h264_pred_4x4_hd (uint8_t *dst, ptrdiff_t stride); +void daedalus_h264_pred_4x4_vl (uint8_t *dst, ptrdiff_t stride); +void daedalus_h264_pred_4x4_hu (uint8_t *dst, ptrdiff_t stride); + /* ------------------------------------------------------------------- * Recipe query — what does the API recommend for each kernel? * ----------------------------------------------------------------- */ diff --git a/tests/h264_intra_pred_4x4_ref.c b/src/h264_intra_pred_4x4.c similarity index 92% rename from tests/h264_intra_pred_4x4_ref.c rename to src/h264_intra_pred_4x4.c index 6cec9ba..bb1db7d 100644 --- a/tests/h264_intra_pred_4x4_ref.c +++ b/src/h264_intra_pred_4x4.c @@ -52,7 +52,7 @@ static inline uint8_t avg2(int a, int b) } /* Mode 0 — Vertical: each col = top[col]. */ -void daedalus_h264_pred_4x4_vertical_ref(uint8_t *dst, ptrdiff_t stride) +void daedalus_h264_pred_4x4_vertical(uint8_t *dst, ptrdiff_t stride) { const uint8_t *top = dst - stride; for (int r = 0; r < 4; r++) { @@ -61,7 +61,7 @@ void daedalus_h264_pred_4x4_vertical_ref(uint8_t *dst, ptrdiff_t stride) } /* Mode 1 — Horizontal: each row = left[row]. */ -void daedalus_h264_pred_4x4_horizontal_ref(uint8_t *dst, ptrdiff_t stride) +void daedalus_h264_pred_4x4_horizontal(uint8_t *dst, ptrdiff_t stride) { for (int r = 0; r < 4; r++) { uint8_t l = dst[r * stride - 1]; @@ -70,7 +70,7 @@ void daedalus_h264_pred_4x4_horizontal_ref(uint8_t *dst, ptrdiff_t stride) } /* Mode 2 — DC: mean of top 4 + left 4, broadcast. */ -void daedalus_h264_pred_4x4_dc_ref(uint8_t *dst, ptrdiff_t stride) +void daedalus_h264_pred_4x4_dc(uint8_t *dst, ptrdiff_t stride) { const uint8_t *top = dst - stride; int sum = 4; /* rounding for ((sum + 4) >> 3) */ @@ -82,7 +82,7 @@ void daedalus_h264_pred_4x4_dc_ref(uint8_t *dst, ptrdiff_t stride) } /* Mode 3 — Diagonal_Down_Left. Uses top[0..7] (incl. top-right). */ -void daedalus_h264_pred_4x4_ddl_ref(uint8_t *dst, ptrdiff_t stride) +void daedalus_h264_pred_4x4_ddl(uint8_t *dst, ptrdiff_t stride) { const uint8_t *top = dst - stride; int t0 = top[0], t1 = top[1], t2 = top[2], t3 = top[3]; @@ -102,7 +102,7 @@ void daedalus_h264_pred_4x4_ddl_ref(uint8_t *dst, ptrdiff_t stride) } /* Mode 4 — Diagonal_Down_Right. Uses top-left + top[0..3] + left[0..3]. */ -void daedalus_h264_pred_4x4_ddr_ref(uint8_t *dst, ptrdiff_t stride) +void daedalus_h264_pred_4x4_ddr(uint8_t *dst, ptrdiff_t stride) { int tl = dst[-stride - 1]; int t0 = dst[-stride + 0], t1 = dst[-stride + 1]; @@ -123,7 +123,7 @@ void daedalus_h264_pred_4x4_ddr_ref(uint8_t *dst, ptrdiff_t stride) } /* Mode 5 — Vertical_Right. */ -void daedalus_h264_pred_4x4_vr_ref(uint8_t *dst, ptrdiff_t stride) +void daedalus_h264_pred_4x4_vr(uint8_t *dst, ptrdiff_t stride) { int tl = dst[-stride - 1]; int t0 = dst[-stride + 0], t1 = dst[-stride + 1]; @@ -153,7 +153,7 @@ void daedalus_h264_pred_4x4_vr_ref(uint8_t *dst, ptrdiff_t stride) } /* Mode 6 — Horizontal_Down. */ -void daedalus_h264_pred_4x4_hd_ref(uint8_t *dst, ptrdiff_t stride) +void daedalus_h264_pred_4x4_hd(uint8_t *dst, ptrdiff_t stride) { int tl = dst[-stride - 1]; int t0 = dst[-stride + 0], t1 = dst[-stride + 1], t2 = dst[-stride + 2]; @@ -182,7 +182,7 @@ void daedalus_h264_pred_4x4_hd_ref(uint8_t *dst, ptrdiff_t stride) } /* Mode 7 — Vertical_Left. Uses top[0..7]. */ -void daedalus_h264_pred_4x4_vl_ref(uint8_t *dst, ptrdiff_t stride) +void daedalus_h264_pred_4x4_vl(uint8_t *dst, ptrdiff_t stride) { const uint8_t *top = dst - stride; int t0=top[0], t1=top[1], t2=top[2], t3=top[3]; @@ -211,7 +211,7 @@ void daedalus_h264_pred_4x4_vl_ref(uint8_t *dst, ptrdiff_t stride) } /* Mode 8 — Horizontal_Up. Uses left[0..3] only. */ -void daedalus_h264_pred_4x4_hu_ref(uint8_t *dst, ptrdiff_t stride) +void daedalus_h264_pred_4x4_hu(uint8_t *dst, ptrdiff_t stride) { int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1]; int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1]; diff --git a/tests/test_intra_pred_4x4.c b/tests/test_intra_pred_4x4.c index 2a44c1d..07afe10 100644 --- a/tests/test_intra_pred_4x4.c +++ b/tests/test_intra_pred_4x4.c @@ -22,15 +22,15 @@ #include #include -extern void daedalus_h264_pred_4x4_vertical_ref(uint8_t *dst, ptrdiff_t stride); -extern void daedalus_h264_pred_4x4_horizontal_ref(uint8_t *dst, ptrdiff_t stride); -extern void daedalus_h264_pred_4x4_dc_ref(uint8_t *dst, ptrdiff_t stride); -extern void daedalus_h264_pred_4x4_ddl_ref(uint8_t *dst, ptrdiff_t stride); -extern void daedalus_h264_pred_4x4_ddr_ref(uint8_t *dst, ptrdiff_t stride); -extern void daedalus_h264_pred_4x4_vr_ref(uint8_t *dst, ptrdiff_t stride); -extern void daedalus_h264_pred_4x4_hd_ref(uint8_t *dst, ptrdiff_t stride); -extern void daedalus_h264_pred_4x4_vl_ref(uint8_t *dst, ptrdiff_t stride); -extern void daedalus_h264_pred_4x4_hu_ref(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_vertical(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_horizontal(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_dc(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_ddl(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_ddr(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_vr(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_hd(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_vl(uint8_t *dst, ptrdiff_t stride); +extern void daedalus_h264_pred_4x4_hu(uint8_t *dst, ptrdiff_t stride); #define STRIDE 9 typedef void (*pred_fn)(uint8_t *dst, ptrdiff_t stride); @@ -82,7 +82,7 @@ int main(void) int t[8] = { 10, 20, 30, 40, 0, 0, 0, 0 }; int l[4] = { 0, 0, 0, 0 }; set_ctx(buf, tl, t, l); - daedalus_h264_pred_4x4_vertical_ref(&buf[1][1], STRIDE); + daedalus_h264_pred_4x4_vertical(&buf[1][1], STRIDE); uint8_t exp[4][4] = { {10,20,30,40}, {10,20,30,40}, {10,20,30,40}, {10,20,30,40} }; @@ -95,7 +95,7 @@ int main(void) int t[8] = { 0,0,0,0, 0,0,0,0 }; int l[4] = { 50, 60, 70, 80 }; set_ctx(buf, 0, t, l); - daedalus_h264_pred_4x4_horizontal_ref(&buf[1][1], STRIDE); + daedalus_h264_pred_4x4_horizontal(&buf[1][1], STRIDE); uint8_t exp[4][4] = { {50,50,50,50}, {60,60,60,60}, {70,70,70,70}, {80,80,80,80} }; @@ -110,7 +110,7 @@ int main(void) int t[8] = { 1,1,1,1, 0,0,0,0 }; int l[4] = { 3,3,3,3 }; set_ctx(buf, 99, t, l); /* tl unused for DC */ - daedalus_h264_pred_4x4_dc_ref(&buf[1][1], STRIDE); + daedalus_h264_pred_4x4_dc(&buf[1][1], STRIDE); uint8_t exp[4][4] = { {2,2,2,2}, {2,2,2,2}, {2,2,2,2}, {2,2,2,2} }; @@ -125,7 +125,7 @@ int main(void) int t[8] = { 100,100,100,100, 100,100,100,100 }; int l[4] = { 0,0,0,0 }; set_ctx(buf, 0, t, l); - daedalus_h264_pred_4x4_ddl_ref(&buf[1][1], STRIDE); + daedalus_h264_pred_4x4_ddl(&buf[1][1], STRIDE); uint8_t exp[4][4] = { {100,100,100,100}, {100,100,100,100}, {100,100,100,100}, {100,100,100,100} @@ -140,7 +140,7 @@ int main(void) int t[8] = { 200,200,200,200, 0,0,0,0 }; int l[4] = { 200,200,200,200 }; set_ctx(buf, 200, t, l); - daedalus_h264_pred_4x4_ddr_ref(&buf[1][1], STRIDE); + daedalus_h264_pred_4x4_ddr(&buf[1][1], STRIDE); uint8_t exp[4][4] = { {200,200,200,200}, {200,200,200,200}, {200,200,200,200}, {200,200,200,200} @@ -155,7 +155,7 @@ int main(void) int t[8] = { 80,80,80,80, 0,0,0,0 }; int l[4] = { 80,80,80,80 }; set_ctx(buf, 80, t, l); - daedalus_h264_pred_4x4_vr_ref(&buf[1][1], STRIDE); + daedalus_h264_pred_4x4_vr(&buf[1][1], STRIDE); uint8_t exp[4][4] = { {80,80,80,80}, {80,80,80,80}, {80,80,80,80}, {80,80,80,80} }; @@ -168,7 +168,7 @@ int main(void) int t[8] = { 120,120,120,120, 0,0,0,0 }; int l[4] = { 120,120,120,120 }; set_ctx(buf, 120, t, l); - daedalus_h264_pred_4x4_hd_ref(&buf[1][1], STRIDE); + daedalus_h264_pred_4x4_hd(&buf[1][1], STRIDE); uint8_t exp[4][4] = { {120,120,120,120}, {120,120,120,120}, {120,120,120,120}, {120,120,120,120} @@ -182,7 +182,7 @@ int main(void) int t[8] = { 64,64,64,64, 64,64,64,64 }; int l[4] = { 0,0,0,0 }; set_ctx(buf, 0, t, l); - daedalus_h264_pred_4x4_vl_ref(&buf[1][1], STRIDE); + daedalus_h264_pred_4x4_vl(&buf[1][1], STRIDE); uint8_t exp[4][4] = { {64,64,64,64}, {64,64,64,64}, {64,64,64,64}, {64,64,64,64} }; @@ -195,7 +195,7 @@ int main(void) int t[8] = { 0,0,0,0, 0,0,0,0 }; int l[4] = { 200,200,200,200 }; set_ctx(buf, 0, t, l); - daedalus_h264_pred_4x4_hu_ref(&buf[1][1], STRIDE); + daedalus_h264_pred_4x4_hu(&buf[1][1], STRIDE); uint8_t exp[4][4] = { {200,200,200,200}, {200,200,200,200}, {200,200,200,200}, {200,200,200,200} @@ -230,7 +230,7 @@ int main(void) int t[8] = { 10,20,30,40, 0,0,0,0 }; int l[4] = { 50,60,70,0 }; set_ctx(buf, 5, t, l); - daedalus_h264_pred_4x4_vr_ref(&buf[1][1], STRIDE); + daedalus_h264_pred_4x4_vr(&buf[1][1], STRIDE); uint8_t exp[4][4] = { { 8,15,25,35}, {18,11,20,30},