From 1f07f3cd705f90ed3413e498914ee83926950cc7 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 13:32:01 +0200 Subject: [PATCH] h264: expose chroma DC 2x2 Hadamard as public API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #23 added the Hadamard as a test-only spec reference; this PR promotes it to a public symbol in src/ so consumers (the eventual marfrit-packages substitution-arc patch 0011) can link against it. New: void daedalus_h264_chroma_dc_hadamard_2x2(int16_t c[4]); — operates in-place on 4 int16, no QP-dependent scaling (caller composes that themselves per §8.5.11.2). The src/ implementation is byte-for-byte identical to the test-only ref in tests/h264_chroma_dc_hadamard_ref.c (kept as a separate spec-validation copy). A new "public API parity" test case verifies the two produce identical output for a non-trivial input. Pure CPU primitive — no substrate-dispatch wrapper because the work is 4 adds + 4 subs; the substrate machinery would cost more than the kernel itself. Verified on hertz: $ ./build/test_chroma_dc_hadamard all-uniform 5 PASS col gradient [0,10,0,10] PASS row gradient [0,0,10,10] PASS anti-diagonal [10,0,0,10] PASS asymmetric [1,2,3,4] PASS sign-alternating [-5,5,-5,5] PASS double-Hadamard = 4*orig PASS public API parity vs _ref PASS ALL chroma DC Hadamard tests PASS $ nm -g build/libdaedalus_core.a | grep chroma_dc_hadamard 0000000000000000 T daedalus_h264_chroma_dc_hadamard_2x2 Unblocks marfrit-packages 0011 (substituting H264DSPContext.chroma_dc_dequant_idct, which composes the Hadamard + qmul scaling). --- CMakeLists.txt | 4 ++++ include/daedalus.h | 15 +++++++++++++++ src/h264_chroma_dc.c | 34 +++++++++++++++++++++++++++++++++ tests/test_chroma_dc_hadamard.c | 18 +++++++++++++++++ 4 files changed, 71 insertions(+) create mode 100644 src/h264_chroma_dc.c diff --git a/CMakeLists.txt b/CMakeLists.txt index b186a40..cb07a13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -391,6 +391,7 @@ endif() add_library(daedalus_core STATIC src/daedalus_core.c + src/h264_chroma_dc.c src/v3d_runner.c ${FFASM_SOURCES} ${FFASM_LPF_SOURCES} @@ -577,6 +578,9 @@ add_executable(test_chroma_dc_hadamard tests/test_chroma_dc_hadamard.c tests/h264_chroma_dc_hadamard_ref.c ) +# Links daedalus_core to pull in the public daedalus_h264_chroma_dc_hadamard_2x2 +# symbol (for the public-API parity test added in this PR). +target_link_libraries(test_chroma_dc_hadamard PRIVATE daedalus_core) target_compile_options(test_chroma_dc_hadamard PRIVATE -O2) # H.264 primitives latency benchmark (NEON CPU baseline). diff --git a/include/daedalus.h b/include/daedalus.h index a224628..0a565d2 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -544,6 +544,21 @@ DECLARE_QPEL_AVG(avg_mc33) #undef DECLARE_QPEL_AVG +/* ------------------------------------------------------------------- + * H.264 chroma DC 2x2 Hadamard pre-pass (per H.264 §8.5.11.1). + * + * Operates in-place on 4 int16 (the DC coefficients of an MB's + * chroma 4x4 AC blocks). Pure CPU primitive — no substrate + * dispatch wrapper because the work is 4 adds + 4 subs. Callers + * compose with QP-dependent scaling themselves; the scale shape + * varies by slice/PPS chroma_qp offset context. + * + * Bit-exact validated against tests/h264_chroma_dc_hadamard_ref.c + * (7-case spec-derived test suite including the H·H = 4·I algebraic + * invariant; see PR #23). + * ----------------------------------------------------------------- */ +void daedalus_h264_chroma_dc_hadamard_2x2(int16_t c[4]); + /* ------------------------------------------------------------------- * Recipe query — what does the API recommend for each kernel? * ----------------------------------------------------------------- */ diff --git a/src/h264_chroma_dc.c b/src/h264_chroma_dc.c new file mode 100644 index 0000000..d99110b --- /dev/null +++ b/src/h264_chroma_dc.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * H.264 chroma DC 2x2 Hadamard pre-pass (public, in-tree CPU). + * + * The 4 DC coefficients of an MB's chroma 4x4 AC blocks go through + * this 2x2 Hadamard before quant-scaling and re-injection into the + * AC blocks' [0,0] coefficient. Algorithm per H.264 §8.5.11.1. + * + * Pure CPU primitive — there's no substrate-dispatch wrapper because + * the work is 4 adds + 4 subs. Callers compose with QP-dependent + * scaling themselves (the scale shape varies by slice/PPS chroma_qp + * offset context and shouldn't be baked into the kernel). + * + * Bit-exact validated against tests/h264_chroma_dc_hadamard_ref.c + * (7-case spec-derived test suite including the H·H = 4·I algebraic + * invariant; see PR #23). Same algorithm; this is the public + * src-tree copy. + */ +#include "daedalus.h" + +#include + +void daedalus_h264_chroma_dc_hadamard_2x2(int16_t c[4]) +{ + int t0 = c[0] + c[1]; + int t1 = c[0] - c[1]; + int t2 = c[2] + c[3]; + int t3 = c[2] - c[3]; + + c[0] = (int16_t)(t0 + t2); /* f[0,0] = sum of all 4 */ + c[1] = (int16_t)(t1 + t3); /* f[0,1] = col-difference */ + c[2] = (int16_t)(t0 - t2); /* f[1,0] = row-difference */ + c[3] = (int16_t)(t1 - t3); /* f[1,1] = anti-diagonal */ +} diff --git a/tests/test_chroma_dc_hadamard.c b/tests/test_chroma_dc_hadamard.c index 4a9b4b2..2a8ec05 100644 --- a/tests/test_chroma_dc_hadamard.c +++ b/tests/test_chroma_dc_hadamard.c @@ -12,6 +12,7 @@ #include extern void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4]); +extern void daedalus_h264_chroma_dc_hadamard_2x2(int16_t c[4]); /* public API */ static int check(const char *name, int16_t in[4], int16_t expect[4]) { @@ -112,6 +113,23 @@ int main(void) fail |= local_fail; } + /* Test 8: public API parity. The public symbol must produce + * byte-identical output to the test-only ref for the same input. + * If the src/ Hadamard ever drifts from the spec, this catches it. */ + { + int16_t input[4] = { 7, -11, 23, -42 }; + int16_t a[4], b[4]; + memcpy(a, input, sizeof(a)); + memcpy(b, input, sizeof(b)); + daedalus_h264_chroma_dc_hadamard_2x2_ref(a); + daedalus_h264_chroma_dc_hadamard_2x2(b); + int local_fail = 0; + for (int i = 0; i < 4; i++) if (a[i] != b[i]) local_fail = 1; + printf(" %-32s %s\n", "public API parity vs _ref", + local_fail ? "FAIL" : "PASS"); + fail |= local_fail; + } + if (fail == 0) printf("\nALL chroma DC Hadamard tests PASS\n"); else fprintf(stderr, "\n%d test(s) FAILED\n", fail); return fail ? 1 : 0; -- 2.47.3