/* * Standalone bit-exact C reference for the H.264 chroma DC 2x2 * Hadamard transform (per H.264 §8.5.11.1). * * In 4:2:0 chroma, the four DC coefficients (one from each chroma * 4x4 AC block within an MB) are arranged into a 2x2 block: * * c[0,0] c[0,1] block (0,0) DC block (0,1) DC * c[1,0] c[1,1] block (1,0) DC block (1,1) DC * * The 2x2 Hadamard transform: * * f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1] * f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1] * f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1] * f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1] * * Equivalently expressed as 2-stage butterflies (row then col), which * the NEON impl uses for SIMD friendliness — we present that form * here too so the QPU/NEON ports are 1:1. * * Output f[] replaces the input c[]. The QP-dependent scaling per * §8.5.11.2 happens AFTER this primitive — the intercept patch * composes Hadamard + LevelScale + shift itself, since the scaling * shape depends on QP and on whether we're in the chroma_qp_offset * adjustment regime. * * Input/output layout: * c[0..3] in row-major order: [c[0,0], c[0,1], c[1,0], c[1,1]] * * License: BSD-2-Clause. Algorithm is in the H.264 spec. */ #include void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4]) { /* Stage 1: butterfly along rows. * t[0] = c[0,0] + c[0,1] = c[0] + c[1] * t[1] = c[0,0] - c[0,1] = c[0] - c[1] * t[2] = c[1,0] + c[1,1] = c[2] + c[3] * t[3] = c[1,0] - c[1,1] = c[2] - c[3] */ int t0 = c[0] + c[1]; int t1 = c[0] - c[1]; int t2 = c[2] + c[3]; int t3 = c[2] - c[3]; /* Stage 2: butterfly along cols. */ c[0] = (int16_t)(t0 + t2); /* f[0,0] = t0+t2 = sum of all 4 */ c[1] = (int16_t)(t1 + t3); /* f[0,1] = (c0-c1) + (c2-c3) */ c[2] = (int16_t)(t0 - t2); /* f[1,0] = (c0+c1) - (c2+c3) */ c[3] = (int16_t)(t1 - t3); /* f[1,1] = (c0-c1) - (c2-c3) */ }