/*
 * Standalone bit-exact C reference for the H.264 chroma DC 2x2
 * Hadamard transform (per H.264 §8.5.11.1).
 *
 * In 4:2:0 chroma, the four DC coefficients (one from each chroma
 * 4x4 AC block within an MB) are arranged into a 2x2 block:
 *
 *     c[0,0]  c[0,1]      block (0,0) DC   block (0,1) DC
 *     c[1,0]  c[1,1]      block (1,0) DC   block (1,1) DC
 *
 * The 2x2 Hadamard transform:
 *
 *     f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1]
 *     f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1]
 *     f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1]
 *     f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1]
 *
 * Equivalently expressed as 2-stage butterflies (row then col), which
 * the NEON impl uses for SIMD friendliness — we present that form
 * here too so the QPU/NEON ports are 1:1.
 *
 * Output f[] replaces the input c[].  The QP-dependent scaling per
 * §8.5.11.2 happens AFTER this primitive — the intercept patch
 * composes Hadamard + LevelScale + shift itself, since the scaling
 * shape depends on QP and on whether we're in the chroma_qp_offset
 * adjustment regime.
 *
 * Input/output layout:
 *   c[0..3] in row-major order: [c[0,0], c[0,1], c[1,0], c[1,1]]
 *
 * License: BSD-2-Clause.  Algorithm is in the H.264 spec.
 */
#include <stdint.h>

void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4])
{
    /* Stage 1: butterfly along rows.
     *   t[0] = c[0,0] + c[0,1]   = c[0] + c[1]
     *   t[1] = c[0,0] - c[0,1]   = c[0] - c[1]
     *   t[2] = c[1,0] + c[1,1]   = c[2] + c[3]
     *   t[3] = c[1,0] - c[1,1]   = c[2] - c[3]
     */
    int t0 = c[0] + c[1];
    int t1 = c[0] - c[1];
    int t2 = c[2] + c[3];
    int t3 = c[2] - c[3];

    /* Stage 2: butterfly along cols. */
    c[0] = (int16_t)(t0 + t2);   /* f[0,0] = t0+t2 = sum of all 4 */
    c[1] = (int16_t)(t1 + t3);   /* f[0,1] = (c0-c1) + (c2-c3) */
    c[2] = (int16_t)(t0 - t2);   /* f[1,0] = (c0+c1) - (c2+c3) */
    c[3] = (int16_t)(t1 - t3);   /* f[1,1] = (c0-c1) - (c2-c3) */
}