/*
 * Standalone bit-exact C reference for VP9 8×8 DCT_DCT inverse
 * transform + add (8-bit pixels), transcribed from the spec
 * structure as represented in FFmpeg's libavcodec/vp9dsp_template.c
 * (vendored under external/ffmpeg-snapshot/ at commit f46e514).
 *
 * Provided as a self-contained translation unit so the harness
 * doesn't need to wrestle FFmpeg's BIT_DEPTH-templated macro
 * expansion. Cross-checked against the vendored reference at
 * runtime (see bench_neon_idct.c::cross_check_vs_ffmpeg_c()).
 *
 * License: LGPL-2.1-or-later (matches the upstream reference).
 *
 * Spec source: VP9 specification §8.7 — Inverse transform process.
 */
#include <stdint.h>
#include <stddef.h>
#include <string.h>

/* Q14 trig constants — VP9 spec table 8.7.1.4. */
#define COSPI_16_64 11585  /* cos(pi/4)  * 2^14 */
#define COSPI_24_64  6270  /* cos(3pi/8) * 2^14 */
#define COSPI_8_64  15137  /* sin(3pi/8) * 2^14 */
#define COSPI_28_64  3196  /* cos(7pi/16)* 2^14 */
#define COSPI_4_64  16069  /* sin(7pi/16)* 2^14 */
#define COSPI_20_64  9102  /* cos(5pi/16)* 2^14 */
#define COSPI_12_64 13623  /* sin(5pi/16)* 2^14 */

/* Q14 round-shift: (x + (1<<13)) >> 14, with overflow-safe widening. */
static inline int32_t qround14(int64_t x)
{
    return (int32_t) ((x + (1 << 13)) >> 14);
}

static inline uint8_t clip_u8(int x)
{
    return (uint8_t) (x < 0 ? 0 : x > 255 ? 255 : x);
}

/* 1-D 8-point inverse DCT, signed int32 throughout. Matches
 * idct8_1d in libavcodec/vp9dsp_template.c (with the stride
 * collapsed to indexed access; identical arithmetic). */
static void idct8_1d(const int32_t in[8], int32_t out[8])
{
    int32_t t0a = qround14((int64_t)(in[0] + in[4]) * COSPI_16_64);
    int32_t t1a = qround14((int64_t)(in[0] - in[4]) * COSPI_16_64);
    int32_t t2a = qround14((int64_t)in[2] * COSPI_24_64 - (int64_t)in[6] * COSPI_8_64);
    int32_t t3a = qround14((int64_t)in[2] * COSPI_8_64  + (int64_t)in[6] * COSPI_24_64);
    int32_t t4a = qround14((int64_t)in[1] * COSPI_28_64 - (int64_t)in[7] * COSPI_4_64);
    int32_t t5a = qround14((int64_t)in[5] * COSPI_12_64 - (int64_t)in[3] * COSPI_20_64);
    int32_t t6a = qround14((int64_t)in[5] * COSPI_20_64 + (int64_t)in[3] * COSPI_12_64);
    int32_t t7a = qround14((int64_t)in[1] * COSPI_4_64  + (int64_t)in[7] * COSPI_28_64);

    int32_t t0 = t0a + t3a, t1 = t1a + t2a;
    int32_t t2 = t1a - t2a, t3 = t0a - t3a;
    int32_t t4 = t4a + t5a;
    int32_t t5p = t4a - t5a;
    int32_t t7 = t7a + t6a;
    int32_t t6p = t7a - t6a;

    int32_t t5 = qround14((int64_t)(t6p - t5p) * COSPI_16_64);
    int32_t t6 = qround14((int64_t)(t6p + t5p) * COSPI_16_64);

    out[0] = t0 + t7; out[1] = t1 + t6;
    out[2] = t2 + t5; out[3] = t3 + t4;
    out[4] = t3 - t4; out[5] = t2 - t5;
    out[6] = t1 - t6; out[7] = t0 - t7;
}

/* Public reference entry point. Signature matches
 * ff_vp9_idct_idct_8x8_add_neon. After the call, *block is
 * zeroed (matches FFmpeg behaviour). */
void daedalus_vp9_idct_idct_8x8_add_ref(uint8_t *dst, ptrdiff_t stride,
                                        int16_t *block, int eob)
{
    int32_t tmp[64];
    int32_t out[8];
    int32_t col[8];

    /* DC-only fast path: (((coef * 11585) Q14) * 11585) Q14, then
     * broadcast (+16) >> 5 added to every pixel. */
    if (eob == 1) {
        int32_t dc = qround14(qround14((int64_t)block[0] * COSPI_16_64)
                              * (int64_t) COSPI_16_64);
        block[0] = 0;
        int32_t add = (dc + 16) >> 5;
        for (int r = 0; r < 8; r++)
            for (int c = 0; c < 8; c++)
                dst[r * stride + c] = clip_u8(dst[r * stride + c] + add);
        return;
    }

    /* 8 column passes, transposed write: IDCT of block column i lands
     * in row i of tmp. This matches FFmpeg's idct_idct_8x8_add_c which
     * uses `tmp + i*8` as the column-pass output base — the transpose
     * is implicit in the offset pattern, making the row pass below
     * read columns of tmp and write columns of dst. */
    for (int i = 0; i < 8; i++) {
        for (int r = 0; r < 8; r++) col[r] = block[r * 8 + i];
        idct8_1d(col, out);
        for (int r = 0; r < 8; r++) tmp[i * 8 + r] = out[r];
    }
    memset(block, 0, 64 * sizeof(*block));

    /* 8 row passes: column i of tmp -> column i of dst (matches
     * FFmpeg's `dst[j*stride] = out[j]; dst++` pattern). */
    for (int i = 0; i < 8; i++) {
        for (int r = 0; r < 8; r++) col[r] = tmp[r * 8 + i];
        idct8_1d(col, out);
        for (int r = 0; r < 8; r++)
            dst[r * stride + i] = clip_u8(dst[r * stride + i]
                                          + ((out[r] + 16) >> 5));
    }
}