/*
 * Standalone bit-exact C reference for H.264 8x8 inverse integer
 * transform + add. Algorithm per H.264 spec §8.5.13.2 (8x8 IT).
 *
 * Mirrors FFmpeg `ff_h264_idct8_add_neon` in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S
 * line 267. Block is COLUMN-MAJOR (per cycle 6 Phase 9 lesson):
 * block[c*8 + r] = coefficient at (row=r, col=c).
 *
 * Signature:
 *   void(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 *
 * Zeroes block after transform (per FFmpeg convention).
 *
 * License: LGPL-2.1-or-later.
 */
#include <stdint.h>
#include <stddef.h>
#include <string.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }

/* 1D 8-element H.264 IT butterfly per H.264 §8.5.13.2.
 * Takes d[0..7], produces g[0..7]. */
static inline void h264_idct8_butterfly(const int d[8], int g[8])
{
    int e[8], f[8];

    e[0] = d[0] + d[4];
    e[1] = -d[3] + d[5] - d[7] - (d[7] >> 1);
    e[2] = d[0] - d[4];
    e[3] = d[1] + d[7] - d[3] - (d[3] >> 1);
    e[4] = (d[2] >> 1) - d[6];
    e[5] = -d[1] + d[7] + d[5] + (d[5] >> 1);
    e[6] = d[2] + (d[6] >> 1);
    e[7] = d[3] + d[5] + d[1] + (d[1] >> 1);

    f[0] = e[0] + e[6];
    f[1] = e[1] + (e[7] >> 2);
    f[2] = e[2] + e[4];
    f[3] = e[3] + (e[5] >> 2);
    f[4] = e[2] - e[4];
    f[5] = (e[3] >> 2) - e[5];
    f[6] = e[0] - e[6];
    f[7] = e[7] - (e[1] >> 2);

    g[0] = f[0] + f[7];
    g[1] = f[2] + f[5];
    g[2] = f[4] + f[3];
    g[3] = f[6] + f[1];
    g[4] = f[6] - f[1];
    g[5] = f[4] - f[3];
    g[6] = f[2] - f[5];
    g[7] = f[0] - f[7];
}

void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride)
{
    int tmp[8][8];

    /* Row pass FIRST. Read block as column-major (block[c*8 + r]).
     * d[c] for row r = block[c*8 + r] = (row=r, col=c) per the
     * H.264/FFmpeg column-major convention from cycle 6 phase 9. */
    for (int r = 0; r < 8; r++) {
        int d[8];
        for (int c = 0; c < 8; c++) d[c] = block[c*8 + r];
        int g[8];
        h264_idct8_butterfly(d, g);
        for (int c = 0; c < 8; c++) tmp[r][c] = g[c];
    }

    /* Column pass NEXT (on row-major tmp). */
    int col_out[8][8];
    for (int c = 0; c < 8; c++) {
        int d[8];
        for (int r = 0; r < 8; r++) d[r] = tmp[r][c];
        int g[8];
        h264_idct8_butterfly(d, g);
        for (int r = 0; r < 8; r++) col_out[r][c] = g[r];
    }

    /* Round (+32) >> 6, add to dst, clip to u8. */
    for (int r = 0; r < 8; r++) {
        for (int c = 0; c < 8; c++) {
            int rounded = (col_out[r][c] + 32) >> 6;
            dst[r * stride + c] = (uint8_t) clip_u8(dst[r * stride + c] + rounded);
        }
    }

    /* FFmpeg convention: zero the block after transform. */
    memset(block, 0, 64 * sizeof(int16_t));
}