/* SPDX-License-Identifier: BSD-2-Clause */
/*
 * test_idct_bitexact — phase1 stage1 bit-exact gate for the frame-
 * scaled luma IDCT 4×4 dispatch.
 *
 * Generates a frame of random coefficients, runs daedalus_decoder
 * (with predicted=0 by the scaffold's flush_frame contract), and
 * compares every output byte against an inline C reference that
 * mirrors the H.264 §8.5.12.1 1D butterfly.
 *
 * Why "bit-exact": the GPU shader and the C reference apply the same
 * integer arithmetic.  Any rounding / sign / overflow disagreement is
 * a bug.  Pass = every output byte matches.
 *
 * Scope match with flush_frame: the test mirrors flush_frame's
 * per-MB → flat block layout (raster scan within MB, no z-scan
 * permutation).  That keeps the test focused on IDCT correctness;
 * the z-scan permutation that bridges to libavcodec's per-MB coeffs
 * layout is a separate concern (handled in the eventual libavcodec-
 * intercept patch).
 *
 * Covers BOTH luma (Y plane, 16 blocks/MB) and chroma (UV plane,
 * 4 Cb + 4 Cr blocks/MB, NV12-interleaved).  Random coeffs for all
 * three components; reference IDCT applied per block.  The chroma
 * compare deinterleaves NV12 UV back into separate Cb/Cr expectations.
 *
 * Not in scope (covered by other tests / future PRs):
 *   - IDCT 8×8 (Phase 1 follow-on)
 *   - Chroma DC / Intra16x16 DC Hadamard pre-pass
 *   - bit-exactness against real H.264 streams (test-vector PR)
 *   - non-zero predicted pixels (intra prediction lands in Stage 2a)
 */

#include "daedalus_decoder.h"

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* xorshift64* for deterministic random coefficient generation. */
static uint64_t xs64_state;
static uint64_t xs64(void)
{
    uint64_t x = xs64_state;
    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
    return xs64_state = x;
}

/* Inline C reference — H.264 §8.5.12.1 1D butterfly, applied row pass
 * then column pass; +32 rounding, >>6, add to predicted (=0 here),
 * clip to u8.  Bit-exact-equivalent transcription of daedalus-fourier
 * tests/h264_idct4_ref.c (LGPL-2.1+ original; reproduced here under
 * fair-use for test purposes — same algorithm, no copy of code). */
static int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }

static void h264_idct4_butterfly(const int d[4], int out[4])
{
    int e = d[0] + d[2];
    int f = d[0] - d[2];
    int g = (d[1] >> 1) - d[3];
    int h = d[1] + (d[3] >> 1);
    out[0] = e + h;
    out[1] = f + g;
    out[2] = f - g;
    out[3] = e - h;
}

static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
{
    /* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier):
     *   block[c*4 + r] = coeff at (row=r, col=c).
     * Row pass first: gather d[c] = block[c*4 + r] for fixed r. */
    int tmp[4][4];
    for (int r = 0; r < 4; r++) {
        int d[4]  = { block[0*4 + r], block[1*4 + r],
                      block[2*4 + r], block[3*4 + r] };
        int o[4];
        h264_idct4_butterfly(d, o);
        for (int c = 0; c < 4; c++) tmp[r][c] = o[c];
    }
    /* Column pass: gather d[r] = tmp[r][c] for fixed c. */
    int col_out[4][4];
    for (int c = 0; c < 4; c++) {
        int d[4]  = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] };
        int o[4];
        h264_idct4_butterfly(d, o);
        for (int r = 0; r < 4; r++) col_out[r][c] = o[r];
    }
    /* Add (predicted=dst, here 0) + clip. */
    for (int r = 0; r < 4; r++)
        for (int c = 0; c < 4; c++)
            dst[r * stride + c] = (uint8_t) clip_u8(
                dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
}

int main(int argc, char **argv)
{
    /* Smaller than 1080p to keep the test snappy; still N_MBs >= 64 so
     * the dispatch covers multiple workgroups (16 blocks/WG → ≥4 WGs). */
    int width  = argc > 1 ? atoi(argv[1]) : 320;
    int height = argc > 2 ? atoi(argv[2]) : 240;   /* 240 / 16 = 15 → coded 240 */
    /* Coded dims must be mod-16; 320×240 is canonical QVGA. */

    uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xfeedface5a5a5a5aULL;
    xs64_state = seed;

    int mb_w = width  / 16;
    int mb_h = height / 16;
    int n_mbs = mb_w * mb_h;
    printf("test_idct_bitexact: %dx%d (%d MBs), seed=0x%lx\n",
           width, height, n_mbs, (unsigned long) seed);

    daedalus_decoder *dec = daedalus_decoder_create(width, height);
    if (!dec) {
        fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
        return 0;
    }

    /* Build the per-MB inputs.  Each MB gets 16 luma 4×4 blocks of
     * random coeffs in [-512, 511] — same range as the daedalus-fourier
     * cycle-6 M1 gate uses. */
    int16_t (*per_mb_coeffs)[384] = malloc((size_t) n_mbs * sizeof(*per_mb_coeffs));
    if (!per_mb_coeffs) { fprintf(stderr, "alloc fail\n"); return 1; }

    for (int mb = 0; mb < n_mbs; mb++) {
        for (int i = 0; i < 384; i++) {
            /* Random coeffs in [-512, 511] for all of luma + Cb + Cr.
             * Same range as the daedalus-fourier cycle-6 M1 gate. */
            per_mb_coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
        }
    }

    /* Append in raster order. */
    struct daedalus_decoder_mb_input mb = {0};
    for (int my = 0; my < mb_h; my++) {
        for (int mx = 0; mx < mb_w; mx++) {
            int idx = my * mb_w + mx;
            mb.mb_x = (uint16_t) mx;
            mb.mb_y = (uint16_t) my;
            mb.coeffs = per_mb_coeffs[idx];
            if (daedalus_decoder_append_mb(dec, &mb) != 0) {
                fprintf(stderr, "append (%d,%d) failed\n", mx, my);
                return 1;
            }
        }
    }

    /* Flush — exercise BOTH the luma path (out_y) and the chroma path
     * (out_uv set to non-NULL so flush_frame runs the chroma dispatch
     * + NV12 interleave). */
    size_t y_size  = (size_t) width * height;
    size_t uv_size = (size_t) width * height / 2;
    uint8_t *gpu_y  = calloc(1, y_size);
    uint8_t *gpu_uv = calloc(1, uv_size);
    if (!gpu_y || !gpu_uv) return 1;
    int frc = daedalus_decoder_flush_frame(dec, gpu_y, (size_t) width,
                                            gpu_uv, (size_t) width);
    if (frc != 0) {
        fprintf(stderr, "flush_frame rc=%d\n", frc);
        return 1;
    }

    /* Compute the reference output: same per-MB → flat raster block
     * layout as flush_frame uses. */
    uint8_t *ref_y = calloc(1, y_size);
    if (!ref_y) return 1;
    /* Need a destructively-mutable copy because the reference IDCT
     * doesn't actually mutate, but the GPU's IDCT shader does zero
     * the coeffs.  Our reference doesn't zero; that's fine because we
     * use a fresh copy per block. */
    int16_t block_scratch[16];
    for (int my = 0; my < mb_h; my++) {
        for (int mx = 0; mx < mb_w; mx++) {
            int mb_idx = my * mb_w + mx;
            for (int sb_y = 0; sb_y < 4; sb_y++) {
                for (int sb_x = 0; sb_x < 4; sb_x++) {
                    int block_in_mb = sb_y * 4 + sb_x;
                    memcpy(block_scratch,
                           &per_mb_coeffs[mb_idx][block_in_mb * 16],
                           16 * sizeof(int16_t));
                    size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
                    size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
                    ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
                                  width, block_scratch);
                }
            }
        }
    }

    /* Build the chroma reference: separate planar Cb and Cr (W/2 by
     * H/2), each block IDCT'd into its plane.  Chroma per-MB layout
     * matches flush_frame: 4 Cb blocks then 4 Cr blocks, raster order
     * within each component (sb_y * 2 + sb_x). */
    size_t chroma_w = (size_t) width  / 2;
    size_t chroma_h = (size_t) height / 2;
    size_t chroma_plane_size = chroma_w * chroma_h;
    uint8_t *ref_cb = calloc(1, chroma_plane_size);
    uint8_t *ref_cr = calloc(1, chroma_plane_size);
    if (!ref_cb || !ref_cr) return 1;
    for (int my = 0; my < mb_h; my++) {
        for (int mx = 0; mx < mb_w; mx++) {
            int mb_idx = my * mb_w + mx;
            for (int comp = 0; comp < 2; comp++) {
                uint8_t *plane = (comp == 0) ? ref_cb : ref_cr;
                size_t coeff_base = 256u + (size_t) comp * 64u;
                for (int sb_y = 0; sb_y < 2; sb_y++) {
                    for (int sb_x = 0; sb_x < 2; sb_x++) {
                        int block_in_comp = sb_y * 2 + sb_x;
                        memcpy(block_scratch,
                               &per_mb_coeffs[mb_idx][coeff_base +
                                                       (size_t) block_in_comp * 16],
                               16 * sizeof(int16_t));
                        size_t px_y = (size_t) my * 8 + (size_t) sb_y * 4;
                        size_t px_x = (size_t) mx * 8 + (size_t) sb_x * 4;
                        ref_idct4_add(&plane[px_y * chroma_w + px_x],
                                      (ptrdiff_t) chroma_w, block_scratch);
                    }
                }
            }
        }
    }

    /* Y compare. */
    size_t y_diffs = 0, y_first_diff = 0;
    for (size_t i = 0; i < y_size; i++) {
        if (gpu_y[i] != ref_y[i]) {
            if (y_diffs == 0) y_first_diff = i;
            y_diffs++;
        }
    }
    printf("Y bytes total:  %zu\n", y_size);
    printf("Y bytes diff:   %zu (%.4f%%)\n", y_diffs, 100.0 * y_diffs / y_size);
    if (y_diffs) {
        printf("Y first diff at offset %zu: gpu=%u ref=%u\n",
               y_first_diff, gpu_y[y_first_diff], ref_y[y_first_diff]);
    }

    /* UV compare — deinterleave NV12 back into Cb/Cr and compare. */
    size_t cb_diffs = 0, cr_diffs = 0;
    size_t cb_first = 0, cr_first = 0;
    for (size_t r = 0; r < chroma_h; r++) {
        const uint8_t *gpu_row = gpu_uv + r * (size_t) width;
        const uint8_t *cb_row  = ref_cb + r * chroma_w;
        const uint8_t *cr_row  = ref_cr + r * chroma_w;
        for (size_t c = 0; c < chroma_w; c++) {
            uint8_t gpu_cb = gpu_row[c * 2 + 0];
            uint8_t gpu_cr = gpu_row[c * 2 + 1];
            if (gpu_cb != cb_row[c]) {
                if (cb_diffs == 0) cb_first = r * chroma_w + c;
                cb_diffs++;
            }
            if (gpu_cr != cr_row[c]) {
                if (cr_diffs == 0) cr_first = r * chroma_w + c;
                cr_diffs++;
            }
        }
    }
    printf("Cb bytes total: %zu  diff: %zu (%.4f%%)\n",
           chroma_plane_size, cb_diffs,
           100.0 * cb_diffs / chroma_plane_size);
    printf("Cr bytes total: %zu  diff: %zu (%.4f%%)\n",
           chroma_plane_size, cr_diffs,
           100.0 * cr_diffs / chroma_plane_size);
    if (cb_diffs) {
        size_t r = cb_first / chroma_w, c = cb_first % chroma_w;
        printf("Cb first diff at (%zu,%zu): gpu=%u ref=%u\n",
               r, c, gpu_uv[r * (size_t) width + c * 2 + 0], ref_cb[cb_first]);
    }
    if (cr_diffs) {
        size_t r = cr_first / chroma_w, c = cr_first % chroma_w;
        printf("Cr first diff at (%zu,%zu): gpu=%u ref=%u\n",
               r, c, gpu_uv[r * (size_t) width + c * 2 + 1], ref_cr[cr_first]);
    }

    free(ref_cr);
    free(ref_cb);
    free(ref_y);
    free(gpu_uv);
    free(gpu_y);
    free(per_mb_coeffs);
    daedalus_decoder_destroy(dec);

    if (y_diffs == 0 && cb_diffs == 0 && cr_diffs == 0) {
        printf("BIT-EXACT PASS (Y + Cb + Cr)\n");
        return 0;
    }
    fprintf(stderr, "BIT-EXACT FAIL\n");
    return 1;
}