/* SPDX-License-Identifier: BSD-2-Clause */
/*
 * daedalus-decoder — public C API implementation.
 *
 * Scaffold only.  Most functions return success with no GPU work
 * performed; the bodies will fill in across Phases 1-4 per DESIGN.md
 * §8.  This file exists so the API surface compiles, links, and can
 * be smoke-tested end-to-end (ctx create / append / flush / destroy)
 * before any shader work begins.
 */

#include "internal.h"

#include <stdlib.h>
#include <string.h>

/* Built via -D from CMakeLists. */
#ifndef DAEDALUS_DECODER_VERSION
#define DAEDALUS_DECODER_VERSION "0.0.1+scaffold"
#endif

const char *daedalus_decoder_version(void)
{
    return DAEDALUS_DECODER_VERSION;
}

daedalus_decoder *daedalus_decoder_create(int width, int height)
{
    if (width <= 0 || height <= 0)
        return NULL;
    if ((width & 15) || (height & 15))
        return NULL;  /* must be multiple of 16 */

    daedalus_decoder *dec = calloc(1, sizeof(*dec));
    if (!dec)
        return NULL;

    dec->width      = width;
    dec->height     = height;
    dec->mb_width   = width >> 4;
    dec->mb_height  = height >> 4;
    dec->n_mbs      = dec->mb_width * dec->mb_height;
    dec->output_fmt = DAEDALUS_DECODER_OUTPUT_NV12;
    dec->substrate  = DAEDALUS_DECODER_SUBSTRATE_AUTO;

    /* daedalus-fourier ctx — required.  Phase 1 needs the QPU; if
     * Vulkan init fails the decoder is unusable.  Caller can check
     * via daedalus_decoder_has_qpu(). */
    dec->dctx = daedalus_ctx_create();
    if (!dec->dctx) {
        free(dec);
        return NULL;
    }

    dec->mb_descs = calloc((size_t) dec->n_mbs, sizeof(*dec->mb_descs));
    dec->coeffs   = calloc((size_t) dec->n_mbs * 384, sizeof(int16_t));
    if (!dec->mb_descs || !dec->coeffs) {
        daedalus_decoder_destroy(dec);
        return NULL;
    }

    return dec;
}

void daedalus_decoder_destroy(daedalus_decoder *dec)
{
    if (!dec)
        return;
    free(dec->coeffs);
    free(dec->mb_descs);
    if (dec->dctx)
        daedalus_ctx_destroy(dec->dctx);
    free(dec);
}

int daedalus_decoder_set_output_format(daedalus_decoder *dec,
                                        daedalus_decoder_output_format fmt)
{
    if (!dec)
        return -1;
    if (dec->mbs_appended != 0)
        return -1;  /* mid-frame change forbidden */
    if (fmt != DAEDALUS_DECODER_OUTPUT_NV12 &&
        fmt != DAEDALUS_DECODER_OUTPUT_RGBA)
        return -1;
    dec->output_fmt = fmt;
    return 0;
}

int daedalus_decoder_set_substrate(daedalus_decoder *dec,
                                    daedalus_decoder_substrate sub)
{
    if (!dec)
        return -1;
    if (dec->mbs_appended != 0)
        return -1;
    if (sub != DAEDALUS_DECODER_SUBSTRATE_AUTO &&
        sub != DAEDALUS_DECODER_SUBSTRATE_CPU &&
        sub != DAEDALUS_DECODER_SUBSTRATE_QPU)
        return -1;
    dec->substrate = sub;
    return 0;
}

/* Map our public substrate enum onto daedalus-fourier's.  Same
 * ordering by intent — we duplicate the enum for ABI isolation. */
static daedalus_substrate map_substrate(daedalus_decoder_substrate s)
{
    switch (s) {
    case DAEDALUS_DECODER_SUBSTRATE_CPU: return DAEDALUS_SUBSTRATE_CPU;
    case DAEDALUS_DECODER_SUBSTRATE_QPU: return DAEDALUS_SUBSTRATE_QPU;
    case DAEDALUS_DECODER_SUBSTRATE_AUTO:
    default:                             return DAEDALUS_SUBSTRATE_AUTO;
    }
}

int daedalus_decoder_append_mb(daedalus_decoder *dec,
                                const struct daedalus_decoder_mb_input *mb)
{
    if (!dec || !mb || !mb->coeffs)
        return -1;
    if (mb->mb_x >= dec->mb_width || mb->mb_y >= dec->mb_height)
        return -1;

    /* Raster-order check — Phase 1's intra wavefront requires it.
     * Caller is libavcodec's slice loop which produces raster order
     * naturally, so this should never fire in practice. */
    int expected = mb->mb_y * dec->mb_width + mb->mb_x;
    if (expected != dec->mbs_appended)
        return -1;

    struct daedalus_decoder_mb_desc *d = &dec->mb_descs[expected];
    d->mb_x              = mb->mb_x;
    d->mb_y              = mb->mb_y;
    d->mb_type           = mb->mb_type;
    d->mb_qp_y           = mb->mb_qp_y;
    d->mb_qp_uv          = mb->mb_qp_uv;
    d->cbp               = mb->cbp;
    memcpy(d->intra_4x4_modes, mb->intra_4x4_modes, 16);
    d->intra_16x16_mode  = mb->intra_16x16_mode;
    d->intra_chroma_mode = mb->intra_chroma_mode;
    d->partition_mode    = mb->partition_mode;
    memcpy(d->ref_idx_l0, mb->ref_idx_l0, 4);
    memcpy(d->ref_idx_l1, mb->ref_idx_l1, 4);
    memcpy(d->mv_l0, mb->mv_l0, sizeof(d->mv_l0));
    memcpy(d->mv_l1, mb->mv_l1, sizeof(d->mv_l1));
    d->deblock_disable   = mb->deblock_disable;
    d->deblock_alpha_c0  = mb->deblock_alpha_c0;
    d->deblock_beta      = mb->deblock_beta;
    d->transform_8x8     = mb->transform_8x8;

    memcpy(&dec->coeffs[(size_t) expected * 384],
           mb->coeffs,
           384 * sizeof(int16_t));

    dec->mbs_appended++;
    return 0;
}

/* Phase 1 stage 1 — frame-scaled IDCT 4x4 dispatch (luma + chroma).
 *
 * Brings up the GPU substrate by calling daedalus-fourier's existing
 * `daedalus_recipe_dispatch_h264_idct4` at frame batch granularity in
 * contrast to the substitution-arc shim that called it with
 * n_blocks = 1 per call.  Two Vulkan submits + waits per frame (one
 * luma, one chroma) instead of millions of per-block dispatches.
 *
 * What's done in this stage:
 *   - Luma: build a per-frame meta[] in raster order (n_blocks =
 *     N_MBs × 16); flat-pack coeffs from each MB's first 256 int16;
 *     dispatch into a frame-sized zero-initialised Y scratch plane.
 *   - Chroma: build an interleaved Cb+Cr meta[] (n_blocks = N_MBs × 8,
 *     4 Cb + 4 Cr per MB); flat-pack coeffs from each MB's next 128
 *     int16 (64 Cb + 64 Cr); dispatch into a planar Cb||Cr scratch
 *     buffer (W*H/4 each, concatenated W*H/2 total); CPU-interleave
 *     into the caller's NV12 UV plane post-dispatch.
 *   - Both dispatches use predicted=0 (the scratch buffers are
 *     calloc'd); the shader does clip255(predicted + idct(coeffs)).
 *
 * What's NOT done yet (follow-on Phase 1 sub-PRs):
 *   - Intra prediction (Stage 2a wavefront): predicted is forced to 0,
 *     so output pixels are residual-only and not a valid frame decode.
 *     Sufficient for Vulkan round-trip validation, not for bit-exact
 *     against FFmpeg.
 *   - Motion compensation (Stage 2b): inter MBs not handled.
 *   - High-profile IDCT 8x8 (Stage 1 extension).
 *   - Chroma DC / luma Intra16x16 DC Hadamard pre-pass (currently we
 *     treat all chroma blocks as plain 4×4 AC IDCT; real decode needs
 *     the chroma DC 2×2 Hadamard contribution folded in).
 *   - Deblock (Stage 4).
 *   - dmabuf export — still memcpy-out to caller-provided planes.
 *   - Stage 5 RGBA opt-in.
 *   - GPU-side NV12 interleave — currently a CPU memcpy loop after
 *     the chroma dispatch.  Trivial cost (~1 MB / frame at 1080p)
 *     vs the IDCT itself, but worth folding into a Stage-5 pass
 *     later for full-GPU residency.
 */
int daedalus_decoder_flush_frame(daedalus_decoder *dec,
                                  uint8_t *out_y,  size_t y_stride,
                                  uint8_t *out_uv, size_t uv_stride)
{
    if (!dec)
        return -1;
    if (dec->mbs_appended != dec->n_mbs)
        return -1;  /* incomplete frame */
    if (!out_y)
        return -1;

    int rc = 0;

    /* ---- Build frame-scaled luma dispatches (4x4 + 8x8) ---- */

    /* Two partitions of the per-MB luma section based on each MB's
     * transform_8x8 flag:
     *
     *   transform_8x8 == 0  →  16 4x4 blocks contribute to the 4x4
     *                          dispatch (16 coeffs each).
     *   transform_8x8 == 1  →  4 8x8 blocks contribute to the 8x8
     *                          dispatch (64 coeffs each).
     *
     * Both partitions can be non-empty in the same frame (FFmpeg sets
     * transform_8x8_size_flag per MB), so we allocate worst-case for
     * each and track actual counts.
     */
    const size_t y_stride_int = (size_t) dec->width;
    const size_t y_size = y_stride_int * (size_t) dec->height;
    uint8_t *scratch_y = calloc(1, y_size);

    const size_t worst_4x4 = (size_t) dec->n_mbs * 16;
    const size_t worst_8x8 = (size_t) dec->n_mbs * 4;
    int16_t                  *coeffs4 = malloc(worst_4x4 * 16 * sizeof(int16_t));
    int16_t                  *coeffs8 = malloc(worst_8x8 * 64 * sizeof(int16_t));
    daedalus_h264_block_meta *meta4   = malloc(worst_4x4 * sizeof(*meta4));
    daedalus_h264_block_meta *meta8   = malloc(worst_8x8 * sizeof(*meta8));

    if (!scratch_y || !coeffs4 || !coeffs8 || !meta4 || !meta8) {
        rc = -1;
        goto cleanup;
    }

    /* Walk MBs in raster order, append each MB's luma blocks to the
     * partition selected by its transform_8x8 flag.
     *
     * NB: per-MB 4x4 / 8x8 coefficient ORDER inside the H.264 bitstream
     * follows the z-scan from spec §6.4.3 / fig 6-10.  We're using
     * flat raster on the input side too (sb_y outer, sb_x inner) for
     * Phase 1 self-consistency; the z-scan permutation is the
     * libavcodec-intercept patch's responsibility.
     */
    size_t bi4 = 0, bi8 = 0;
    for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
        for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
            int mb_idx = mb_y * dec->mb_width + mb_x;
            const struct daedalus_decoder_mb_desc *d = &dec->mb_descs[mb_idx];
            const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];

            if (d->transform_8x8) {
                /* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
                for (int sb_y = 0; sb_y < 2; sb_y++) {
                    for (int sb_x = 0; sb_x < 2; sb_x++) {
                        size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 8;
                        size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 8;
                        meta8[bi8].dst_off = (uint32_t)
                            (px_y * y_stride_int + px_x);
                        int block_in_mb = sb_y * 2 + sb_x;
                        memcpy(&coeffs8[bi8 * 64],
                               &mb_coeffs[block_in_mb * 64],
                               64 * sizeof(int16_t));
                        bi8++;
                    }
                }
            } else {
                /* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
                for (int sb_y = 0; sb_y < 4; sb_y++) {
                    for (int sb_x = 0; sb_x < 4; sb_x++) {
                        size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
                        size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
                        meta4[bi4].dst_off = (uint32_t)
                            (px_y * y_stride_int + px_x);
                        int block_in_mb = sb_y * 4 + sb_x;
                        memcpy(&coeffs4[bi4 * 16],
                               &mb_coeffs[block_in_mb * 16],
                               16 * sizeof(int16_t));
                        bi4++;
                    }
                }
            }
        }
    }
    /* assert bi4 + bi8*4 == n_mbs*16; loop math guarantees it */

    /* ---- One Vulkan submit + wait per non-empty luma partition.
     * AUTO substrate picks QPU per the post-decree recipe table; falls
     * back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable.
     * Skipping the dispatch when the partition is empty avoids the
     * shader-pool warm-up cost on the common case (a typical Baseline
     * stream is all-4x4 → 8x8 dispatch is no-op). */
    const daedalus_substrate sub = map_substrate(dec->substrate);
    if (bi4 > 0) {
        int dr = daedalus_dispatch_h264_idct4(dec->dctx, sub,
                                               scratch_y, y_stride_int,
                                               coeffs4, bi4, meta4);
        if (dr != 0) { rc = -3; goto cleanup; }
    }
    if (bi8 > 0) {
        int dr = daedalus_dispatch_h264_idct8(dec->dctx, sub,
                                               scratch_y, y_stride_int,
                                               coeffs8, bi8, meta8);
        if (dr != 0) { rc = -3; goto cleanup; }
    }

    /* ---- Copy Y out to caller's plane at the requested stride. ---- */
    for (int r = 0; r < dec->height; r++)
        memcpy(out_y + (size_t) r * y_stride,
               &scratch_y[(size_t) r * y_stride_int],
               (size_t) dec->width);

    /* ---- Build frame-scaled chroma 4×4 dispatch ---- */
    /*
     * 4:2:0 layout — chroma planes are (W/2) by (H/2), one Cb + one
     * Cr per pixel pair.  H.264 per-MB chroma is two 8×8 components,
     * each split into 4 4×4 blocks, so 8 chroma 4×4 blocks per MB.
     *
     * We dispatch BOTH components in a single shader call against a
     * planar scratch buffer:
     *     scratch_uv[0 .. cb_plane_size)        — Cb plane (W/2 × H/2)
     *     scratch_uv[cb_plane_size .. 2*size)   — Cr plane (W/2 × H/2)
     *
     * meta[i].dst_off is a flat offset into the scratch buffer (the
     * shader treats dst+dst_off as a contiguous 4×4 with row pitch =
     * stride), so Cr blocks just add cb_plane_size to their offset.
     * Stride is W/2 (the chroma row width); this works because Cb and
     * Cr planes share the same row pitch.
     *
     * Post-dispatch we interleave the two planes into NV12 UV layout
     * on the CPU.  Doing this on the GPU is a Stage-5 follow-up
     * (would need a small "copy + interleave" shader); CPU memcpy
     * loop is ~1 MB/frame at 1080p so it's not on the critical path.
     */
    int16_t *chroma_coeffs = NULL;
    daedalus_h264_block_meta *chroma_meta = NULL;
    uint8_t *scratch_uv = NULL;
    if (out_uv) {
        const size_t n_chroma_blocks_per_mb = 8;  /* 4 Cb + 4 Cr */
        const size_t n_chroma_blocks =
            (size_t) dec->n_mbs * n_chroma_blocks_per_mb;
        const size_t chroma_w = (size_t) dec->width  / 2;
        const size_t chroma_h = (size_t) dec->height / 2;
        const size_t cb_plane_size = chroma_w * chroma_h;
        const size_t uv_scratch_size = 2 * cb_plane_size;

        scratch_uv    = calloc(1, uv_scratch_size);
        chroma_coeffs = malloc(n_chroma_blocks * 16 * sizeof(int16_t));
        chroma_meta   = malloc(n_chroma_blocks *
                               sizeof(daedalus_h264_block_meta));
        if (!scratch_uv || !chroma_coeffs || !chroma_meta) {
            rc = -1;
            goto chroma_cleanup;
        }

        size_t cbi = 0;
        for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
            for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
                int mb_idx = mb_y * dec->mb_width + mb_x;
                const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
                /* Per-MB coeff layout (set by append_mb):
                 *   [  0 .. 256) — 16 luma 4×4 blocks
                 *   [256 .. 320) — 4 Cb 4×4 blocks (raster sb_y*2+sb_x)
                 *   [320 .. 384) — 4 Cr 4×4 blocks (raster sb_y*2+sb_x)
                 */
                for (int comp = 0; comp < 2; comp++) {           /* 0=Cb 1=Cr */
                    size_t plane_base = (size_t) comp * cb_plane_size;
                    size_t coeff_base = 256u + (size_t) comp * 64u;
                    for (int sb_y = 0; sb_y < 2; sb_y++) {
                        for (int sb_x = 0; sb_x < 2; sb_x++) {
                            size_t px_y = (size_t) mb_y * 8 + (size_t) sb_y * 4;
                            size_t px_x = (size_t) mb_x * 8 + (size_t) sb_x * 4;
                            chroma_meta[cbi].dst_off = (uint32_t)
                                (plane_base + px_y * chroma_w + px_x);

                            int block_in_comp = sb_y * 2 + sb_x;
                            memcpy(&chroma_coeffs[cbi * 16],
                                   &mb_coeffs[coeff_base + (size_t) block_in_comp * 16],
                                   16 * sizeof(int16_t));
                            cbi++;
                        }
                    }
                }
            }
        }
        /* assert cbi == n_chroma_blocks; loop math guarantees it */

        int cr_rc = daedalus_dispatch_h264_idct4(dec->dctx, sub,
                                                  scratch_uv, chroma_w,
                                                  chroma_coeffs,
                                                  n_chroma_blocks,
                                                  chroma_meta);
        if (cr_rc != 0) {
            rc = -3;
            goto chroma_cleanup;
        }

        /* CPU NV12 interleave: out_uv[r][2c+0] = Cb[r][c], [2c+1] = Cr. */
        const uint8_t *cb_plane = scratch_uv;
        const uint8_t *cr_plane = scratch_uv + cb_plane_size;
        for (size_t r = 0; r < chroma_h; r++) {
            uint8_t *dst_row = out_uv + r * uv_stride;
            const uint8_t *cb_row = cb_plane + r * chroma_w;
            const uint8_t *cr_row = cr_plane + r * chroma_w;
            for (size_t c = 0; c < chroma_w; c++) {
                dst_row[c * 2 + 0] = cb_row[c];
                dst_row[c * 2 + 1] = cr_row[c];
            }
        }

    chroma_cleanup:
        free(chroma_meta);
        free(chroma_coeffs);
        free(scratch_uv);
        if (rc != 0)
            goto cleanup;
    }

cleanup:
    free(meta8);
    free(meta4);
    free(coeffs8);
    free(coeffs4);
    free(scratch_y);
    dec->mbs_appended = 0;
    return rc;
}

int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane)
{
    (void) dec; (void) plane;
    /* TODO Phase 1: vkGetMemoryFdKHR on the DPB slot's VkImage memory. */
    return -1;
}

int daedalus_decoder_has_qpu(const daedalus_decoder *dec)
{
    if (!dec || !dec->dctx)
        return 0;
    return daedalus_ctx_has_qpu(dec->dctx);
}