/* SPDX-License-Identifier: BSD-2-Clause */
/*
 * daedalus-decoder — public C API.
 *
 * Frame-level GPU H.264 decoder targeting V3D7 (Raspberry Pi 5).  Built
 * on daedalus-fourier's V3D compute primitives at frame granularity —
 * one Vulkan submit per frame, one fence wait per frame, encoded
 * bitstream in (via libavcodec's per-MB intercept), NV12 frame out.
 *
 * Per the 2026-05-24 Phase 1 design decisions:
 *   - libavcodec intercept is at macroblock-level (substitution-arc
 *     evolution): the caller is expected to drive the per-MB CABAC /
 *     CAVLC entropy decode and feed each macroblock's descriptor +
 *     coefficients via daedalus_decoder_append_mb().  flush_frame()
 *     builds the per-frame VkCommandBuffer and submits.
 *   - DPB is Vulkan-native VkImage with VK_KHR_external_memory_dma_buf
 *     export.  The caller can obtain the output frame's dmabuf fd
 *     via daedalus_decoder_export_dmabuf().
 *   - Daemon integration shape: this library is statically linked into
 *     daedalus_v4l2_daemon.  No IPC.
 *
 * STATUS: scaffold.  No GPU pipeline implemented yet; all functions
 * are stubs that compile but do not decode anything.  See DESIGN.md
 * for the architecture.
 *
 * ABI: pre-0.1 — every signature here may change.  Don't rely on
 * stability yet.
 */
#ifndef DAEDALUS_DECODER_H
#define DAEDALUS_DECODER_H

#include <stddef.h>
#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

/* -------------------------------------------------------------------
 * Opaque decoder context.  One per concurrent stream.
 * ----------------------------------------------------------------- */
typedef struct daedalus_decoder daedalus_decoder;

/* -------------------------------------------------------------------
 * Per-edge deblock metadata.  One entry per filter-edge; the caller
 * derives these from H.264 §8.7.2.1 boundary-strength rules.
 *
 * Coordinate convention:
 *   mb_x / mb_y  — the MB whose top-left this edge sits on (the "right"
 *                  side for vertical edges, "bottom" side for horizontal
 *                  edges, in H.264 spec's q-side convention).
 *   edge_idx     — 0..3 within the MB:
 *                    luma:   edge 0 = MB boundary, edges 1..3 = internal
 *                            at cols/rows 4, 8, 12.
 *                    chroma: edge 0 = MB boundary, edge 1 = internal at
 *                            col/row 4.  edge_idx > 1 invalid for chroma.
 *                  Edges at frame boundaries (top row of MBs for H edges;
 *                  left column for V edges) MUST be bS=0 — the kernel
 *                  reads p3 at four samples beyond the edge.
 *   orient       — 0 = vertical edge (filtered horizontally across), 1 = horizontal.
 *   plane        — 0 = luma, 1 = chroma Cb, 2 = chroma Cr.  Cb and Cr
 *                  always share the same filter parameters per H.264
 *                  spec, but are listed separately so the caller can
 *                  omit one or the other if needed.
 *   bS           — 0 = skip this edge (no GPU work), 1..3 = bS<4 path
 *                  (uses tc0), 4 = bS=4 "intra" path (ignores tc0).
 *   alpha, beta  — H.264 §8.7.2.2 table 8-16/8-17 values, both 0..255.
 *   tc0[4]      — per-4-cell segment strength along the edge (luma has
 *                  4 segments; chroma has 4 also, with 2 cells each).
 *                  IGNORED when bS == 4.
 * ----------------------------------------------------------------- */
struct daedalus_decoder_edge {
    uint16_t mb_x;
    uint16_t mb_y;
    uint8_t  edge_idx;
    uint8_t  orient;
    uint8_t  plane;
    uint8_t  bS;
    uint8_t  alpha;
    uint8_t  beta;
    int8_t   tc0[4];
};

/* -------------------------------------------------------------------
 * Per-macroblock input.  Mirrors §3 of DESIGN.md.  The caller's
 * libavcodec intercept populates this from the H264SliceContext
 * fields after ff_h264_decode_mb_cabac/cavlc returns and before
 * ff_h264_hl_decode_mb is supposed to run (we replace the latter).
 * ----------------------------------------------------------------- */
struct daedalus_decoder_mb_input {
    /* Frame coordinates (macroblock units). */
    uint16_t mb_x;
    uint16_t mb_y;

    /* Type + quantisation. */
    uint8_t  mb_type;            /* H.264 spec table 7-13/7-14/7-17/7-18 enum */
    uint8_t  mb_qp_y;
    uint8_t  mb_qp_uv;
    uint8_t  cbp;                /* coded block pattern, 0..47 */

    /* Intra prediction (used iff mb_type == I_NxN or I_16x16). */
    uint8_t  intra_4x4_modes[16];
    uint8_t  intra_16x16_mode;
    uint8_t  intra_chroma_mode;

    /* Inter motion / partitions (used iff P_* or B_*). */
    uint8_t  partition_mode;     /* P_16x16 / P_16x8 / P_8x16 / P_8x8 / etc. */
    int8_t   ref_idx_l0[4];      /* per partition; -1 = not used */
    int8_t   ref_idx_l1[4];      /* B only */
    int16_t  mv_l0[4][2];        /* qpel precision (1/4 sample); (x, y) */
    int16_t  mv_l1[4][2];

    /* Deblocking filter parameters. */
    uint8_t  deblock_disable;    /* 0 = enabled */
    int8_t   deblock_alpha_c0;
    int8_t   deblock_beta;

    /* High-profile 8x8 transform selector.
     *   0 = the 256-int16 luma section of coeffs[] holds 16 4x4 blocks
     *       (16 coeffs each, raster sb_y*4+sb_x); the chroma section is
     *       always 4x4.
     *   1 = the 256-int16 luma section holds 4 8x8 blocks (64 coeffs
     *       each, raster sb_y*2+sb_x).  Set per H.264's
     *       transform_8x8_size_flag.  Chroma remains 4x4 (4:2:0).
     */
    uint8_t  transform_8x8;

    /* Transform coefficients — 256 luma + 64 cb + 64 cr int16, all
     * column-major within each 4x4 or 8x8 block (matches FFmpeg
     * convention).  Caller-owned; copied during append. */
    const int16_t *coeffs;       /* points at exactly 384 int16_t */

    /* Reconstructed predicted samples for this MB, planar order:
     *   [  0 .. 256) — 16×16 luma, ROW-MAJOR raster (row 0 cols 0..15,
     *                  row 1 cols 0..15, ..., row 15 cols 0..15)
     *   [256 .. 320) — 8×8 Cb, ROW-MAJOR raster
     *   [320 .. 384) — 8×8 Cr, ROW-MAJOR raster
     *
     * The caller (libavcodec's CPU intra-prediction kernels for Phase 1
     * I-frames; MC fallback for Phase 2 P-frames before GPU MC lands)
     * populates this from neighbour samples per H.264 §8.3 / §8.4.
     * `flush_frame()`'s reconstruction step is `clip255(predicted +
     * idct(coeffs))` — the IDCT shader reads dst, adds the inverse
     * transform, writes clipped — so a non-zero `predicted` here makes
     * the output pixel a valid H.264 reconstruction; zero means
     * residual-only (used by IDCT-isolation tests).
     *
     * NULL is legal and means "all-zero predicted samples" for this MB
     * (the per-frame predicted buffer is zeroed at flush time so a NULL
     * is indistinguishable from explicit zeros). */
    const uint8_t *predicted;    /* NULL or exactly 384 uint8_t */

    /* Per-MB deblock edges — caller-derived per H.264 §8.7.2.  Typical
     * count: 4 V-luma + 4 H-luma + 2 V-Cb + 2 H-Cb + 2 V-Cr + 2 H-Cr
     * = 16 edges per MB (omit zero-bS edges if preferred — frame
     * boundaries MUST be bS=0 since the kernels read p3 at four
     * samples beyond the edge).  daedalus_decoder routes each entry
     * to the appropriate luma/chroma × V/H × bS=4/<4 dispatch in
     * flush_frame and pays a single Vulkan submit per non-empty
     * (direction × bS-band) partition (≤8 deblock submits / frame
     * total) per the Q1 architecture decision (one-submit-per-kernel
     * for now; cmdbuf-builder deferred to Stage 4).
     *
     * NULL or n_edges == 0 → no deblock on this MB. */
    const struct daedalus_decoder_edge *edges;
    uint8_t                              n_edges;
};

/* -------------------------------------------------------------------
 * Output frame format selector.
 * ----------------------------------------------------------------- */
typedef enum {
    DAEDALUS_DECODER_OUTPUT_NV12 = 0,   /* default; Stage 4 final */
    DAEDALUS_DECODER_OUTPUT_RGBA = 1,   /* Stage 5 opt-in */
} daedalus_decoder_output_format;

/* -------------------------------------------------------------------
 * Substrate selector.  Determines which backend daedalus-fourier
 * dispatches the per-frame compute through.
 *
 * AUTO is the only sensible choice for production — it picks per the
 * recipe table baked into daedalus-fourier (post 2026-05-23 decree:
 * QPU when a V3D shader exists, CPU NEON otherwise).  The explicit
 * options exist for testing:
 *
 *   - CPU forces the dispatch onto the NEON path even when V3D7 is
 *     available.  Lets the bit-exact ctests run on hosts without a
 *     working Vulkan/V3D stack (CI runners, dev x86 boxes via
 *     cross-build), and lets us cross-check the V3D shader output
 *     against the NEON reference path on hosts that DO have V3D.
 *   - QPU is the dual — force QPU even on a CPU-preferred kernel.
 *     Useful for benchmarking specific QPU paths in isolation.
 *
 * A non-AUTO selection on a host that can't satisfy it
 * (DAEDALUS_DECODER_SUBSTRATE_QPU on an x86 dev box) propagates a
 * dispatch failure back through flush_frame as -3.
 * ----------------------------------------------------------------- */
typedef enum {
    DAEDALUS_DECODER_SUBSTRATE_AUTO = 0,
    DAEDALUS_DECODER_SUBSTRATE_CPU  = 1,
    DAEDALUS_DECODER_SUBSTRATE_QPU  = 2,
} daedalus_decoder_substrate;

/* -------------------------------------------------------------------
 * Lifecycle
 * ----------------------------------------------------------------- */

/* Create a decoder context for the given **coded** frame dimensions.
 *
 * width, height: pixels of the H.264 coded picture, NOT the displayed
 * picture.  Both must be multiples of 16 (macroblock granularity).
 * For displayed 1080p (1920×1080), the coded frame is 1920×1088 with
 * the SPS's `frame_cropping_*` offsets cropping the bottom 8 rows.
 * The caller is responsible for translating from SPS dims + crop
 * rectangle to the values passed here; we decode the coded frame.
 *
 * Returns NULL on bad dimensions or allocation failure.  Returns a
 * usable context with daedalus_decoder_has_qpu() == 0 when Vulkan
 * init fails — callers that need GPU work should check has_qpu
 * before relying on it.
 */
daedalus_decoder *daedalus_decoder_create(int width, int height);

/* Free all resources.  Safe with NULL. */
void daedalus_decoder_destroy(daedalus_decoder *dec);

/* Switch output format BEFORE the first append_mb call of a frame.
 * Default is NV12.  Returns 0 on success, -1 if called mid-frame
 * (caller must flush first). */
int daedalus_decoder_set_output_format(daedalus_decoder *dec,
                                        daedalus_decoder_output_format fmt);

/* Override the dispatch substrate for subsequent flush_frame calls.
 * Default is AUTO.  Same mid-frame-change restriction as
 * set_output_format. */
int daedalus_decoder_set_substrate(daedalus_decoder *dec,
                                    daedalus_decoder_substrate sub);

/* -------------------------------------------------------------------
 * Per-frame submission
 * ----------------------------------------------------------------- */

/* Append one macroblock's data to the current frame's descriptor SSBO
 * + coefficient SSBO.  No GPU dispatch yet — just CPU-side writes.
 *
 * Must be called in raster order (mb_y * mb_width + mb_x) for the
 * intra-prediction wavefront to work correctly in Phase 1.
 *
 * Returns 0 on success, negative on bounds violation or OOM.
 */
int daedalus_decoder_append_mb(daedalus_decoder *dec,
                                const struct daedalus_decoder_mb_input *mb);

/* End-of-frame flush: builds the per-frame VkCommandBuffer with all
 * pipeline stages, submits once, waits on a single fence, copies the
 * NV12 (or RGBA when opted in) output into the caller-provided
 * planes.
 *
 * For NV12:
 *   out_y / y_stride: Y plane (W*H bytes minimum, at the given stride)
 *   out_uv / uv_stride: interleaved UV plane (W*(H/2) bytes minimum)
 *
 * For RGBA: out_y receives 4*W*H bytes at y_stride; out_uv ignored.
 *
 * Returns 0 on success, negative on Vulkan failure or undecodable
 * frame.  After return, the decoder is ready for the next frame's
 * append calls.
 */
int daedalus_decoder_flush_frame(daedalus_decoder *dec,
                                  uint8_t *out_y,  size_t y_stride,
                                  uint8_t *out_uv, size_t uv_stride);

/* Export the most-recently-decoded frame as a dma_buf fd.  The fd is
 * owned by the caller and must be closed when done.  Lets V4L2
 * consumers (daedalus_v4l2_daemon, libva-v4l2-request-fourier) attach
 * the GPU-decoded surface directly to a CAPTURE plane without a CPU
 * round-trip.
 *
 * Returns the dmabuf fd on success, -1 on failure.  Must be called
 * AFTER flush_frame returns for the relevant frame.
 */
int daedalus_decoder_export_dmabuf(daedalus_decoder *dec, int plane);

/* -------------------------------------------------------------------
 * Diagnostics
 * ----------------------------------------------------------------- */

/* daedalus-decoder build version (semver string, e.g. "0.0.1+g0a1b2c3"). */
const char *daedalus_decoder_version(void);

/* Whether the underlying daedalus-fourier context picked up a working
 * V3D7 Vulkan instance.  Returns 0 if Vulkan init failed and the
 * decoder is operating in stub / failure mode. */
int daedalus_decoder_has_qpu(const daedalus_decoder *dec);

#ifdef __cplusplus
}
#endif

#endif /* DAEDALUS_DECODER_H */