daedalus-fourier/include/daedalus.h

/*
 * daedalus-fourier — public C API.
 *
 * Stable surface for the integration layer (Phase 8 V4L2 shim,
 * libva-v4l2-request-fourier consumer, or any future skin) to
 * dispatch per-kernel work to the right substrate per the
 * cycle 1-5 deployment recipe.
 *
 * Recipe (verdict at end of cycles 1-5, see docs/k*_phase7.md):
 *
 *   VP9 IDCT 8x8       → V3D QPU  (R=0.92 GREEN; M4 +7.2 %)
 *   VP9 LPF wd=4 inner → V3D QPU  (R=0.41 ORANGE; M4 +6.9 %)
 *   VP9 MC 8-tap horiz → CPU NEON (R=0.067 RED; M4 -19.5 %)
 *   VP9 LPF wd=8 inner → V3D QPU  (R=0.34 ORANGE; M4 +4.1 %)
 *   AV1 CDEF 8x8 luma  → CPU NEON (R=0.116 ORANGE; QPU = opportunistic helper at 0.4 Mblock/s)
 *
 * The API exposes BOTH substrates for every kernel — the
 * integration layer can override the recipe at runtime if it
 * has scheduler knowledge the kernel-level R-band measurement
 * didn't capture. The recommended path is to use
 * `daedalus_recipe_dispatch_*` which picks the recipe substrate
 * automatically.
 *
 * License: BSD-2-Clause. This header is part of the library API
 * boundary; the implementation links against vendored
 * LGPL-2.1+ FFmpeg snapshot and BSD-2-Clause dav1d snapshot.
 *
 * Threading: a `daedalus_ctx *` owns Vulkan + V3D state. A
 * context is single-threaded; use one per worker thread if you
 * need parallelism on the QPU side. NEON-side dispatch is
 * stateless and re-entrant.
 *
 * ABI: pre-1.0 — no stability guarantees yet. The function names
 * and signatures will become ABI-stable at v1.0; until then the
 * integration layer should rebuild against the headers it links
 * with.
 */
#ifndef DAEDALUS_FOURIER_H
#define DAEDALUS_FOURIER_H

#include <stdint.h>
#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

/* -------------------------------------------------------------------
 * Substrate selection
 *
 * Most callers should NOT specify a substrate — use the
 * `daedalus_recipe_dispatch_*` family below, which picks the
 * substrate per the cycles-1-5 verdict. Explicit substrate
 * selection is for benchmarking, debugging, and future
 * runtime-aware schedulers.
 * ----------------------------------------------------------------- */
typedef enum {
    DAEDALUS_SUBSTRATE_AUTO = 0,   /* per recipe table */
    DAEDALUS_SUBSTRATE_CPU  = 1,   /* force ARM NEON */
    DAEDALUS_SUBSTRATE_QPU  = 2,   /* force V3D compute */
} daedalus_substrate;

/* -------------------------------------------------------------------
 * Context lifecycle
 * ----------------------------------------------------------------- */
typedef struct daedalus_ctx daedalus_ctx;

/* Create a context.  Initialises V3D Vulkan device if available;
 * NEON-only fallback OK if V3D init fails. Returns NULL on alloc
 * failure. */
daedalus_ctx *daedalus_ctx_create(void);

/* Same but skip V3D init — for callers that know they want CPU
 * only and want a fast-creating context. */
daedalus_ctx *daedalus_ctx_create_no_qpu(void);

/* Returns 1 if QPU dispatch is available on this context, 0 if
 * NEON-only.  Useful for the integration layer to short-circuit
 * QPU dispatch attempts. */
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx);

void daedalus_ctx_destroy(daedalus_ctx *ctx);

/* -------------------------------------------------------------------
 * VP9 IDCT 8x8 add — cycle 1 (QPU by recipe)
 *
 * For each of n_blocks: take 64 int16 coefficients, perform 8x8
 * inverse DCT, add to dst[r,c] = clamp(dst[r,c] + ((q + 16)>>5)).
 *
 * `meta` is an array of (dst_byte_offset, block_x, block_y) for
 * each block, where dst_byte_offset is byte offset into dst.
 *
 * Returns 0 on success, negative errno-like on failure.
 * ----------------------------------------------------------------- */
typedef struct {
    uint32_t dst_off;       /* byte offset into dst */
    uint32_t block_x;       /* used only by QPU path for placement */
    uint32_t block_y;
    uint32_t _pad;
} daedalus_idct8_meta;

int daedalus_recipe_dispatch_vp9_idct8(
    daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    const int16_t *coeffs, size_t n_blocks,
    const daedalus_idct8_meta *meta);

int daedalus_dispatch_vp9_idct8(
    daedalus_ctx *ctx,
    daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    const int16_t *coeffs, size_t n_blocks,
    const daedalus_idct8_meta *meta);

/* -------------------------------------------------------------------
 * VP9 LPF wd=4 / wd=8 — cycles 2 and 4 (QPU by recipe)
 *
 * Loop filter at horizontal edge crossing pixel column 4 of an
 * 8x8 block.  Per-edge thresholds (E, I, H).
 * ----------------------------------------------------------------- */
typedef struct {
    uint32_t dst_off;    /* byte offset into dst, at col 4 of edge */
    int32_t  E, I, H;
} daedalus_lpf_meta;

int daedalus_recipe_dispatch_vp9_lpf4(
    daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_lpf_meta *meta);

int daedalus_recipe_dispatch_vp9_lpf8(
    daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_lpf_meta *meta);

int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_lpf_meta *meta);

int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_lpf_meta *meta);

/* -------------------------------------------------------------------
 * VP9 MC 8-tap horizontal — cycle 3 (CPU by recipe)
 *
 * Subpel-fractional 8-tap horizontal filter; mx selects filter
 * row.  CPU path is the high-performance default; QPU path is
 * available but never recommended by the recipe.
 * ----------------------------------------------------------------- */
typedef struct {
    uint32_t dst_off;
    uint32_t src_off;          /* raw, no pre-advance — shader handles -3 internally */
    int32_t  mx;
    uint32_t _pad;
} daedalus_mc_meta;

int daedalus_recipe_dispatch_vp9_mc_8h(
    daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    const uint8_t *src, size_t src_stride,
    size_t n_blocks, const daedalus_mc_meta *meta);

int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    const uint8_t *src, size_t src_stride,
    size_t n_blocks, const daedalus_mc_meta *meta);

/* -------------------------------------------------------------------
 * AV1 CDEF 8x8 luma — cycle 5 (CPU by recipe; QPU opportunistic)
 *
 * tmp is an array of n_blocks * 192 uint16, with the padded-buffer
 * layout that dav1d's NEON expects (stride 16, padding 2-rows-top +
 * 2-cols-left + 2-cols-right + 2-rows-bottom).  Caller supplies
 * tmp populated with either source pixels (if all edges valid) or
 * INT16_MIN sentinels at the boundary (if edge filtered out).
 * ----------------------------------------------------------------- */
typedef struct {
    uint32_t dst_off;
    uint32_t tmp_off_u16;      /* offset to block-origin in tmp[] (= padded_origin + 2*16+2) */
    int32_t  pri_strength;     /* 1..7 */
    int32_t  sec_strength;     /* 1..4 */
    int32_t  dir;              /* 0..7 */
    int32_t  damping;          /* 1..6 */
} daedalus_cdef_meta;

int daedalus_recipe_dispatch_cdef_8x8(
    daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    const uint16_t *tmp,
    size_t n_blocks, const daedalus_cdef_meta *meta);

int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    const uint16_t *tmp,
    size_t n_blocks, const daedalus_cdef_meta *meta);

/* -------------------------------------------------------------------
 * H.264 IDCT 4x4 + add — cycle 6 (CPU by recipe; QPU unused)
 *
 * Per H.264 §8.5.12.1, integer 4x4 inverse transform. block is
 * COLUMN-major: block[c*4 + r] = coefficient at (row r, col c).
 * Block is destructively zeroed after the transform (FFmpeg
 * convention).
 *
 * `coeffs` is an array of n_blocks * 16 int16. `dst_off` is byte
 * offset into dst per block.
 * ----------------------------------------------------------------- */
typedef struct {
    uint32_t dst_off;
    uint32_t _pad0, _pad1, _pad2;
} daedalus_h264_block_meta;

int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    int16_t *coeffs,           /* not const — destructively zeroed */
    size_t n_blocks, const daedalus_h264_block_meta *meta);

int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    int16_t *coeffs,
    size_t n_blocks, const daedalus_h264_block_meta *meta);

/* H.264 IDCT 8x8 + add — cycle 7 (CPU by recipe).
 * Per H.264 §8.5.13.2, integer 8x8 inverse transform.
 * `coeffs` is an array of n_blocks * 64 int16, column-major per block.
 */
int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    int16_t *coeffs,
    size_t n_blocks, const daedalus_h264_block_meta *meta);

int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    int16_t *coeffs,
    size_t n_blocks, const daedalus_h264_block_meta *meta);

/* -------------------------------------------------------------------
 * H.264 luma "v_loop_filter" — cycle 8 (CPU primary; QPU opportunistic)
 *
 * Filter applied VERTICALLY across a HORIZONTAL edge (16 columns
 * wide; pix points to row 0 of the bottom block). Non-intra
 * (bS < 4) variant.
 *
 * Each tile is 16 cols × 8 rows of context (rows -4..+3 around
 * the edge). dst_off points to row 0 col 0 of the bottom block.
 *
 * Constraint: dst_off >= 4 * dst_stride (the kernel reads p3 at
 * -4*stride). Caller must ensure this.
 * ----------------------------------------------------------------- */
typedef struct {
    uint32_t dst_off;
    int32_t  alpha;             /* 0..63 typical, table-derived */
    int32_t  beta;              /* 0..63 typical */
    int8_t   tc0[4];            /* per-segment filter strength; -1 means skip */
} daedalus_h264_deblock_meta;

int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

/* H.264 luma "h_loop_filter" — sibling of _v, applies filter
 * HORIZONTALLY across a VERTICAL edge (16 rows tall; pix points to
 * row 0 of the right block, col 0 = leftmost output column).  Same
 * non-intra (bS < 4) variant.
 *
 * Each tile is 8 cols x 16 rows of context (cols -4..+3 around the
 * edge).  dst_off points to row 0 col 0 of the RIGHT block.
 *
 * Constraint: (dst_off % dst_stride) >= 4 (the kernel reads p3 at
 * pix[-4]).  Caller must ensure this.
 *
 * QPU shader for the H variant is not yet implemented; recipe table
 * routes AUTO to CPU NEON.  An explicit DAEDALUS_SUBSTRATE_QPU on
 * the _h dispatch returns -1 rather than silently degrading.
 */
int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

/* H.264 chroma (4:2:0) loop filters — bS<4 variant.  Chroma uses
 * the SAME daedalus_h264_deblock_meta struct as luma but on smaller
 * tiles: 8 cols × 4 rows for V (4 segments of 2 cols), 4 cols × 8
 * rows for H (4 segments of 2 rows).  Each segment has its own tc0
 * strength (tc0[s] applies to both cells in segment s).
 *
 * Algorithm difference vs luma: chroma updates only p0 and q0
 * (never p1/p2/q1/q2) and uses tC = tc0_seg + 1 directly (no
 * luma-style ap/aq side-condition bonus).
 *
 * QPU shaders for chroma deblock not implemented yet; recipe table
 * routes AUTO to CPU NEON.  Explicit SUBSTRATE_QPU returns -1.
 */
int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

/* H.264 bS=4 "intra" loop filters — used at I-MB and inter
 * macroblock boundaries where boundary strength is forced to 4 per
 * H.264 §8.7.2.1.  Different algorithm from bS<4: per-side strong
 * vs weak filter decided by quad-tree condition (luma only);
 * chroma is always weak.  No tc0 — the daedalus_h264_deblock_meta
 * struct's tc0[] field is IGNORED for intra dispatches (callers can
 * leave it uninitialised or share a single edge list across both
 * intra and non-intra kernels).
 *
 * Reuses the same meta layout as bS<4 dispatches for alpha + beta +
 * dst_off; tile geometry per orientation is identical to the bS<4
 * sibling (16-col / 16-row luma; 8-col / 8-row chroma).
 *
 * QPU shaders not implemented for any of the four; recipe routes
 * AUTO to CPU NEON.  Explicit SUBSTRATE_QPU returns -1 (fast fail).
 */
int daedalus_recipe_dispatch_h264_deblock_luma_v_intra(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
int daedalus_dispatch_h264_deblock_luma_v_intra(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

int daedalus_recipe_dispatch_h264_deblock_luma_h_intra(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
int daedalus_dispatch_h264_deblock_luma_h_intra(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

int daedalus_recipe_dispatch_h264_deblock_chroma_v_intra(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
int daedalus_dispatch_h264_deblock_chroma_v_intra(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

int daedalus_recipe_dispatch_h264_deblock_chroma_h_intra(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);
int daedalus_dispatch_h264_deblock_chroma_h_intra(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta);

/* -------------------------------------------------------------------
 * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
 * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
 * docs/k9_h264qpel_mc20.md for the R-band rationale).
 *
 * Per H.264 §8.4.2.2.1, horizontal half-pel luma 6-tap filter:
 *   dst[r,c] = clip255((s[r,c-2] - 5*s[r,c-1] + 20*s[r,c]
 *                       + 20*s[r,c+1] - 5*s[r,c+2] + s[r,c+3]
 *                       + 16) >> 5)
 *
 * Single-stride: dst and src share `stride`; this matches FFmpeg's
 * H264QpelContext.put_h264_qpel_pixels_tab[][] convention and the
 * vendored ff_put_h264_qpel8_mc20_neon signature.
 *
 * `src + src_off` points at the leftmost OUTPUT column (col 0); the
 * filter reads cols -2..+3, so the caller must guarantee src has at
 * least 2 pixels of left context and 3 pixels of right context per
 * row. (FFmpeg already maintains an edge-emulated buffer for the
 * frame boundary; this matches that contract.)
 * ----------------------------------------------------------------- */
typedef struct {
    uint32_t dst_off;        /* byte offset into dst (block top-left) */
    uint32_t src_off;        /* byte offset into src (col 0, row 0)   */
} daedalus_h264_qpel_meta;

int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta);

int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta);

/* H.264 luma qpel mc02 (vertical half-pel) — mirror of mc20.
 * 6-tap filter applied vertically:
 *   dst[r,c] = clip255((s[r-2,c] - 5*s[r-1,c] + 20*s[r,c]
 *                       + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c]
 *                       + 16) >> 5)
 *
 * Same single-stride convention as mc20.  src + src_off points at
 * row 0 col 0 of the OUTPUT block; the filter reads rows -2..+3, so
 * the caller must guarantee 2 rows of top context and 3 rows of
 * bottom context per block (FFmpeg edge-emulated buffer handles
 * frame boundaries; same contract as mc20).
 *
 * QPU shader not implemented yet; recipe table routes AUTO to CPU
 * NEON.  Explicit DAEDALUS_SUBSTRATE_QPU returns -1.
 */
int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta);

int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, const uint8_t *src, size_t stride,
    size_t n_blocks, const daedalus_h264_qpel_meta *meta);

/* -------------------------------------------------------------------
 * Recipe query — what does the API recommend for each kernel?
 * ----------------------------------------------------------------- */
typedef enum {
    DAEDALUS_KERNEL_VP9_IDCT8       = 1,
    DAEDALUS_KERNEL_VP9_LPF4_INNER  = 2,
    DAEDALUS_KERNEL_VP9_MC_8H       = 3,
    DAEDALUS_KERNEL_VP9_LPF8_INNER  = 4,
    DAEDALUS_KERNEL_AV1_CDEF_8X8    = 5,
    DAEDALUS_KERNEL_H264_IDCT4      = 6,
    DAEDALUS_KERNEL_H264_IDCT8      = 7,
    DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
    DAEDALUS_KERNEL_H264_QPEL_MC20  = 9,
    DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
    DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11,
    DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12,
    DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA = 13,
    DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA = 14,
    DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15,
    DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
    DAEDALUS_KERNEL_H264_QPEL_MC02        = 17,
} daedalus_kernel;

daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);

#ifdef __cplusplus
}
#endif
#endif  /* DAEDALUS_FOURIER_H */