Files
daedalus-fourier/include/daedalus.h
T
marfrit 1085c5699c Phase 8: wire IDCT QPU dispatch through public API
daedalus_ctx now owns a v3d_runner when V3D is available. The
public API's dispatch_vp9_idct8 routes QPU calls through a
new dispatch_idct8_qpu helper that: (1) lazy-creates the cycle 1
v4 pipeline on first use, (2) allocates 3 host-visible SSBOs
per call (coeffs/dst/meta), (3) memcpy host->GPU, (4) dispatch
with the v4 32-blocks-per-WG geometry, (5) memcpy GPU->host.

Per-call alloc is intentional for Phase 8 correctness-first
scope; buffer-pool perf optimization is deferred.

Added daedalus_ctx_create_no_qpu() for fast-path callers that
know they want CPU only.

test_api_idct extended to a 3-mode matrix: CPU forced, QPU
forced, AUTO recipe. All three deliver 4096/4096 bit-exact
on hertz with V3D 7.1.7.0:

  recipe substrate for VP9_IDCT8: 2 (QPU)
  [CPU] 4096/4096 bit-exact
  [QPU] 4096/4096 bit-exact (real QPU dispatch through the API)
  [AUTO] 4096/4096 bit-exact (recipe routes to QPU)

Next Phase 8 sub-step: same wiring pattern for cycle 2 LPF wd=4
and cycle 4 LPF wd=8 (the other two recipe-QPU kernels).
Cycle 3 MC and cycle 5 CDEF only need the dispatch hook
(recipe routes to CPU; QPU stays opportunistic via explicit
override).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:55:55 +00:00

215 lines
8.0 KiB
C

/*
* daedalus-fourier — public C API.
*
* Stable surface for the integration layer (Phase 8 V4L2 shim,
* libva-v4l2-request-fourier consumer, or any future skin) to
* dispatch per-kernel work to the right substrate per the
* cycle 1-5 deployment recipe.
*
* Recipe (verdict at end of cycles 1-5, see docs/k*_phase7.md):
*
* VP9 IDCT 8x8 → V3D QPU (R=0.92 GREEN; M4 +7.2 %)
* VP9 LPF wd=4 inner → V3D QPU (R=0.41 ORANGE; M4 +6.9 %)
* VP9 MC 8-tap horiz → CPU NEON (R=0.067 RED; M4 -19.5 %)
* VP9 LPF wd=8 inner → V3D QPU (R=0.34 ORANGE; M4 +4.1 %)
* AV1 CDEF 8x8 luma → CPU NEON (R=0.116 ORANGE; QPU = opportunistic helper at 0.4 Mblock/s)
*
* The API exposes BOTH substrates for every kernel — the
* integration layer can override the recipe at runtime if it
* has scheduler knowledge the kernel-level R-band measurement
* didn't capture. The recommended path is to use
* `daedalus_recipe_dispatch_*` which picks the recipe substrate
* automatically.
*
* License: BSD-2-Clause. This header is part of the library API
* boundary; the implementation links against vendored
* LGPL-2.1+ FFmpeg snapshot and BSD-2-Clause dav1d snapshot.
*
* Threading: a `daedalus_ctx *` owns Vulkan + V3D state. A
* context is single-threaded; use one per worker thread if you
* need parallelism on the QPU side. NEON-side dispatch is
* stateless and re-entrant.
*
* ABI: pre-1.0 — no stability guarantees yet. The function names
* and signatures will become ABI-stable at v1.0; until then the
* integration layer should rebuild against the headers it links
* with.
*/
#ifndef DAEDALUS_FOURIER_H
#define DAEDALUS_FOURIER_H
#include <stdint.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
/* -------------------------------------------------------------------
* Substrate selection
*
* Most callers should NOT specify a substrate — use the
* `daedalus_recipe_dispatch_*` family below, which picks the
* substrate per the cycles-1-5 verdict. Explicit substrate
* selection is for benchmarking, debugging, and future
* runtime-aware schedulers.
* ----------------------------------------------------------------- */
typedef enum {
DAEDALUS_SUBSTRATE_AUTO = 0, /* per recipe table */
DAEDALUS_SUBSTRATE_CPU = 1, /* force ARM NEON */
DAEDALUS_SUBSTRATE_QPU = 2, /* force V3D compute */
} daedalus_substrate;
/* -------------------------------------------------------------------
* Context lifecycle
* ----------------------------------------------------------------- */
typedef struct daedalus_ctx daedalus_ctx;
/* Create a context. Initialises V3D Vulkan device if available;
* NEON-only fallback OK if V3D init fails. Returns NULL on alloc
* failure. */
daedalus_ctx *daedalus_ctx_create(void);
/* Same but skip V3D init — for callers that know they want CPU
* only and want a fast-creating context. */
daedalus_ctx *daedalus_ctx_create_no_qpu(void);
/* Returns 1 if QPU dispatch is available on this context, 0 if
* NEON-only. Useful for the integration layer to short-circuit
* QPU dispatch attempts. */
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx);
void daedalus_ctx_destroy(daedalus_ctx *ctx);
/* -------------------------------------------------------------------
* VP9 IDCT 8x8 add — cycle 1 (QPU by recipe)
*
* For each of n_blocks: take 64 int16 coefficients, perform 8x8
* inverse DCT, add to dst[r,c] = clamp(dst[r,c] + ((q + 16)>>5)).
*
* `meta` is an array of (dst_byte_offset, block_x, block_y) for
* each block, where dst_byte_offset is byte offset into dst.
*
* Returns 0 on success, negative errno-like on failure.
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off; /* byte offset into dst */
uint32_t block_x; /* used only by QPU path for placement */
uint32_t block_y;
uint32_t _pad;
} daedalus_idct8_meta;
int daedalus_recipe_dispatch_vp9_idct8(
daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta);
int daedalus_dispatch_vp9_idct8(
daedalus_ctx *ctx,
daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta);
/* -------------------------------------------------------------------
* VP9 LPF wd=4 / wd=8 — cycles 2 and 4 (QPU by recipe)
*
* Loop filter at horizontal edge crossing pixel column 4 of an
* 8x8 block. Per-edge thresholds (E, I, H).
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off; /* byte offset into dst, at col 4 of edge */
int32_t E, I, H;
} daedalus_lpf_meta;
int daedalus_recipe_dispatch_vp9_lpf4(
daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta);
int daedalus_recipe_dispatch_vp9_lpf8(
daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta);
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta);
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta);
/* -------------------------------------------------------------------
* VP9 MC 8-tap horizontal — cycle 3 (CPU by recipe)
*
* Subpel-fractional 8-tap horizontal filter; mx selects filter
* row. CPU path is the high-performance default; QPU path is
* available but never recommended by the recipe.
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off;
uint32_t src_off; /* raw, no pre-advance — shader handles -3 internally */
int32_t mx;
uint32_t _pad;
} daedalus_mc_meta;
int daedalus_recipe_dispatch_vp9_mc_8h(
daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta);
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta);
/* -------------------------------------------------------------------
* AV1 CDEF 8x8 luma — cycle 5 (CPU by recipe; QPU opportunistic)
*
* tmp is an array of n_blocks * 192 uint16, with the padded-buffer
* layout that dav1d's NEON expects (stride 16, padding 2-rows-top +
* 2-cols-left + 2-cols-right + 2-rows-bottom). Caller supplies
* tmp populated with either source pixels (if all edges valid) or
* INT16_MIN sentinels at the boundary (if edge filtered out).
* ----------------------------------------------------------------- */
typedef struct {
uint32_t dst_off;
uint32_t tmp_off_u16; /* offset to block-origin in tmp[] (= padded_origin + 2*16+2) */
int32_t pri_strength; /* 1..7 */
int32_t sec_strength; /* 1..4 */
int32_t dir; /* 0..7 */
int32_t damping; /* 1..6 */
} daedalus_cdef_meta;
int daedalus_recipe_dispatch_cdef_8x8(
daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta);
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta);
/* -------------------------------------------------------------------
* Recipe query — what does the API recommend for each kernel?
* ----------------------------------------------------------------- */
typedef enum {
DAEDALUS_KERNEL_VP9_IDCT8 = 1,
DAEDALUS_KERNEL_VP9_LPF4_INNER = 2,
DAEDALUS_KERNEL_VP9_MC_8H = 3,
DAEDALUS_KERNEL_VP9_LPF8_INNER = 4,
DAEDALUS_KERNEL_AV1_CDEF_8X8 = 5,
} daedalus_kernel;
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
#ifdef __cplusplus
}
#endif
#endif /* DAEDALUS_FOURIER_H */