Phase 8 skeleton: public C API + first end-to-end smoke test

include/daedalus.h: stable C API surface exposing the 5 cycles
(VP9 IDCT 8x8, LPF wd=4, MC 8h, LPF wd=8; AV1 CDEF). Per-kernel
recipe-dispatch helpers default to the cycle 1-5 verdict
substrate (QPU for cycles 1+2+4, CPU for cycles 3+5); explicit
override available for benchmarking and runtime-aware scheduling.

src/daedalus_core.c: NEON-path implementation of all 5 kernels
wrapped behind the public API. QPU path stubbed out (returns -1)
since wiring v3d_runner into daedalus_ctx is the next Phase 8
sub-step; with has_qpu=0 the recipe falls back to CPU cleanly.

tests/test_api_idct.c: 64-block IDCT through the public recipe
dispatch, bit-exact vs C ref. PASS 4096/4096 bytes — proves the
API surface compiles, library links, dispatch routing works, and
NEON fallback delivers correct results.

docs/phase8_scoping.md: architecture options (A=userspace V4L2,
B=kernel V4L2 shim, C=direct libva); pick A for v1; explicitly
out-of-scope work tracked.

Next Phase 8 sub-step: wire v3d_runner into daedalus_ctx so
has_qpu=1 and QPU dispatch goes through the API too. After that:
V4L2 ioctl glue, bitstream parser, superblock loop.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 13:54:43 +00:00
parent 5223d3cb3f
commit 760f6a4060
5 changed files with 733 additions and 0 deletions
+252
View File
@@ -0,0 +1,252 @@
/*
* daedalus-fourier core library — Phase 8 skeleton.
*
* Wraps cycles 1-5 kernels behind the public C API in
* include/daedalus.h. Recipe dispatch routes per-kernel to the
* verdict substrate from each cycle's Phase 7 doc.
*
* License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
* dav1d BSD-2-Clause NEON snapshots.
*/
#include "../include/daedalus.h"
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <assert.h>
/* -------------------- Context -------------------- */
struct daedalus_ctx {
/* For Phase 8 skeleton: just a flag. Real impl would hold the
* v3d_runner + per-kernel pipeline handles. */
int has_qpu;
};
daedalus_ctx *daedalus_ctx_create(void)
{
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
if (!ctx) return NULL;
/* Phase 8 deferred: real impl probes V3D Vulkan device; for now
* default to CPU-only (NEON paths are always available). */
ctx->has_qpu = 0;
return ctx;
}
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
{
return ctx ? ctx->has_qpu : 0;
}
void daedalus_ctx_destroy(daedalus_ctx *ctx)
{
free(ctx);
}
/* -------------------- Recipe query -------------------- */
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
{
switch (k) {
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
}
return DAEDALUS_SUBSTRATE_CPU; /* defensive default */
}
/* -------------------- NEON externs (per cycle bench links) ----- */
extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob);
extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride,
int E, int I, int H);
extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride,
int E, int I, int H);
extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
int h, int mx, int my);
extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
const uint16_t *tmp,
int pri_strength, int sec_strength,
int dir, int damping, int h,
size_t edges);
/* -------------------- CPU dispatch implementations -------------- */
static int dispatch_idct8_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta)
{
(void) ctx;
int16_t scratch[64];
for (size_t i = 0; i < n_blocks; i++) {
memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t));
ff_vp9_idct_idct_8x8_add_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
scratch, 64);
}
return 0;
}
static int dispatch_lpf_cpu(daedalus_ctx *ctx, int wd_8,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
uint8_t *p = dst + meta[i].dst_off;
if (wd_8) ff_vp9_loop_filter_h_8_8_neon(p, (ptrdiff_t) dst_stride,
meta[i].E, meta[i].I, meta[i].H);
else ff_vp9_loop_filter_h_4_8_neon(p, (ptrdiff_t) dst_stride,
meta[i].E, meta[i].I, meta[i].H);
}
return 0;
}
static int dispatch_mc_8h_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_blocks; i++) {
ff_vp9_put_regular8_h_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
src + meta[i].src_off + 3,
(ptrdiff_t) src_stride,
8, meta[i].mx, 0);
}
return 0;
}
static int dispatch_cdef_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_blocks; i++) {
dav1d_cdef_filter8_8bpc_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
tmp + meta[i].tmp_off_u16,
meta[i].pri_strength,
meta[i].sec_strength,
meta[i].dir, meta[i].damping, 8, 0);
}
return 0;
}
/* -------------------- Public dispatch entry points -------------- */
#define ROUTE(_kernel, _cpu_fn, ...) \
daedalus_substrate eff = sub; \
if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
eff = DAEDALUS_SUBSTRATE_CPU; \
if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \
return -1 /* QPU path not yet wired in Phase 8 skeleton */
int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta)
{
ROUTE(DAEDALUS_KERNEL_VP9_IDCT8, dispatch_idct8_cpu,
dst, dst_stride, coeffs, n_blocks, meta);
}
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta);
return -1;
}
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta);
return -1;
}
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta)
{
ROUTE(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
dst, dst_stride, src, src_stride, n_blocks, meta);
}
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta)
{
ROUTE(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
dst, dst_stride, tmp, n_blocks, meta);
}
/* -------------------- Recipe convenience wrappers --------------- */
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta)
{
return daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, coeffs, n_blocks, meta);
}
int daedalus_recipe_dispatch_vp9_lpf4(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
return daedalus_dispatch_vp9_lpf4(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, n_edges, meta);
}
int daedalus_recipe_dispatch_vp9_lpf8(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
return daedalus_dispatch_vp9_lpf8(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, n_edges, meta);
}
int daedalus_recipe_dispatch_vp9_mc_8h(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta)
{
return daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, src, src_stride, n_blocks, meta);
}
int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta)
{
return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, tmp, n_blocks, meta);
}