diff --git a/CMakeLists.txt b/CMakeLists.txt index a9d8f38..ff739f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -276,6 +276,32 @@ if (DAEDALUS_BUILD_VULKAN) add_dependencies(bench_v3d_cdef daedalus_shaders) target_link_libraries(bench_v3d_cdef PRIVATE v3d_runner Vulkan::Vulkan) target_compile_options(bench_v3d_cdef PRIVATE -O2) +endif() + +# ---- Phase 8 — public C API library + smoke test --------------------------- + +add_library(daedalus_core STATIC + src/daedalus_core.c + ${FFASM_SOURCES} + ${FFASM_LPF_SOURCES} + ${FFASM_MC_SOURCES} + ${FFC_MC_SOURCES} + ${DAV1D_CDEF_ASM_SOURCES} + ${DAV1D_CDEF_C_SOURCES} +) +target_include_directories(daedalus_core PUBLIC include) +target_compile_options(daedalus_core PRIVATE -O2) + +add_executable(test_api_idct + tests/test_api_idct.c + tests/vp9_idct8_ref.c +) +target_link_libraries(test_api_idct PRIVATE daedalus_core) +target_compile_options(test_api_idct PRIVATE -O2) + +if (DAEDALUS_BUILD_VULKAN) +# (re-open the conditional so the closing endif() below balances) + # M4 — concurrent CPU(NEON) + QPU bench. Links the FFmpeg NEON # snapshot so we can run real NEON kernels on pinned CPU cores diff --git a/docs/phase8_scoping.md b/docs/phase8_scoping.md new file mode 100644 index 0000000..2929439 --- /dev/null +++ b/docs/phase8_scoping.md @@ -0,0 +1,142 @@ +--- +phase: 8 +status: scoping (architecture options + tractable-first-step picked) +date_opened: 2026-05-18 +prereqs: cycles 1-5 closed (IDCT, LPF wd=4, MC, LPF wd=8, CDEF) +consumer_target: libva-v4l2-request-fourier → firefox/chromium-fourier +--- + +# Phase 8 — V4L2 deployment scoping + +## What Phase 8 is + +The "deliver the work" phase. Cycles 1-5 produced 5 individually- +measured per-block kernels (3 deployed on QPU, 2 on CPU per the +deployment recipe). Phase 8 makes those kernels add up to a +decoded video at the user's display. + +Per `project_consumer_target.md`, the integration target is +**libva-v4l2-request-fourier**: a V4L2 stateless decoder node +exposing a VP9 (later AV1) contract, bridged via VA-API to +browser-fourier builds. Same plumbing mfritsche already runs for +HEVC/RK3588, different decoder backend. + +## Architecture stack + +``` ++-------------------------------------------------------+ +| firefox-fourier / chromium-fourier (already builds) | ++-------------------------------------------------------+ +| VA-API | ++-------------------------------------------------------+ +| libva-v4l2-request-fourier (already runs for HEVC) | ++-------------------------------------------------------+ +| V4L2 stateless ioctl interface (kernel uAPI) | ++-------------------------------------------------------+ +| daedalus-fourier V4L2 shim (NEW — Phase 8 work) | +| ↳ Parses bitstream control structs (V4L2_CID_STATELESS_VP9_*) +| ↳ Drives per-superblock decode loop +| ↳ Dispatches per-kernel to CPU NEON or V3D QPU (recipe) ++-------------------------------------------------------+ +| daedalus-fourier core library (NEW Phase 8 — wraps | +| ↳ kernels from cycles 1-5) | ++-------------------------------------------------------+ +| V3D 7.1 Mesa userspace + ARM NEON | ++-------------------------------------------------------+ +``` + +## Three architecture options + +### Option A — Userspace V4L2 emulation (recommended for v1) + +Implement a userspace `videodev2`-compatible loopback device +(via `v4l2loopback` or a custom UIO-style approach) that exposes +`/dev/videoNN` with the VP9 stateless contract. libva-v4l2- +request-fourier talks to this normally. + +**Pros**: stays entirely in userspace; no kernel module work; can +iterate quickly; isolation from kernel crash domain. The +daedalus-fourier daemon runs as a regular Linux process, taking +V4L2 ioctls (via the loopback shim) and emitting decoded frames. + +**Cons**: v4l2loopback is loosely maintained; userspace V4L2 has +some semantic quirks (DRM/PRIME buffer sharing is harder than in +a real kernel driver). + +### Option B — Tiny kernel V4L2 shim + +A small kernel module that registers as a V4L2 device, takes the +ioctls, and forwards bitstream blobs + control structs to a +userspace daemon (the actual decoder) over a UNIX socket or +character-device chardev. Daemon decodes and posts frames back. + +**Pros**: a real `/dev/videoNN` with proper VFL_TYPE_VIDEO +semantics. DRM PRIME buffer sharing works correctly. + +**Cons**: kernel module work. Cross-process buffer marshaling +adds latency. Out-of-tree maintenance burden. + +### Option C — Direct libva integration (not recommended) + +Skip V4L2 entirely; implement a libva backend module directly. + +**Pros**: avoids the V4L2 wrapper layer entirely. + +**Cons**: contradicts `project_consumer_target.md` (decision to +use V4L2 path locked in). libva backend maintenance burden is +roughly equivalent to V4L2 shim with no portability gain. + +**Pick A** for v1; revisit if userspace V4L2 semantics block +DRM PRIME / dmabuf for browser zero-copy. + +## What's tractable this session + +Phase 8 in full is **days of work** (V4L2 ioctl glue, bitstream +parser, superblock loop, frame buffer management, dmabuf handling, +end-to-end test against a real VP9 clip). Out of scope for a +single session continuation. + +What IS tractable now: + +1. **Public C API header** (`include/daedalus.h`): declare the + library's stable function surface for the 5 kernels + + substrate selection + init/teardown. Future Phase 8 V4L2 shim + consumes this header. This: + - Locks the API shape so V4L2 work doesn't need to plumb + through internal types. + - Documents which kernels deploy where (recipe encoded in API). + - Forces a clean separation between "kernel work" (cycles 1-5) + and "decoder pipeline" (Phase 8). + +2. **A minimal core library** (`src/daedalus_core.{h,c}`): + skeleton that compiles, has the right typedefs and dispatch + tables, but body of each function is `assert(0 && "TODO")`. + Builds against existing kernel implementations. + +3. **One integration test** (`tests/test_idct_through_api.c`): + exercise the public API for ONE kernel end-to-end. Proves the + API can connect to existing benches. + +This commit gives the integration target something concrete to +hook into without prejudging V4L2 architecture (A/B/C). + +## Out of scope for this session + +- v4l2loopback setup (Option A specifics). +- VP9 bitstream parser (huge — borrow from FFmpeg / VP9 reference). +- Superblock-level decode loop. +- Frame buffer / dmabuf integration. +- libva-v4l2-request-fourier modifications (separate sibling repo). + +These are tracked as future phases / issues. + +## Acceptance for this Phase 8 scoping deliverable + +- `include/daedalus.h` exists and is documented. +- `src/daedalus_core.{h,c}` skeleton compiles + links into the + existing CMake build. +- One pass-through test (`test_idct_through_api`) runs and + exercises the public API path for at least one kernel, + producing the same M1 bit-exact result the cycle 1 bench did. +- Recipe table (which kernel runs where) is documented in the + header and the docs/k* phase7 docs cross-reference it. diff --git a/include/daedalus.h b/include/daedalus.h new file mode 100644 index 0000000..38a9ccf --- /dev/null +++ b/include/daedalus.h @@ -0,0 +1,210 @@ +/* + * daedalus-fourier — public C API. + * + * Stable surface for the integration layer (Phase 8 V4L2 shim, + * libva-v4l2-request-fourier consumer, or any future skin) to + * dispatch per-kernel work to the right substrate per the + * cycle 1-5 deployment recipe. + * + * Recipe (verdict at end of cycles 1-5, see docs/k*_phase7.md): + * + * VP9 IDCT 8x8 → V3D QPU (R=0.92 GREEN; M4 +7.2 %) + * VP9 LPF wd=4 inner → V3D QPU (R=0.41 ORANGE; M4 +6.9 %) + * VP9 MC 8-tap horiz → CPU NEON (R=0.067 RED; M4 -19.5 %) + * VP9 LPF wd=8 inner → V3D QPU (R=0.34 ORANGE; M4 +4.1 %) + * AV1 CDEF 8x8 luma → CPU NEON (R=0.116 ORANGE; QPU = opportunistic helper at 0.4 Mblock/s) + * + * The API exposes BOTH substrates for every kernel — the + * integration layer can override the recipe at runtime if it + * has scheduler knowledge the kernel-level R-band measurement + * didn't capture. The recommended path is to use + * `daedalus_recipe_dispatch_*` which picks the recipe substrate + * automatically. + * + * License: BSD-2-Clause. This header is part of the library API + * boundary; the implementation links against vendored + * LGPL-2.1+ FFmpeg snapshot and BSD-2-Clause dav1d snapshot. + * + * Threading: a `daedalus_ctx *` owns Vulkan + V3D state. A + * context is single-threaded; use one per worker thread if you + * need parallelism on the QPU side. NEON-side dispatch is + * stateless and re-entrant. + * + * ABI: pre-1.0 — no stability guarantees yet. The function names + * and signatures will become ABI-stable at v1.0; until then the + * integration layer should rebuild against the headers it links + * with. + */ +#ifndef DAEDALUS_FOURIER_H +#define DAEDALUS_FOURIER_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* ------------------------------------------------------------------- + * Substrate selection + * + * Most callers should NOT specify a substrate — use the + * `daedalus_recipe_dispatch_*` family below, which picks the + * substrate per the cycles-1-5 verdict. Explicit substrate + * selection is for benchmarking, debugging, and future + * runtime-aware schedulers. + * ----------------------------------------------------------------- */ +typedef enum { + DAEDALUS_SUBSTRATE_AUTO = 0, /* per recipe table */ + DAEDALUS_SUBSTRATE_CPU = 1, /* force ARM NEON */ + DAEDALUS_SUBSTRATE_QPU = 2, /* force V3D compute */ +} daedalus_substrate; + +/* ------------------------------------------------------------------- + * Context lifecycle + * ----------------------------------------------------------------- */ +typedef struct daedalus_ctx daedalus_ctx; + +/* Create a context. Initialises V3D Vulkan device if available; + * NEON-only fallback OK if V3D init fails. Returns NULL on alloc + * failure. */ +daedalus_ctx *daedalus_ctx_create(void); + +/* Returns 1 if QPU dispatch is available on this context, 0 if + * NEON-only. Useful for the integration layer to short-circuit + * QPU dispatch attempts. */ +int daedalus_ctx_has_qpu(const daedalus_ctx *ctx); + +void daedalus_ctx_destroy(daedalus_ctx *ctx); + +/* ------------------------------------------------------------------- + * VP9 IDCT 8x8 add — cycle 1 (QPU by recipe) + * + * For each of n_blocks: take 64 int16 coefficients, perform 8x8 + * inverse DCT, add to dst[r,c] = clamp(dst[r,c] + ((q + 16)>>5)). + * + * `meta` is an array of (dst_byte_offset, block_x, block_y) for + * each block, where dst_byte_offset is byte offset into dst. + * + * Returns 0 on success, negative errno-like on failure. + * ----------------------------------------------------------------- */ +typedef struct { + uint32_t dst_off; /* byte offset into dst */ + uint32_t block_x; /* used only by QPU path for placement */ + uint32_t block_y; + uint32_t _pad; +} daedalus_idct8_meta; + +int daedalus_recipe_dispatch_vp9_idct8( + daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const int16_t *coeffs, size_t n_blocks, + const daedalus_idct8_meta *meta); + +int daedalus_dispatch_vp9_idct8( + daedalus_ctx *ctx, + daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + const int16_t *coeffs, size_t n_blocks, + const daedalus_idct8_meta *meta); + +/* ------------------------------------------------------------------- + * VP9 LPF wd=4 / wd=8 — cycles 2 and 4 (QPU by recipe) + * + * Loop filter at horizontal edge crossing pixel column 4 of an + * 8x8 block. Per-edge thresholds (E, I, H). + * ----------------------------------------------------------------- */ +typedef struct { + uint32_t dst_off; /* byte offset into dst, at col 4 of edge */ + int32_t E, I, H; +} daedalus_lpf_meta; + +int daedalus_recipe_dispatch_vp9_lpf4( + daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_lpf_meta *meta); + +int daedalus_recipe_dispatch_vp9_lpf8( + daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_lpf_meta *meta); + +int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_lpf_meta *meta); + +int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_lpf_meta *meta); + +/* ------------------------------------------------------------------- + * VP9 MC 8-tap horizontal — cycle 3 (CPU by recipe) + * + * Subpel-fractional 8-tap horizontal filter; mx selects filter + * row. CPU path is the high-performance default; QPU path is + * available but never recommended by the recipe. + * ----------------------------------------------------------------- */ +typedef struct { + uint32_t dst_off; + uint32_t src_off; /* raw, no pre-advance — shader handles -3 internally */ + int32_t mx; + uint32_t _pad; +} daedalus_mc_meta; + +int daedalus_recipe_dispatch_vp9_mc_8h( + daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const uint8_t *src, size_t src_stride, + size_t n_blocks, const daedalus_mc_meta *meta); + +int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + const uint8_t *src, size_t src_stride, + size_t n_blocks, const daedalus_mc_meta *meta); + +/* ------------------------------------------------------------------- + * AV1 CDEF 8x8 luma — cycle 5 (CPU by recipe; QPU opportunistic) + * + * tmp is an array of n_blocks * 192 uint16, with the padded-buffer + * layout that dav1d's NEON expects (stride 16, padding 2-rows-top + + * 2-cols-left + 2-cols-right + 2-rows-bottom). Caller supplies + * tmp populated with either source pixels (if all edges valid) or + * INT16_MIN sentinels at the boundary (if edge filtered out). + * ----------------------------------------------------------------- */ +typedef struct { + uint32_t dst_off; + uint32_t tmp_off_u16; /* offset to block-origin in tmp[] (= padded_origin + 2*16+2) */ + int32_t pri_strength; /* 1..7 */ + int32_t sec_strength; /* 1..4 */ + int32_t dir; /* 0..7 */ + int32_t damping; /* 1..6 */ +} daedalus_cdef_meta; + +int daedalus_recipe_dispatch_cdef_8x8( + daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const uint16_t *tmp, + size_t n_blocks, const daedalus_cdef_meta *meta); + +int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + const uint16_t *tmp, + size_t n_blocks, const daedalus_cdef_meta *meta); + +/* ------------------------------------------------------------------- + * Recipe query — what does the API recommend for each kernel? + * ----------------------------------------------------------------- */ +typedef enum { + DAEDALUS_KERNEL_VP9_IDCT8 = 1, + DAEDALUS_KERNEL_VP9_LPF4_INNER = 2, + DAEDALUS_KERNEL_VP9_MC_8H = 3, + DAEDALUS_KERNEL_VP9_LPF8_INNER = 4, + DAEDALUS_KERNEL_AV1_CDEF_8X8 = 5, +} daedalus_kernel; + +daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); + +#ifdef __cplusplus +} +#endif +#endif /* DAEDALUS_FOURIER_H */ diff --git a/src/daedalus_core.c b/src/daedalus_core.c new file mode 100644 index 0000000..fd45298 --- /dev/null +++ b/src/daedalus_core.c @@ -0,0 +1,252 @@ +/* + * daedalus-fourier core library — Phase 8 skeleton. + * + * Wraps cycles 1-5 kernels behind the public C API in + * include/daedalus.h. Recipe dispatch routes per-kernel to the + * verdict substrate from each cycle's Phase 7 doc. + * + * License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ + + * dav1d BSD-2-Clause NEON snapshots. + */ +#include "../include/daedalus.h" + +#include +#include +#include +#include +#include + +/* -------------------- Context -------------------- */ + +struct daedalus_ctx { + /* For Phase 8 skeleton: just a flag. Real impl would hold the + * v3d_runner + per-kernel pipeline handles. */ + int has_qpu; +}; + +daedalus_ctx *daedalus_ctx_create(void) +{ + daedalus_ctx *ctx = calloc(1, sizeof(*ctx)); + if (!ctx) return NULL; + /* Phase 8 deferred: real impl probes V3D Vulkan device; for now + * default to CPU-only (NEON paths are always available). */ + ctx->has_qpu = 0; + return ctx; +} + +int daedalus_ctx_has_qpu(const daedalus_ctx *ctx) +{ + return ctx ? ctx->has_qpu : 0; +} + +void daedalus_ctx_destroy(daedalus_ctx *ctx) +{ + free(ctx); +} + +/* -------------------- Recipe query -------------------- */ + +daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) +{ + switch (k) { + case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU; + case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU; + case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU; + case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU; + } + return DAEDALUS_SUBSTRATE_CPU; /* defensive default */ +} + +/* -------------------- NEON externs (per cycle bench links) ----- */ + +extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride, + int16_t *block, int eob); +extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride, + int E, int I, int H); +extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride, + int E, int I, int H); +extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int h, int mx, int my); +extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, + int pri_strength, int sec_strength, + int dir, int damping, int h, + size_t edges); + +/* -------------------- CPU dispatch implementations -------------- */ + +static int dispatch_idct8_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const int16_t *coeffs, size_t n_blocks, + const daedalus_idct8_meta *meta) +{ + (void) ctx; + int16_t scratch[64]; + for (size_t i = 0; i < n_blocks; i++) { + memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t)); + ff_vp9_idct_idct_8x8_add_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + scratch, 64); + } + return 0; +} + +static int dispatch_lpf_cpu(daedalus_ctx *ctx, int wd_8, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_lpf_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_edges; i++) { + uint8_t *p = dst + meta[i].dst_off; + if (wd_8) ff_vp9_loop_filter_h_8_8_neon(p, (ptrdiff_t) dst_stride, + meta[i].E, meta[i].I, meta[i].H); + else ff_vp9_loop_filter_h_4_8_neon(p, (ptrdiff_t) dst_stride, + meta[i].E, meta[i].I, meta[i].H); + } + return 0; +} + +static int dispatch_mc_8h_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const uint8_t *src, size_t src_stride, + size_t n_blocks, const daedalus_mc_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_blocks; i++) { + ff_vp9_put_regular8_h_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + src + meta[i].src_off + 3, + (ptrdiff_t) src_stride, + 8, meta[i].mx, 0); + } + return 0; +} + +static int dispatch_cdef_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const uint16_t *tmp, + size_t n_blocks, const daedalus_cdef_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_blocks; i++) { + dav1d_cdef_filter8_8bpc_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + tmp + meta[i].tmp_off_u16, + meta[i].pri_strength, + meta[i].sec_strength, + meta[i].dir, meta[i].damping, 8, 0); + } + return 0; +} + +/* -------------------- Public dispatch entry points -------------- */ + +#define ROUTE(_kernel, _cpu_fn, ...) \ + daedalus_substrate eff = sub; \ + if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \ + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \ + eff = DAEDALUS_SUBSTRATE_CPU; \ + if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \ + return -1 /* QPU path not yet wired in Phase 8 skeleton */ + +int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + const int16_t *coeffs, size_t n_blocks, + const daedalus_idct8_meta *meta) +{ + ROUTE(DAEDALUS_KERNEL_VP9_IDCT8, dispatch_idct8_cpu, + dst, dst_stride, coeffs, n_blocks, meta); +} + +int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_lpf_meta *meta) +{ + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta); + return -1; +} + +int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_lpf_meta *meta) +{ + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta); + return -1; +} + +int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + const uint8_t *src, size_t src_stride, + size_t n_blocks, const daedalus_mc_meta *meta) +{ + ROUTE(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu, + dst, dst_stride, src, src_stride, n_blocks, meta); +} + +int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + const uint16_t *tmp, + size_t n_blocks, const daedalus_cdef_meta *meta) +{ + ROUTE(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu, + dst, dst_stride, tmp, n_blocks, meta); +} + +/* -------------------- Recipe convenience wrappers --------------- */ + +int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const int16_t *coeffs, size_t n_blocks, + const daedalus_idct8_meta *meta) +{ + return daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, coeffs, n_blocks, meta); +} + +int daedalus_recipe_dispatch_vp9_lpf4(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_lpf_meta *meta) +{ + return daedalus_dispatch_vp9_lpf4(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, n_edges, meta); +} + +int daedalus_recipe_dispatch_vp9_lpf8(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_lpf_meta *meta) +{ + return daedalus_dispatch_vp9_lpf8(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, n_edges, meta); +} + +int daedalus_recipe_dispatch_vp9_mc_8h(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const uint8_t *src, size_t src_stride, + size_t n_blocks, const daedalus_mc_meta *meta) +{ + return daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, src, src_stride, n_blocks, meta); +} + +int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const uint16_t *tmp, + size_t n_blocks, const daedalus_cdef_meta *meta) +{ + return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO, + dst, dst_stride, tmp, n_blocks, meta); +} diff --git a/tests/test_api_idct.c b/tests/test_api_idct.c new file mode 100644 index 0000000..13896b3 --- /dev/null +++ b/tests/test_api_idct.c @@ -0,0 +1,103 @@ +/* + * Phase 8 — first end-to-end test through the public API. + * + * Exercises `daedalus_recipe_dispatch_vp9_idct8` end-to-end: + * 1. Create context. + * 2. Generate random VP9 coefficient blocks + dst pixels. + * 3. Compute reference output via the C ref (tests/vp9_idct8_ref.c). + * 4. Run public API dispatch on a copy of dst. + * 5. Assert bit-exact. + * + * In Phase 8 skeleton, the API routes to CPU NEON (QPU dispatch + * not yet wired through the API). Bit-exact gate against C ref + * still passes because the underlying NEON kernel was the cycle 1 + * reference. + */ +#include +#include +#include +#include +#include + +#include "../include/daedalus.h" + +extern void daedalus_vp9_idct_idct_8x8_add_ref( + uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +#define BLOCKS_W 8 +#define BLOCKS_H 8 +#define N_BLOCKS (BLOCKS_W * BLOCKS_H) +#define DST_STRIDE (BLOCKS_W * 8) +#define DST_BYTES (BLOCKS_H * 8 * DST_STRIDE) + +static uint64_t xs_state = 0xa57edbeef5717ULL; +static inline uint64_t xs(void) { + uint64_t x = xs_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs_state = x; +} + +int main(void) +{ + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; } + + printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n"); + printf(" has_qpu: %d (Phase 8 skeleton: NEON-only)\n", + daedalus_ctx_has_qpu(ctx)); + printf(" recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8)); + + /* Generate random VP9 IDCT inputs: 64-coef blocks + a dst surface. */ + int16_t coeffs[N_BLOCKS * 64]; + memset(coeffs, 0, sizeof(coeffs)); + for (int i = 0; i < N_BLOCKS; i++) { + /* Sparse non-zero coefs to keep range realistic. */ + int n = 1 + (int)(xs() % 16); + for (int j = 0; j < n; j++) { + int pos = (int)(xs() % 64); + int16_t v = (int16_t)((int)(xs() % 8192) - 4096); + coeffs[i * 64 + pos] = v; + } + } + + uint8_t dst_ref[DST_BYTES], dst_api[DST_BYTES]; + for (int i = 0; i < DST_BYTES; i++) + dst_ref[i] = dst_api[i] = (uint8_t)(xs() & 0xff); + + /* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset + * by*8*stride + bx*8. */ + daedalus_idct8_meta meta[N_BLOCKS]; + for (int by = 0; by < BLOCKS_H; by++) { + for (int bx = 0; bx < BLOCKS_W; bx++) { + int i = by * BLOCKS_W + bx; + meta[i].dst_off = (uint32_t)(by * 8 * DST_STRIDE + bx * 8); + meta[i].block_x = (uint32_t) bx; + meta[i].block_y = (uint32_t) by; + meta[i]._pad = 0; + } + } + + /* Compute reference via the C ref (mutates a scratch copy of + * coeffs because the C ref destroys its input). */ + int16_t scratch[64]; + for (int i = 0; i < N_BLOCKS; i++) { + memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t)); + daedalus_vp9_idct_idct_8x8_add_ref(dst_ref + meta[i].dst_off, + DST_STRIDE, scratch, 64); + } + + /* Dispatch through the public API. */ + int rc = daedalus_recipe_dispatch_vp9_idct8(ctx, dst_api, DST_STRIDE, + coeffs, N_BLOCKS, meta); + if (rc != 0) { fprintf(stderr, "API dispatch failed rc=%d\n", rc); return 1; } + + /* Compare. */ + int diffs = 0; + for (int i = 0; i < DST_BYTES; i++) if (dst_ref[i] != dst_api[i]) diffs++; + printf(" bytes bit-exact: %d / %d (%.4f%%)\n", + DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES); + + daedalus_ctx_destroy(ctx); + return diffs == 0 ? 0 : 1; +}