Phase 8 skeleton: public C API + first end-to-end smoke test
include/daedalus.h: stable C API surface exposing the 5 cycles (VP9 IDCT 8x8, LPF wd=4, MC 8h, LPF wd=8; AV1 CDEF). Per-kernel recipe-dispatch helpers default to the cycle 1-5 verdict substrate (QPU for cycles 1+2+4, CPU for cycles 3+5); explicit override available for benchmarking and runtime-aware scheduling. src/daedalus_core.c: NEON-path implementation of all 5 kernels wrapped behind the public API. QPU path stubbed out (returns -1) since wiring v3d_runner into daedalus_ctx is the next Phase 8 sub-step; with has_qpu=0 the recipe falls back to CPU cleanly. tests/test_api_idct.c: 64-block IDCT through the public recipe dispatch, bit-exact vs C ref. PASS 4096/4096 bytes — proves the API surface compiles, library links, dispatch routing works, and NEON fallback delivers correct results. docs/phase8_scoping.md: architecture options (A=userspace V4L2, B=kernel V4L2 shim, C=direct libva); pick A for v1; explicitly out-of-scope work tracked. Next Phase 8 sub-step: wire v3d_runner into daedalus_ctx so has_qpu=1 and QPU dispatch goes through the API too. After that: V4L2 ioctl glue, bitstream parser, superblock loop. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -276,6 +276,32 @@ if (DAEDALUS_BUILD_VULKAN)
|
|||||||
add_dependencies(bench_v3d_cdef daedalus_shaders)
|
add_dependencies(bench_v3d_cdef daedalus_shaders)
|
||||||
target_link_libraries(bench_v3d_cdef PRIVATE v3d_runner Vulkan::Vulkan)
|
target_link_libraries(bench_v3d_cdef PRIVATE v3d_runner Vulkan::Vulkan)
|
||||||
target_compile_options(bench_v3d_cdef PRIVATE -O2)
|
target_compile_options(bench_v3d_cdef PRIVATE -O2)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# ---- Phase 8 — public C API library + smoke test ---------------------------
|
||||||
|
|
||||||
|
add_library(daedalus_core STATIC
|
||||||
|
src/daedalus_core.c
|
||||||
|
${FFASM_SOURCES}
|
||||||
|
${FFASM_LPF_SOURCES}
|
||||||
|
${FFASM_MC_SOURCES}
|
||||||
|
${FFC_MC_SOURCES}
|
||||||
|
${DAV1D_CDEF_ASM_SOURCES}
|
||||||
|
${DAV1D_CDEF_C_SOURCES}
|
||||||
|
)
|
||||||
|
target_include_directories(daedalus_core PUBLIC include)
|
||||||
|
target_compile_options(daedalus_core PRIVATE -O2)
|
||||||
|
|
||||||
|
add_executable(test_api_idct
|
||||||
|
tests/test_api_idct.c
|
||||||
|
tests/vp9_idct8_ref.c
|
||||||
|
)
|
||||||
|
target_link_libraries(test_api_idct PRIVATE daedalus_core)
|
||||||
|
target_compile_options(test_api_idct PRIVATE -O2)
|
||||||
|
|
||||||
|
if (DAEDALUS_BUILD_VULKAN)
|
||||||
|
# (re-open the conditional so the closing endif() below balances)
|
||||||
|
|
||||||
|
|
||||||
# M4 — concurrent CPU(NEON) + QPU bench. Links the FFmpeg NEON
|
# M4 — concurrent CPU(NEON) + QPU bench. Links the FFmpeg NEON
|
||||||
# snapshot so we can run real NEON kernels on pinned CPU cores
|
# snapshot so we can run real NEON kernels on pinned CPU cores
|
||||||
|
|||||||
@@ -0,0 +1,142 @@
|
|||||||
|
---
|
||||||
|
phase: 8
|
||||||
|
status: scoping (architecture options + tractable-first-step picked)
|
||||||
|
date_opened: 2026-05-18
|
||||||
|
prereqs: cycles 1-5 closed (IDCT, LPF wd=4, MC, LPF wd=8, CDEF)
|
||||||
|
consumer_target: libva-v4l2-request-fourier → firefox/chromium-fourier
|
||||||
|
---
|
||||||
|
|
||||||
|
# Phase 8 — V4L2 deployment scoping
|
||||||
|
|
||||||
|
## What Phase 8 is
|
||||||
|
|
||||||
|
The "deliver the work" phase. Cycles 1-5 produced 5 individually-
|
||||||
|
measured per-block kernels (3 deployed on QPU, 2 on CPU per the
|
||||||
|
deployment recipe). Phase 8 makes those kernels add up to a
|
||||||
|
decoded video at the user's display.
|
||||||
|
|
||||||
|
Per `project_consumer_target.md`, the integration target is
|
||||||
|
**libva-v4l2-request-fourier**: a V4L2 stateless decoder node
|
||||||
|
exposing a VP9 (later AV1) contract, bridged via VA-API to
|
||||||
|
browser-fourier builds. Same plumbing mfritsche already runs for
|
||||||
|
HEVC/RK3588, different decoder backend.
|
||||||
|
|
||||||
|
## Architecture stack
|
||||||
|
|
||||||
|
```
|
||||||
|
+-------------------------------------------------------+
|
||||||
|
| firefox-fourier / chromium-fourier (already builds) |
|
||||||
|
+-------------------------------------------------------+
|
||||||
|
| VA-API |
|
||||||
|
+-------------------------------------------------------+
|
||||||
|
| libva-v4l2-request-fourier (already runs for HEVC) |
|
||||||
|
+-------------------------------------------------------+
|
||||||
|
| V4L2 stateless ioctl interface (kernel uAPI) |
|
||||||
|
+-------------------------------------------------------+
|
||||||
|
| daedalus-fourier V4L2 shim (NEW — Phase 8 work) |
|
||||||
|
| ↳ Parses bitstream control structs (V4L2_CID_STATELESS_VP9_*)
|
||||||
|
| ↳ Drives per-superblock decode loop
|
||||||
|
| ↳ Dispatches per-kernel to CPU NEON or V3D QPU (recipe)
|
||||||
|
+-------------------------------------------------------+
|
||||||
|
| daedalus-fourier core library (NEW Phase 8 — wraps |
|
||||||
|
| ↳ kernels from cycles 1-5) |
|
||||||
|
+-------------------------------------------------------+
|
||||||
|
| V3D 7.1 Mesa userspace + ARM NEON |
|
||||||
|
+-------------------------------------------------------+
|
||||||
|
```
|
||||||
|
|
||||||
|
## Three architecture options
|
||||||
|
|
||||||
|
### Option A — Userspace V4L2 emulation (recommended for v1)
|
||||||
|
|
||||||
|
Implement a userspace `videodev2`-compatible loopback device
|
||||||
|
(via `v4l2loopback` or a custom UIO-style approach) that exposes
|
||||||
|
`/dev/videoNN` with the VP9 stateless contract. libva-v4l2-
|
||||||
|
request-fourier talks to this normally.
|
||||||
|
|
||||||
|
**Pros**: stays entirely in userspace; no kernel module work; can
|
||||||
|
iterate quickly; isolation from kernel crash domain. The
|
||||||
|
daedalus-fourier daemon runs as a regular Linux process, taking
|
||||||
|
V4L2 ioctls (via the loopback shim) and emitting decoded frames.
|
||||||
|
|
||||||
|
**Cons**: v4l2loopback is loosely maintained; userspace V4L2 has
|
||||||
|
some semantic quirks (DRM/PRIME buffer sharing is harder than in
|
||||||
|
a real kernel driver).
|
||||||
|
|
||||||
|
### Option B — Tiny kernel V4L2 shim
|
||||||
|
|
||||||
|
A small kernel module that registers as a V4L2 device, takes the
|
||||||
|
ioctls, and forwards bitstream blobs + control structs to a
|
||||||
|
userspace daemon (the actual decoder) over a UNIX socket or
|
||||||
|
character-device chardev. Daemon decodes and posts frames back.
|
||||||
|
|
||||||
|
**Pros**: a real `/dev/videoNN` with proper VFL_TYPE_VIDEO
|
||||||
|
semantics. DRM PRIME buffer sharing works correctly.
|
||||||
|
|
||||||
|
**Cons**: kernel module work. Cross-process buffer marshaling
|
||||||
|
adds latency. Out-of-tree maintenance burden.
|
||||||
|
|
||||||
|
### Option C — Direct libva integration (not recommended)
|
||||||
|
|
||||||
|
Skip V4L2 entirely; implement a libva backend module directly.
|
||||||
|
|
||||||
|
**Pros**: avoids the V4L2 wrapper layer entirely.
|
||||||
|
|
||||||
|
**Cons**: contradicts `project_consumer_target.md` (decision to
|
||||||
|
use V4L2 path locked in). libva backend maintenance burden is
|
||||||
|
roughly equivalent to V4L2 shim with no portability gain.
|
||||||
|
|
||||||
|
**Pick A** for v1; revisit if userspace V4L2 semantics block
|
||||||
|
DRM PRIME / dmabuf for browser zero-copy.
|
||||||
|
|
||||||
|
## What's tractable this session
|
||||||
|
|
||||||
|
Phase 8 in full is **days of work** (V4L2 ioctl glue, bitstream
|
||||||
|
parser, superblock loop, frame buffer management, dmabuf handling,
|
||||||
|
end-to-end test against a real VP9 clip). Out of scope for a
|
||||||
|
single session continuation.
|
||||||
|
|
||||||
|
What IS tractable now:
|
||||||
|
|
||||||
|
1. **Public C API header** (`include/daedalus.h`): declare the
|
||||||
|
library's stable function surface for the 5 kernels +
|
||||||
|
substrate selection + init/teardown. Future Phase 8 V4L2 shim
|
||||||
|
consumes this header. This:
|
||||||
|
- Locks the API shape so V4L2 work doesn't need to plumb
|
||||||
|
through internal types.
|
||||||
|
- Documents which kernels deploy where (recipe encoded in API).
|
||||||
|
- Forces a clean separation between "kernel work" (cycles 1-5)
|
||||||
|
and "decoder pipeline" (Phase 8).
|
||||||
|
|
||||||
|
2. **A minimal core library** (`src/daedalus_core.{h,c}`):
|
||||||
|
skeleton that compiles, has the right typedefs and dispatch
|
||||||
|
tables, but body of each function is `assert(0 && "TODO")`.
|
||||||
|
Builds against existing kernel implementations.
|
||||||
|
|
||||||
|
3. **One integration test** (`tests/test_idct_through_api.c`):
|
||||||
|
exercise the public API for ONE kernel end-to-end. Proves the
|
||||||
|
API can connect to existing benches.
|
||||||
|
|
||||||
|
This commit gives the integration target something concrete to
|
||||||
|
hook into without prejudging V4L2 architecture (A/B/C).
|
||||||
|
|
||||||
|
## Out of scope for this session
|
||||||
|
|
||||||
|
- v4l2loopback setup (Option A specifics).
|
||||||
|
- VP9 bitstream parser (huge — borrow from FFmpeg / VP9 reference).
|
||||||
|
- Superblock-level decode loop.
|
||||||
|
- Frame buffer / dmabuf integration.
|
||||||
|
- libva-v4l2-request-fourier modifications (separate sibling repo).
|
||||||
|
|
||||||
|
These are tracked as future phases / issues.
|
||||||
|
|
||||||
|
## Acceptance for this Phase 8 scoping deliverable
|
||||||
|
|
||||||
|
- `include/daedalus.h` exists and is documented.
|
||||||
|
- `src/daedalus_core.{h,c}` skeleton compiles + links into the
|
||||||
|
existing CMake build.
|
||||||
|
- One pass-through test (`test_idct_through_api`) runs and
|
||||||
|
exercises the public API path for at least one kernel,
|
||||||
|
producing the same M1 bit-exact result the cycle 1 bench did.
|
||||||
|
- Recipe table (which kernel runs where) is documented in the
|
||||||
|
header and the docs/k* phase7 docs cross-reference it.
|
||||||
@@ -0,0 +1,210 @@
|
|||||||
|
/*
|
||||||
|
* daedalus-fourier — public C API.
|
||||||
|
*
|
||||||
|
* Stable surface for the integration layer (Phase 8 V4L2 shim,
|
||||||
|
* libva-v4l2-request-fourier consumer, or any future skin) to
|
||||||
|
* dispatch per-kernel work to the right substrate per the
|
||||||
|
* cycle 1-5 deployment recipe.
|
||||||
|
*
|
||||||
|
* Recipe (verdict at end of cycles 1-5, see docs/k*_phase7.md):
|
||||||
|
*
|
||||||
|
* VP9 IDCT 8x8 → V3D QPU (R=0.92 GREEN; M4 +7.2 %)
|
||||||
|
* VP9 LPF wd=4 inner → V3D QPU (R=0.41 ORANGE; M4 +6.9 %)
|
||||||
|
* VP9 MC 8-tap horiz → CPU NEON (R=0.067 RED; M4 -19.5 %)
|
||||||
|
* VP9 LPF wd=8 inner → V3D QPU (R=0.34 ORANGE; M4 +4.1 %)
|
||||||
|
* AV1 CDEF 8x8 luma → CPU NEON (R=0.116 ORANGE; QPU = opportunistic helper at 0.4 Mblock/s)
|
||||||
|
*
|
||||||
|
* The API exposes BOTH substrates for every kernel — the
|
||||||
|
* integration layer can override the recipe at runtime if it
|
||||||
|
* has scheduler knowledge the kernel-level R-band measurement
|
||||||
|
* didn't capture. The recommended path is to use
|
||||||
|
* `daedalus_recipe_dispatch_*` which picks the recipe substrate
|
||||||
|
* automatically.
|
||||||
|
*
|
||||||
|
* License: BSD-2-Clause. This header is part of the library API
|
||||||
|
* boundary; the implementation links against vendored
|
||||||
|
* LGPL-2.1+ FFmpeg snapshot and BSD-2-Clause dav1d snapshot.
|
||||||
|
*
|
||||||
|
* Threading: a `daedalus_ctx *` owns Vulkan + V3D state. A
|
||||||
|
* context is single-threaded; use one per worker thread if you
|
||||||
|
* need parallelism on the QPU side. NEON-side dispatch is
|
||||||
|
* stateless and re-entrant.
|
||||||
|
*
|
||||||
|
* ABI: pre-1.0 — no stability guarantees yet. The function names
|
||||||
|
* and signatures will become ABI-stable at v1.0; until then the
|
||||||
|
* integration layer should rebuild against the headers it links
|
||||||
|
* with.
|
||||||
|
*/
|
||||||
|
#ifndef DAEDALUS_FOURIER_H
|
||||||
|
#define DAEDALUS_FOURIER_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------
|
||||||
|
* Substrate selection
|
||||||
|
*
|
||||||
|
* Most callers should NOT specify a substrate — use the
|
||||||
|
* `daedalus_recipe_dispatch_*` family below, which picks the
|
||||||
|
* substrate per the cycles-1-5 verdict. Explicit substrate
|
||||||
|
* selection is for benchmarking, debugging, and future
|
||||||
|
* runtime-aware schedulers.
|
||||||
|
* ----------------------------------------------------------------- */
|
||||||
|
typedef enum {
|
||||||
|
DAEDALUS_SUBSTRATE_AUTO = 0, /* per recipe table */
|
||||||
|
DAEDALUS_SUBSTRATE_CPU = 1, /* force ARM NEON */
|
||||||
|
DAEDALUS_SUBSTRATE_QPU = 2, /* force V3D compute */
|
||||||
|
} daedalus_substrate;
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------
|
||||||
|
* Context lifecycle
|
||||||
|
* ----------------------------------------------------------------- */
|
||||||
|
typedef struct daedalus_ctx daedalus_ctx;
|
||||||
|
|
||||||
|
/* Create a context. Initialises V3D Vulkan device if available;
|
||||||
|
* NEON-only fallback OK if V3D init fails. Returns NULL on alloc
|
||||||
|
* failure. */
|
||||||
|
daedalus_ctx *daedalus_ctx_create(void);
|
||||||
|
|
||||||
|
/* Returns 1 if QPU dispatch is available on this context, 0 if
|
||||||
|
* NEON-only. Useful for the integration layer to short-circuit
|
||||||
|
* QPU dispatch attempts. */
|
||||||
|
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx);
|
||||||
|
|
||||||
|
void daedalus_ctx_destroy(daedalus_ctx *ctx);
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------
|
||||||
|
* VP9 IDCT 8x8 add — cycle 1 (QPU by recipe)
|
||||||
|
*
|
||||||
|
* For each of n_blocks: take 64 int16 coefficients, perform 8x8
|
||||||
|
* inverse DCT, add to dst[r,c] = clamp(dst[r,c] + ((q + 16)>>5)).
|
||||||
|
*
|
||||||
|
* `meta` is an array of (dst_byte_offset, block_x, block_y) for
|
||||||
|
* each block, where dst_byte_offset is byte offset into dst.
|
||||||
|
*
|
||||||
|
* Returns 0 on success, negative errno-like on failure.
|
||||||
|
* ----------------------------------------------------------------- */
|
||||||
|
typedef struct {
|
||||||
|
uint32_t dst_off; /* byte offset into dst */
|
||||||
|
uint32_t block_x; /* used only by QPU path for placement */
|
||||||
|
uint32_t block_y;
|
||||||
|
uint32_t _pad;
|
||||||
|
} daedalus_idct8_meta;
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_vp9_idct8(
|
||||||
|
daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const int16_t *coeffs, size_t n_blocks,
|
||||||
|
const daedalus_idct8_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_dispatch_vp9_idct8(
|
||||||
|
daedalus_ctx *ctx,
|
||||||
|
daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const int16_t *coeffs, size_t n_blocks,
|
||||||
|
const daedalus_idct8_meta *meta);
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------
|
||||||
|
* VP9 LPF wd=4 / wd=8 — cycles 2 and 4 (QPU by recipe)
|
||||||
|
*
|
||||||
|
* Loop filter at horizontal edge crossing pixel column 4 of an
|
||||||
|
* 8x8 block. Per-edge thresholds (E, I, H).
|
||||||
|
* ----------------------------------------------------------------- */
|
||||||
|
typedef struct {
|
||||||
|
uint32_t dst_off; /* byte offset into dst, at col 4 of edge */
|
||||||
|
int32_t E, I, H;
|
||||||
|
} daedalus_lpf_meta;
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_vp9_lpf4(
|
||||||
|
daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_lpf_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_vp9_lpf8(
|
||||||
|
daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_lpf_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_lpf_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_lpf_meta *meta);
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------
|
||||||
|
* VP9 MC 8-tap horizontal — cycle 3 (CPU by recipe)
|
||||||
|
*
|
||||||
|
* Subpel-fractional 8-tap horizontal filter; mx selects filter
|
||||||
|
* row. CPU path is the high-performance default; QPU path is
|
||||||
|
* available but never recommended by the recipe.
|
||||||
|
* ----------------------------------------------------------------- */
|
||||||
|
typedef struct {
|
||||||
|
uint32_t dst_off;
|
||||||
|
uint32_t src_off; /* raw, no pre-advance — shader handles -3 internally */
|
||||||
|
int32_t mx;
|
||||||
|
uint32_t _pad;
|
||||||
|
} daedalus_mc_meta;
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_vp9_mc_8h(
|
||||||
|
daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const uint8_t *src, size_t src_stride,
|
||||||
|
size_t n_blocks, const daedalus_mc_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const uint8_t *src, size_t src_stride,
|
||||||
|
size_t n_blocks, const daedalus_mc_meta *meta);
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------
|
||||||
|
* AV1 CDEF 8x8 luma — cycle 5 (CPU by recipe; QPU opportunistic)
|
||||||
|
*
|
||||||
|
* tmp is an array of n_blocks * 192 uint16, with the padded-buffer
|
||||||
|
* layout that dav1d's NEON expects (stride 16, padding 2-rows-top +
|
||||||
|
* 2-cols-left + 2-cols-right + 2-rows-bottom). Caller supplies
|
||||||
|
* tmp populated with either source pixels (if all edges valid) or
|
||||||
|
* INT16_MIN sentinels at the boundary (if edge filtered out).
|
||||||
|
* ----------------------------------------------------------------- */
|
||||||
|
typedef struct {
|
||||||
|
uint32_t dst_off;
|
||||||
|
uint32_t tmp_off_u16; /* offset to block-origin in tmp[] (= padded_origin + 2*16+2) */
|
||||||
|
int32_t pri_strength; /* 1..7 */
|
||||||
|
int32_t sec_strength; /* 1..4 */
|
||||||
|
int32_t dir; /* 0..7 */
|
||||||
|
int32_t damping; /* 1..6 */
|
||||||
|
} daedalus_cdef_meta;
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_cdef_8x8(
|
||||||
|
daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const uint16_t *tmp,
|
||||||
|
size_t n_blocks, const daedalus_cdef_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const uint16_t *tmp,
|
||||||
|
size_t n_blocks, const daedalus_cdef_meta *meta);
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------
|
||||||
|
* Recipe query — what does the API recommend for each kernel?
|
||||||
|
* ----------------------------------------------------------------- */
|
||||||
|
typedef enum {
|
||||||
|
DAEDALUS_KERNEL_VP9_IDCT8 = 1,
|
||||||
|
DAEDALUS_KERNEL_VP9_LPF4_INNER = 2,
|
||||||
|
DAEDALUS_KERNEL_VP9_MC_8H = 3,
|
||||||
|
DAEDALUS_KERNEL_VP9_LPF8_INNER = 4,
|
||||||
|
DAEDALUS_KERNEL_AV1_CDEF_8X8 = 5,
|
||||||
|
} daedalus_kernel;
|
||||||
|
|
||||||
|
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif /* DAEDALUS_FOURIER_H */
|
||||||
@@ -0,0 +1,252 @@
|
|||||||
|
/*
|
||||||
|
* daedalus-fourier core library — Phase 8 skeleton.
|
||||||
|
*
|
||||||
|
* Wraps cycles 1-5 kernels behind the public C API in
|
||||||
|
* include/daedalus.h. Recipe dispatch routes per-kernel to the
|
||||||
|
* verdict substrate from each cycle's Phase 7 doc.
|
||||||
|
*
|
||||||
|
* License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
|
||||||
|
* dav1d BSD-2-Clause NEON snapshots.
|
||||||
|
*/
|
||||||
|
#include "../include/daedalus.h"
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
/* -------------------- Context -------------------- */
|
||||||
|
|
||||||
|
struct daedalus_ctx {
|
||||||
|
/* For Phase 8 skeleton: just a flag. Real impl would hold the
|
||||||
|
* v3d_runner + per-kernel pipeline handles. */
|
||||||
|
int has_qpu;
|
||||||
|
};
|
||||||
|
|
||||||
|
daedalus_ctx *daedalus_ctx_create(void)
|
||||||
|
{
|
||||||
|
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
||||||
|
if (!ctx) return NULL;
|
||||||
|
/* Phase 8 deferred: real impl probes V3D Vulkan device; for now
|
||||||
|
* default to CPU-only (NEON paths are always available). */
|
||||||
|
ctx->has_qpu = 0;
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
|
||||||
|
{
|
||||||
|
return ctx ? ctx->has_qpu : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
||||||
|
{
|
||||||
|
free(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -------------------- Recipe query -------------------- */
|
||||||
|
|
||||||
|
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||||
|
{
|
||||||
|
switch (k) {
|
||||||
|
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
|
||||||
|
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||||
|
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||||
|
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
}
|
||||||
|
return DAEDALUS_SUBSTRATE_CPU; /* defensive default */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -------------------- NEON externs (per cycle bench links) ----- */
|
||||||
|
|
||||||
|
extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
|
||||||
|
int16_t *block, int eob);
|
||||||
|
extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride,
|
||||||
|
int E, int I, int H);
|
||||||
|
extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride,
|
||||||
|
int E, int I, int H);
|
||||||
|
extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||||||
|
const uint8_t *src, ptrdiff_t src_stride,
|
||||||
|
int h, int mx, int my);
|
||||||
|
extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||||||
|
const uint16_t *tmp,
|
||||||
|
int pri_strength, int sec_strength,
|
||||||
|
int dir, int damping, int h,
|
||||||
|
size_t edges);
|
||||||
|
|
||||||
|
/* -------------------- CPU dispatch implementations -------------- */
|
||||||
|
|
||||||
|
static int dispatch_idct8_cpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const int16_t *coeffs, size_t n_blocks,
|
||||||
|
const daedalus_idct8_meta *meta)
|
||||||
|
{
|
||||||
|
(void) ctx;
|
||||||
|
int16_t scratch[64];
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t));
|
||||||
|
ff_vp9_idct_idct_8x8_add_neon(dst + meta[i].dst_off,
|
||||||
|
(ptrdiff_t) dst_stride,
|
||||||
|
scratch, 64);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dispatch_lpf_cpu(daedalus_ctx *ctx, int wd_8,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||||
|
{
|
||||||
|
(void) ctx;
|
||||||
|
for (size_t i = 0; i < n_edges; i++) {
|
||||||
|
uint8_t *p = dst + meta[i].dst_off;
|
||||||
|
if (wd_8) ff_vp9_loop_filter_h_8_8_neon(p, (ptrdiff_t) dst_stride,
|
||||||
|
meta[i].E, meta[i].I, meta[i].H);
|
||||||
|
else ff_vp9_loop_filter_h_4_8_neon(p, (ptrdiff_t) dst_stride,
|
||||||
|
meta[i].E, meta[i].I, meta[i].H);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dispatch_mc_8h_cpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const uint8_t *src, size_t src_stride,
|
||||||
|
size_t n_blocks, const daedalus_mc_meta *meta)
|
||||||
|
{
|
||||||
|
(void) ctx;
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
ff_vp9_put_regular8_h_neon(dst + meta[i].dst_off,
|
||||||
|
(ptrdiff_t) dst_stride,
|
||||||
|
src + meta[i].src_off + 3,
|
||||||
|
(ptrdiff_t) src_stride,
|
||||||
|
8, meta[i].mx, 0);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dispatch_cdef_cpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const uint16_t *tmp,
|
||||||
|
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||||||
|
{
|
||||||
|
(void) ctx;
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
dav1d_cdef_filter8_8bpc_neon(dst + meta[i].dst_off,
|
||||||
|
(ptrdiff_t) dst_stride,
|
||||||
|
tmp + meta[i].tmp_off_u16,
|
||||||
|
meta[i].pri_strength,
|
||||||
|
meta[i].sec_strength,
|
||||||
|
meta[i].dir, meta[i].damping, 8, 0);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -------------------- Public dispatch entry points -------------- */
|
||||||
|
|
||||||
|
#define ROUTE(_kernel, _cpu_fn, ...) \
|
||||||
|
daedalus_substrate eff = sub; \
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||||||
|
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \
|
||||||
|
return -1 /* QPU path not yet wired in Phase 8 skeleton */
|
||||||
|
|
||||||
|
int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const int16_t *coeffs, size_t n_blocks,
|
||||||
|
const daedalus_idct8_meta *meta)
|
||||||
|
{
|
||||||
|
ROUTE(DAEDALUS_KERNEL_VP9_IDCT8, dispatch_idct8_cpu,
|
||||||
|
dst, dst_stride, coeffs, n_blocks, meta);
|
||||||
|
}
|
||||||
|
|
||||||
|
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||||
|
{
|
||||||
|
daedalus_substrate eff = sub;
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||||
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER);
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||||
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||||
|
return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||||
|
{
|
||||||
|
daedalus_substrate eff = sub;
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||||
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER);
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||||
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||||
|
return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const uint8_t *src, size_t src_stride,
|
||||||
|
size_t n_blocks, const daedalus_mc_meta *meta)
|
||||||
|
{
|
||||||
|
ROUTE(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
|
||||||
|
dst, dst_stride, src, src_stride, n_blocks, meta);
|
||||||
|
}
|
||||||
|
|
||||||
|
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const uint16_t *tmp,
|
||||||
|
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||||||
|
{
|
||||||
|
ROUTE(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
|
||||||
|
dst, dst_stride, tmp, n_blocks, meta);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -------------------- Recipe convenience wrappers --------------- */
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const int16_t *coeffs, size_t n_blocks,
|
||||||
|
const daedalus_idct8_meta *meta)
|
||||||
|
{
|
||||||
|
return daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||||
|
dst, dst_stride, coeffs, n_blocks, meta);
|
||||||
|
}
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_vp9_lpf4(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||||
|
{
|
||||||
|
return daedalus_dispatch_vp9_lpf4(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||||
|
dst, dst_stride, n_edges, meta);
|
||||||
|
}
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_vp9_lpf8(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||||
|
{
|
||||||
|
return daedalus_dispatch_vp9_lpf8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||||
|
dst, dst_stride, n_edges, meta);
|
||||||
|
}
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_vp9_mc_8h(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const uint8_t *src, size_t src_stride,
|
||||||
|
size_t n_blocks, const daedalus_mc_meta *meta)
|
||||||
|
{
|
||||||
|
return daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||||
|
dst, dst_stride, src, src_stride, n_blocks, meta);
|
||||||
|
}
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const uint16_t *tmp,
|
||||||
|
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||||||
|
{
|
||||||
|
return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||||
|
dst, dst_stride, tmp, n_blocks, meta);
|
||||||
|
}
|
||||||
@@ -0,0 +1,103 @@
|
|||||||
|
/*
|
||||||
|
* Phase 8 — first end-to-end test through the public API.
|
||||||
|
*
|
||||||
|
* Exercises `daedalus_recipe_dispatch_vp9_idct8` end-to-end:
|
||||||
|
* 1. Create context.
|
||||||
|
* 2. Generate random VP9 coefficient blocks + dst pixels.
|
||||||
|
* 3. Compute reference output via the C ref (tests/vp9_idct8_ref.c).
|
||||||
|
* 4. Run public API dispatch on a copy of dst.
|
||||||
|
* 5. Assert bit-exact.
|
||||||
|
*
|
||||||
|
* In Phase 8 skeleton, the API routes to CPU NEON (QPU dispatch
|
||||||
|
* not yet wired through the API). Bit-exact gate against C ref
|
||||||
|
* still passes because the underlying NEON kernel was the cycle 1
|
||||||
|
* reference.
|
||||||
|
*/
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "../include/daedalus.h"
|
||||||
|
|
||||||
|
extern void daedalus_vp9_idct_idct_8x8_add_ref(
|
||||||
|
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
|
||||||
|
|
||||||
|
#define BLOCKS_W 8
|
||||||
|
#define BLOCKS_H 8
|
||||||
|
#define N_BLOCKS (BLOCKS_W * BLOCKS_H)
|
||||||
|
#define DST_STRIDE (BLOCKS_W * 8)
|
||||||
|
#define DST_BYTES (BLOCKS_H * 8 * DST_STRIDE)
|
||||||
|
|
||||||
|
static uint64_t xs_state = 0xa57edbeef5717ULL;
|
||||||
|
static inline uint64_t xs(void) {
|
||||||
|
uint64_t x = xs_state;
|
||||||
|
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||||||
|
return xs_state = x;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||||
|
if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
|
||||||
|
|
||||||
|
printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n");
|
||||||
|
printf(" has_qpu: %d (Phase 8 skeleton: NEON-only)\n",
|
||||||
|
daedalus_ctx_has_qpu(ctx));
|
||||||
|
printf(" recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n",
|
||||||
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8));
|
||||||
|
|
||||||
|
/* Generate random VP9 IDCT inputs: 64-coef blocks + a dst surface. */
|
||||||
|
int16_t coeffs[N_BLOCKS * 64];
|
||||||
|
memset(coeffs, 0, sizeof(coeffs));
|
||||||
|
for (int i = 0; i < N_BLOCKS; i++) {
|
||||||
|
/* Sparse non-zero coefs to keep range realistic. */
|
||||||
|
int n = 1 + (int)(xs() % 16);
|
||||||
|
for (int j = 0; j < n; j++) {
|
||||||
|
int pos = (int)(xs() % 64);
|
||||||
|
int16_t v = (int16_t)((int)(xs() % 8192) - 4096);
|
||||||
|
coeffs[i * 64 + pos] = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t dst_ref[DST_BYTES], dst_api[DST_BYTES];
|
||||||
|
for (int i = 0; i < DST_BYTES; i++)
|
||||||
|
dst_ref[i] = dst_api[i] = (uint8_t)(xs() & 0xff);
|
||||||
|
|
||||||
|
/* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset
|
||||||
|
* by*8*stride + bx*8. */
|
||||||
|
daedalus_idct8_meta meta[N_BLOCKS];
|
||||||
|
for (int by = 0; by < BLOCKS_H; by++) {
|
||||||
|
for (int bx = 0; bx < BLOCKS_W; bx++) {
|
||||||
|
int i = by * BLOCKS_W + bx;
|
||||||
|
meta[i].dst_off = (uint32_t)(by * 8 * DST_STRIDE + bx * 8);
|
||||||
|
meta[i].block_x = (uint32_t) bx;
|
||||||
|
meta[i].block_y = (uint32_t) by;
|
||||||
|
meta[i]._pad = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Compute reference via the C ref (mutates a scratch copy of
|
||||||
|
* coeffs because the C ref destroys its input). */
|
||||||
|
int16_t scratch[64];
|
||||||
|
for (int i = 0; i < N_BLOCKS; i++) {
|
||||||
|
memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t));
|
||||||
|
daedalus_vp9_idct_idct_8x8_add_ref(dst_ref + meta[i].dst_off,
|
||||||
|
DST_STRIDE, scratch, 64);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Dispatch through the public API. */
|
||||||
|
int rc = daedalus_recipe_dispatch_vp9_idct8(ctx, dst_api, DST_STRIDE,
|
||||||
|
coeffs, N_BLOCKS, meta);
|
||||||
|
if (rc != 0) { fprintf(stderr, "API dispatch failed rc=%d\n", rc); return 1; }
|
||||||
|
|
||||||
|
/* Compare. */
|
||||||
|
int diffs = 0;
|
||||||
|
for (int i = 0; i < DST_BYTES; i++) if (dst_ref[i] != dst_api[i]) diffs++;
|
||||||
|
printf(" bytes bit-exact: %d / %d (%.4f%%)\n",
|
||||||
|
DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
|
||||||
|
|
||||||
|
daedalus_ctx_destroy(ctx);
|
||||||
|
return diffs == 0 ? 0 : 1;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user