760f6a4060
include/daedalus.h: stable C API surface exposing the 5 cycles (VP9 IDCT 8x8, LPF wd=4, MC 8h, LPF wd=8; AV1 CDEF). Per-kernel recipe-dispatch helpers default to the cycle 1-5 verdict substrate (QPU for cycles 1+2+4, CPU for cycles 3+5); explicit override available for benchmarking and runtime-aware scheduling. src/daedalus_core.c: NEON-path implementation of all 5 kernels wrapped behind the public API. QPU path stubbed out (returns -1) since wiring v3d_runner into daedalus_ctx is the next Phase 8 sub-step; with has_qpu=0 the recipe falls back to CPU cleanly. tests/test_api_idct.c: 64-block IDCT through the public recipe dispatch, bit-exact vs C ref. PASS 4096/4096 bytes — proves the API surface compiles, library links, dispatch routing works, and NEON fallback delivers correct results. docs/phase8_scoping.md: architecture options (A=userspace V4L2, B=kernel V4L2 shim, C=direct libva); pick A for v1; explicitly out-of-scope work tracked. Next Phase 8 sub-step: wire v3d_runner into daedalus_ctx so has_qpu=1 and QPU dispatch goes through the API too. After that: V4L2 ioctl glue, bitstream parser, superblock loop. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
104 lines
3.6 KiB
C
104 lines
3.6 KiB
C
/*
|
|
* Phase 8 — first end-to-end test through the public API.
|
|
*
|
|
* Exercises `daedalus_recipe_dispatch_vp9_idct8` end-to-end:
|
|
* 1. Create context.
|
|
* 2. Generate random VP9 coefficient blocks + dst pixels.
|
|
* 3. Compute reference output via the C ref (tests/vp9_idct8_ref.c).
|
|
* 4. Run public API dispatch on a copy of dst.
|
|
* 5. Assert bit-exact.
|
|
*
|
|
* In Phase 8 skeleton, the API routes to CPU NEON (QPU dispatch
|
|
* not yet wired through the API). Bit-exact gate against C ref
|
|
* still passes because the underlying NEON kernel was the cycle 1
|
|
* reference.
|
|
*/
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
|
|
#include "../include/daedalus.h"
|
|
|
|
extern void daedalus_vp9_idct_idct_8x8_add_ref(
|
|
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
|
|
|
|
#define BLOCKS_W 8
|
|
#define BLOCKS_H 8
|
|
#define N_BLOCKS (BLOCKS_W * BLOCKS_H)
|
|
#define DST_STRIDE (BLOCKS_W * 8)
|
|
#define DST_BYTES (BLOCKS_H * 8 * DST_STRIDE)
|
|
|
|
static uint64_t xs_state = 0xa57edbeef5717ULL;
|
|
static inline uint64_t xs(void) {
|
|
uint64_t x = xs_state;
|
|
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
|
return xs_state = x;
|
|
}
|
|
|
|
int main(void)
|
|
{
|
|
daedalus_ctx *ctx = daedalus_ctx_create();
|
|
if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
|
|
|
|
printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n");
|
|
printf(" has_qpu: %d (Phase 8 skeleton: NEON-only)\n",
|
|
daedalus_ctx_has_qpu(ctx));
|
|
printf(" recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n",
|
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8));
|
|
|
|
/* Generate random VP9 IDCT inputs: 64-coef blocks + a dst surface. */
|
|
int16_t coeffs[N_BLOCKS * 64];
|
|
memset(coeffs, 0, sizeof(coeffs));
|
|
for (int i = 0; i < N_BLOCKS; i++) {
|
|
/* Sparse non-zero coefs to keep range realistic. */
|
|
int n = 1 + (int)(xs() % 16);
|
|
for (int j = 0; j < n; j++) {
|
|
int pos = (int)(xs() % 64);
|
|
int16_t v = (int16_t)((int)(xs() % 8192) - 4096);
|
|
coeffs[i * 64 + pos] = v;
|
|
}
|
|
}
|
|
|
|
uint8_t dst_ref[DST_BYTES], dst_api[DST_BYTES];
|
|
for (int i = 0; i < DST_BYTES; i++)
|
|
dst_ref[i] = dst_api[i] = (uint8_t)(xs() & 0xff);
|
|
|
|
/* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset
|
|
* by*8*stride + bx*8. */
|
|
daedalus_idct8_meta meta[N_BLOCKS];
|
|
for (int by = 0; by < BLOCKS_H; by++) {
|
|
for (int bx = 0; bx < BLOCKS_W; bx++) {
|
|
int i = by * BLOCKS_W + bx;
|
|
meta[i].dst_off = (uint32_t)(by * 8 * DST_STRIDE + bx * 8);
|
|
meta[i].block_x = (uint32_t) bx;
|
|
meta[i].block_y = (uint32_t) by;
|
|
meta[i]._pad = 0;
|
|
}
|
|
}
|
|
|
|
/* Compute reference via the C ref (mutates a scratch copy of
|
|
* coeffs because the C ref destroys its input). */
|
|
int16_t scratch[64];
|
|
for (int i = 0; i < N_BLOCKS; i++) {
|
|
memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t));
|
|
daedalus_vp9_idct_idct_8x8_add_ref(dst_ref + meta[i].dst_off,
|
|
DST_STRIDE, scratch, 64);
|
|
}
|
|
|
|
/* Dispatch through the public API. */
|
|
int rc = daedalus_recipe_dispatch_vp9_idct8(ctx, dst_api, DST_STRIDE,
|
|
coeffs, N_BLOCKS, meta);
|
|
if (rc != 0) { fprintf(stderr, "API dispatch failed rc=%d\n", rc); return 1; }
|
|
|
|
/* Compare. */
|
|
int diffs = 0;
|
|
for (int i = 0; i < DST_BYTES; i++) if (dst_ref[i] != dst_api[i]) diffs++;
|
|
printf(" bytes bit-exact: %d / %d (%.4f%%)\n",
|
|
DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
|
|
|
|
daedalus_ctx_destroy(ctx);
|
|
return diffs == 0 ? 0 : 1;
|
|
}
|