/* * Phase 8 — first end-to-end test through the public API. * * Exercises `daedalus_recipe_dispatch_vp9_idct8` end-to-end: * 1. Create context. * 2. Generate random VP9 coefficient blocks + dst pixels. * 3. Compute reference output via the C ref (tests/vp9_idct8_ref.c). * 4. Run public API dispatch on a copy of dst. * 5. Assert bit-exact. * * In Phase 8 skeleton, the API routes to CPU NEON (QPU dispatch * not yet wired through the API). Bit-exact gate against C ref * still passes because the underlying NEON kernel was the cycle 1 * reference. */ #include #include #include #include #include #include "../include/daedalus.h" extern void daedalus_vp9_idct_idct_8x8_add_ref( uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); #define BLOCKS_W 8 #define BLOCKS_H 8 #define N_BLOCKS (BLOCKS_W * BLOCKS_H) #define DST_STRIDE (BLOCKS_W * 8) #define DST_BYTES (BLOCKS_H * 8 * DST_STRIDE) static uint64_t xs_state = 0xa57edbeef5717ULL; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } static int run_once(daedalus_substrate force, const int16_t *coeffs, const daedalus_idct8_meta *meta, const uint8_t *dst_initial, const uint8_t *dst_ref, const char *label) { daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; } int has_qpu = daedalus_ctx_has_qpu(ctx); printf(" [%s] has_qpu=%d force=%d\n", label, has_qpu, (int) force); if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) { printf(" SKIP — QPU unavailable on this host\n"); daedalus_ctx_destroy(ctx); return 0; } uint8_t dst[DST_BYTES]; memcpy(dst, dst_initial, DST_BYTES); int rc = daedalus_dispatch_vp9_idct8(ctx, force, dst, DST_STRIDE, coeffs, N_BLOCKS, meta); if (rc) { fprintf(stderr, " dispatch rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; } int diffs = 0; for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++; printf(" %d / %d bytes bit-exact (%.4f%%)\n", DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES); daedalus_ctx_destroy(ctx); return diffs == 0 ? 0 : 1; } int main(void) { printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n"); printf(" recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8)); /* Generate random VP9 IDCT inputs: 64-coef blocks + a dst surface. */ int16_t coeffs[N_BLOCKS * 64]; memset(coeffs, 0, sizeof(coeffs)); for (int i = 0; i < N_BLOCKS; i++) { /* Sparse non-zero coefs to keep range realistic. */ int n = 1 + (int)(xs() % 16); for (int j = 0; j < n; j++) { int pos = (int)(xs() % 64); int16_t v = (int16_t)((int)(xs() % 8192) - 4096); coeffs[i * 64 + pos] = v; } } uint8_t dst_ref[DST_BYTES], dst_initial[DST_BYTES]; for (int i = 0; i < DST_BYTES; i++) dst_ref[i] = dst_initial[i] = (uint8_t)(xs() & 0xff); /* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset * by*8*stride + bx*8. */ daedalus_idct8_meta meta[N_BLOCKS]; for (int by = 0; by < BLOCKS_H; by++) { for (int bx = 0; bx < BLOCKS_W; bx++) { int i = by * BLOCKS_W + bx; meta[i].dst_off = (uint32_t)(by * 8 * DST_STRIDE + bx * 8); meta[i].block_x = (uint32_t) bx; meta[i].block_y = (uint32_t) by; meta[i]._pad = 0; } } /* Compute reference via the C ref (mutates a scratch copy of * coeffs because the C ref destroys its input). */ int16_t scratch[64]; for (int i = 0; i < N_BLOCKS; i++) { memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t)); daedalus_vp9_idct_idct_8x8_add_ref(dst_ref + meta[i].dst_off, DST_STRIDE, scratch, 64); } int fail = 0; fail |= run_once(DAEDALUS_SUBSTRATE_CPU, coeffs, meta, dst_initial, dst_ref, "CPU"); fail |= run_once(DAEDALUS_SUBSTRATE_QPU, coeffs, meta, dst_initial, dst_ref, "QPU"); fail |= run_once(DAEDALUS_SUBSTRATE_AUTO, coeffs, meta, dst_initial, dst_ref, "AUTO"); return fail; }