/* * Phase 8 — VP9 LPF wd=4 + wd=8 through the public API. * * Exercises both kernels in CPU / QPU / AUTO modes against the * C reference (tests/vp9_lpf_ref.c, vp9_lpf8_ref.c). Bit-exact * gate per cycle 2 and 4 phase 7 docs. */ #include #include #include #include #include #include "../include/daedalus.h" extern void daedalus_vp9_loop_filter_h_4_8_ref( uint8_t *dst, ptrdiff_t stride, int E, int I, int H); extern void daedalus_vp9_loop_filter_h_8_8_ref( uint8_t *dst, ptrdiff_t stride, int E, int I, int H); #define N_EDGES 32 #define EDGE_STRIDE 8 #define EDGE_H 8 #define EDGE_BYTES (EDGE_H * EDGE_STRIDE) /* 64 */ #define DST_BYTES (N_EDGES * EDGE_BYTES) static uint64_t xs_state = 0xa57edbeef5717ULL; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } static void gen_edge_pixels(uint8_t *buf) { int side_a_base = (int)(xs() % 200) + 20; int side_b_base = (int)(xs() % 200) + 20; int noise = (int)(xs() % 30); for (int r = 0; r < EDGE_H; r++) { for (int c = 0; c < 8; c++) { int base = (c < 4) ? side_a_base : side_b_base; int n = ((int)(xs() % (2 * noise + 1))) - noise; int v = base + n; buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v); } } } static int run_lpf(int wd_8, daedalus_substrate force, const uint8_t *dst_initial, const uint8_t *dst_ref, const daedalus_lpf_meta *meta, const char *label) { daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; int has_qpu = daedalus_ctx_has_qpu(ctx); if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) { printf(" [%s wd=%d] SKIP — QPU unavailable\n", label, wd_8 ? 8 : 4); daedalus_ctx_destroy(ctx); return 0; } uint8_t dst[DST_BYTES]; memcpy(dst, dst_initial, DST_BYTES); int rc = wd_8 ? daedalus_dispatch_vp9_lpf8(ctx, force, dst, EDGE_STRIDE, N_EDGES, meta) : daedalus_dispatch_vp9_lpf4(ctx, force, dst, EDGE_STRIDE, N_EDGES, meta); if (rc) { fprintf(stderr, " rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; } int diffs = 0; for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++; printf(" [%s wd=%d] %d/%d bit-exact (%.4f%%)\n", label, wd_8 ? 8 : 4, DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES); daedalus_ctx_destroy(ctx); return diffs == 0 ? 0 : 1; } static int run_one_kernel(int wd_8) { /* Per-edge layout: edge i occupies bytes [i*64..i*64+63]. Edge * center is at column 4 of row 0 → byte offset i*64 + 4. */ uint8_t initial[DST_BYTES]; uint8_t ref[DST_BYTES]; daedalus_lpf_meta meta[N_EDGES]; for (int i = 0; i < N_EDGES; i++) { gen_edge_pixels(initial + i * EDGE_BYTES); meta[i].dst_off = (uint32_t)(i * EDGE_BYTES + 4); meta[i].E = (int32_t)(xs() % 81); meta[i].I = (int32_t)(xs() % 41); meta[i].H = (int32_t)(xs() % 11); } memcpy(ref, initial, DST_BYTES); for (int i = 0; i < N_EDGES; i++) { if (wd_8) daedalus_vp9_loop_filter_h_8_8_ref( ref + meta[i].dst_off, EDGE_STRIDE, meta[i].E, meta[i].I, meta[i].H); else daedalus_vp9_loop_filter_h_4_8_ref( ref + meta[i].dst_off, EDGE_STRIDE, meta[i].E, meta[i].I, meta[i].H); } int fail = 0; fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_CPU, initial, ref, meta, "CPU"); fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_QPU, initial, ref, meta, "QPU"); fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_AUTO, initial, ref, meta, "AUTO"); return fail; } int main(void) { printf("=== Phase 8 API smoke: VP9 LPF wd=4 + wd=8 ===\n"); printf(" recipe for LPF4_INNER: %d (1=CPU, 2=QPU)\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER)); printf(" recipe for LPF8_INNER: %d\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER)); int fail = 0; printf("\nLPF wd=4:\n"); fail |= run_one_kernel(0); printf("\nLPF wd=8:\n"); fail |= run_one_kernel(1); return fail; }