/* * Phase 8b — opportunistic-QPU dispatch paths through public API. * * Verifies that cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264 deblock) * can be force-routed to QPU via daedalus_dispatch_*(QPU, ...) and * produce bit-exact output vs the CPU path (which is the C ref proxy * for each kernel — see per-cycle Phase 7 docs). * * AUTO/recipe path stays on CPU for these kernels — that's the * deployment shape. This test exercises the override-mode path * the integration layer would use for runtime-aware scheduling. */ #include #include #include #include #include #include "../include/daedalus.h" static uint64_t xs_state = 0xab10b81cULL; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } static int test_mc(void) { enum { N = 32, DST_STRIDE = 16, DST_ROWS = 8 * 4, DST_BYTES = DST_ROWS * DST_STRIDE, SRC_STRIDE = 16, SRC_ROWS = 12, SRC_BYTES = SRC_ROWS * SRC_STRIDE * N }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; if (!daedalus_ctx_has_qpu(ctx)) { printf(" VP9 MC: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0; } /* Allocate per-block src tiles (12 rows x 16 cols each). */ uint8_t *src = malloc(SRC_BYTES); uint8_t *dst_cpu = calloc(1, DST_BYTES * N); uint8_t *dst_qpu = calloc(1, DST_BYTES * N); daedalus_mc_meta *meta = calloc(N, sizeof(*meta)); if (!src || !dst_cpu || !dst_qpu || !meta) return 1; for (size_t i = 0; i < SRC_BYTES; i++) src[i] = (uint8_t)(xs() & 0xff); for (int i = 0; i < N; i++) { meta[i].dst_off = i * 64; /* 8 rows × 8 cols = 64 bytes per block */ meta[i].src_off = i * SRC_STRIDE * SRC_ROWS; /* RAW src offset; shader handles -3 */ meta[i].mx = (int)(xs() & 15); } daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, 8, src, SRC_STRIDE, N, meta); daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, 8, src, SRC_STRIDE, N, meta); int diff = 0; for (int i = 0; i < N * 64; i++) if (dst_cpu[i] != dst_qpu[i]) diff++; printf(" VP9 MC (CPU vs QPU): %d/%d bytes match (%.4f%%)\n", N * 64 - diff, N * 64, 100.0 * (N * 64 - diff) / (N * 64)); free(src); free(dst_cpu); free(dst_qpu); free(meta); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_deblock(void) { enum { N = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE, TOTAL = N * TILE_BYTES, EDGE_OFF = 4 * TILE_STRIDE }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; if (!daedalus_ctx_has_qpu(ctx)) { printf(" H.264 deblock: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0; } uint8_t *master = malloc(TOTAL); uint8_t *dst_cpu = malloc(TOTAL); uint8_t *dst_qpu = malloc(TOTAL); daedalus_h264_deblock_meta *meta = calloc(N, sizeof(*meta)); if (!master || !dst_cpu || !dst_qpu || !meta) return 1; for (int i = 0; i < TOTAL; i++) master[i] = (uint8_t)(xs() & 0xff); memcpy(dst_cpu, master, TOTAL); memcpy(dst_qpu, master, TOTAL); for (int i = 0; i < N; i++) { meta[i].dst_off = i * TILE_BYTES + EDGE_OFF; meta[i].alpha = (int)(xs() % 64) + 1; meta[i].beta = (int)(xs() % 16) + 1; for (int s = 0; s < 4; s++) { int r = (int)(xs() % 8); meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); } } daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, TILE_STRIDE, N, meta); daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, TILE_STRIDE, N, meta); int diff = 0; for (int i = 0; i < TOTAL; i++) if (dst_cpu[i] != dst_qpu[i]) diff++; printf(" H.264 deblock (CPU vs QPU): %d/%d bytes match (%.4f%%)\n", TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); free(master); free(dst_cpu); free(dst_qpu); free(meta); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } int main(void) { printf("=== Phase 8b: opportunistic-QPU paths through API ===\n"); int fail = 0; fail |= test_mc(); fail |= test_deblock(); /* CDEF skipped here — tmp construction in C ref differs subtly * from dav1d NEON's; bench_v3d_cdef.c is the authoritative gate * for the QPU CDEF path. */ return fail; }