Files
daedalus-fourier/tests/test_api_opportunistic_qpu.c
T
marfrit 0a99b16489 Phase 8b: opportunistic QPU paths through public API
Wires QPU dispatch for cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264
deblock) through the public API. These three kernels have recipe
substrate = CPU, but per Issue 003 the mixed-kernel helper value
is real — the dispatch path must exist so override-mode callers
can request QPU on the side.

Pattern mirrors dispatch_idct8_qpu (lazy pipeline + per-call SSBO
alloc + memcpy + dispatch + readback). Each kernel has its own
push-constant struct (mc_pc 3-field, cdef_pc 3-field, deblock_pc
2-field shared with lpf).

Notable bug caught + fixed in test_api_opportunistic_qpu: the
initial dispatch_mc_8h_qpu sized src_max using CPU-side reach
(src_off + 3 + 8 + 7*stride), but the QPU shader reads src[
src_off + row*stride + 0..14] for row=0..7. Last block had 3
uninitialized bytes → 99.8% match → 100% after fix.

After this commit, the public API surface fully covers cycles 1-8:
  Cycle 1 (IDCT 8x8): CPU + QPU + AUTO bit-exact
  Cycle 2 (LPF wd=4): CPU + QPU + AUTO bit-exact
  Cycle 3 (MC 8h): CPU recipe; QPU override bit-exact
  Cycle 4 (LPF wd=8): CPU + QPU + AUTO bit-exact
  Cycle 5 (CDEF): CPU recipe; QPU override (untested in this
    test — bench_v3d_cdef is the authoritative 3-way M1)
  Cycle 6 (H.264 IDCT 4x4): CPU only (no QPU shader by recipe)
  Cycle 7 (H.264 IDCT 8x8): CPU only
  Cycle 8 (H.264 deblock luma-v): CPU recipe; QPU override bit-exact

Tests: test_api_opportunistic_qpu adds CPU-vs-QPU bit-exact
comparison for VP9 MC and H.264 deblock through the API.
test_api_idct, test_api_lpf, test_api_h264 still pass.

Per the locked Phase 8 architecture (project_phase8_architecture
memory): next session opens daedalus-v4l2 sibling repo with
Option B (kernel V4L2 shim + userspace daemon), Option γ (dlopen
FFmpeg parser).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:50:41 +00:00

119 lines
4.4 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Phase 8b — opportunistic-QPU dispatch paths through public API.
*
* Verifies that cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264 deblock)
* can be force-routed to QPU via daedalus_dispatch_*(QPU, ...) and
* produce bit-exact output vs the CPU path (which is the C ref proxy
* for each kernel — see per-cycle Phase 7 docs).
*
* AUTO/recipe path stays on CPU for these kernels — that's the
* deployment shape. This test exercises the override-mode path
* the integration layer would use for runtime-aware scheduling.
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "../include/daedalus.h"
static uint64_t xs_state = 0xab10b81cULL;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
static int test_mc(void)
{
enum { N = 32, DST_STRIDE = 16, DST_ROWS = 8 * 4, DST_BYTES = DST_ROWS * DST_STRIDE,
SRC_STRIDE = 16, SRC_ROWS = 12, SRC_BYTES = SRC_ROWS * SRC_STRIDE * N };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
if (!daedalus_ctx_has_qpu(ctx)) {
printf(" VP9 MC: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0;
}
/* Allocate per-block src tiles (12 rows x 16 cols each). */
uint8_t *src = malloc(SRC_BYTES);
uint8_t *dst_cpu = calloc(1, DST_BYTES * N);
uint8_t *dst_qpu = calloc(1, DST_BYTES * N);
daedalus_mc_meta *meta = calloc(N, sizeof(*meta));
if (!src || !dst_cpu || !dst_qpu || !meta) return 1;
for (size_t i = 0; i < SRC_BYTES; i++) src[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N; i++) {
meta[i].dst_off = i * 64; /* 8 rows × 8 cols = 64 bytes per block */
meta[i].src_off = i * SRC_STRIDE * SRC_ROWS; /* RAW src offset; shader handles -3 */
meta[i].mx = (int)(xs() & 15);
}
daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, 8, src, SRC_STRIDE, N, meta);
daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, 8, src, SRC_STRIDE, N, meta);
int diff = 0;
for (int i = 0; i < N * 64; i++) if (dst_cpu[i] != dst_qpu[i]) diff++;
printf(" VP9 MC (CPU vs QPU): %d/%d bytes match (%.4f%%)\n",
N * 64 - diff, N * 64, 100.0 * (N * 64 - diff) / (N * 64));
free(src); free(dst_cpu); free(dst_qpu); free(meta);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock(void)
{
enum { N = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
TOTAL = N * TILE_BYTES, EDGE_OFF = 4 * TILE_STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
if (!daedalus_ctx_has_qpu(ctx)) {
printf(" H.264 deblock: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0;
}
uint8_t *master = malloc(TOTAL);
uint8_t *dst_cpu = malloc(TOTAL);
uint8_t *dst_qpu = malloc(TOTAL);
daedalus_h264_deblock_meta *meta = calloc(N, sizeof(*meta));
if (!master || !dst_cpu || !dst_qpu || !meta) return 1;
for (int i = 0; i < TOTAL; i++) master[i] = (uint8_t)(xs() & 0xff);
memcpy(dst_cpu, master, TOTAL);
memcpy(dst_qpu, master, TOTAL);
for (int i = 0; i < N; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, TILE_STRIDE, N, meta);
daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, TILE_STRIDE, N, meta);
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst_cpu[i] != dst_qpu[i]) diff++;
printf(" H.264 deblock (CPU vs QPU): %d/%d bytes match (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
free(master); free(dst_cpu); free(dst_qpu); free(meta);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
int main(void)
{
printf("=== Phase 8b: opportunistic-QPU paths through API ===\n");
int fail = 0;
fail |= test_mc();
fail |= test_deblock();
/* CDEF skipped here — tmp construction in C ref differs subtly
* from dav1d NEON's; bench_v3d_cdef.c is the authoritative gate
* for the QPU CDEF path. */
return fail;
}