Phase 8b: opportunistic QPU paths through public API

Wires QPU dispatch for cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264
deblock) through the public API. These three kernels have recipe
substrate = CPU, but per Issue 003 the mixed-kernel helper value
is real — the dispatch path must exist so override-mode callers
can request QPU on the side.

Pattern mirrors dispatch_idct8_qpu (lazy pipeline + per-call SSBO
alloc + memcpy + dispatch + readback). Each kernel has its own
push-constant struct (mc_pc 3-field, cdef_pc 3-field, deblock_pc
2-field shared with lpf).

Notable bug caught + fixed in test_api_opportunistic_qpu: the
initial dispatch_mc_8h_qpu sized src_max using CPU-side reach
(src_off + 3 + 8 + 7*stride), but the QPU shader reads src[
src_off + row*stride + 0..14] for row=0..7. Last block had 3
uninitialized bytes → 99.8% match → 100% after fix.

After this commit, the public API surface fully covers cycles 1-8:
  Cycle 1 (IDCT 8x8): CPU + QPU + AUTO bit-exact
  Cycle 2 (LPF wd=4): CPU + QPU + AUTO bit-exact
  Cycle 3 (MC 8h): CPU recipe; QPU override bit-exact
  Cycle 4 (LPF wd=8): CPU + QPU + AUTO bit-exact
  Cycle 5 (CDEF): CPU recipe; QPU override (untested in this
    test — bench_v3d_cdef is the authoritative 3-way M1)
  Cycle 6 (H.264 IDCT 4x4): CPU only (no QPU shader by recipe)
  Cycle 7 (H.264 IDCT 8x8): CPU only
  Cycle 8 (H.264 deblock luma-v): CPU recipe; QPU override bit-exact

Tests: test_api_opportunistic_qpu adds CPU-vs-QPU bit-exact
comparison for VP9 MC and H.264 deblock through the API.
test_api_idct, test_api_lpf, test_api_h264 still pass.

Per the locked Phase 8 architecture (project_phase8_architecture
memory): next session opens daedalus-v4l2 sibling repo with
Option B (kernel V4L2 shim + userspace daemon), Option γ (dlopen
FFmpeg parser).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 14:50:41 +00:00
parent fd55f5ebc1
commit 0a99b16489
3 changed files with 396 additions and 9 deletions
+118
View File
@@ -0,0 +1,118 @@
/*
* Phase 8b — opportunistic-QPU dispatch paths through public API.
*
* Verifies that cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264 deblock)
* can be force-routed to QPU via daedalus_dispatch_*(QPU, ...) and
* produce bit-exact output vs the CPU path (which is the C ref proxy
* for each kernel — see per-cycle Phase 7 docs).
*
* AUTO/recipe path stays on CPU for these kernels — that's the
* deployment shape. This test exercises the override-mode path
* the integration layer would use for runtime-aware scheduling.
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "../include/daedalus.h"
static uint64_t xs_state = 0xab10b81cULL;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
static int test_mc(void)
{
enum { N = 32, DST_STRIDE = 16, DST_ROWS = 8 * 4, DST_BYTES = DST_ROWS * DST_STRIDE,
SRC_STRIDE = 16, SRC_ROWS = 12, SRC_BYTES = SRC_ROWS * SRC_STRIDE * N };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
if (!daedalus_ctx_has_qpu(ctx)) {
printf(" VP9 MC: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0;
}
/* Allocate per-block src tiles (12 rows x 16 cols each). */
uint8_t *src = malloc(SRC_BYTES);
uint8_t *dst_cpu = calloc(1, DST_BYTES * N);
uint8_t *dst_qpu = calloc(1, DST_BYTES * N);
daedalus_mc_meta *meta = calloc(N, sizeof(*meta));
if (!src || !dst_cpu || !dst_qpu || !meta) return 1;
for (size_t i = 0; i < SRC_BYTES; i++) src[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N; i++) {
meta[i].dst_off = i * 64; /* 8 rows × 8 cols = 64 bytes per block */
meta[i].src_off = i * SRC_STRIDE * SRC_ROWS; /* RAW src offset; shader handles -3 */
meta[i].mx = (int)(xs() & 15);
}
daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, 8, src, SRC_STRIDE, N, meta);
daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, 8, src, SRC_STRIDE, N, meta);
int diff = 0;
for (int i = 0; i < N * 64; i++) if (dst_cpu[i] != dst_qpu[i]) diff++;
printf(" VP9 MC (CPU vs QPU): %d/%d bytes match (%.4f%%)\n",
N * 64 - diff, N * 64, 100.0 * (N * 64 - diff) / (N * 64));
free(src); free(dst_cpu); free(dst_qpu); free(meta);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock(void)
{
enum { N = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
TOTAL = N * TILE_BYTES, EDGE_OFF = 4 * TILE_STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
if (!daedalus_ctx_has_qpu(ctx)) {
printf(" H.264 deblock: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0;
}
uint8_t *master = malloc(TOTAL);
uint8_t *dst_cpu = malloc(TOTAL);
uint8_t *dst_qpu = malloc(TOTAL);
daedalus_h264_deblock_meta *meta = calloc(N, sizeof(*meta));
if (!master || !dst_cpu || !dst_qpu || !meta) return 1;
for (int i = 0; i < TOTAL; i++) master[i] = (uint8_t)(xs() & 0xff);
memcpy(dst_cpu, master, TOTAL);
memcpy(dst_qpu, master, TOTAL);
for (int i = 0; i < N; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, TILE_STRIDE, N, meta);
daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, TILE_STRIDE, N, meta);
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst_cpu[i] != dst_qpu[i]) diff++;
printf(" H.264 deblock (CPU vs QPU): %d/%d bytes match (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
free(master); free(dst_cpu); free(dst_qpu); free(meta);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
int main(void)
{
printf("=== Phase 8b: opportunistic-QPU paths through API ===\n");
int fail = 0;
fail |= test_mc();
fail |= test_deblock();
/* CDEF skipped here — tmp construction in C ref differs subtly
* from dav1d NEON's; bench_v3d_cdef.c is the authoritative gate
* for the QPU CDEF path. */
return fail;
}