0a99b16489
Wires QPU dispatch for cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264
deblock) through the public API. These three kernels have recipe
substrate = CPU, but per Issue 003 the mixed-kernel helper value
is real — the dispatch path must exist so override-mode callers
can request QPU on the side.
Pattern mirrors dispatch_idct8_qpu (lazy pipeline + per-call SSBO
alloc + memcpy + dispatch + readback). Each kernel has its own
push-constant struct (mc_pc 3-field, cdef_pc 3-field, deblock_pc
2-field shared with lpf).
Notable bug caught + fixed in test_api_opportunistic_qpu: the
initial dispatch_mc_8h_qpu sized src_max using CPU-side reach
(src_off + 3 + 8 + 7*stride), but the QPU shader reads src[
src_off + row*stride + 0..14] for row=0..7. Last block had 3
uninitialized bytes → 99.8% match → 100% after fix.
After this commit, the public API surface fully covers cycles 1-8:
Cycle 1 (IDCT 8x8): CPU + QPU + AUTO bit-exact
Cycle 2 (LPF wd=4): CPU + QPU + AUTO bit-exact
Cycle 3 (MC 8h): CPU recipe; QPU override bit-exact
Cycle 4 (LPF wd=8): CPU + QPU + AUTO bit-exact
Cycle 5 (CDEF): CPU recipe; QPU override (untested in this
test — bench_v3d_cdef is the authoritative 3-way M1)
Cycle 6 (H.264 IDCT 4x4): CPU only (no QPU shader by recipe)
Cycle 7 (H.264 IDCT 8x8): CPU only
Cycle 8 (H.264 deblock luma-v): CPU recipe; QPU override bit-exact
Tests: test_api_opportunistic_qpu adds CPU-vs-QPU bit-exact
comparison for VP9 MC and H.264 deblock through the API.
test_api_idct, test_api_lpf, test_api_h264 still pass.
Per the locked Phase 8 architecture (project_phase8_architecture
memory): next session opens daedalus-v4l2 sibling repo with
Option B (kernel V4L2 shim + userspace daemon), Option γ (dlopen
FFmpeg parser).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
119 lines
4.4 KiB
C
119 lines
4.4 KiB
C
/*
|
||
* Phase 8b — opportunistic-QPU dispatch paths through public API.
|
||
*
|
||
* Verifies that cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264 deblock)
|
||
* can be force-routed to QPU via daedalus_dispatch_*(QPU, ...) and
|
||
* produce bit-exact output vs the CPU path (which is the C ref proxy
|
||
* for each kernel — see per-cycle Phase 7 docs).
|
||
*
|
||
* AUTO/recipe path stays on CPU for these kernels — that's the
|
||
* deployment shape. This test exercises the override-mode path
|
||
* the integration layer would use for runtime-aware scheduling.
|
||
*/
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <stdint.h>
|
||
#include <stddef.h>
|
||
#include <string.h>
|
||
|
||
#include "../include/daedalus.h"
|
||
|
||
static uint64_t xs_state = 0xab10b81cULL;
|
||
static inline uint64_t xs(void) {
|
||
uint64_t x = xs_state;
|
||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||
return xs_state = x;
|
||
}
|
||
|
||
static int test_mc(void)
|
||
{
|
||
enum { N = 32, DST_STRIDE = 16, DST_ROWS = 8 * 4, DST_BYTES = DST_ROWS * DST_STRIDE,
|
||
SRC_STRIDE = 16, SRC_ROWS = 12, SRC_BYTES = SRC_ROWS * SRC_STRIDE * N };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
if (!daedalus_ctx_has_qpu(ctx)) {
|
||
printf(" VP9 MC: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0;
|
||
}
|
||
|
||
/* Allocate per-block src tiles (12 rows x 16 cols each). */
|
||
uint8_t *src = malloc(SRC_BYTES);
|
||
uint8_t *dst_cpu = calloc(1, DST_BYTES * N);
|
||
uint8_t *dst_qpu = calloc(1, DST_BYTES * N);
|
||
daedalus_mc_meta *meta = calloc(N, sizeof(*meta));
|
||
if (!src || !dst_cpu || !dst_qpu || !meta) return 1;
|
||
|
||
for (size_t i = 0; i < SRC_BYTES; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||
for (int i = 0; i < N; i++) {
|
||
meta[i].dst_off = i * 64; /* 8 rows × 8 cols = 64 bytes per block */
|
||
meta[i].src_off = i * SRC_STRIDE * SRC_ROWS; /* RAW src offset; shader handles -3 */
|
||
meta[i].mx = (int)(xs() & 15);
|
||
}
|
||
|
||
daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, 8, src, SRC_STRIDE, N, meta);
|
||
daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, 8, src, SRC_STRIDE, N, meta);
|
||
|
||
int diff = 0;
|
||
for (int i = 0; i < N * 64; i++) if (dst_cpu[i] != dst_qpu[i]) diff++;
|
||
printf(" VP9 MC (CPU vs QPU): %d/%d bytes match (%.4f%%)\n",
|
||
N * 64 - diff, N * 64, 100.0 * (N * 64 - diff) / (N * 64));
|
||
|
||
free(src); free(dst_cpu); free(dst_qpu); free(meta);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_deblock(void)
|
||
{
|
||
enum { N = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
|
||
TOTAL = N * TILE_BYTES, EDGE_OFF = 4 * TILE_STRIDE };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
if (!daedalus_ctx_has_qpu(ctx)) {
|
||
printf(" H.264 deblock: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0;
|
||
}
|
||
|
||
uint8_t *master = malloc(TOTAL);
|
||
uint8_t *dst_cpu = malloc(TOTAL);
|
||
uint8_t *dst_qpu = malloc(TOTAL);
|
||
daedalus_h264_deblock_meta *meta = calloc(N, sizeof(*meta));
|
||
if (!master || !dst_cpu || !dst_qpu || !meta) return 1;
|
||
|
||
for (int i = 0; i < TOTAL; i++) master[i] = (uint8_t)(xs() & 0xff);
|
||
memcpy(dst_cpu, master, TOTAL);
|
||
memcpy(dst_qpu, master, TOTAL);
|
||
|
||
for (int i = 0; i < N; i++) {
|
||
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
|
||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||
meta[i].beta = (int)(xs() % 16) + 1;
|
||
for (int s = 0; s < 4; s++) {
|
||
int r = (int)(xs() % 8);
|
||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||
}
|
||
}
|
||
|
||
daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, TILE_STRIDE, N, meta);
|
||
daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, TILE_STRIDE, N, meta);
|
||
|
||
int diff = 0;
|
||
for (int i = 0; i < TOTAL; i++) if (dst_cpu[i] != dst_qpu[i]) diff++;
|
||
printf(" H.264 deblock (CPU vs QPU): %d/%d bytes match (%.4f%%)\n",
|
||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||
|
||
free(master); free(dst_cpu); free(dst_qpu); free(meta);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
int main(void)
|
||
{
|
||
printf("=== Phase 8b: opportunistic-QPU paths through API ===\n");
|
||
int fail = 0;
|
||
fail |= test_mc();
|
||
fail |= test_deblock();
|
||
/* CDEF skipped here — tmp construction in C ref differs subtly
|
||
* from dav1d NEON's; bench_v3d_cdef.c is the authoritative gate
|
||
* for the QPU CDEF path. */
|
||
return fail;
|
||
}
|