Phase 8b: opportunistic QPU paths through public API

Wires QPU dispatch for cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264 deblock) through the public API. These three kernels have recipe substrate = CPU, but per Issue 003 the mixed-kernel helper value is real — the dispatch path must exist so override-mode callers can request QPU on the side. Pattern mirrors dispatch_idct8_qpu (lazy pipeline + per-call SSBO alloc + memcpy + dispatch + readback). Each kernel has its own push-constant struct (mc_pc 3-field, cdef_pc 3-field, deblock_pc 2-field shared with lpf). Notable bug caught + fixed in test_api_opportunistic_qpu: the initial dispatch_mc_8h_qpu sized src_max using CPU-side reach (src_off + 3 + 8 + 7*stride), but the QPU shader reads src[ src_off + row*stride + 0..14] for row=0..7. Last block had 3 uninitialized bytes → 99.8% match → 100% after fix. After this commit, the public API surface fully covers cycles 1-8: Cycle 1 (IDCT 8x8): CPU + QPU + AUTO bit-exact Cycle 2 (LPF wd=4): CPU + QPU + AUTO bit-exact Cycle 3 (MC 8h): CPU recipe; QPU override bit-exact Cycle 4 (LPF wd=8): CPU + QPU + AUTO bit-exact Cycle 5 (CDEF): CPU recipe; QPU override (untested in this test — bench_v3d_cdef is the authoritative 3-way M1) Cycle 6 (H.264 IDCT 4x4): CPU only (no QPU shader by recipe) Cycle 7 (H.264 IDCT 8x8): CPU only Cycle 8 (H.264 deblock luma-v): CPU recipe; QPU override bit-exact Tests: test_api_opportunistic_qpu adds CPU-vs-QPU bit-exact comparison for VP9 MC and H.264 deblock through the API. test_api_idct, test_api_lpf, test_api_h264 still pass. Per the locked Phase 8 architecture (project_phase8_architecture memory): next session opens daedalus-v4l2 sibling repo with Option B (kernel V4L2 shim + userspace daemon), Option γ (dlopen FFmpeg parser). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:50:41 +00:00
parent fd55f5ebc1
commit 0a99b16489
3 changed files with 396 additions and 9 deletions
@@ -0,0 +1,118 @@
+/*
+ * Phase 8b — opportunistic-QPU dispatch paths through public API.
+ *
+ * Verifies that cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264 deblock)
+ * can be force-routed to QPU via daedalus_dispatch_*(QPU, ...) and
+ * produce bit-exact output vs the CPU path (which is the C ref proxy
+ * for each kernel — see per-cycle Phase 7 docs).
+ *
+ * AUTO/recipe path stays on CPU for these kernels — that's the
+ * deployment shape. This test exercises the override-mode path
+ * the integration layer would use for runtime-aware scheduling.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "../include/daedalus.h"
+
+static uint64_t xs_state = 0xab10b81cULL;
+static inline uint64_t xs(void) {
+    uint64_t x = xs_state;
+    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
+    return xs_state = x;
+}
+
+static int test_mc(void)
+{
+    enum { N = 32, DST_STRIDE = 16, DST_ROWS = 8 * 4, DST_BYTES = DST_ROWS * DST_STRIDE,
+           SRC_STRIDE = 16, SRC_ROWS = 12, SRC_BYTES = SRC_ROWS * SRC_STRIDE * N };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+    if (!daedalus_ctx_has_qpu(ctx)) {
+        printf("  VP9 MC: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0;
+    }
+
+    /* Allocate per-block src tiles (12 rows x 16 cols each). */
+    uint8_t *src = malloc(SRC_BYTES);
+    uint8_t *dst_cpu = calloc(1, DST_BYTES * N);
+    uint8_t *dst_qpu = calloc(1, DST_BYTES * N);
+    daedalus_mc_meta *meta = calloc(N, sizeof(*meta));
+    if (!src || !dst_cpu || !dst_qpu || !meta) return 1;
+
+    for (size_t i = 0; i < SRC_BYTES; i++) src[i] = (uint8_t)(xs() & 0xff);
+    for (int i = 0; i < N; i++) {
+        meta[i].dst_off = i * 64;                            /* 8 rows × 8 cols = 64 bytes per block */
+        meta[i].src_off = i * SRC_STRIDE * SRC_ROWS;         /* RAW src offset; shader handles -3 */
+        meta[i].mx = (int)(xs() & 15);
+    }
+
+    daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, 8, src, SRC_STRIDE, N, meta);
+    daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, 8, src, SRC_STRIDE, N, meta);
+
+    int diff = 0;
+    for (int i = 0; i < N * 64; i++) if (dst_cpu[i] != dst_qpu[i]) diff++;
+    printf("  VP9 MC (CPU vs QPU): %d/%d bytes match (%.4f%%)\n",
+           N * 64 - diff, N * 64, 100.0 * (N * 64 - diff) / (N * 64));
+
+    free(src); free(dst_cpu); free(dst_qpu); free(meta);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
+static int test_deblock(void)
+{
+    enum { N = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
+           TOTAL = N * TILE_BYTES, EDGE_OFF = 4 * TILE_STRIDE };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+    if (!daedalus_ctx_has_qpu(ctx)) {
+        printf("  H.264 deblock: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0;
+    }
+
+    uint8_t *master  = malloc(TOTAL);
+    uint8_t *dst_cpu = malloc(TOTAL);
+    uint8_t *dst_qpu = malloc(TOTAL);
+    daedalus_h264_deblock_meta *meta = calloc(N, sizeof(*meta));
+    if (!master || !dst_cpu || !dst_qpu || !meta) return 1;
+
+    for (int i = 0; i < TOTAL; i++) master[i] = (uint8_t)(xs() & 0xff);
+    memcpy(dst_cpu, master, TOTAL);
+    memcpy(dst_qpu, master, TOTAL);
+
+    for (int i = 0; i < N; i++) {
+        meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
+        meta[i].alpha = (int)(xs() % 64) + 1;
+        meta[i].beta  = (int)(xs() % 16) + 1;
+        for (int s = 0; s < 4; s++) {
+            int r = (int)(xs() % 8);
+            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
+        }
+    }
+
+    daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, TILE_STRIDE, N, meta);
+    daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, TILE_STRIDE, N, meta);
+
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst_cpu[i] != dst_qpu[i]) diff++;
+    printf("  H.264 deblock (CPU vs QPU): %d/%d bytes match (%.4f%%)\n",
+           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+
+    free(master); free(dst_cpu); free(dst_qpu); free(meta);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
+int main(void)
+{
+    printf("=== Phase 8b: opportunistic-QPU paths through API ===\n");
+    int fail = 0;
+    fail |= test_mc();
+    fail |= test_deblock();
+    /* CDEF skipped here — tmp construction in C ref differs subtly
+     * from dav1d NEON's; bench_v3d_cdef.c is the authoritative gate
+     * for the QPU CDEF path. */
+    return fail;
+}