Phase 8: wire IDCT QPU dispatch through public API

daedalus_ctx now owns a v3d_runner when V3D is available. The public API's dispatch_vp9_idct8 routes QPU calls through a new dispatch_idct8_qpu helper that: (1) lazy-creates the cycle 1 v4 pipeline on first use, (2) allocates 3 host-visible SSBOs per call (coeffs/dst/meta), (3) memcpy host->GPU, (4) dispatch with the v4 32-blocks-per-WG geometry, (5) memcpy GPU->host. Per-call alloc is intentional for Phase 8 correctness-first scope; buffer-pool perf optimization is deferred. Added daedalus_ctx_create_no_qpu() for fast-path callers that know they want CPU only. test_api_idct extended to a 3-mode matrix: CPU forced, QPU forced, AUTO recipe. All three deliver 4096/4096 bit-exact on hertz with V3D 7.1.7.0: recipe substrate for VP9_IDCT8: 2 (QPU) [CPU] 4096/4096 bit-exact [QPU] 4096/4096 bit-exact (real QPU dispatch through the API) [AUTO] 4096/4096 bit-exact (recipe routes to QPU) Next Phase 8 sub-step: same wiring pattern for cycle 2 LPF wd=4 and cycle 4 LPF wd=8 (the other two recipe-QPU kernels). Cycle 3 MC and cycle 5 CDEF only need the dispatch hook (recipe routes to CPU; QPU stays opportunistic via explicit override). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:55:55 +00:00
parent 760f6a4060
commit 1085c5699c
4 changed files with 194 additions and 32 deletions
@@ -37,14 +37,37 @@ static inline uint64_t xs(void) {
    return xs_state = x;
 }

-int main(void)
+static int run_once(daedalus_substrate force,
+                    const int16_t *coeffs,
+                    const daedalus_idct8_meta *meta,
+                    const uint8_t *dst_initial,
+                    const uint8_t *dst_ref,
+                    const char *label)
 {
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
+    int has_qpu = daedalus_ctx_has_qpu(ctx);
+    printf("  [%s] has_qpu=%d force=%d\n", label, has_qpu, (int) force);
+    if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) {
+        printf("    SKIP — QPU unavailable on this host\n");
+        daedalus_ctx_destroy(ctx); return 0;
+    }
+    uint8_t dst[DST_BYTES];
+    memcpy(dst, dst_initial, DST_BYTES);
+    int rc = daedalus_dispatch_vp9_idct8(ctx, force, dst, DST_STRIDE,
+                                          coeffs, N_BLOCKS, meta);
+    if (rc) { fprintf(stderr, "    dispatch rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; }
+    int diffs = 0;
+    for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++;
+    printf("    %d / %d bytes bit-exact (%.4f%%)\n",
+           DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
+    daedalus_ctx_destroy(ctx);
+    return diffs == 0 ? 0 : 1;
+}

+int main(void)
+{
    printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n");
-    printf("  has_qpu: %d (Phase 8 skeleton: NEON-only)\n",
-           daedalus_ctx_has_qpu(ctx));
    printf("  recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8));

@@ -61,9 +84,9 @@ int main(void)
        }
    }

-    uint8_t dst_ref[DST_BYTES], dst_api[DST_BYTES];
+    uint8_t dst_ref[DST_BYTES], dst_initial[DST_BYTES];
    for (int i = 0; i < DST_BYTES; i++)
-        dst_ref[i] = dst_api[i] = (uint8_t)(xs() & 0xff);
+        dst_ref[i] = dst_initial[i] = (uint8_t)(xs() & 0xff);

    /* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset
     * by*8*stride + bx*8. */
@@ -87,17 +110,9 @@ int main(void)
                                              DST_STRIDE, scratch, 64);
    }

-    /* Dispatch through the public API. */
-    int rc = daedalus_recipe_dispatch_vp9_idct8(ctx, dst_api, DST_STRIDE,
-                                                 coeffs, N_BLOCKS, meta);
-    if (rc != 0) { fprintf(stderr, "API dispatch failed rc=%d\n", rc); return 1; }
-
-    /* Compare. */
-    int diffs = 0;
-    for (int i = 0; i < DST_BYTES; i++) if (dst_ref[i] != dst_api[i]) diffs++;
-    printf("  bytes bit-exact: %d / %d (%.4f%%)\n",
-           DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
-
-    daedalus_ctx_destroy(ctx);
-    return diffs == 0 ? 0 : 1;
+    int fail = 0;
+    fail |= run_once(DAEDALUS_SUBSTRATE_CPU, coeffs, meta, dst_initial, dst_ref, "CPU");
+    fail |= run_once(DAEDALUS_SUBSTRATE_QPU, coeffs, meta, dst_initial, dst_ref, "QPU");
+    fail |= run_once(DAEDALUS_SUBSTRATE_AUTO, coeffs, meta, dst_initial, dst_ref, "AUTO");
+    return fail;
 }