From 1085c5699c3a3d71e623d4d27427e101a34e3104 Mon Sep 17 00:00:00 2001
From: Markus Fritsche <mfritsche@reauktion.de>
Date: Mon, 18 May 2026 13:55:55 +0000
Subject: [PATCH] Phase 8: wire IDCT QPU dispatch through public API

daedalus_ctx now owns a v3d_runner when V3D is available. The
public API's dispatch_vp9_idct8 routes QPU calls through a
new dispatch_idct8_qpu helper that: (1) lazy-creates the cycle 1
v4 pipeline on first use, (2) allocates 3 host-visible SSBOs
per call (coeffs/dst/meta), (3) memcpy host->GPU, (4) dispatch
with the v4 32-blocks-per-WG geometry, (5) memcpy GPU->host.

Per-call alloc is intentional for Phase 8 correctness-first
scope; buffer-pool perf optimization is deferred.

Added daedalus_ctx_create_no_qpu() for fast-path callers that
know they want CPU only.

test_api_idct extended to a 3-mode matrix: CPU forced, QPU
forced, AUTO recipe. All three deliver 4096/4096 bit-exact
on hertz with V3D 7.1.7.0:

  recipe substrate for VP9_IDCT8: 2 (QPU)
  [CPU] 4096/4096 bit-exact
  [QPU] 4096/4096 bit-exact (real QPU dispatch through the API)
  [AUTO] 4096/4096 bit-exact (recipe routes to QPU)

Next Phase 8 sub-step: same wiring pattern for cycle 2 LPF wd=4
and cycle 4 LPF wd=8 (the other two recipe-QPU kernels).
Cycle 3 MC and cycle 5 CDEF only need the dispatch hook
(recipe routes to CPU; QPU stays opportunistic via explicit
override).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt        |   6 ++
 include/daedalus.h    |   4 +
 src/daedalus_core.c   | 165 ++++++++++++++++++++++++++++++++++++++----
 tests/test_api_idct.c |  51 ++++++++-----
 4 files changed, 194 insertions(+), 32 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff739f6..921be42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -282,6 +282,7 @@ endif()
 
 add_library(daedalus_core STATIC
     src/daedalus_core.c
+    src/v3d_runner.c
     ${FFASM_SOURCES}
     ${FFASM_LPF_SOURCES}
     ${FFASM_MC_SOURCES}
@@ -290,7 +291,12 @@ add_library(daedalus_core STATIC
     ${DAV1D_CDEF_C_SOURCES}
 )
 target_include_directories(daedalus_core PUBLIC include)
+target_include_directories(daedalus_core PRIVATE src)
+target_link_libraries(daedalus_core PUBLIC Vulkan::Vulkan)
 target_compile_options(daedalus_core PRIVATE -O2)
+if (DAEDALUS_BUILD_VULKAN)
+    add_dependencies(daedalus_core daedalus_shaders)
+endif()
 
 add_executable(test_api_idct
     tests/test_api_idct.c
diff --git a/include/daedalus.h b/include/daedalus.h
index 38a9ccf..2e91795 100644
--- a/include/daedalus.h
+++ b/include/daedalus.h
@@ -70,6 +70,10 @@ typedef struct daedalus_ctx daedalus_ctx;
  * failure. */
 daedalus_ctx *daedalus_ctx_create(void);
 
+/* Same but skip V3D init — for callers that know they want CPU
+ * only and want a fast-creating context. */
+daedalus_ctx *daedalus_ctx_create_no_qpu(void);
+
 /* Returns 1 if QPU dispatch is available on this context, 0 if
  * NEON-only.  Useful for the integration layer to short-circuit
  * QPU dispatch attempts. */
diff --git a/src/daedalus_core.c b/src/daedalus_core.c
index fd45298..4087dc0 100644
--- a/src/daedalus_core.c
+++ b/src/daedalus_core.c
@@ -1,14 +1,19 @@
 /*
- * daedalus-fourier core library — Phase 8 skeleton.
+ * daedalus-fourier core library — Phase 8 skeleton + IDCT QPU wired.
  *
  * Wraps cycles 1-5 kernels behind the public C API in
  * include/daedalus.h. Recipe dispatch routes per-kernel to the
  * verdict substrate from each cycle's Phase 7 doc.
  *
+ * QPU dispatch wiring status:
+ *   IDCT 8x8: wired (cycle 1 v4 shader).
+ *   Others:   stubbed (return -1); CPU path always works.
+ *
  * License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
  * dav1d BSD-2-Clause NEON snapshots.
  */
 #include "../include/daedalus.h"
+#include "v3d_runner.h"
 
 #include <stdlib.h>
 #include <stdint.h>
@@ -19,18 +24,29 @@
 /* -------------------- Context -------------------- */
 
 struct daedalus_ctx {
-    /* For Phase 8 skeleton: just a flag.  Real impl would hold the
-     * v3d_runner + per-kernel pipeline handles. */
     int has_qpu;
+    v3d_runner   *runner;              /* NULL when has_qpu == 0 */
+
+    /* Per-kernel pipelines, lazy-created on first QPU dispatch. */
+    int           idct8_pipe_ready;
+    v3d_pipeline  idct8_pipe;
 };
 
 daedalus_ctx *daedalus_ctx_create(void)
 {
     daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
     if (!ctx) return NULL;
-    /* Phase 8 deferred: real impl probes V3D Vulkan device; for now
-     * default to CPU-only (NEON paths are always available). */
+    ctx->runner = v3d_runner_create();
+    ctx->has_qpu = (ctx->runner != NULL);
+    return ctx;
+}
+
+daedalus_ctx *daedalus_ctx_create_no_qpu(void)
+{
+    daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
+    if (!ctx) return NULL;
     ctx->has_qpu = 0;
+    ctx->runner = NULL;
     return ctx;
 }
 
@@ -41,6 +57,10 @@ int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
 
 void daedalus_ctx_destroy(daedalus_ctx *ctx)
 {
+    if (!ctx) return;
+    if (ctx->idct8_pipe_ready && ctx->runner)
+        v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
+    if (ctx->runner) v3d_runner_destroy(ctx->runner);
     free(ctx);
 }
 
@@ -55,7 +75,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
     case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
     case DAEDALUS_KERNEL_AV1_CDEF_8X8:   return DAEDALUS_SUBSTRATE_CPU;
     }
-    return DAEDALUS_SUBSTRATE_CPU;  /* defensive default */
+    return DAEDALUS_SUBSTRATE_CPU;
 }
 
 /* -------------------- NEON externs (per cycle bench links) ----- */
@@ -141,23 +161,140 @@ static int dispatch_cdef_cpu(daedalus_ctx *ctx,
     return 0;
 }
 
+/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
+
+typedef struct {
+    uint32_t n_blocks;
+    uint32_t blocks_per_row;
+    uint32_t dst_stride_u8;
+    uint32_t _pad;
+} idct8_pc;
+
+static int ensure_idct8_pipeline(daedalus_ctx *ctx)
+{
+    if (ctx->idct8_pipe_ready) return 0;
+    if (v3d_runner_create_pipeline(ctx->runner,
+                                   "v3d_idct8.spv",
+                                   /*n_ssbos=*/3,
+                                   /*push_const_size=*/sizeof(idct8_pc),
+                                   &ctx->idct8_pipe) != 0) {
+        return -1;
+    }
+    ctx->idct8_pipe_ready = 1;
+    return 0;
+}
+
+static int dispatch_idct8_qpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    const int16_t *coeffs, size_t n_blocks,
+    const daedalus_idct8_meta *meta)
+{
+    if (ensure_idct8_pipeline(ctx) != 0) return -1;
+
+    /* Allocate three SSBOs per call (coeffs, dst, meta). Performance-
+     * tuning (buffer pool) is deferred; correctness first. */
+    size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
+    size_t meta_bytes  = n_blocks * 2 * sizeof(uint32_t);     /* uvec2 per block */
+    /* dst buffer must hold all of dst[0..max_dst_off + 64 + 8*stride].
+     * Cheapest correct answer: alloc the smallest contiguous region
+     * containing every block's footprint. For Phase 8 we assume the
+     * caller's dst surface starts at byte 0 of the buffer and use
+     * the full provided extent. We size by scanning meta. */
+    size_t max_byte_touched = 0;
+    for (size_t i = 0; i < n_blocks; i++) {
+        size_t end = meta[i].dst_off + (size_t)(8 - 1) * dst_stride + 8;
+        if (end > max_byte_touched) max_byte_touched = end;
+    }
+
+    v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
+    if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
+    if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
+        v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
+    }
+    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
+        v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
+        v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
+    }
+
+    /* Upload. Coeffs and meta are straight copies. Dst we copy the
+     * caller's full region (since we'll need to read it back). */
+    memcpy(buf_coeffs.mapped, coeffs, coeff_bytes);
+    memcpy(buf_dst.mapped, dst, max_byte_touched);
+    uint32_t *m = buf_meta.mapped;
+    for (size_t i = 0; i < n_blocks; i++) {
+        m[2*i + 0] = meta[i].block_x;
+        m[2*i + 1] = meta[i].block_y;
+    }
+
+    /* Bind: shader expects (coeffs, dst, meta) per src/v3d_idct8.comp. */
+    v3d_buffer binds[3] = { buf_coeffs, buf_dst, buf_meta };
+    if (v3d_runner_bind_buffers(ctx->runner, &ctx->idct8_pipe, binds, 3)) {
+        goto fail;
+    }
+
+    /* WG geometry: 32 blocks per WG. */
+    uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
+    idct8_pc pc = {
+        .n_blocks       = (uint32_t) n_blocks,
+        .blocks_per_row = 0,   /* unused by v4 shader (meta drives placement) */
+        .dst_stride_u8  = (uint32_t) dst_stride,
+        ._pad = 0,
+    };
+
+    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
+    if (cb == VK_NULL_HANDLE) goto fail;
+    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
+    vkBeginCommandBuffer(cb, &cbbi);
+    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
+                      ctx->idct8_pipe.pipeline);
+    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
+                            ctx->idct8_pipe.layout, 0, 1,
+                            &ctx->idct8_pipe.desc_set, 0, NULL);
+    vkCmdPushConstants(cb, ctx->idct8_pipe.layout,
+                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
+    vkCmdDispatch(cb, wg_count, 1, 1);
+    vkEndCommandBuffer(cb);
+
+    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
+
+    /* Read-back dst. */
+    memcpy(dst, buf_dst.mapped, max_byte_touched);
+
+    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
+    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
+    v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
+    return 0;
+
+fail:
+    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
+    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
+    v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
+    return -1;
+}
+
 /* -------------------- Public dispatch entry points -------------- */
 
-#define ROUTE(_kernel, _cpu_fn, ...)                                          \
+#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...)                                 \
     daedalus_substrate eff = sub;                                             \
     if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
     if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))          \
         eff = DAEDALUS_SUBSTRATE_CPU;                                         \
     if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__);      \
-    return -1   /* QPU path not yet wired in Phase 8 skeleton */
+    return -1   /* QPU path not yet wired for this kernel */
 
 int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
     uint8_t *dst, size_t dst_stride,
     const int16_t *coeffs, size_t n_blocks,
     const daedalus_idct8_meta *meta)
 {
-    ROUTE(DAEDALUS_KERNEL_VP9_IDCT8, dispatch_idct8_cpu,
-          dst, dst_stride, coeffs, n_blocks, meta);
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_CPU)
+        return dispatch_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
+    return dispatch_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
 }
 
 int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -193,8 +330,8 @@ int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
     const uint8_t *src, size_t src_stride,
     size_t n_blocks, const daedalus_mc_meta *meta)
 {
-    ROUTE(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
-          dst, dst_stride, src, src_stride, n_blocks, meta);
+    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
+                   dst, dst_stride, src, src_stride, n_blocks, meta);
 }
 
 int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -202,8 +339,8 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
     const uint16_t *tmp,
     size_t n_blocks, const daedalus_cdef_meta *meta)
 {
-    ROUTE(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
-          dst, dst_stride, tmp, n_blocks, meta);
+    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
+                   dst, dst_stride, tmp, n_blocks, meta);
 }
 
 /* -------------------- Recipe convenience wrappers --------------- */
diff --git a/tests/test_api_idct.c b/tests/test_api_idct.c
index 13896b3..3804fc5 100644
--- a/tests/test_api_idct.c
+++ b/tests/test_api_idct.c
@@ -37,14 +37,37 @@ static inline uint64_t xs(void) {
     return xs_state = x;
 }
 
-int main(void)
+static int run_once(daedalus_substrate force,
+                    const int16_t *coeffs,
+                    const daedalus_idct8_meta *meta,
+                    const uint8_t *dst_initial,
+                    const uint8_t *dst_ref,
+                    const char *label)
 {
     daedalus_ctx *ctx = daedalus_ctx_create();
     if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
+    int has_qpu = daedalus_ctx_has_qpu(ctx);
+    printf("  [%s] has_qpu=%d force=%d\n", label, has_qpu, (int) force);
+    if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) {
+        printf("    SKIP — QPU unavailable on this host\n");
+        daedalus_ctx_destroy(ctx); return 0;
+    }
+    uint8_t dst[DST_BYTES];
+    memcpy(dst, dst_initial, DST_BYTES);
+    int rc = daedalus_dispatch_vp9_idct8(ctx, force, dst, DST_STRIDE,
+                                          coeffs, N_BLOCKS, meta);
+    if (rc) { fprintf(stderr, "    dispatch rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; }
+    int diffs = 0;
+    for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++;
+    printf("    %d / %d bytes bit-exact (%.4f%%)\n",
+           DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
+    daedalus_ctx_destroy(ctx);
+    return diffs == 0 ? 0 : 1;
+}
 
+int main(void)
+{
     printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n");
-    printf("  has_qpu: %d (Phase 8 skeleton: NEON-only)\n",
-           daedalus_ctx_has_qpu(ctx));
     printf("  recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n",
            (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8));
 
@@ -61,9 +84,9 @@ int main(void)
         }
     }
 
-    uint8_t dst_ref[DST_BYTES], dst_api[DST_BYTES];
+    uint8_t dst_ref[DST_BYTES], dst_initial[DST_BYTES];
     for (int i = 0; i < DST_BYTES; i++)
-        dst_ref[i] = dst_api[i] = (uint8_t)(xs() & 0xff);
+        dst_ref[i] = dst_initial[i] = (uint8_t)(xs() & 0xff);
 
     /* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset
      * by*8*stride + bx*8. */
@@ -87,17 +110,9 @@ int main(void)
                                               DST_STRIDE, scratch, 64);
     }
 
-    /* Dispatch through the public API. */
-    int rc = daedalus_recipe_dispatch_vp9_idct8(ctx, dst_api, DST_STRIDE,
-                                                 coeffs, N_BLOCKS, meta);
-    if (rc != 0) { fprintf(stderr, "API dispatch failed rc=%d\n", rc); return 1; }
-
-    /* Compare. */
-    int diffs = 0;
-    for (int i = 0; i < DST_BYTES; i++) if (dst_ref[i] != dst_api[i]) diffs++;
-    printf("  bytes bit-exact: %d / %d (%.4f%%)\n",
-           DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
-
-    daedalus_ctx_destroy(ctx);
-    return diffs == 0 ? 0 : 1;
+    int fail = 0;
+    fail |= run_once(DAEDALUS_SUBSTRATE_CPU, coeffs, meta, dst_initial, dst_ref, "CPU");
+    fail |= run_once(DAEDALUS_SUBSTRATE_QPU, coeffs, meta, dst_initial, dst_ref, "QPU");
+    fail |= run_once(DAEDALUS_SUBSTRATE_AUTO, coeffs, meta, dst_initial, dst_ref, "AUTO");
+    return fail;
 }