Phase 8: wire IDCT QPU dispatch through public API

daedalus_ctx now owns a v3d_runner when V3D is available. The public API's dispatch_vp9_idct8 routes QPU calls through a new dispatch_idct8_qpu helper that: (1) lazy-creates the cycle 1 v4 pipeline on first use, (2) allocates 3 host-visible SSBOs per call (coeffs/dst/meta), (3) memcpy host->GPU, (4) dispatch with the v4 32-blocks-per-WG geometry, (5) memcpy GPU->host. Per-call alloc is intentional for Phase 8 correctness-first scope; buffer-pool perf optimization is deferred. Added daedalus_ctx_create_no_qpu() for fast-path callers that know they want CPU only. test_api_idct extended to a 3-mode matrix: CPU forced, QPU forced, AUTO recipe. All three deliver 4096/4096 bit-exact on hertz with V3D 7.1.7.0: recipe substrate for VP9_IDCT8: 2 (QPU) [CPU] 4096/4096 bit-exact [QPU] 4096/4096 bit-exact (real QPU dispatch through the API) [AUTO] 4096/4096 bit-exact (recipe routes to QPU) Next Phase 8 sub-step: same wiring pattern for cycle 2 LPF wd=4 and cycle 4 LPF wd=8 (the other two recipe-QPU kernels). Cycle 3 MC and cycle 5 CDEF only need the dispatch hook (recipe routes to CPU; QPU stays opportunistic via explicit override). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:55:55 +00:00
parent 760f6a4060
commit 1085c5699c
4 changed files with 194 additions and 32 deletions
@@ -282,6 +282,7 @@ endif()
 add_library(daedalus_core STATIC
    src/daedalus_core.c
    src/v3d_runner.c
    ${FFASM_SOURCES}
    ${FFASM_LPF_SOURCES}
    ${FFASM_MC_SOURCES}
@@ -290,7 +291,12 @@ add_library(daedalus_core STATIC
    ${DAV1D_CDEF_C_SOURCES}
 )
 target_include_directories(daedalus_core PUBLIC include)
 target_include_directories(daedalus_core PRIVATE src)
 target_link_libraries(daedalus_core PUBLIC Vulkan::Vulkan)
 target_compile_options(daedalus_core PRIVATE -O2)
 if (DAEDALUS_BUILD_VULKAN)
    add_dependencies(daedalus_core daedalus_shaders)
 endif()
 add_executable(test_api_idct
    tests/test_api_idct.c
@@ -70,6 +70,10 @@ typedef struct daedalus_ctx daedalus_ctx;
 * failure. */
 daedalus_ctx *daedalus_ctx_create(void);
 /* Same but skip V3D init — for callers that know they want CPU
 * only and want a fast-creating context. */
 daedalus_ctx *daedalus_ctx_create_no_qpu(void);
 /* Returns 1 if QPU dispatch is available on this context, 0 if
 * NEON-only.  Useful for the integration layer to short-circuit
 * QPU dispatch attempts. */
@@ -1,14 +1,19 @@
 /*
- * daedalus-fourier core library — Phase 8 skeleton.
+ * daedalus-fourier core library — Phase 8 skeleton + IDCT QPU wired.
 *
 * Wraps cycles 1-5 kernels behind the public C API in
 * include/daedalus.h. Recipe dispatch routes per-kernel to the
 * verdict substrate from each cycle's Phase 7 doc.
 *
 * QPU dispatch wiring status:
 *   IDCT 8x8: wired (cycle 1 v4 shader).
 *   Others:   stubbed (return -1); CPU path always works.
 *
 * License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
 * dav1d BSD-2-Clause NEON snapshots.
 */
 #include "../include/daedalus.h"
 #include "v3d_runner.h"
 #include <stdlib.h>
 #include <stdint.h>
@@ -19,18 +24,29 @@
 /* -------------------- Context -------------------- */
 struct daedalus_ctx {
    /* For Phase 8 skeleton: just a flag.  Real impl would hold the
     * v3d_runner + per-kernel pipeline handles. */
    int has_qpu;
    v3d_runner   *runner;              /* NULL when has_qpu == 0 */
    /* Per-kernel pipelines, lazy-created on first QPU dispatch. */
    int           idct8_pipe_ready;
    v3d_pipeline  idct8_pipe;
 };
 daedalus_ctx *daedalus_ctx_create(void)
 {
    daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
    if (!ctx) return NULL;
-    /* Phase 8 deferred: real impl probes V3D Vulkan device; for now
+    ctx->runner = v3d_runner_create();
-     * default to CPU-only (NEON paths are always available). */
+    ctx->has_qpu = (ctx->runner != NULL);
    return ctx;
 }
 daedalus_ctx *daedalus_ctx_create_no_qpu(void)
 {
    daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
    if (!ctx) return NULL;
    ctx->has_qpu = 0;
    ctx->runner = NULL;
    return ctx;
 }
@@ -41,6 +57,10 @@ int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
 void daedalus_ctx_destroy(daedalus_ctx *ctx)
 {
    if (!ctx) return;
    if (ctx->idct8_pipe_ready && ctx->runner)
        v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
    if (ctx->runner) v3d_runner_destroy(ctx->runner);
    free(ctx);
 }
@@ -55,7 +75,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
    case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
    case DAEDALUS_KERNEL_AV1_CDEF_8X8:   return DAEDALUS_SUBSTRATE_CPU;
    }
-    return DAEDALUS_SUBSTRATE_CPU;  /* defensive default */
+    return DAEDALUS_SUBSTRATE_CPU;
 }
 /* -------------------- NEON externs (per cycle bench links) ----- */
@@ -141,23 +161,140 @@ static int dispatch_cdef_cpu(daedalus_ctx *ctx,
    return 0;
 }
 /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
 typedef struct {
    uint32_t n_blocks;
    uint32_t blocks_per_row;
    uint32_t dst_stride_u8;
    uint32_t _pad;
 } idct8_pc;
 static int ensure_idct8_pipeline(daedalus_ctx *ctx)
 {
    if (ctx->idct8_pipe_ready) return 0;
    if (v3d_runner_create_pipeline(ctx->runner,
                                   "v3d_idct8.spv",
                                   /*n_ssbos=*/3,
                                   /*push_const_size=*/sizeof(idct8_pc),
                                   &ctx->idct8_pipe) != 0) {
        return -1;
    }
    ctx->idct8_pipe_ready = 1;
    return 0;
 }
 static int dispatch_idct8_qpu(daedalus_ctx *ctx,
    uint8_t *dst, size_t dst_stride,
    const int16_t *coeffs, size_t n_blocks,
    const daedalus_idct8_meta *meta)
 {
    if (ensure_idct8_pipeline(ctx) != 0) return -1;
    /* Allocate three SSBOs per call (coeffs, dst, meta). Performance-
     * tuning (buffer pool) is deferred; correctness first. */
    size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
    size_t meta_bytes  = n_blocks * 2 * sizeof(uint32_t);     /* uvec2 per block */
    /* dst buffer must hold all of dst[0..max_dst_off + 64 + 8*stride].
     * Cheapest correct answer: alloc the smallest contiguous region
     * containing every block's footprint. For Phase 8 we assume the
     * caller's dst surface starts at byte 0 of the buffer and use
     * the full provided extent. We size by scanning meta. */
    size_t max_byte_touched = 0;
    for (size_t i = 0; i < n_blocks; i++) {
        size_t end = meta[i].dst_off + (size_t)(8 - 1) * dst_stride + 8;
        if (end > max_byte_touched) max_byte_touched = end;
    }
    v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
    if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
    if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
        v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
    }
    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
        v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
        v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
    }
    /* Upload. Coeffs and meta are straight copies. Dst we copy the
     * caller's full region (since we'll need to read it back). */
    memcpy(buf_coeffs.mapped, coeffs, coeff_bytes);
    memcpy(buf_dst.mapped, dst, max_byte_touched);
    uint32_t *m = buf_meta.mapped;
    for (size_t i = 0; i < n_blocks; i++) {
        m[2*i + 0] = meta[i].block_x;
        m[2*i + 1] = meta[i].block_y;
    }
    /* Bind: shader expects (coeffs, dst, meta) per src/v3d_idct8.comp. */
    v3d_buffer binds[3] = { buf_coeffs, buf_dst, buf_meta };
    if (v3d_runner_bind_buffers(ctx->runner, &ctx->idct8_pipe, binds, 3)) {
        goto fail;
    }
    /* WG geometry: 32 blocks per WG. */
    uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
    idct8_pc pc = {
        .n_blocks       = (uint32_t) n_blocks,
        .blocks_per_row = 0,   /* unused by v4 shader (meta drives placement) */
        .dst_stride_u8  = (uint32_t) dst_stride,
        ._pad = 0,
    };
    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
    if (cb == VK_NULL_HANDLE) goto fail;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                      ctx->idct8_pipe.pipeline);
    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                            ctx->idct8_pipe.layout, 0, 1,
                            &ctx->idct8_pipe.desc_set, 0, NULL);
    vkCmdPushConstants(cb, ctx->idct8_pipe.layout,
                       VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
    vkCmdDispatch(cb, wg_count, 1, 1);
    vkEndCommandBuffer(cb);
    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
    /* Read-back dst. */
    memcpy(dst, buf_dst.mapped, max_byte_touched);
    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
    v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
    return 0;
 fail:
    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
    v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
    return -1;
 }
 /* -------------------- Public dispatch entry points -------------- */
-#define ROUTE(_kernel, _cpu_fn, ...)                                          \
+#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...)                                 \
    daedalus_substrate eff = sub;                                             \
    if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))          \
        eff = DAEDALUS_SUBSTRATE_CPU;                                         \
    if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__);      \
-    return -1   /* QPU path not yet wired in Phase 8 skeleton */
+    return -1   /* QPU path not yet wired for this kernel */
 int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
    uint8_t *dst, size_t dst_stride,
    const int16_t *coeffs, size_t n_blocks,
    const daedalus_idct8_meta *meta)
 {
-    ROUTE(DAEDALUS_KERNEL_VP9_IDCT8, dispatch_idct8_cpu,
+    daedalus_substrate eff = sub;
-          dst, dst_stride, coeffs, n_blocks, meta);
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8);
    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_CPU)
        return dispatch_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
    return dispatch_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
 }
 int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -193,8 +330,8 @@ int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
    const uint8_t *src, size_t src_stride,
    size_t n_blocks, const daedalus_mc_meta *meta)
 {
-    ROUTE(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
+    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
-          dst, dst_stride, src, src_stride, n_blocks, meta);
+                   dst, dst_stride, src, src_stride, n_blocks, meta);
 }
 int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -202,8 +339,8 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
    const uint16_t *tmp,
    size_t n_blocks, const daedalus_cdef_meta *meta)
 {
-    ROUTE(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
+    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
-          dst, dst_stride, tmp, n_blocks, meta);
+                   dst, dst_stride, tmp, n_blocks, meta);
 }
 /* -------------------- Recipe convenience wrappers --------------- */
@@ -37,14 +37,37 @@ static inline uint64_t xs(void) {
    return xs_state = x;
 }
-int main(void)
+static int run_once(daedalus_substrate force,
                    const int16_t *coeffs,
                    const daedalus_idct8_meta *meta,
                    const uint8_t *dst_initial,
                    const uint8_t *dst_ref,
                    const char *label)
 {
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
    int has_qpu = daedalus_ctx_has_qpu(ctx);
    printf("  [%s] has_qpu=%d force=%d\n", label, has_qpu, (int) force);
    if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) {
        printf("    SKIP — QPU unavailable on this host\n");
        daedalus_ctx_destroy(ctx); return 0;
    }
    uint8_t dst[DST_BYTES];
    memcpy(dst, dst_initial, DST_BYTES);
    int rc = daedalus_dispatch_vp9_idct8(ctx, force, dst, DST_STRIDE,
                                          coeffs, N_BLOCKS, meta);
    if (rc) { fprintf(stderr, "    dispatch rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; }
    int diffs = 0;
    for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++;
    printf("    %d / %d bytes bit-exact (%.4f%%)\n",
           DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
    daedalus_ctx_destroy(ctx);
    return diffs == 0 ? 0 : 1;
 }
 int main(void)
 {
    printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n");
    printf("  has_qpu: %d (Phase 8 skeleton: NEON-only)\n",
           daedalus_ctx_has_qpu(ctx));
    printf("  recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8));
@@ -61,9 +84,9 @@ int main(void)
        }
    }
-    uint8_t dst_ref[DST_BYTES], dst_api[DST_BYTES];
+    uint8_t dst_ref[DST_BYTES], dst_initial[DST_BYTES];
    for (int i = 0; i < DST_BYTES; i++)
-        dst_ref[i] = dst_api[i] = (uint8_t)(xs() & 0xff);
+        dst_ref[i] = dst_initial[i] = (uint8_t)(xs() & 0xff);
    /* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset
     * by*8*stride + bx*8. */
@@ -87,17 +110,9 @@ int main(void)
                                              DST_STRIDE, scratch, 64);
    }
-    /* Dispatch through the public API. */
+    int fail = 0;
-    int rc = daedalus_recipe_dispatch_vp9_idct8(ctx, dst_api, DST_STRIDE,
+    fail |= run_once(DAEDALUS_SUBSTRATE_CPU, coeffs, meta, dst_initial, dst_ref, "CPU");
-                                                 coeffs, N_BLOCKS, meta);
+    fail |= run_once(DAEDALUS_SUBSTRATE_QPU, coeffs, meta, dst_initial, dst_ref, "QPU");
-    if (rc != 0) { fprintf(stderr, "API dispatch failed rc=%d\n", rc); return 1; }
+    fail |= run_once(DAEDALUS_SUBSTRATE_AUTO, coeffs, meta, dst_initial, dst_ref, "AUTO");
-
+    return fail;
    /* Compare. */
    int diffs = 0;
    for (int i = 0; i < DST_BYTES; i++) if (dst_ref[i] != dst_api[i]) diffs++;
    printf("  bytes bit-exact: %d / %d (%.4f%%)\n",
           DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
    daedalus_ctx_destroy(ctx);
    return diffs == 0 ? 0 : 1;
 }