Phase 8b: opportunistic QPU paths through public API

Wires QPU dispatch for cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264 deblock) through the public API. These three kernels have recipe substrate = CPU, but per Issue 003 the mixed-kernel helper value is real — the dispatch path must exist so override-mode callers can request QPU on the side. Pattern mirrors dispatch_idct8_qpu (lazy pipeline + per-call SSBO alloc + memcpy + dispatch + readback). Each kernel has its own push-constant struct (mc_pc 3-field, cdef_pc 3-field, deblock_pc 2-field shared with lpf). Notable bug caught + fixed in test_api_opportunistic_qpu: the initial dispatch_mc_8h_qpu sized src_max using CPU-side reach (src_off + 3 + 8 + 7*stride), but the QPU shader reads src[ src_off + row*stride + 0..14] for row=0..7. Last block had 3 uninitialized bytes → 99.8% match → 100% after fix. After this commit, the public API surface fully covers cycles 1-8: Cycle 1 (IDCT 8x8): CPU + QPU + AUTO bit-exact Cycle 2 (LPF wd=4): CPU + QPU + AUTO bit-exact Cycle 3 (MC 8h): CPU recipe; QPU override bit-exact Cycle 4 (LPF wd=8): CPU + QPU + AUTO bit-exact Cycle 5 (CDEF): CPU recipe; QPU override (untested in this test — bench_v3d_cdef is the authoritative 3-way M1) Cycle 6 (H.264 IDCT 4x4): CPU only (no QPU shader by recipe) Cycle 7 (H.264 IDCT 8x8): CPU only Cycle 8 (H.264 deblock luma-v): CPU recipe; QPU override bit-exact Tests: test_api_opportunistic_qpu adds CPU-vs-QPU bit-exact comparison for VP9 MC and H.264 deblock through the API. test_api_idct, test_api_lpf, test_api_h264 still pass. Per the locked Phase 8 architecture (project_phase8_architecture memory): next session opens daedalus-v4l2 sibling repo with Option B (kernel V4L2 shim + userspace daemon), Option γ (dlopen FFmpeg parser). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:50:41 +00:00
parent fd55f5ebc1
commit 0a99b16489
3 changed files with 396 additions and 9 deletions
@@ -34,6 +34,12 @@ struct daedalus_ctx {
    v3d_pipeline  lpf4_pipe;
    int           lpf8_pipe_ready;
    v3d_pipeline  lpf8_pipe;
+    int           mc8h_pipe_ready;
+    v3d_pipeline  mc8h_pipe;
+    int           cdef_pipe_ready;
+    v3d_pipeline  cdef_pipe;
+    int           h264deblock_pipe_ready;
+    v3d_pipeline  h264deblock_pipe;
 };

 daedalus_ctx *daedalus_ctx_create(void)
@@ -63,9 +69,12 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
 {
    if (!ctx) return;
    if (ctx->runner) {
-        if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
-        if (ctx->lpf4_pipe_ready)  v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe);
-        if (ctx->lpf8_pipe_ready)  v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe);
+        if (ctx->idct8_pipe_ready)       v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
+        if (ctx->lpf4_pipe_ready)        v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe);
+        if (ctx->lpf8_pipe_ready)        v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe);
+        if (ctx->mc8h_pipe_ready)        v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe);
+        if (ctx->cdef_pipe_ready)        v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
+        if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
        v3d_runner_destroy(ctx->runner);
    }
    free(ctx);
@@ -449,6 +458,244 @@ fail:
    return -1;
 }

+/* -------------------- VP9 MC QPU dispatch (cycle 3) ------------- */
+
+typedef struct {
+    uint32_t n_blocks;
+    uint32_t dst_stride_u8;
+    uint32_t src_stride_u8;
+    uint32_t _pad;
+} mc_pc;
+
+static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    const uint8_t *src, size_t src_stride,
+    size_t n_blocks, const daedalus_mc_meta *meta)
+{
+    if (!ctx->mc8h_pipe_ready) {
+        if (v3d_runner_create_pipeline(ctx->runner, "v3d_mc_8h.spv",
+                                       3, sizeof(mc_pc), &ctx->mc8h_pipe) != 0)
+            return -1;
+        ctx->mc8h_pipe_ready = 1;
+    }
+
+    size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
+    size_t dst_max = 0, src_max = 0;
+    for (size_t i = 0; i < n_blocks; i++) {
+        size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8;
+        if (de > dst_max) dst_max = de;
+        /* QPU shader reads src[src_off + row*stride + 0..14] for row=0..7. */
+        size_t se = meta[i].src_off + 7 * src_stride + 15;
+        if (se > src_max) src_max = se;
+    }
+
+    v3d_buffer bm = {0}, bd = {0}, bs = {0};
+    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
+    if (v3d_runner_create_buffer(ctx->runner, dst_max,     &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_create_buffer(ctx->runner, src_max,     &bs)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+
+    memcpy(bs.mapped, src, src_max);
+    memcpy(bd.mapped, dst, dst_max);
+    uint32_t *m = bm.mapped;
+    for (size_t i = 0; i < n_blocks; i++) {
+        m[4*i+0] = meta[i].dst_off;
+        m[4*i+1] = meta[i].src_off;
+        m[4*i+2] = (uint32_t) meta[i].mx;
+        m[4*i+3] = 0;
+    }
+
+    v3d_buffer binds[3] = { bm, bd, bs };
+    if (v3d_runner_bind_buffers(ctx->runner, &ctx->mc8h_pipe, binds, 3)) goto fail;
+
+    uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
+    mc_pc pc = { .n_blocks = (uint32_t) n_blocks,
+                 .dst_stride_u8 = (uint32_t) dst_stride,
+                 .src_stride_u8 = (uint32_t) src_stride };
+    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
+    if (cb == VK_NULL_HANDLE) goto fail;
+    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
+    vkBeginCommandBuffer(cb, &cbbi);
+    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline);
+    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
+                            ctx->mc8h_pipe.layout, 0, 1, &ctx->mc8h_pipe.desc_set, 0, NULL);
+    vkCmdPushConstants(cb, ctx->mc8h_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
+                       0, sizeof(pc), &pc);
+    vkCmdDispatch(cb, wg_count, 1, 1);
+    vkEndCommandBuffer(cb);
+    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
+
+    memcpy(dst, bd.mapped, dst_max);
+
+    v3d_runner_destroy_buffer(ctx->runner, &bs);
+    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    return 0;
+fail:
+    v3d_runner_destroy_buffer(ctx->runner, &bs);
+    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    return -1;
+}
+
+/* -------------------- CDEF QPU dispatch (cycle 5) --------------- */
+
+typedef struct {
+    uint32_t n_blocks;
+    uint32_t tmp_stride_u16;
+    uint32_t dst_stride_u8;
+    uint32_t _pad;
+} cdef_pc;
+
+static int dispatch_cdef_qpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    const uint16_t *tmp,
+    size_t n_blocks, const daedalus_cdef_meta *meta)
+{
+    if (!ctx->cdef_pipe_ready) {
+        if (v3d_runner_create_pipeline(ctx->runner, "v3d_cdef.spv",
+                                       3, sizeof(cdef_pc), &ctx->cdef_pipe) != 0)
+            return -1;
+        ctx->cdef_pipe_ready = 1;
+    }
+
+    size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
+    size_t dst_max = 0, tmp_max_u16 = 0;
+    for (size_t i = 0; i < n_blocks; i++) {
+        size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8;
+        if (de > dst_max) dst_max = de;
+        size_t te = meta[i].tmp_off_u16 + (8 - 1) * 16 + 8;  /* center 8x8 in stride-16 tmp */
+        if (te > tmp_max_u16) tmp_max_u16 = te;
+    }
+    size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
+
+    v3d_buffer bm = {0}, bd = {0}, bt = {0};
+    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
+    if (v3d_runner_create_buffer(ctx->runner, dst_max,    &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_create_buffer(ctx->runner, tmp_bytes,  &bt)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+
+    /* tmp may need padding before block-origin offset (caller-allocated). Just
+     * copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
+     * caller has the layout set up. */
+    memcpy(bt.mapped, tmp, tmp_bytes);
+    memcpy(bd.mapped, dst, dst_max);
+    uint32_t *m = bm.mapped;
+    for (size_t i = 0; i < n_blocks; i++) {
+        uint32_t pri = (uint32_t) meta[i].pri_strength;
+        uint32_t sec = (uint32_t) meta[i].sec_strength;
+        uint32_t damping = (uint32_t) meta[i].damping;
+        m[4*i+0] = meta[i].dst_off;
+        m[4*i+1] = pri | (sec << 8) | (damping << 16);
+        m[4*i+2] = meta[i].tmp_off_u16;
+        m[4*i+3] = (uint32_t) meta[i].dir;
+    }
+
+    v3d_buffer binds[3] = { bm, bd, bt };
+    if (v3d_runner_bind_buffers(ctx->runner, &ctx->cdef_pipe, binds, 3)) goto fail;
+
+    uint32_t wg_count = (uint32_t)((n_blocks + 3) / 4);
+    cdef_pc pc = { .n_blocks = (uint32_t) n_blocks,
+                   .tmp_stride_u16 = 16,
+                   .dst_stride_u8 = (uint32_t) dst_stride };
+    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
+    if (cb == VK_NULL_HANDLE) goto fail;
+    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
+    vkBeginCommandBuffer(cb, &cbbi);
+    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline);
+    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
+                            ctx->cdef_pipe.layout, 0, 1, &ctx->cdef_pipe.desc_set, 0, NULL);
+    vkCmdPushConstants(cb, ctx->cdef_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
+                       0, sizeof(pc), &pc);
+    vkCmdDispatch(cb, wg_count, 1, 1);
+    vkEndCommandBuffer(cb);
+    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
+
+    memcpy(dst, bd.mapped, dst_max);
+
+    v3d_runner_destroy_buffer(ctx->runner, &bt);
+    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    return 0;
+fail:
+    v3d_runner_destroy_buffer(ctx->runner, &bt);
+    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    return -1;
+}
+
+/* -------------------- H.264 deblock QPU dispatch (cycle 8) ------ */
+
+typedef struct {
+    uint32_t n_edges;
+    uint32_t dst_stride_u8;
+    uint32_t _pad0;
+    uint32_t _pad1;
+} h264deblock_pc;
+
+static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_h264_deblock_meta *meta)
+{
+    if (!ctx->h264deblock_pipe_ready) {
+        if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264deblock.spv",
+                                       2, sizeof(h264deblock_pc), &ctx->h264deblock_pipe) != 0)
+            return -1;
+        ctx->h264deblock_pipe_ready = 1;
+    }
+
+    size_t meta_bytes = n_edges * 4 * sizeof(uint32_t);
+    size_t dst_max = 0;
+    for (size_t i = 0; i < n_edges; i++) {
+        /* Reads -4*stride to +3*stride+15 from dst_off; writes -2..+1 *stride. */
+        size_t e = meta[i].dst_off + 3 * dst_stride + 16;
+        if (e > dst_max) dst_max = e;
+    }
+
+    v3d_buffer bm = {0}, bd = {0};
+    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
+    if (v3d_runner_create_buffer(ctx->runner, dst_max,    &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+
+    memcpy(bd.mapped, dst, dst_max);
+    uint32_t *m = bm.mapped;
+    for (size_t i = 0; i < n_edges; i++) {
+        m[4*i+0] = meta[i].dst_off;
+        m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8);
+        m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0])
+                 | (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8)
+                 | (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16)
+                 | (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24);
+        m[4*i+3] = 0;
+    }
+
+    v3d_buffer binds[2] = { bm, bd };
+    if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264deblock_pipe, binds, 2)) goto fail;
+
+    uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
+    h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
+                          .dst_stride_u8 = (uint32_t) dst_stride };
+    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
+    if (cb == VK_NULL_HANDLE) goto fail;
+    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
+    vkBeginCommandBuffer(cb, &cbbi);
+    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline);
+    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
+                            ctx->h264deblock_pipe.layout, 0, 1, &ctx->h264deblock_pipe.desc_set, 0, NULL);
+    vkCmdPushConstants(cb, ctx->h264deblock_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
+                       0, sizeof(pc), &pc);
+    vkCmdDispatch(cb, wg_count, 1, 1);
+    vkEndCommandBuffer(cb);
+    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
+
+    memcpy(dst, bd.mapped, dst_max);
+
+    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    return 0;
+fail:
+    v3d_runner_destroy_buffer(ctx->runner, &bd);
+    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    return -1;
+}
+
 /* -------------------- Public dispatch entry points -------------- */

 #define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...)                                 \
@@ -507,8 +754,14 @@ int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
    const uint8_t *src, size_t src_stride,
    size_t n_blocks, const daedalus_mc_meta *meta)
 {
-    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
-                   dst, dst_stride, src, src_stride, n_blocks, meta);
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_MC_8H);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_CPU)
+        return dispatch_mc_8h_cpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta);
+    return dispatch_mc_8h_qpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta);
 }

 int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -516,8 +769,14 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
    const uint16_t *tmp,
    size_t n_blocks, const daedalus_cdef_meta *meta)
 {
-    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
-                   dst, dst_stride, tmp, n_blocks, meta);
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_AV1_CDEF_8X8);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_CPU)
+        return dispatch_cdef_cpu(ctx, dst, dst_stride, tmp, n_blocks, meta);
+    return dispatch_cdef_qpu(ctx, dst, dst_stride, tmp, n_blocks, meta);
 }

 int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -542,8 +801,14 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
    uint8_t *dst, size_t dst_stride,
    size_t n_edges, const daedalus_h264_deblock_meta *meta)
 {
-    ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_DEBLOCK_LV, dispatch_h264_deblock_cpu,
-                   dst, dst_stride, n_edges, meta);
+    daedalus_substrate eff = sub;
+    if (eff == DAEDALUS_SUBSTRATE_AUTO)
+        eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV);
+    if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
+        eff = DAEDALUS_SUBSTRATE_CPU;
+    if (eff == DAEDALUS_SUBSTRATE_CPU)
+        return dispatch_h264_deblock_cpu(ctx, dst, dst_stride, n_edges, meta);
+    return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
 }

 /* -------------------- Recipe convenience wrappers --------------- */