Phase 8: wire LPF wd=4 + wd=8 QPU through public API

Mirror the IDCT pattern (lazy pipeline + per-call SSBO alloc + dispatch + readback) for cycles 2 (LPF wd=4) and 4 (LPF wd=8). Important caught-empirically bug: the two LPF shaders disagree on push-constant slot order — wd=4 puts dst_stride_u8 at slot 1, wd=8 puts it at slot 2 (with unused blocks_per_row at slot 1). Initial single-struct attempt silently corrupted wd=8 output (1958/2048 = 95.6 % bit-exact on test_api_lpf). Fixed by keeping separate lpf4_pc and lpf8_pc struct definitions. dst-window calc handles both kernels (same -4..+3 byte footprint per row). test_api_lpf exercises both kernels in CPU / QPU / AUTO modes against the C reference. All 6 mode/kernel combinations pass 2048/2048 bit-exact (32 edges × 8 rows × 8 bytes/edge). Phase 8 status after this commit: 3 of 5 kernels wired through API for QPU dispatch (IDCT, LPF wd=4, LPF wd=8 — i.e., all 3 QPU-default kernels per recipe). Cycle 3 MC and cycle 5 CDEF still need wiring for opportunistic-override mode but aren't needed for recipe-AUTO path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:57:25 +00:00
parent 1085c5699c
commit eb5cfb34c4
3 changed files with 262 additions and 5 deletions
@@ -30,6 +30,10 @@ struct daedalus_ctx {
    /* Per-kernel pipelines, lazy-created on first QPU dispatch. */
    int           idct8_pipe_ready;
    v3d_pipeline  idct8_pipe;
+    int           lpf4_pipe_ready;
+    v3d_pipeline  lpf4_pipe;
+    int           lpf8_pipe_ready;
+    v3d_pipeline  lpf8_pipe;
 };

 daedalus_ctx *daedalus_ctx_create(void)
@@ -58,9 +62,12 @@ int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
 void daedalus_ctx_destroy(daedalus_ctx *ctx)
 {
    if (!ctx) return;
-    if (ctx->idct8_pipe_ready && ctx->runner)
-        v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
-    if (ctx->runner) v3d_runner_destroy(ctx->runner);
+    if (ctx->runner) {
+        if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
+        if (ctx->lpf4_pipe_ready)  v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe);
+        if (ctx->lpf8_pipe_ready)  v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe);
+        v3d_runner_destroy(ctx->runner);
+    }
    free(ctx);
 }

@@ -272,6 +279,127 @@ fail:
    return -1;
 }

+/* -------------------- LPF QPU dispatch (cycles 2 + 4 shaders) --
+ *
+ * NOTE: the two LPF shaders disagree on push-constant slot order.
+ * v3d_lpf_h_4_8.comp:  (n_edges, dst_stride_u8, _pad, _pad)
+ * v3d_lpf_h_8_8.comp:  (n_edges, blocks_per_row=unused, dst_stride_u8, _pad)
+ *
+ * Same total size (16 bytes), different slot 2. Keep separate
+ * struct definitions to avoid silent corruption — Phase 8 caught
+ * this empirically when test_api_lpf wd=8 reported 95.6 % match.
+ */
+typedef struct {
+    uint32_t n_edges;
+    uint32_t dst_stride_u8;
+    uint32_t _pad0;
+    uint32_t _pad1;
+} lpf4_pc;
+
+typedef struct {
+    uint32_t n_edges;
+    uint32_t blocks_per_row;   /* unused by shader, must exist */
+    uint32_t dst_stride_u8;
+    uint32_t _pad;
+} lpf8_pc;
+
+static int ensure_lpf_pipeline(daedalus_ctx *ctx, int wd_8,
+                                int *flag, v3d_pipeline *pipe,
+                                const char *spv)
+{
+    if (*flag) return 0;
+    size_t pc_size = wd_8 ? sizeof(lpf8_pc) : sizeof(lpf4_pc);
+    if (v3d_runner_create_pipeline(ctx->runner, spv,
+                                   /*n_ssbos=*/2,
+                                   /*push_const_size=*/(uint32_t) pc_size,
+                                   pipe) != 0) {
+        return -1;
+    }
+    *flag = 1;
+    return 0;
+}
+
+static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
+    uint8_t *dst, size_t dst_stride,
+    size_t n_edges, const daedalus_lpf_meta *meta)
+{
+    int *flag      = wd_8 ? &ctx->lpf8_pipe_ready : &ctx->lpf4_pipe_ready;
+    v3d_pipeline *p = wd_8 ? &ctx->lpf8_pipe     : &ctx->lpf4_pipe;
+    const char *spv = wd_8 ? "v3d_lpf_h_8_8.spv"  : "v3d_lpf_h_4_8.spv";
+    if (ensure_lpf_pipeline(ctx, wd_8, flag, p, spv) != 0) return -1;
+
+    size_t meta_bytes = n_edges * 4 * sizeof(uint32_t);    /* uvec4 per edge */
+    /* Determine smallest dst window. Each edge writes to bytes
+     * [dst_off - 4 .. dst_off + 3] for 8 rows at dst_stride. */
+    size_t lo = (size_t) -1, hi = 0;
+    for (size_t i = 0; i < n_edges; i++) {
+        size_t base = meta[i].dst_off;
+        if (base >= 4) {
+            size_t this_lo = base - 4;
+            if (this_lo < lo) lo = this_lo;
+        } else {
+            lo = 0;
+        }
+        size_t this_hi = base + (size_t)(8 - 1) * dst_stride + 4;
+        if (this_hi > hi) hi = this_hi;
+    }
+    if (n_edges == 0) { lo = 0; hi = 0; }
+    size_t dst_window_size = hi - lo;
+
+    v3d_buffer buf_meta = {0}, buf_dst = {0};
+    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
+    if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) {
+        v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1;
+    }
+
+    memcpy(buf_dst.mapped, dst + lo, dst_window_size);
+    uint32_t *m = buf_meta.mapped;
+    for (size_t i = 0; i < n_edges; i++) {
+        m[4*i + 0] = (uint32_t)(meta[i].dst_off - lo);
+        m[4*i + 1] = (uint32_t) meta[i].E;
+        m[4*i + 2] = (uint32_t) meta[i].I;
+        m[4*i + 3] = (uint32_t) meta[i].H;
+    }
+
+    v3d_buffer binds[2] = { buf_meta, buf_dst };
+    if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
+
+    uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
+    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
+    if (cb == VK_NULL_HANDLE) goto fail;
+    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
+    vkBeginCommandBuffer(cb, &cbbi);
+    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
+    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
+                            p->layout, 0, 1, &p->desc_set, 0, NULL);
+    if (wd_8) {
+        lpf8_pc pc = { .n_edges = (uint32_t) n_edges,
+                       .blocks_per_row = 0,
+                       .dst_stride_u8 = (uint32_t) dst_stride,
+                       ._pad = 0 };
+        vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
+                           0, sizeof(pc), &pc);
+    } else {
+        lpf4_pc pc = { .n_edges = (uint32_t) n_edges,
+                       .dst_stride_u8 = (uint32_t) dst_stride };
+        vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
+                           0, sizeof(pc), &pc);
+    }
+    vkCmdDispatch(cb, wg_count, 1, 1);
+    vkEndCommandBuffer(cb);
+    if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
+
+    memcpy(dst + lo, buf_dst.mapped, dst_window_size);
+
+    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
+    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
+    return 0;
+fail:
+    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
+    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
+    return -1;
+}
+
 /* -------------------- Public dispatch entry points -------------- */

 #define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...)                                 \
@@ -308,7 +436,7 @@ int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_CPU)
        return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta);
-    return -1;
+    return dispatch_lpf_qpu(ctx, 0, dst, dst_stride, n_edges, meta);
 }

 int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -322,7 +450,7 @@ int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
        eff = DAEDALUS_SUBSTRATE_CPU;
    if (eff == DAEDALUS_SUBSTRATE_CPU)
        return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta);
-    return -1;
+    return dispatch_lpf_qpu(ctx, 1, dst, dst_stride, n_edges, meta);
 }

 int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,