Phase 6 (v1+v4 production) + Phase 7 closure: R = 0.92 ± 0.03 on hertz

First QPU IDCT8 kernel running and bit-exact on V3D 7.1 via Mesa v3dv compute. Five iterations through a Phase 7→Phase 4' loopback; production kernel is v4. New files: - src/v3d_runner.{c,h} — reusable Vulkan compute plumbing (instance, V3D device picker, HOST_VISIBLE|COHERENT SSBOs with mmap, compute pipeline from .spv, enables storageBuffer{8,16}BitAccess) - src/v3d_idct8.comp — VP9 8x8 DCT_DCT IDCT add, v4 production: 256 invocations/WG, 2 blocks/subgroup (no idle lanes), uint8 dst SSBO (race-free per phase5 finding 5), unrolled writes (no chained ternary), oob-flag pattern (barrier-safe per phase5 finding 7) - tests/bench_v3d_idct.c — M1' bit-exact gate + M2 throughput vs C ref - docs/phase7.md — full iteration journey + decision verdict CMakeLists.txt updated to build the new shader, library, and bench when DAEDALUS_BUILD_VULKAN=ON. Iteration record (1920x1088 luma, 32640 blocks/dispatch, N=3): ver change R ns/block v1 first-light 0.230 533 v2 kill ternary + 2-blocks-per-sg 0.474 258 v3 per-pass scope oN 0.481 254 (noise) v4 WG 64 -> 256 invocations 0.947 129 v5 packed uint32 coeff reads 0.938 130 (noise, reverted) v4 final N=3 0.918 +/- 0.033 Bit-exactness 100.0000% across all iterations (10000-block sample on 128x128, 32640-block sample on 1080p) against both the C reference (tests/vp9_idct8_ref.c) and the vendored FFmpeg NEON ff_vp9_idct_idct_8x8_add_neon. Key learning over the Phase 5 review's prediction model: the chained ternary was NOT a spill killer on V3D 7.1 (shaderdb showed 0:0 spills:fills even in v1). The actual lever was workgroup-size-driven latency hiding — going from 64 to 256 invocations doubled throughput with the same compiled code (270 inst, 2 threads, 21 max-temps, 0 spills) because the v3dv scheduler had 4x more in-flight work to overlap TMU latency. Verdict per phase1.md decision rules: YELLOW band (0.5 <= R < 1.0) by a wide margin, near GREEN boundary. Phase 1 YELLOW rule: add M4 (concurrent CPU+QPU throughput) before honest-close or continue. M4 is the next measurement, not more shader tuning — at R = 0.92 with all 4 A76 cores still 100% free for other work, the question is whether the system aggregate beats pure 4-core NEON. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 12:09:00 +00:00
parent 71db72928f
commit d66f22f333
6 changed files with 1267 additions and 1 deletions
@@ -0,0 +1,217 @@
+// daedalus-fourier — VP9 8×8 DCT_DCT inverse-transform-add, V3D 7.1.
+// v2: post-Phase-7 loopback. Phase 4' iteration 1.
+//
+// Changes from v1 (per phase47 iteration 1 + Sonnet v3d perf research):
+//
+//   Opt 1 — kill the chained ternary. v1's row-pass write had
+//           `(r==0)?o0:(r==1)?o1:...` inside a `for r` loop; that
+//           kept all 8 oN scalars live across 7 phi nodes and almost
+//           certainly forced register spills (Iago Toral 2021,
+//           blogs.igalia.com/itoral). v2 unrolls the 8 writes
+//           completely — each oN is used exactly once.
+//
+//   Opt 2 — 2 blocks per subgroup. v1 had 1 block per 16-lane
+//           subgroup with 8 lanes idle per phase. v2 packs 2 blocks
+//           per subgroup (one in lanes 0..7, one in lanes 8..15),
+//           and every lane runs both passes for its own block.
+//           Eliminates idle lanes AND removes the col_pass/row_pass
+//           branch divergence. 8 blocks per WG (vs 4 before),
+//           dispatch count halves from 8160 to 4080 on 1080p.
+//           Shared-mem footprint doubles to 2 KiB (still « 16 KiB).
+//
+// (Opt 3 — packed uint32 storage — deferred; do it if Opt 1+2
+// don't get us into the GREEN/YELLOW decision band.)
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_16bit_storage            : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+// v4: local_size 256 (was 64) — 16 subgroups × 16 lanes = 32 blocks/WG.
+// More in-flight work per WG = more latency hiding for v3d's TMU.
+// shared = 32 × 64 × 4 B = 8 KiB (still under 16 KiB).
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly buffer Coeffs {
+    int16_t coeffs[];   // N × 64 packed
+} u_coeffs;
+// (v5 tried uint32-packed reads with manual unpack — no measurable
+// perf change vs int16, added code complexity; reverted.)
+
+layout(binding = 1) buffer Dst {
+    uint8_t dst[];      // H × stride bytes
+} u_dst;
+
+layout(binding = 2) readonly buffer Meta {
+    uvec2 meta[];       // per-block (block_x_8, block_y_8)
+} u_meta;
+
+layout(push_constant) uniform PC {
+    uint n_blocks;
+    uint blocks_per_row;   // unused (meta drives position)
+    uint dst_stride_u8;
+    uint _pad;
+} pc;
+
+// 32 blocks per WG × 64 i32 per block × 4 B = 8192 B shared.
+shared int tmp_shared[32 * 64];
+
+// VP9 Q14 trig constants (spec §8.7.1.4).
+const int COSPI_16 = 11585;
+const int COSPI_24 =  6270;
+const int COSPI_08 = 15137;
+const int COSPI_28 =  3196;
+const int COSPI_04 = 16069;
+const int COSPI_20 =  9102;
+const int COSPI_12 = 13623;
+
+int qround14(int x) { return (x + (1 << 13)) >> 14; }
+
+void idct8_1d(int i0, int i1, int i2, int i3,
+              int i4, int i5, int i6, int i7,
+              out int o0, out int o1, out int o2, out int o3,
+              out int o4, out int o5, out int o6, out int o7)
+{
+    int t0a = qround14((i0 + i4) * COSPI_16);
+    int t1a = qround14((i0 - i4) * COSPI_16);
+    int t2a = qround14(i2 * COSPI_24 - i6 * COSPI_08);
+    int t3a = qround14(i2 * COSPI_08 + i6 * COSPI_24);
+    int t4a = qround14(i1 * COSPI_28 - i7 * COSPI_04);
+    int t5a = qround14(i5 * COSPI_12 - i3 * COSPI_20);
+    int t6a = qround14(i5 * COSPI_20 + i3 * COSPI_12);
+    int t7a = qround14(i1 * COSPI_04 + i7 * COSPI_28);
+
+    int t0 = t0a + t3a, t1 = t1a + t2a;
+    int t2 = t1a - t2a, t3 = t0a - t3a;
+    int t4  = t4a + t5a;
+    int t5p = t4a - t5a;
+    int t7  = t7a + t6a;
+    int t6p = t7a - t6a;
+
+    int t5 = qround14((t6p - t5p) * COSPI_16);
+    int t6 = qround14((t6p + t5p) * COSPI_16);
+
+    o0 = t0 + t7; o1 = t1 + t6;
+    o2 = t2 + t5; o3 = t3 + t4;
+    o4 = t3 - t4; o5 = t2 - t5;
+    o6 = t1 - t6; o7 = t0 - t7;
+}
+
+void main()
+{
+    // ---- Lane / block decomposition --------------------------------
+    // 64 invocations/WG = 4 subgroups × 16 lanes/subgroup.
+    // Each subgroup packs 2 blocks (one in lanes 0..7, one in lanes 8..15).
+    // 8 blocks per WG total.
+    //
+    // Every lane runs both column and row pass for its own block —
+    // no idle lanes, no col_pass/row_pass branch divergence.
+
+    uint gid          = gl_GlobalInvocationID.x;
+    uint wg_id        = gid / 256u;
+    uint lane_in_wg   = gid & 255u;
+    uint sg_in_wg     = lane_in_wg >> 4;          // 0..15
+    uint lane_in_sg   = lane_in_wg & 15u;
+    uint block_slot   = lane_in_sg >> 3;          // 0 (lanes 0..7) or 1 (lanes 8..15)
+    uint k            = lane_in_sg & 7u;          // 0..7
+
+    uint block_local  = sg_in_wg * 2u + block_slot;   // 0..31 within WG
+    uint block_idx    = wg_id * 32u + block_local;
+
+    // OOB flag — gates work bodies, but barrier() is reached by all.
+    // Per phase5.md finding 7.
+    bool oob = (block_idx >= pc.n_blocks);
+
+    // ---- Column pass ----------------------------------------------
+    // v3 (Opt 4): scope oN inside each pass so they're dead at the
+    // barrier — v2 had them function-scope which inflated max-temps
+    // (shaderdb reported 20 max-temps / 2 threads instead of 4 threads
+    // possible). Lower temps → more hardware threads → better
+    // latency hiding.
+    if (!oob) {
+        uint base = block_idx * 64u;
+        int c0 = int(u_coeffs.coeffs[base + 0u * 8u + k]);
+        int c1 = int(u_coeffs.coeffs[base + 1u * 8u + k]);
+        int c2 = int(u_coeffs.coeffs[base + 2u * 8u + k]);
+        int c3 = int(u_coeffs.coeffs[base + 3u * 8u + k]);
+        int c4 = int(u_coeffs.coeffs[base + 4u * 8u + k]);
+        int c5 = int(u_coeffs.coeffs[base + 5u * 8u + k]);
+        int c6 = int(u_coeffs.coeffs[base + 6u * 8u + k]);
+        int c7 = int(u_coeffs.coeffs[base + 7u * 8u + k]);
+
+        int o0, o1, o2, o3, o4, o5, o6, o7;
+        idct8_1d(c0, c1, c2, c3, c4, c5, c6, c7,
+                 o0, o1, o2, o3, o4, o5, o6, o7);
+
+        // Transposed write: row k of tmp_shared[block_local].
+        uint tbase = block_local * 64u + k * 8u;
+        tmp_shared[tbase + 0u] = o0;
+        tmp_shared[tbase + 1u] = o1;
+        tmp_shared[tbase + 2u] = o2;
+        tmp_shared[tbase + 3u] = o3;
+        tmp_shared[tbase + 4u] = o4;
+        tmp_shared[tbase + 5u] = o5;
+        tmp_shared[tbase + 6u] = o6;
+        tmp_shared[tbase + 7u] = o7;
+    }
+
+    barrier();   // unconditional — every lane in the WG reaches this
+
+    // ---- Row pass --------------------------------------------------
+    if (!oob) {
+        // Read column k of tmp_shared[block_local].
+        uint tbase = block_local * 64u;
+        int s0 = tmp_shared[tbase + 0u * 8u + k];
+        int s1 = tmp_shared[tbase + 1u * 8u + k];
+        int s2 = tmp_shared[tbase + 2u * 8u + k];
+        int s3 = tmp_shared[tbase + 3u * 8u + k];
+        int s4 = tmp_shared[tbase + 4u * 8u + k];
+        int s5 = tmp_shared[tbase + 5u * 8u + k];
+        int s6 = tmp_shared[tbase + 6u * 8u + k];
+        int s7 = tmp_shared[tbase + 7u * 8u + k];
+
+        int o0, o1, o2, o3, o4, o5, o6, o7;
+        idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7,
+                 o0, o1, o2, o3, o4, o5, o6, o7);
+
+        // Columnar write into dst. Each lane owns column k of its block.
+        // Block position in dst from meta.
+        uvec2 bp = u_meta.meta[block_idx];
+        uint block_x = bp.x;
+        uint block_y = bp.y;
+        uint dx     = block_x * 8u + k;
+        uint dy0    = block_y * 8u;
+        uint stride = pc.dst_stride_u8;
+
+        // Opt 1: 8 fully-unrolled writes — each o_i used exactly once.
+        // No chained ternary, no loop with runtime-variable index.
+        uint a0 = (dy0 + 0u) * stride + dx;
+        uint a1 = (dy0 + 1u) * stride + dx;
+        uint a2 = (dy0 + 2u) * stride + dx;
+        uint a3 = (dy0 + 3u) * stride + dx;
+        uint a4 = (dy0 + 4u) * stride + dx;
+        uint a5 = (dy0 + 5u) * stride + dx;
+        uint a6 = (dy0 + 6u) * stride + dx;
+        uint a7 = (dy0 + 7u) * stride + dx;
+
+        int p0 = int(u_dst.dst[a0]);
+        int p1 = int(u_dst.dst[a1]);
+        int p2 = int(u_dst.dst[a2]);
+        int p3 = int(u_dst.dst[a3]);
+        int p4 = int(u_dst.dst[a4]);
+        int p5 = int(u_dst.dst[a5]);
+        int p6 = int(u_dst.dst[a6]);
+        int p7 = int(u_dst.dst[a7]);
+
+        u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 16) >> 5), 0, 255));
+        u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 16) >> 5), 0, 255));
+        u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 16) >> 5), 0, 255));
+        u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 16) >> 5), 0, 255));
+        u_dst.dst[a4] = uint8_t(clamp(p4 + ((o4 + 16) >> 5), 0, 255));
+        u_dst.dst[a5] = uint8_t(clamp(p5 + ((o5 + 16) >> 5), 0, 255));
+        u_dst.dst[a6] = uint8_t(clamp(p6 + ((o6 + 16) >> 5), 0, 255));
+        u_dst.dst[a7] = uint8_t(clamp(p7 + ((o7 + 16) >> 5), 0, 255));
+    }
+}
@@ -0,0 +1,435 @@
+/*
+ * v3d_runner — implementation. See v3d_runner.h.
+ *
+ * License: BSD-2-Clause.
+ */
+#include "v3d_runner.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
+    fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
+            r__, __FILE__, __LINE__, #call); return -1; } } while (0)
+
+#define CHK_NULL(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
+    fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
+            r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
+
+struct v3d_runner {
+    VkInstance       instance;
+    VkPhysicalDevice phys;
+    VkDevice         device;
+    VkQueue          queue;
+    uint32_t         queue_family;
+    VkCommandPool    pool;
+    char             device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
+    VkPhysicalDeviceMemoryProperties mem_props;
+};
+
+static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
+                                    char name_out[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE])
+{
+    uint32_t n = 0;
+    if (vkEnumeratePhysicalDevices(inst, &n, NULL) != VK_SUCCESS || n == 0) {
+        fprintf(stderr, "v3d_runner: no Vulkan physical devices\n");
+        return -1;
+    }
+    VkPhysicalDevice *pds = malloc(n * sizeof(*pds));
+    if (!pds) return -1;
+    vkEnumeratePhysicalDevices(inst, &n, pds);
+
+    int picked = -1;
+    for (uint32_t i = 0; i < n; i++) {
+        VkPhysicalDeviceProperties p;
+        vkGetPhysicalDeviceProperties(pds[i], &p);
+        if (strstr(p.deviceName, "V3D") != NULL) {
+            *out = pds[i];
+            memcpy(name_out, p.deviceName, sizeof(p.deviceName));
+            picked = 0;
+            break;
+        }
+    }
+    free(pds);
+    if (picked != 0)
+        fprintf(stderr, "v3d_runner: no V3D device found (looked for "
+                        "\"V3D\" substring in deviceName)\n");
+    return picked;
+}
+
+static uint32_t pick_compute_queue_family(VkPhysicalDevice phys)
+{
+    uint32_t n = 0;
+    vkGetPhysicalDeviceQueueFamilyProperties(phys, &n, NULL);
+    VkQueueFamilyProperties *q = malloc(n * sizeof(*q));
+    if (!q) return UINT32_MAX;
+    vkGetPhysicalDeviceQueueFamilyProperties(phys, &n, q);
+    uint32_t out = UINT32_MAX;
+    for (uint32_t i = 0; i < n; i++) {
+        if (q[i].queueFlags & VK_QUEUE_COMPUTE_BIT) { out = i; break; }
+    }
+    free(q);
+    return out;
+}
+
+v3d_runner *v3d_runner_create(void)
+{
+    v3d_runner *r = calloc(1, sizeof(*r));
+    if (!r) return NULL;
+
+    /* Instance — Vulkan 1.3 to inherit 1.2 promoted features. */
+    VkApplicationInfo app = {
+        .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+        .pApplicationName = "daedalus-fourier",
+        .apiVersion = VK_API_VERSION_1_3,
+    };
+    VkInstanceCreateInfo ici = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+        .pApplicationInfo = &app,
+    };
+    CHK_NULL(vkCreateInstance(&ici, NULL, &r->instance));
+
+    if (pick_v3d_physical_device(r->instance, &r->phys, r->device_name) != 0) {
+        vkDestroyInstance(r->instance, NULL);
+        free(r);
+        return NULL;
+    }
+
+    vkGetPhysicalDeviceMemoryProperties(r->phys, &r->mem_props);
+
+    r->queue_family = pick_compute_queue_family(r->phys);
+    if (r->queue_family == UINT32_MAX) {
+        fprintf(stderr, "v3d_runner: no compute queue family\n");
+        vkDestroyInstance(r->instance, NULL);
+        free(r);
+        return NULL;
+    }
+
+    /* Enable 8-bit + 16-bit storage features. Both are exposed on
+     * V3D 7.1 per vulkaninfo_v3d_7_1_7_hertz.txt; the kernel
+     * declares storageBuffer8BitAccess (uint8_t dst[]) and
+     * storageBuffer16BitAccess (int16_t coeffs[]).
+     */
+    VkPhysicalDevice16BitStorageFeatures f16 = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES,
+        .storageBuffer16BitAccess = VK_TRUE,
+        .uniformAndStorageBuffer16BitAccess = VK_TRUE,
+    };
+    VkPhysicalDevice8BitStorageFeatures f8 = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES,
+        .pNext = &f16,
+        .storageBuffer8BitAccess = VK_TRUE,
+        .uniformAndStorageBuffer8BitAccess = VK_TRUE,
+    };
+    VkPhysicalDeviceFeatures2 f2 = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+        .pNext = &f8,
+    };
+
+    float qprio = 1.0f;
+    VkDeviceQueueCreateInfo dqci = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        .queueFamilyIndex = r->queue_family,
+        .queueCount = 1,
+        .pQueuePriorities = &qprio,
+    };
+    VkDeviceCreateInfo dci = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .pNext = &f2,
+        .queueCreateInfoCount = 1,
+        .pQueueCreateInfos = &dqci,
+    };
+    if (vkCreateDevice(r->phys, &dci, NULL, &r->device) != VK_SUCCESS) {
+        fprintf(stderr, "v3d_runner: vkCreateDevice failed\n");
+        vkDestroyInstance(r->instance, NULL);
+        free(r);
+        return NULL;
+    }
+    vkGetDeviceQueue(r->device, r->queue_family, 0, &r->queue);
+
+    VkCommandPoolCreateInfo cpci = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = r->queue_family,
+    };
+    if (vkCreateCommandPool(r->device, &cpci, NULL, &r->pool) != VK_SUCCESS) {
+        fprintf(stderr, "v3d_runner: vkCreateCommandPool failed\n");
+        vkDestroyDevice(r->device, NULL);
+        vkDestroyInstance(r->instance, NULL);
+        free(r);
+        return NULL;
+    }
+
+    return r;
+}
+
+void v3d_runner_destroy(v3d_runner *r)
+{
+    if (!r) return;
+    if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
+    if (r->pool != VK_NULL_HANDLE)
+        vkDestroyCommandPool(r->device, r->pool, NULL);
+    if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
+    if (r->instance != VK_NULL_HANDLE) vkDestroyInstance(r->instance, NULL);
+    free(r);
+}
+
+VkDevice      v3d_runner_device(v3d_runner *r)        { return r->device; }
+VkQueue       v3d_runner_queue(v3d_runner *r)         { return r->queue; }
+uint32_t      v3d_runner_queue_family(v3d_runner *r)  { return r->queue_family; }
+VkCommandPool v3d_runner_cmd_pool(v3d_runner *r)      { return r->pool; }
+const char   *v3d_runner_device_name(v3d_runner *r)   { return r->device_name; }
+
+/* ---- Buffers ---------------------------------------------------- */
+
+static int find_memory_type(VkPhysicalDeviceMemoryProperties *p,
+                            uint32_t type_bits, VkMemoryPropertyFlags wanted)
+{
+    for (uint32_t i = 0; i < p->memoryTypeCount; i++) {
+        if ((type_bits & (1u << i)) &&
+            (p->memoryTypes[i].propertyFlags & wanted) == wanted)
+            return (int) i;
+    }
+    return -1;
+}
+
+int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
+{
+    memset(out, 0, sizeof(*out));
+    out->size = size;
+
+    VkBufferCreateInfo bci = {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .size = size,
+        .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
+               | VK_BUFFER_USAGE_TRANSFER_SRC_BIT
+               | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+    };
+    CHK(vkCreateBuffer(r->device, &bci, NULL, &out->buffer));
+
+    VkMemoryRequirements req;
+    vkGetBufferMemoryRequirements(r->device, out->buffer, &req);
+
+    /* HOST_VISIBLE | HOST_COHERENT is the unified-memory zero-copy
+     * path on Pi 5: CPU and GPU see the same LPDDR4x physical pages,
+     * no explicit flush/invalidate needed (the COHERENT bit asserts
+     * that). */
+    int mt = find_memory_type(&r->mem_props, req.memoryTypeBits,
+                              VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
+                            | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
+    if (mt < 0) {
+        fprintf(stderr, "v3d_runner: no HOST_VISIBLE|COHERENT memory type\n");
+        return -1;
+    }
+
+    VkMemoryAllocateInfo mai = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .allocationSize = req.size,
+        .memoryTypeIndex = (uint32_t) mt,
+    };
+    CHK(vkAllocateMemory(r->device, &mai, NULL, &out->memory));
+    CHK(vkBindBufferMemory(r->device, out->buffer, out->memory, 0));
+    CHK(vkMapMemory(r->device, out->memory, 0, VK_WHOLE_SIZE, 0, &out->mapped));
+    return 0;
+}
+
+void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf)
+{
+    if (!buf || buf->buffer == VK_NULL_HANDLE) return;
+    if (buf->mapped) vkUnmapMemory(r->device, buf->memory);
+    vkDestroyBuffer(r->device, buf->buffer, NULL);
+    vkFreeMemory(r->device, buf->memory, NULL);
+    memset(buf, 0, sizeof(*buf));
+}
+
+/* ---- Pipelines -------------------------------------------------- */
+
+static uint32_t *read_spv(const char *path, size_t *out_size)
+{
+    FILE *f = fopen(path, "rb");
+    if (!f) { perror(path); return NULL; }
+    fseek(f, 0, SEEK_END);
+    long sz = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    if (sz <= 0 || (sz & 3)) {
+        fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz);
+        fclose(f); return NULL;
+    }
+    uint32_t *buf = malloc(sz);
+    if (!buf || fread(buf, 1, sz, f) != (size_t)sz) {
+        perror("read"); fclose(f); free(buf); return NULL;
+    }
+    fclose(f);
+    *out_size = sz;
+    return buf;
+}
+
+int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path,
+                               uint32_t n_ssbos, uint32_t push_const_size,
+                               v3d_pipeline *out)
+{
+    memset(out, 0, sizeof(*out));
+    out->n_ssbos = n_ssbos;
+    out->push_const_size = push_const_size;
+
+    /* Descriptor set layout: n_ssbos SSBO bindings, compute-only. */
+    VkDescriptorSetLayoutBinding *binds = calloc(n_ssbos, sizeof(*binds));
+    if (!binds) return -1;
+    for (uint32_t i = 0; i < n_ssbos; i++) {
+        binds[i] = (VkDescriptorSetLayoutBinding){
+            .binding = i,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .descriptorCount = 1,
+            .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        };
+    }
+    VkDescriptorSetLayoutCreateInfo dslci = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .bindingCount = n_ssbos,
+        .pBindings = binds,
+    };
+    VkResult vr = vkCreateDescriptorSetLayout(r->device, &dslci, NULL,
+                                              &out->ds_layout);
+    free(binds);
+    if (vr != VK_SUCCESS) {
+        fprintf(stderr, "vkCreateDescriptorSetLayout = %d\n", vr); return -1;
+    }
+
+    VkPushConstantRange pcr = {
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .offset = 0,
+        .size = push_const_size,
+    };
+    VkPipelineLayoutCreateInfo plci = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .setLayoutCount = 1,
+        .pSetLayouts = &out->ds_layout,
+        .pushConstantRangeCount = push_const_size ? 1 : 0,
+        .pPushConstantRanges = push_const_size ? &pcr : NULL,
+    };
+    CHK(vkCreatePipelineLayout(r->device, &plci, NULL, &out->layout));
+
+    size_t spv_size = 0;
+    uint32_t *spv = read_spv(spv_path, &spv_size);
+    if (!spv) return -1;
+    VkShaderModuleCreateInfo smci = {
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .codeSize = spv_size,
+        .pCode = spv,
+    };
+    VkShaderModule shader;
+    vr = vkCreateShaderModule(r->device, &smci, NULL, &shader);
+    free(spv);
+    if (vr != VK_SUCCESS) {
+        fprintf(stderr, "vkCreateShaderModule(%s) = %d\n", spv_path, vr);
+        return -1;
+    }
+
+    VkComputePipelineCreateInfo cpci = {
+        .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+        .stage = {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+            .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+            .module = shader,
+            .pName = "main",
+        },
+        .layout = out->layout,
+    };
+    vr = vkCreateComputePipelines(r->device, VK_NULL_HANDLE, 1, &cpci, NULL,
+                                  &out->pipeline);
+    vkDestroyShaderModule(r->device, shader, NULL);
+    if (vr != VK_SUCCESS) {
+        fprintf(stderr, "vkCreateComputePipelines = %d\n", vr); return -1;
+    }
+
+    /* Single descriptor pool + set for this pipeline. */
+    VkDescriptorPoolSize ps = {
+        .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        .descriptorCount = n_ssbos,
+    };
+    VkDescriptorPoolCreateInfo dpci = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+        .maxSets = 1,
+        .poolSizeCount = 1,
+        .pPoolSizes = &ps,
+    };
+    CHK(vkCreateDescriptorPool(r->device, &dpci, NULL, &out->pool));
+
+    VkDescriptorSetAllocateInfo dsai = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .descriptorPool = out->pool,
+        .descriptorSetCount = 1,
+        .pSetLayouts = &out->ds_layout,
+    };
+    CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set));
+    return 0;
+}
+
+void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
+{
+    if (!p || p->pipeline == VK_NULL_HANDLE) return;
+    vkDestroyPipeline(r->device, p->pipeline, NULL);
+    vkDestroyPipelineLayout(r->device, p->layout, NULL);
+    vkDestroyDescriptorPool(r->device, p->pool, NULL);  /* frees its set */
+    vkDestroyDescriptorSetLayout(r->device, p->ds_layout, NULL);
+    memset(p, 0, sizeof(*p));
+}
+
+int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p,
+                            const v3d_buffer *bufs, uint32_t n)
+{
+    if (n != p->n_ssbos) {
+        fprintf(stderr, "bind_buffers: n=%u != pipeline n_ssbos=%u\n",
+                n, p->n_ssbos);
+        return -1;
+    }
+    VkDescriptorBufferInfo *bi = calloc(n, sizeof(*bi));
+    VkWriteDescriptorSet   *wr = calloc(n, sizeof(*wr));
+    if (!bi || !wr) { free(bi); free(wr); return -1; }
+    for (uint32_t i = 0; i < n; i++) {
+        bi[i].buffer = bufs[i].buffer;
+        bi[i].offset = 0;
+        bi[i].range  = bufs[i].size;
+        wr[i] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = p->desc_set,
+            .dstBinding = i,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .pBufferInfo = &bi[i],
+        };
+    }
+    vkUpdateDescriptorSets(r->device, n, wr, 0, NULL);
+    free(bi); free(wr);
+    return 0;
+}
+
+/* ---- Command buffers ------------------------------------------- */
+
+VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r)
+{
+    VkCommandBufferAllocateInfo cbai = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = r->pool,
+        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = 1,
+    };
+    VkCommandBuffer cb = VK_NULL_HANDLE;
+    if (vkAllocateCommandBuffers(r->device, &cbai, &cb) != VK_SUCCESS)
+        return VK_NULL_HANDLE;
+    return cb;
+}
+
+int v3d_runner_submit_wait(v3d_runner *r, VkCommandBuffer cb)
+{
+    VkSubmitInfo si = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .commandBufferCount = 1,
+        .pCommandBuffers = &cb,
+    };
+    CHK(vkQueueSubmit(r->queue, 1, &si, VK_NULL_HANDLE));
+    CHK(vkQueueWaitIdle(r->queue));
+    return 0;
+}
@@ -0,0 +1,96 @@
+/*
+ * v3d_runner — minimal Vulkan compute plumbing for V3D 7.1 on Pi 5.
+ *
+ * Factored out of tests/bench_vulkan_dispatch.c so successive kernel
+ * benches can reuse the device/queue/buffer/pipeline machinery
+ * without copy-paste. Kept deliberately small and concrete — no
+ * generality beyond what daedalus-fourier needs.
+ *
+ * License: BSD-2-Clause.
+ */
+#ifndef DAEDALUS_V3D_RUNNER_H
+#define DAEDALUS_V3D_RUNNER_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <vulkan/vulkan.h>
+
+typedef struct v3d_runner v3d_runner;
+
+/* Host-visible SSBO. .mapped is a CPU-side pointer to .size bytes. */
+typedef struct {
+    VkBuffer        buffer;
+    VkDeviceMemory  memory;
+    void           *mapped;
+    size_t          size;
+} v3d_buffer;
+
+/* Compute pipeline + its descriptor set (one set per pipeline). */
+typedef struct {
+    VkPipeline             pipeline;
+    VkPipelineLayout       layout;
+    VkDescriptorSetLayout  ds_layout;
+    VkDescriptorPool       pool;
+    VkDescriptorSet        desc_set;
+    uint32_t               n_ssbos;
+    uint32_t               push_const_size;
+} v3d_pipeline;
+
+/*
+ * Create runner: Vulkan instance, V3D physical device, logical
+ * device with storageBuffer{8,16}BitAccess features enabled,
+ * compute queue, command pool.
+ *
+ * Returns NULL on failure (writes errors to stderr).
+ */
+v3d_runner *v3d_runner_create(void);
+void        v3d_runner_destroy(v3d_runner *r);
+
+/* Expose a few internals for code that wants direct vkCmd*. */
+VkDevice         v3d_runner_device(v3d_runner *r);
+VkQueue          v3d_runner_queue(v3d_runner *r);
+uint32_t         v3d_runner_queue_family(v3d_runner *r);
+VkCommandPool    v3d_runner_cmd_pool(v3d_runner *r);
+const char      *v3d_runner_device_name(v3d_runner *r);
+
+/* Storage buffer, HOST_VISIBLE | HOST_COHERENT, mapped on the
+ * host side. The mapping persists for the lifetime of the buffer.
+ *
+ * Returns 0 on success, non-zero on failure.
+ */
+int  v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
+void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf);
+
+/* Compute pipeline from a SPIR-V file path. The descriptor-set
+ * layout exposes `n_ssbos` storage buffer bindings at binding
+ * indices 0..n_ssbos-1, all visible to the compute stage. A push
+ * constant range of `push_const_size` bytes is added if non-zero.
+ *
+ * The single descriptor set is pre-allocated; bind buffers via
+ * v3d_runner_bind_buffers().
+ */
+int  v3d_runner_create_pipeline(v3d_runner *r,
+                                const char  *spv_path,
+                                uint32_t     n_ssbos,
+                                uint32_t     push_const_size,
+                                v3d_pipeline *out);
+void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p);
+
+/* Bind SSBOs to the pipeline's descriptor set. `bufs` must have
+ * exactly `p->n_ssbos` entries, in binding order. Idempotent —
+ * rebind freely between dispatches if buffers change.
+ */
+int  v3d_runner_bind_buffers(v3d_runner   *r,
+                             v3d_pipeline *p,
+                             const v3d_buffer *bufs,
+                             uint32_t      n);
+
+/* Allocate a primary command buffer from the runner's pool. */
+VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r);
+
+/* Submit `cb` to the queue and wait for completion. The classic
+ * timed operation. Returns 0 on success.
+ */
+int v3d_runner_submit_wait(v3d_runner *r, VkCommandBuffer cb);
+
+#endif /* DAEDALUS_V3D_RUNNER_H */