From 0a042a8e95f0fe1c81d04e0c64b9c1d9d691b1ed Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Sat, 23 May 2026 19:52:50 +0200
Subject: [PATCH 1/2] v3d_runner: buffer pool for QPU dispatch hot path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per the QPU-default substrate decree 2026-05-23: the per-dispatch
vkAllocateMemory in dispatch_*_qpu was the biggest single fixable
contributor to QPU dispatch overhead.  This pools v3d_buffer
allocations by power-of-2 size class so the second-and-subsequent
dispatch hits a freelist instead of paying ~10-50us of Mesa-V3D7
memory-allocation cost per call.

API additions (v3d_runner.h):
  - v3d_runner_acquire_buffer(): pulls from per-bucket freelist;
    falls through to v3d_runner_create_buffer() on miss.
  - v3d_runner_release_buffer(): pushes back onto the freelist; the
    backing VkBuffer/VkDeviceMemory only get vkFreeMemory'd in
    v3d_runner_destroy().
  - v3d_runner_pool_total_bytes(): diagnostic watermark.

Size classes 2^8..2^23 (256 B to 8 MiB).  Oversize requests fall
through to non-pooled (vkAllocateMemory) for both ends — pool stays
correct, just degenerates to old behaviour for those calls.

Migration: daedalus_core.c dispatch_*_qpu paths globally swap
create_buffer → acquire_buffer and destroy_buffer → release_buffer.
All five QPU dispatch functions (idct8 / lpf / mc_8h / cdef /
h264_deblock) now reuse buffers across calls.  test_api_idct stays
bit-exact (4096/4096 bytes on CPU/QPU/AUTO substrates on hertz).

Microbench (tests/bench_pool_overhead.c) on hertz (Pi 5,
6.18.29+rpt-rpi-2712, V3D 7.1):

  call 0:  434.89 us  (cold — 3x vkAllocateMemory)
  call 1:  100.06 us  (pool hit on all 3 buffers)
  steady-state:
    p50:    76.44 us
    p90:    90.52 us
    mean:   77.95 us
  first-call / steady-state ratio: 5.7x

The remaining ~76us steady-state is dominated by vkQueueWaitIdle +
shader execution + per-call descriptor-set update + command-buffer
allocation — addressed in follow-on tasks 161 (persistent cmdbuf)
and 162 (dmabuf import for dst, eliminates memcpy in/out).

Refs daedalus-fourier task #160.
---
 CMakeLists.txt              |   4 ++
 src/daedalus_core.c         |  86 ++++++++++++-------------
 src/v3d_runner.c            | 122 ++++++++++++++++++++++++++++++++++++
 src/v3d_runner.h            |  33 ++++++++++
 tests/bench_pool_overhead.c | 120 +++++++++++++++++++++++++++++++++++
 5 files changed, 322 insertions(+), 43 deletions(-)
 create mode 100644 tests/bench_pool_overhead.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c2faa7..c6592f2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -492,6 +492,10 @@ add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c)
 target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core)
 target_compile_options(test_api_opportunistic_qpu PRIVATE -O2)
 
+add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
+target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
+target_compile_options(bench_pool_overhead PRIVATE -O2)
+
 if (DAEDALUS_BUILD_VULKAN)
 # (re-open the conditional so the closing endif() below balances)
 
diff --git a/src/daedalus_core.c b/src/daedalus_core.c
index fd7d73b..375064c 100644
--- a/src/daedalus_core.c
+++ b/src/daedalus_core.c
@@ -291,13 +291,13 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
     }
 
     v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
-    if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
-    if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
-        v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
+        v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
     }
-    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
-        v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
-        v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) {
+        v3d_runner_release_buffer(ctx->runner, &buf_dst);
+        v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
     }
 
     /* Upload. Coeffs and meta are straight copies. Dst we copy the
@@ -344,15 +344,15 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
     /* Read-back dst. */
     memcpy(dst, buf_dst.mapped, max_byte_touched);
 
-    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
+    v3d_runner_release_buffer(ctx->runner, &buf_meta);
+    v3d_runner_release_buffer(ctx->runner, &buf_dst);
+    v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
     return 0;
 
 fail:
-    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
+    v3d_runner_release_buffer(ctx->runner, &buf_meta);
+    v3d_runner_release_buffer(ctx->runner, &buf_dst);
+    v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
     return -1;
 }
 
@@ -424,9 +424,9 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
     size_t dst_window_size = hi - lo;
 
     v3d_buffer buf_meta = {0}, buf_dst = {0};
-    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
-    if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) {
-        v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, dst_window_size, &buf_dst)) {
+        v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1;
     }
 
     memcpy(buf_dst.mapped, dst + lo, dst_window_size);
@@ -468,12 +468,12 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
 
     memcpy(dst + lo, buf_dst.mapped, dst_window_size);
 
-    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
+    v3d_runner_release_buffer(ctx->runner, &buf_dst);
+    v3d_runner_release_buffer(ctx->runner, &buf_meta);
     return 0;
 fail:
-    v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
-    v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
+    v3d_runner_release_buffer(ctx->runner, &buf_dst);
+    v3d_runner_release_buffer(ctx->runner, &buf_meta);
     return -1;
 }
 
@@ -509,9 +509,9 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
     }
 
     v3d_buffer bm = {0}, bd = {0}, bs = {0};
-    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
-    if (v3d_runner_create_buffer(ctx->runner, dst_max,     &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
-    if (v3d_runner_create_buffer(ctx->runner, src_max,     &bs)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, dst_max,     &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_acquire_buffer(ctx->runner, src_max,     &bs)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
 
     memcpy(bs.mapped, src, src_max);
     memcpy(bd.mapped, dst, dst_max);
@@ -545,14 +545,14 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
 
     memcpy(dst, bd.mapped, dst_max);
 
-    v3d_runner_destroy_buffer(ctx->runner, &bs);
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bs);
+    v3d_runner_release_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bm);
     return 0;
 fail:
-    v3d_runner_destroy_buffer(ctx->runner, &bs);
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bs);
+    v3d_runner_release_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bm);
     return -1;
 }
 
@@ -588,9 +588,9 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
     size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
 
     v3d_buffer bm = {0}, bd = {0}, bt = {0};
-    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
-    if (v3d_runner_create_buffer(ctx->runner, dst_max,    &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
-    if (v3d_runner_create_buffer(ctx->runner, tmp_bytes,  &bt)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, dst_max,    &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_acquire_buffer(ctx->runner, tmp_bytes,  &bt)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
 
     /* tmp may need padding before block-origin offset (caller-allocated). Just
      * copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
@@ -630,14 +630,14 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
 
     memcpy(dst, bd.mapped, dst_max);
 
-    v3d_runner_destroy_buffer(ctx->runner, &bt);
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bt);
+    v3d_runner_release_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bm);
     return 0;
 fail:
-    v3d_runner_destroy_buffer(ctx->runner, &bt);
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bt);
+    v3d_runner_release_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bm);
     return -1;
 }
 
@@ -670,8 +670,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
     }
 
     v3d_buffer bm = {0}, bd = {0};
-    if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
-    if (v3d_runner_create_buffer(ctx->runner, dst_max,    &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
+    if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
+    if (v3d_runner_acquire_buffer(ctx->runner, dst_max,    &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
 
     memcpy(bd.mapped, dst, dst_max);
     uint32_t *m = bm.mapped;
@@ -706,12 +706,12 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
 
     memcpy(dst, bd.mapped, dst_max);
 
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bm);
     return 0;
 fail:
-    v3d_runner_destroy_buffer(ctx->runner, &bd);
-    v3d_runner_destroy_buffer(ctx->runner, &bm);
+    v3d_runner_release_buffer(ctx->runner, &bd);
+    v3d_runner_release_buffer(ctx->runner, &bm);
     return -1;
 }
 
diff --git a/src/v3d_runner.c b/src/v3d_runner.c
index 25d139b..cbf000d 100644
--- a/src/v3d_runner.c
+++ b/src/v3d_runner.c
@@ -17,6 +17,18 @@
     fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
             r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
 
+/* Power-of-2 size classes from 2^8 (256 B) up to 2^23 (8 MiB).  Cycle
+ * 1's largest dispatch with n_blocks ≈ 8K is well under 8 MiB; oversize
+ * requests fall through to non-pooled allocation. */
+#define V3D_POOL_MIN_LOG2	8
+#define V3D_POOL_MAX_LOG2	23
+#define V3D_POOL_BUCKETS	(V3D_POOL_MAX_LOG2 - V3D_POOL_MIN_LOG2 + 1)
+
+struct v3d_pool_entry {
+    v3d_buffer             buf;
+    struct v3d_pool_entry *next;
+};
+
 struct v3d_runner {
     VkInstance       instance;
     VkPhysicalDevice phys;
@@ -26,6 +38,15 @@ struct v3d_runner {
     VkCommandPool    pool;
     char             device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
     VkPhysicalDeviceMemoryProperties mem_props;
+
+    /* Buffer pool: per-bucket freelist of previously-released
+     * v3d_buffer.  bucket index = ceil_log2(size) - V3D_POOL_MIN_LOG2.
+     * pool_total_bytes accumulates every successful vkAllocateMemory
+     * we've done through the pool — never decreases (the freelist
+     * just hands buffers around, no vkFreeMemory until destroy).
+     */
+    struct v3d_pool_entry *pool_free[V3D_POOL_BUCKETS];
+    size_t                 pool_total_bytes;
 };
 
 static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
@@ -168,6 +189,21 @@ void v3d_runner_destroy(v3d_runner *r)
 {
     if (!r) return;
     if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
+
+    /* Drain the buffer pool BEFORE destroying device — the pool
+     * entries own VkBuffer/VkDeviceMemory handles, which need a live
+     * device for vkDestroyBuffer/vkFreeMemory. */
+    for (int b = 0; b < V3D_POOL_BUCKETS; b++) {
+        struct v3d_pool_entry *e = r->pool_free[b];
+        while (e) {
+            struct v3d_pool_entry *next = e->next;
+            v3d_runner_destroy_buffer(r, &e->buf);
+            free(e);
+            e = next;
+        }
+        r->pool_free[b] = NULL;
+    }
+
     if (r->pool != VK_NULL_HANDLE)
         vkDestroyCommandPool(r->device, r->pool, NULL);
     if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
@@ -175,6 +211,92 @@ void v3d_runner_destroy(v3d_runner *r)
     free(r);
 }
 
+/* ---- Buffer pool ----------------------------------------------- */
+
+/* ceil_log2 for buffer pool bucket selection. */
+static int v3d_pool_bucket_for(size_t size)
+{
+    int log2;
+    size_t m;
+
+    if (size <= ((size_t)1 << V3D_POOL_MIN_LOG2))
+        return 0;
+    m = size - 1;
+    log2 = 0;
+    while (m) { log2++; m >>= 1; }
+    if (log2 < V3D_POOL_MIN_LOG2) log2 = V3D_POOL_MIN_LOG2;
+    if (log2 > V3D_POOL_MAX_LOG2) return -1;
+    return log2 - V3D_POOL_MIN_LOG2;
+}
+
+int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
+{
+    int bucket;
+    size_t bucket_size;
+    struct v3d_pool_entry *e;
+    int rc;
+
+    if (!r || !out || size == 0) return -1;
+
+    bucket = v3d_pool_bucket_for(size);
+    if (bucket < 0) {
+        /* Oversize — fall through to non-pooled allocation.  Caller
+         * still calls v3d_runner_release_buffer(), which detects the
+         * oversize bucket via bucket_for() and destroys. */
+        return v3d_runner_create_buffer(r, size, out);
+    }
+    bucket_size = (size_t)1 << (bucket + V3D_POOL_MIN_LOG2);
+
+    e = r->pool_free[bucket];
+    if (e) {
+        r->pool_free[bucket] = e->next;
+        *out = e->buf;
+        free(e);
+        return 0;
+    }
+
+    /* Miss — allocate fresh at the bucket size.  Subsequent acquire/
+     * release for the same bucket reuses this buffer. */
+    rc = v3d_runner_create_buffer(r, bucket_size, out);
+    if (rc == 0)
+        r->pool_total_bytes += bucket_size;
+    return rc;
+}
+
+void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf)
+{
+    int bucket;
+    struct v3d_pool_entry *e;
+
+    if (!r || !buf || buf->buffer == VK_NULL_HANDLE) return;
+
+    bucket = v3d_pool_bucket_for(buf->size);
+    if (bucket < 0) {
+        /* Oversize — destroy outright; never made it into the pool. */
+        v3d_runner_destroy_buffer(r, buf);
+        memset(buf, 0, sizeof(*buf));
+        return;
+    }
+
+    e = malloc(sizeof(*e));
+    if (!e) {
+        /* Allocator failure: just destroy.  Pool degenerates to
+         * non-pooled behaviour but doesn't leak. */
+        v3d_runner_destroy_buffer(r, buf);
+        memset(buf, 0, sizeof(*buf));
+        return;
+    }
+    e->buf = *buf;
+    e->next = r->pool_free[bucket];
+    r->pool_free[bucket] = e;
+    memset(buf, 0, sizeof(*buf));
+}
+
+size_t v3d_runner_pool_total_bytes(v3d_runner *r)
+{
+    return r ? r->pool_total_bytes : 0;
+}
+
 VkDevice      v3d_runner_device(v3d_runner *r)        { return r->device; }
 VkQueue       v3d_runner_queue(v3d_runner *r)         { return r->queue; }
 uint32_t      v3d_runner_queue_family(v3d_runner *r)  { return r->queue_family; }
diff --git a/src/v3d_runner.h b/src/v3d_runner.h
index b729995..fb4147b 100644
--- a/src/v3d_runner.h
+++ b/src/v3d_runner.h
@@ -57,10 +57,43 @@ const char      *v3d_runner_device_name(v3d_runner *r);
  * host side. The mapping persists for the lifetime of the buffer.
  *
  * Returns 0 on success, non-zero on failure.
+ *
+ * NOTE: prefer v3d_runner_acquire_buffer() on the dispatch hot path —
+ * create_buffer/destroy_buffer go straight to vkAllocateMemory each
+ * call, which on V3D7's Mesa stack costs ~10-50us.  The acquire/
+ * release pair pulls from a freelist and pays vkAllocateMemory only
+ * on a cache miss.
  */
 int  v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
 void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf);
 
+/*
+ * Pooled buffer acquisition.  Returns a v3d_buffer whose .size is the
+ * smallest power-of-2 >= the requested size (so callers can pool
+ * across similar-sized requests).  Backed by HOST_VISIBLE |
+ * HOST_COHERENT memory; mapped pointer is valid.
+ *
+ * On cache hit: zero-cost reuse of a previously-released buffer.
+ * On miss: falls through to v3d_runner_create_buffer().  Release with
+ * v3d_runner_release_buffer(); pool drains in v3d_runner_destroy().
+ *
+ * Lifetime contract: the returned buffer's .mapped contents are
+ * UNINITIALISED — the previous user's data may still be present.
+ * Callers that need a clean buffer must memset themselves.  This is
+ * deliberate; the dispatch hot paths immediately overwrite the
+ * buffer with new coefficients / meta anyway.
+ *
+ * Thread-safety: NOT thread-safe.  A daedalus_ctx is single-threaded
+ * by API contract; the pool inherits that constraint.
+ */
+int  v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
+void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf);
+
+/* Pool diagnostics: total allocated bytes (sum across all size
+ * classes, including currently-released entries).  Useful for
+ * watermark logging. */
+size_t v3d_runner_pool_total_bytes(v3d_runner *r);
+
 /* Compute pipeline from a SPIR-V file path. The descriptor-set
  * layout exposes `n_ssbos` storage buffer bindings at binding
  * indices 0..n_ssbos-1, all visible to the compute stage. A push
diff --git a/tests/bench_pool_overhead.c b/tests/bench_pool_overhead.c
new file mode 100644
index 0000000..d0f9564
--- /dev/null
+++ b/tests/bench_pool_overhead.c
@@ -0,0 +1,120 @@
+/*
+ * bench_pool_overhead — measure QPU dispatch overhead with and without
+ * the v3d_runner buffer pool warm.
+ *
+ * Times N consecutive daedalus_recipe_dispatch_vp9_idct8 calls and
+ * prints the per-call distribution.  The first call pays
+ * vkAllocateMemory (typically tens of microseconds on V3D7's Mesa);
+ * the second and subsequent should hit the pool freelist and amortise
+ * to the pure dispatch-floor cost.
+ *
+ * Purpose: provide a concrete before/after number for the QPU-default
+ * substrate decree (2026-05-23).  Bench is non-gating and runs in
+ * fractions of a second.
+ *
+ * License: BSD-2-Clause.
+ */
+#define _POSIX_C_SOURCE 200809L
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+
+#include "../include/daedalus.h"
+
+extern size_t v3d_runner_pool_total_bytes(void *);  /* exposed if we wanted it */
+
+static double now_seconds(void)
+{
+	struct timespec ts;
+	clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+	return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+static int cmp_double(const void *a, const void *b)
+{
+	double da = *(const double *)a, db = *(const double *)b;
+	return da < db ? -1 : da > db ? 1 : 0;
+}
+
+int main(int argc, char **argv)
+{
+	int n_calls = argc > 1 ? atoi(argv[1]) : 200;
+	int n_blocks = 8;	/* one MB column of 8x8 IDCT blocks */
+	int stride = 64;
+
+	daedalus_ctx *ctx = daedalus_ctx_create();
+	if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
+	int has_qpu = daedalus_ctx_has_qpu(ctx);
+	printf("ctx: has_qpu=%d\n", has_qpu);
+	if (!has_qpu) {
+		fprintf(stderr, "QPU not available on this device; bench needs V3D\n");
+		daedalus_ctx_destroy(ctx);
+		return 2;
+	}
+
+	/* Build a representative IDCT 8x8 batch and warm a dst buffer. */
+	int16_t *coeffs = calloc((size_t) n_blocks * 64, sizeof(int16_t));
+	uint8_t *dst    = calloc((size_t) n_blocks * 8 * stride, 1);
+	daedalus_idct8_meta *meta = calloc((size_t) n_blocks, sizeof(*meta));
+	if (!coeffs || !dst || !meta) { fprintf(stderr, "alloc fail\n"); return 1; }
+
+	uint64_t s = 0x1234567abcdefULL;
+	for (size_t i = 0; i < (size_t) n_blocks * 64; i++) {
+		s ^= s << 13; s ^= s >> 7; s ^= s << 17;
+		coeffs[i] = (int16_t)(s & 0x7ff) - 0x400;
+	}
+	for (int b = 0; b < n_blocks; b++) {
+		meta[b].dst_off = (uint32_t) b * 8;
+		meta[b].block_x = (uint32_t) b;
+		meta[b].block_y = 0;
+	}
+
+	double *t = malloc((size_t) n_calls * sizeof(double));
+	int rc;
+
+	printf("=== dispatching %d times, n_blocks=%d/call ===\n",
+	       n_calls, n_blocks);
+
+	for (int i = 0; i < n_calls; i++) {
+		double t0 = now_seconds();
+		rc = daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_QPU,
+						  dst, (size_t) stride,
+						  coeffs, (size_t) n_blocks, meta);
+		double t1 = now_seconds();
+		if (rc) { fprintf(stderr, "dispatch %d rc=%d\n", i, rc); return 1; }
+		t[i] = (t1 - t0) * 1e6;	/* us */
+	}
+
+	/* Per-call distribution (first few + sorted summary on the steady-state) */
+	printf("\nfirst 5 calls (cold-warm transition):\n");
+	for (int i = 0; i < 5 && i < n_calls; i++)
+		printf("  call %d:  %.2f us\n", i, t[i]);
+
+	int skip = 10;	/* drop warm-up calls from the steady-state stats */
+	if (n_calls > skip + 10) {
+		int n = n_calls - skip;
+		double *s_arr = malloc((size_t) n * sizeof(double));
+		memcpy(s_arr, t + skip, (size_t) n * sizeof(double));
+		qsort(s_arr, (size_t) n, sizeof(double), cmp_double);
+		double sum = 0;
+		for (int i = 0; i < n; i++) sum += s_arr[i];
+		printf("\nsteady-state stats (calls %d..%d, n=%d):\n",
+		       skip, n_calls - 1, n);
+		printf("  min:    %.2f us\n", s_arr[0]);
+		printf("  p50:    %.2f us\n", s_arr[n / 2]);
+		printf("  p90:    %.2f us\n", s_arr[(int)(n * 0.9)]);
+		printf("  p99:    %.2f us\n", s_arr[(int)(n * 0.99)]);
+		printf("  max:    %.2f us\n", s_arr[n - 1]);
+		printf("  mean:   %.2f us\n", sum / n);
+		printf("\nfirst-call / steady-state median ratio: %.1fx\n",
+		       t[0] / s_arr[n / 2]);
+		free(s_arr);
+	}
+
+	free(t); free(coeffs); free(dst); free(meta);
+	daedalus_ctx_destroy(ctx);
+	return 0;
+}
-- 
2.47.3


From 98553278dd6b5f5720c9f9e1064fdfa9299e24c0 Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Sat, 23 May 2026 19:56:35 +0200
Subject: [PATCH 2/2] v3d_runner: persistent per-pipeline command buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 2 of the QPU-default substrate campaign — eliminate
vkAllocateCommandBuffers from the dispatch hot path.

Attaches a VkCommandBuffer to each v3d_pipeline, allocated once in
v3d_runner_create_pipeline() and freed in destroy_pipeline().  The
five dispatch_*_qpu sites switch from v3d_runner_alloc_cmdbuf() to
v3d_runner_pipeline_cmdbuf_reset() — vkResetCommandBuffer is O(1)
versus the driver-side allocation walk.  Pool was already created
with VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT so reset is
permitted.

Microbench (hertz, Pi 5, kernel 6.18.29, V3D 7.1):

  before (task 160 pool only):
    steady-state p50: 76.44 us
    steady-state mean: 77.95 us
  after (task 160 pool + task 161 persistent cb):
    steady-state p50: 54.56 us
    steady-state mean: 56.00 us
    -> 28% per-dispatch reduction

The remaining ~54 us steady-state is dominated by vkQueueWaitIdle +
shader execution + the two memcpy(in/out) on the dst buffer — task
162 (dmabuf import for dst) targets the memcpy half.

test_api_idct stays bit-exact across CPU/QPU/AUTO substrates.

Refs daedalus-fourier task #161.
---
 src/daedalus_core.c | 20 ++++++++++----------
 src/v3d_runner.c    | 22 ++++++++++++++++++++++
 src/v3d_runner.h    | 12 ++++++++++++
 3 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/src/daedalus_core.c b/src/daedalus_core.c
index 375064c..ff4c255 100644
--- a/src/daedalus_core.c
+++ b/src/daedalus_core.c
@@ -325,8 +325,8 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
         ._pad = 0,
     };
 
-    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
-    if (cb == VK_NULL_HANDLE) goto fail;
+    if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->idct8_pipe)) goto fail;
+    VkCommandBuffer cb = ctx->idct8_pipe.cb;
     VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
     vkBeginCommandBuffer(cb, &cbbi);
     vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
@@ -442,8 +442,8 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
     if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
 
     uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
-    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
-    if (cb == VK_NULL_HANDLE) goto fail;
+    if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, p)) goto fail;
+    VkCommandBuffer cb = p->cb;
     VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
     vkBeginCommandBuffer(cb, &cbbi);
     vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
@@ -530,8 +530,8 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
     mc_pc pc = { .n_blocks = (uint32_t) n_blocks,
                  .dst_stride_u8 = (uint32_t) dst_stride,
                  .src_stride_u8 = (uint32_t) src_stride };
-    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
-    if (cb == VK_NULL_HANDLE) goto fail;
+    if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->mc8h_pipe)) goto fail;
+    VkCommandBuffer cb = ctx->mc8h_pipe.cb;
     VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
     vkBeginCommandBuffer(cb, &cbbi);
     vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline);
@@ -615,8 +615,8 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
     cdef_pc pc = { .n_blocks = (uint32_t) n_blocks,
                    .tmp_stride_u16 = 16,
                    .dst_stride_u8 = (uint32_t) dst_stride };
-    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
-    if (cb == VK_NULL_HANDLE) goto fail;
+    if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->cdef_pipe)) goto fail;
+    VkCommandBuffer cb = ctx->cdef_pipe.cb;
     VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
     vkBeginCommandBuffer(cb, &cbbi);
     vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline);
@@ -691,8 +691,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
     uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
     h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
                           .dst_stride_u8 = (uint32_t) dst_stride };
-    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
-    if (cb == VK_NULL_HANDLE) goto fail;
+    if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_pipe)) goto fail;
+    VkCommandBuffer cb = ctx->h264deblock_pipe.cb;
     VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
     vkBeginCommandBuffer(cb, &cbbi);
     vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline);
diff --git a/src/v3d_runner.c b/src/v3d_runner.c
index cbf000d..05d34c5 100644
--- a/src/v3d_runner.c
+++ b/src/v3d_runner.c
@@ -486,12 +486,27 @@ int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path,
         .pSetLayouts = &out->ds_layout,
     };
     CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set));
+
+    /* Persistent command buffer — pool was created with
+     * RESET_COMMAND_BUFFER_BIT (see v3d_runner_create) so dispatch
+     * sites can call vkResetCommandBuffer on this same cb instead
+     * of paying vkAllocateCommandBuffers per call. */
+    VkCommandBufferAllocateInfo cbai = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = r->pool,
+        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = 1,
+    };
+    CHK(vkAllocateCommandBuffers(r->device, &cbai, &out->cb));
+
     return 0;
 }
 
 void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
 {
     if (!p || p->pipeline == VK_NULL_HANDLE) return;
+    if (p->cb != VK_NULL_HANDLE)
+        vkFreeCommandBuffers(r->device, r->pool, 1, &p->cb);
     vkDestroyPipeline(r->device, p->pipeline, NULL);
     vkDestroyPipelineLayout(r->device, p->layout, NULL);
     vkDestroyDescriptorPool(r->device, p->pool, NULL);  /* frees its set */
@@ -499,6 +514,13 @@ void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
     memset(p, 0, sizeof(*p));
 }
 
+int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p)
+{
+    (void) r;
+    if (!p || p->cb == VK_NULL_HANDLE) return -1;
+    return vkResetCommandBuffer(p->cb, 0) == VK_SUCCESS ? 0 : -1;
+}
+
 int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p,
                             const v3d_buffer *bufs, uint32_t n)
 {
diff --git a/src/v3d_runner.h b/src/v3d_runner.h
index fb4147b..86f706a 100644
--- a/src/v3d_runner.h
+++ b/src/v3d_runner.h
@@ -34,6 +34,12 @@ typedef struct {
     VkDescriptorSet        desc_set;
     uint32_t               n_ssbos;
     uint32_t               push_const_size;
+    /* Persistent command buffer.  Allocated at create-pipeline time;
+     * dispatch sites use v3d_runner_pipeline_cmdbuf_reset() to
+     * vkResetCommandBuffer instead of paying vkAllocateCommandBuffers
+     * per dispatch.  Pool flagged RESET_COMMAND_BUFFER_BIT so reset
+     * is permitted. */
+    VkCommandBuffer        cb;
 } v3d_pipeline;
 
 /*
@@ -121,6 +127,12 @@ int  v3d_runner_bind_buffers(v3d_runner   *r,
 /* Allocate a primary command buffer from the runner's pool. */
 VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r);
 
+/* Reset @p->cb so it can be re-recorded.  Returns 0 on success.
+ * Replaces v3d_runner_alloc_cmdbuf() on the dispatch hot path —
+ * vkResetCommandBuffer is O(1) vs vkAllocateCommandBuffers' ~1-5us
+ * driver cost. */
+int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p);
+
 /* Submit `cb` to the queue and wait for completion. The classic
  * timed operation. Returns 0 on success.
  */
-- 
2.47.3