From 0a042a8e95f0fe1c81d04e0c64b9c1d9d691b1ed Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sat, 23 May 2026 19:52:50 +0200 Subject: [PATCH 1/2] v3d_runner: buffer pool for QPU dispatch hot path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the QPU-default substrate decree 2026-05-23: the per-dispatch vkAllocateMemory in dispatch_*_qpu was the biggest single fixable contributor to QPU dispatch overhead. This pools v3d_buffer allocations by power-of-2 size class so the second-and-subsequent dispatch hits a freelist instead of paying ~10-50us of Mesa-V3D7 memory-allocation cost per call. API additions (v3d_runner.h): - v3d_runner_acquire_buffer(): pulls from per-bucket freelist; falls through to v3d_runner_create_buffer() on miss. - v3d_runner_release_buffer(): pushes back onto the freelist; the backing VkBuffer/VkDeviceMemory only get vkFreeMemory'd in v3d_runner_destroy(). - v3d_runner_pool_total_bytes(): diagnostic watermark. Size classes 2^8..2^23 (256 B to 8 MiB). Oversize requests fall through to non-pooled (vkAllocateMemory) for both ends — pool stays correct, just degenerates to old behaviour for those calls. Migration: daedalus_core.c dispatch_*_qpu paths globally swap create_buffer → acquire_buffer and destroy_buffer → release_buffer. All five QPU dispatch functions (idct8 / lpf / mc_8h / cdef / h264_deblock) now reuse buffers across calls. test_api_idct stays bit-exact (4096/4096 bytes on CPU/QPU/AUTO substrates on hertz). Microbench (tests/bench_pool_overhead.c) on hertz (Pi 5, 6.18.29+rpt-rpi-2712, V3D 7.1): call 0: 434.89 us (cold — 3x vkAllocateMemory) call 1: 100.06 us (pool hit on all 3 buffers) steady-state: p50: 76.44 us p90: 90.52 us mean: 77.95 us first-call / steady-state ratio: 5.7x The remaining ~76us steady-state is dominated by vkQueueWaitIdle + shader execution + per-call descriptor-set update + command-buffer allocation — addressed in follow-on tasks 161 (persistent cmdbuf) and 162 (dmabuf import for dst, eliminates memcpy in/out). Refs daedalus-fourier task #160. --- CMakeLists.txt | 4 ++ src/daedalus_core.c | 86 ++++++++++++------------- src/v3d_runner.c | 122 ++++++++++++++++++++++++++++++++++++ src/v3d_runner.h | 33 ++++++++++ tests/bench_pool_overhead.c | 120 +++++++++++++++++++++++++++++++++++ 5 files changed, 322 insertions(+), 43 deletions(-) create mode 100644 tests/bench_pool_overhead.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c2faa7..c6592f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -492,6 +492,10 @@ add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c) target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core) target_compile_options(test_api_opportunistic_qpu PRIVATE -O2) +add_executable(bench_pool_overhead tests/bench_pool_overhead.c) +target_link_libraries(bench_pool_overhead PRIVATE daedalus_core) +target_compile_options(bench_pool_overhead PRIVATE -O2) + if (DAEDALUS_BUILD_VULKAN) # (re-open the conditional so the closing endif() below balances) diff --git a/src/daedalus_core.c b/src/daedalus_core.c index fd7d73b..375064c 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -291,13 +291,13 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx, } v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0}; - if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1; - if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) { - v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1; + if (v3d_runner_acquire_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1; + if (v3d_runner_acquire_buffer(ctx->runner, max_byte_touched, &buf_dst)) { + v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1; } - if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) { - v3d_runner_destroy_buffer(ctx->runner, &buf_dst); - v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1; + if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) { + v3d_runner_release_buffer(ctx->runner, &buf_dst); + v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1; } /* Upload. Coeffs and meta are straight copies. Dst we copy the @@ -344,15 +344,15 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx, /* Read-back dst. */ memcpy(dst, buf_dst.mapped, max_byte_touched); - v3d_runner_destroy_buffer(ctx->runner, &buf_meta); - v3d_runner_destroy_buffer(ctx->runner, &buf_dst); - v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); + v3d_runner_release_buffer(ctx->runner, &buf_meta); + v3d_runner_release_buffer(ctx->runner, &buf_dst); + v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return 0; fail: - v3d_runner_destroy_buffer(ctx->runner, &buf_meta); - v3d_runner_destroy_buffer(ctx->runner, &buf_dst); - v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); + v3d_runner_release_buffer(ctx->runner, &buf_meta); + v3d_runner_release_buffer(ctx->runner, &buf_dst); + v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1; } @@ -424,9 +424,9 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8, size_t dst_window_size = hi - lo; v3d_buffer buf_meta = {0}, buf_dst = {0}; - if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1; - if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) { - v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1; + if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1; + if (v3d_runner_acquire_buffer(ctx->runner, dst_window_size, &buf_dst)) { + v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1; } memcpy(buf_dst.mapped, dst + lo, dst_window_size); @@ -468,12 +468,12 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8, memcpy(dst + lo, buf_dst.mapped, dst_window_size); - v3d_runner_destroy_buffer(ctx->runner, &buf_dst); - v3d_runner_destroy_buffer(ctx->runner, &buf_meta); + v3d_runner_release_buffer(ctx->runner, &buf_dst); + v3d_runner_release_buffer(ctx->runner, &buf_meta); return 0; fail: - v3d_runner_destroy_buffer(ctx->runner, &buf_dst); - v3d_runner_destroy_buffer(ctx->runner, &buf_meta); + v3d_runner_release_buffer(ctx->runner, &buf_dst); + v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1; } @@ -509,9 +509,9 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx, } v3d_buffer bm = {0}, bd = {0}, bs = {0}; - if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1; - if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; } - if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; } + if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1; + if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; } + if (v3d_runner_acquire_buffer(ctx->runner, src_max, &bs)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; } memcpy(bs.mapped, src, src_max); memcpy(bd.mapped, dst, dst_max); @@ -545,14 +545,14 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx, memcpy(dst, bd.mapped, dst_max); - v3d_runner_destroy_buffer(ctx->runner, &bs); - v3d_runner_destroy_buffer(ctx->runner, &bd); - v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_release_buffer(ctx->runner, &bs); + v3d_runner_release_buffer(ctx->runner, &bd); + v3d_runner_release_buffer(ctx->runner, &bm); return 0; fail: - v3d_runner_destroy_buffer(ctx->runner, &bs); - v3d_runner_destroy_buffer(ctx->runner, &bd); - v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_release_buffer(ctx->runner, &bs); + v3d_runner_release_buffer(ctx->runner, &bd); + v3d_runner_release_buffer(ctx->runner, &bm); return -1; } @@ -588,9 +588,9 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx, size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t); v3d_buffer bm = {0}, bd = {0}, bt = {0}; - if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1; - if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; } - if (v3d_runner_create_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; } + if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1; + if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; } + if (v3d_runner_acquire_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; } /* tmp may need padding before block-origin offset (caller-allocated). Just * copy from caller; we assume meta[i].tmp_off_u16 is consistent with how @@ -630,14 +630,14 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx, memcpy(dst, bd.mapped, dst_max); - v3d_runner_destroy_buffer(ctx->runner, &bt); - v3d_runner_destroy_buffer(ctx->runner, &bd); - v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_release_buffer(ctx->runner, &bt); + v3d_runner_release_buffer(ctx->runner, &bd); + v3d_runner_release_buffer(ctx->runner, &bm); return 0; fail: - v3d_runner_destroy_buffer(ctx->runner, &bt); - v3d_runner_destroy_buffer(ctx->runner, &bd); - v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_release_buffer(ctx->runner, &bt); + v3d_runner_release_buffer(ctx->runner, &bd); + v3d_runner_release_buffer(ctx->runner, &bm); return -1; } @@ -670,8 +670,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx, } v3d_buffer bm = {0}, bd = {0}; - if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1; - if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; } + if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1; + if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; } memcpy(bd.mapped, dst, dst_max); uint32_t *m = bm.mapped; @@ -706,12 +706,12 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx, memcpy(dst, bd.mapped, dst_max); - v3d_runner_destroy_buffer(ctx->runner, &bd); - v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_release_buffer(ctx->runner, &bd); + v3d_runner_release_buffer(ctx->runner, &bm); return 0; fail: - v3d_runner_destroy_buffer(ctx->runner, &bd); - v3d_runner_destroy_buffer(ctx->runner, &bm); + v3d_runner_release_buffer(ctx->runner, &bd); + v3d_runner_release_buffer(ctx->runner, &bm); return -1; } diff --git a/src/v3d_runner.c b/src/v3d_runner.c index 25d139b..cbf000d 100644 --- a/src/v3d_runner.c +++ b/src/v3d_runner.c @@ -17,6 +17,18 @@ fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \ r__, __FILE__, __LINE__, #call); return NULL; } } while (0) +/* Power-of-2 size classes from 2^8 (256 B) up to 2^23 (8 MiB). Cycle + * 1's largest dispatch with n_blocks ≈ 8K is well under 8 MiB; oversize + * requests fall through to non-pooled allocation. */ +#define V3D_POOL_MIN_LOG2 8 +#define V3D_POOL_MAX_LOG2 23 +#define V3D_POOL_BUCKETS (V3D_POOL_MAX_LOG2 - V3D_POOL_MIN_LOG2 + 1) + +struct v3d_pool_entry { + v3d_buffer buf; + struct v3d_pool_entry *next; +}; + struct v3d_runner { VkInstance instance; VkPhysicalDevice phys; @@ -26,6 +38,15 @@ struct v3d_runner { VkCommandPool pool; char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE]; VkPhysicalDeviceMemoryProperties mem_props; + + /* Buffer pool: per-bucket freelist of previously-released + * v3d_buffer. bucket index = ceil_log2(size) - V3D_POOL_MIN_LOG2. + * pool_total_bytes accumulates every successful vkAllocateMemory + * we've done through the pool — never decreases (the freelist + * just hands buffers around, no vkFreeMemory until destroy). + */ + struct v3d_pool_entry *pool_free[V3D_POOL_BUCKETS]; + size_t pool_total_bytes; }; static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out, @@ -168,6 +189,21 @@ void v3d_runner_destroy(v3d_runner *r) { if (!r) return; if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device); + + /* Drain the buffer pool BEFORE destroying device — the pool + * entries own VkBuffer/VkDeviceMemory handles, which need a live + * device for vkDestroyBuffer/vkFreeMemory. */ + for (int b = 0; b < V3D_POOL_BUCKETS; b++) { + struct v3d_pool_entry *e = r->pool_free[b]; + while (e) { + struct v3d_pool_entry *next = e->next; + v3d_runner_destroy_buffer(r, &e->buf); + free(e); + e = next; + } + r->pool_free[b] = NULL; + } + if (r->pool != VK_NULL_HANDLE) vkDestroyCommandPool(r->device, r->pool, NULL); if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL); @@ -175,6 +211,92 @@ void v3d_runner_destroy(v3d_runner *r) free(r); } +/* ---- Buffer pool ----------------------------------------------- */ + +/* ceil_log2 for buffer pool bucket selection. */ +static int v3d_pool_bucket_for(size_t size) +{ + int log2; + size_t m; + + if (size <= ((size_t)1 << V3D_POOL_MIN_LOG2)) + return 0; + m = size - 1; + log2 = 0; + while (m) { log2++; m >>= 1; } + if (log2 < V3D_POOL_MIN_LOG2) log2 = V3D_POOL_MIN_LOG2; + if (log2 > V3D_POOL_MAX_LOG2) return -1; + return log2 - V3D_POOL_MIN_LOG2; +} + +int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out) +{ + int bucket; + size_t bucket_size; + struct v3d_pool_entry *e; + int rc; + + if (!r || !out || size == 0) return -1; + + bucket = v3d_pool_bucket_for(size); + if (bucket < 0) { + /* Oversize — fall through to non-pooled allocation. Caller + * still calls v3d_runner_release_buffer(), which detects the + * oversize bucket via bucket_for() and destroys. */ + return v3d_runner_create_buffer(r, size, out); + } + bucket_size = (size_t)1 << (bucket + V3D_POOL_MIN_LOG2); + + e = r->pool_free[bucket]; + if (e) { + r->pool_free[bucket] = e->next; + *out = e->buf; + free(e); + return 0; + } + + /* Miss — allocate fresh at the bucket size. Subsequent acquire/ + * release for the same bucket reuses this buffer. */ + rc = v3d_runner_create_buffer(r, bucket_size, out); + if (rc == 0) + r->pool_total_bytes += bucket_size; + return rc; +} + +void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf) +{ + int bucket; + struct v3d_pool_entry *e; + + if (!r || !buf || buf->buffer == VK_NULL_HANDLE) return; + + bucket = v3d_pool_bucket_for(buf->size); + if (bucket < 0) { + /* Oversize — destroy outright; never made it into the pool. */ + v3d_runner_destroy_buffer(r, buf); + memset(buf, 0, sizeof(*buf)); + return; + } + + e = malloc(sizeof(*e)); + if (!e) { + /* Allocator failure: just destroy. Pool degenerates to + * non-pooled behaviour but doesn't leak. */ + v3d_runner_destroy_buffer(r, buf); + memset(buf, 0, sizeof(*buf)); + return; + } + e->buf = *buf; + e->next = r->pool_free[bucket]; + r->pool_free[bucket] = e; + memset(buf, 0, sizeof(*buf)); +} + +size_t v3d_runner_pool_total_bytes(v3d_runner *r) +{ + return r ? r->pool_total_bytes : 0; +} + VkDevice v3d_runner_device(v3d_runner *r) { return r->device; } VkQueue v3d_runner_queue(v3d_runner *r) { return r->queue; } uint32_t v3d_runner_queue_family(v3d_runner *r) { return r->queue_family; } diff --git a/src/v3d_runner.h b/src/v3d_runner.h index b729995..fb4147b 100644 --- a/src/v3d_runner.h +++ b/src/v3d_runner.h @@ -57,10 +57,43 @@ const char *v3d_runner_device_name(v3d_runner *r); * host side. The mapping persists for the lifetime of the buffer. * * Returns 0 on success, non-zero on failure. + * + * NOTE: prefer v3d_runner_acquire_buffer() on the dispatch hot path — + * create_buffer/destroy_buffer go straight to vkAllocateMemory each + * call, which on V3D7's Mesa stack costs ~10-50us. The acquire/ + * release pair pulls from a freelist and pays vkAllocateMemory only + * on a cache miss. */ int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out); void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf); +/* + * Pooled buffer acquisition. Returns a v3d_buffer whose .size is the + * smallest power-of-2 >= the requested size (so callers can pool + * across similar-sized requests). Backed by HOST_VISIBLE | + * HOST_COHERENT memory; mapped pointer is valid. + * + * On cache hit: zero-cost reuse of a previously-released buffer. + * On miss: falls through to v3d_runner_create_buffer(). Release with + * v3d_runner_release_buffer(); pool drains in v3d_runner_destroy(). + * + * Lifetime contract: the returned buffer's .mapped contents are + * UNINITIALISED — the previous user's data may still be present. + * Callers that need a clean buffer must memset themselves. This is + * deliberate; the dispatch hot paths immediately overwrite the + * buffer with new coefficients / meta anyway. + * + * Thread-safety: NOT thread-safe. A daedalus_ctx is single-threaded + * by API contract; the pool inherits that constraint. + */ +int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out); +void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf); + +/* Pool diagnostics: total allocated bytes (sum across all size + * classes, including currently-released entries). Useful for + * watermark logging. */ +size_t v3d_runner_pool_total_bytes(v3d_runner *r); + /* Compute pipeline from a SPIR-V file path. The descriptor-set * layout exposes `n_ssbos` storage buffer bindings at binding * indices 0..n_ssbos-1, all visible to the compute stage. A push diff --git a/tests/bench_pool_overhead.c b/tests/bench_pool_overhead.c new file mode 100644 index 0000000..d0f9564 --- /dev/null +++ b/tests/bench_pool_overhead.c @@ -0,0 +1,120 @@ +/* + * bench_pool_overhead — measure QPU dispatch overhead with and without + * the v3d_runner buffer pool warm. + * + * Times N consecutive daedalus_recipe_dispatch_vp9_idct8 calls and + * prints the per-call distribution. The first call pays + * vkAllocateMemory (typically tens of microseconds on V3D7's Mesa); + * the second and subsequent should hit the pool freelist and amortise + * to the pure dispatch-floor cost. + * + * Purpose: provide a concrete before/after number for the QPU-default + * substrate decree (2026-05-23). Bench is non-gating and runs in + * fractions of a second. + * + * License: BSD-2-Clause. + */ +#define _POSIX_C_SOURCE 200809L + +#include +#include +#include +#include +#include + +#include "../include/daedalus.h" + +extern size_t v3d_runner_pool_total_bytes(void *); /* exposed if we wanted it */ + +static double now_seconds(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +static int cmp_double(const void *a, const void *b) +{ + double da = *(const double *)a, db = *(const double *)b; + return da < db ? -1 : da > db ? 1 : 0; +} + +int main(int argc, char **argv) +{ + int n_calls = argc > 1 ? atoi(argv[1]) : 200; + int n_blocks = 8; /* one MB column of 8x8 IDCT blocks */ + int stride = 64; + + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; } + int has_qpu = daedalus_ctx_has_qpu(ctx); + printf("ctx: has_qpu=%d\n", has_qpu); + if (!has_qpu) { + fprintf(stderr, "QPU not available on this device; bench needs V3D\n"); + daedalus_ctx_destroy(ctx); + return 2; + } + + /* Build a representative IDCT 8x8 batch and warm a dst buffer. */ + int16_t *coeffs = calloc((size_t) n_blocks * 64, sizeof(int16_t)); + uint8_t *dst = calloc((size_t) n_blocks * 8 * stride, 1); + daedalus_idct8_meta *meta = calloc((size_t) n_blocks, sizeof(*meta)); + if (!coeffs || !dst || !meta) { fprintf(stderr, "alloc fail\n"); return 1; } + + uint64_t s = 0x1234567abcdefULL; + for (size_t i = 0; i < (size_t) n_blocks * 64; i++) { + s ^= s << 13; s ^= s >> 7; s ^= s << 17; + coeffs[i] = (int16_t)(s & 0x7ff) - 0x400; + } + for (int b = 0; b < n_blocks; b++) { + meta[b].dst_off = (uint32_t) b * 8; + meta[b].block_x = (uint32_t) b; + meta[b].block_y = 0; + } + + double *t = malloc((size_t) n_calls * sizeof(double)); + int rc; + + printf("=== dispatching %d times, n_blocks=%d/call ===\n", + n_calls, n_blocks); + + for (int i = 0; i < n_calls; i++) { + double t0 = now_seconds(); + rc = daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_QPU, + dst, (size_t) stride, + coeffs, (size_t) n_blocks, meta); + double t1 = now_seconds(); + if (rc) { fprintf(stderr, "dispatch %d rc=%d\n", i, rc); return 1; } + t[i] = (t1 - t0) * 1e6; /* us */ + } + + /* Per-call distribution (first few + sorted summary on the steady-state) */ + printf("\nfirst 5 calls (cold-warm transition):\n"); + for (int i = 0; i < 5 && i < n_calls; i++) + printf(" call %d: %.2f us\n", i, t[i]); + + int skip = 10; /* drop warm-up calls from the steady-state stats */ + if (n_calls > skip + 10) { + int n = n_calls - skip; + double *s_arr = malloc((size_t) n * sizeof(double)); + memcpy(s_arr, t + skip, (size_t) n * sizeof(double)); + qsort(s_arr, (size_t) n, sizeof(double), cmp_double); + double sum = 0; + for (int i = 0; i < n; i++) sum += s_arr[i]; + printf("\nsteady-state stats (calls %d..%d, n=%d):\n", + skip, n_calls - 1, n); + printf(" min: %.2f us\n", s_arr[0]); + printf(" p50: %.2f us\n", s_arr[n / 2]); + printf(" p90: %.2f us\n", s_arr[(int)(n * 0.9)]); + printf(" p99: %.2f us\n", s_arr[(int)(n * 0.99)]); + printf(" max: %.2f us\n", s_arr[n - 1]); + printf(" mean: %.2f us\n", sum / n); + printf("\nfirst-call / steady-state median ratio: %.1fx\n", + t[0] / s_arr[n / 2]); + free(s_arr); + } + + free(t); free(coeffs); free(dst); free(meta); + daedalus_ctx_destroy(ctx); + return 0; +} -- 2.47.3 From 98553278dd6b5f5720c9f9e1064fdfa9299e24c0 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sat, 23 May 2026 19:56:35 +0200 Subject: [PATCH 2/2] v3d_runner: persistent per-pipeline command buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of the QPU-default substrate campaign — eliminate vkAllocateCommandBuffers from the dispatch hot path. Attaches a VkCommandBuffer to each v3d_pipeline, allocated once in v3d_runner_create_pipeline() and freed in destroy_pipeline(). The five dispatch_*_qpu sites switch from v3d_runner_alloc_cmdbuf() to v3d_runner_pipeline_cmdbuf_reset() — vkResetCommandBuffer is O(1) versus the driver-side allocation walk. Pool was already created with VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT so reset is permitted. Microbench (hertz, Pi 5, kernel 6.18.29, V3D 7.1): before (task 160 pool only): steady-state p50: 76.44 us steady-state mean: 77.95 us after (task 160 pool + task 161 persistent cb): steady-state p50: 54.56 us steady-state mean: 56.00 us -> 28% per-dispatch reduction The remaining ~54 us steady-state is dominated by vkQueueWaitIdle + shader execution + the two memcpy(in/out) on the dst buffer — task 162 (dmabuf import for dst) targets the memcpy half. test_api_idct stays bit-exact across CPU/QPU/AUTO substrates. Refs daedalus-fourier task #161. --- src/daedalus_core.c | 20 ++++++++++---------- src/v3d_runner.c | 22 ++++++++++++++++++++++ src/v3d_runner.h | 12 ++++++++++++ 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 375064c..ff4c255 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -325,8 +325,8 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx, ._pad = 0, }; - VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); - if (cb == VK_NULL_HANDLE) goto fail; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->idct8_pipe)) goto fail; + VkCommandBuffer cb = ctx->idct8_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, @@ -442,8 +442,8 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8, if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail; uint32_t wg_count = (uint32_t)((n_edges + 31) / 32); - VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); - if (cb == VK_NULL_HANDLE) goto fail; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, p)) goto fail; + VkCommandBuffer cb = p->cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline); @@ -530,8 +530,8 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx, mc_pc pc = { .n_blocks = (uint32_t) n_blocks, .dst_stride_u8 = (uint32_t) dst_stride, .src_stride_u8 = (uint32_t) src_stride }; - VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); - if (cb == VK_NULL_HANDLE) goto fail; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->mc8h_pipe)) goto fail; + VkCommandBuffer cb = ctx->mc8h_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline); @@ -615,8 +615,8 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx, cdef_pc pc = { .n_blocks = (uint32_t) n_blocks, .tmp_stride_u16 = 16, .dst_stride_u8 = (uint32_t) dst_stride }; - VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); - if (cb == VK_NULL_HANDLE) goto fail; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->cdef_pipe)) goto fail; + VkCommandBuffer cb = ctx->cdef_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline); @@ -691,8 +691,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx, uint32_t wg_count = (uint32_t)((n_edges + 15) / 16); h264deblock_pc pc = { .n_edges = (uint32_t) n_edges, .dst_stride_u8 = (uint32_t) dst_stride }; - VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); - if (cb == VK_NULL_HANDLE) goto fail; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_pipe)) goto fail; + VkCommandBuffer cb = ctx->h264deblock_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline); diff --git a/src/v3d_runner.c b/src/v3d_runner.c index cbf000d..05d34c5 100644 --- a/src/v3d_runner.c +++ b/src/v3d_runner.c @@ -486,12 +486,27 @@ int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path, .pSetLayouts = &out->ds_layout, }; CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set)); + + /* Persistent command buffer — pool was created with + * RESET_COMMAND_BUFFER_BIT (see v3d_runner_create) so dispatch + * sites can call vkResetCommandBuffer on this same cb instead + * of paying vkAllocateCommandBuffers per call. */ + VkCommandBufferAllocateInfo cbai = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = r->pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }; + CHK(vkAllocateCommandBuffers(r->device, &cbai, &out->cb)); + return 0; } void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p) { if (!p || p->pipeline == VK_NULL_HANDLE) return; + if (p->cb != VK_NULL_HANDLE) + vkFreeCommandBuffers(r->device, r->pool, 1, &p->cb); vkDestroyPipeline(r->device, p->pipeline, NULL); vkDestroyPipelineLayout(r->device, p->layout, NULL); vkDestroyDescriptorPool(r->device, p->pool, NULL); /* frees its set */ @@ -499,6 +514,13 @@ void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p) memset(p, 0, sizeof(*p)); } +int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p) +{ + (void) r; + if (!p || p->cb == VK_NULL_HANDLE) return -1; + return vkResetCommandBuffer(p->cb, 0) == VK_SUCCESS ? 0 : -1; +} + int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p, const v3d_buffer *bufs, uint32_t n) { diff --git a/src/v3d_runner.h b/src/v3d_runner.h index fb4147b..86f706a 100644 --- a/src/v3d_runner.h +++ b/src/v3d_runner.h @@ -34,6 +34,12 @@ typedef struct { VkDescriptorSet desc_set; uint32_t n_ssbos; uint32_t push_const_size; + /* Persistent command buffer. Allocated at create-pipeline time; + * dispatch sites use v3d_runner_pipeline_cmdbuf_reset() to + * vkResetCommandBuffer instead of paying vkAllocateCommandBuffers + * per dispatch. Pool flagged RESET_COMMAND_BUFFER_BIT so reset + * is permitted. */ + VkCommandBuffer cb; } v3d_pipeline; /* @@ -121,6 +127,12 @@ int v3d_runner_bind_buffers(v3d_runner *r, /* Allocate a primary command buffer from the runner's pool. */ VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r); +/* Reset @p->cb so it can be re-recorded. Returns 0 on success. + * Replaces v3d_runner_alloc_cmdbuf() on the dispatch hot path — + * vkResetCommandBuffer is O(1) vs vkAllocateCommandBuffers' ~1-5us + * driver cost. */ +int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p); + /* Submit `cb` to the queue and wait for completion. The classic * timed operation. Returns 0 on success. */ -- 2.47.3