From 98553278dd6b5f5720c9f9e1064fdfa9299e24c0 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sat, 23 May 2026 19:56:35 +0200 Subject: [PATCH] v3d_runner: persistent per-pipeline command buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of the QPU-default substrate campaign — eliminate vkAllocateCommandBuffers from the dispatch hot path. Attaches a VkCommandBuffer to each v3d_pipeline, allocated once in v3d_runner_create_pipeline() and freed in destroy_pipeline(). The five dispatch_*_qpu sites switch from v3d_runner_alloc_cmdbuf() to v3d_runner_pipeline_cmdbuf_reset() — vkResetCommandBuffer is O(1) versus the driver-side allocation walk. Pool was already created with VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT so reset is permitted. Microbench (hertz, Pi 5, kernel 6.18.29, V3D 7.1): before (task 160 pool only): steady-state p50: 76.44 us steady-state mean: 77.95 us after (task 160 pool + task 161 persistent cb): steady-state p50: 54.56 us steady-state mean: 56.00 us -> 28% per-dispatch reduction The remaining ~54 us steady-state is dominated by vkQueueWaitIdle + shader execution + the two memcpy(in/out) on the dst buffer — task 162 (dmabuf import for dst) targets the memcpy half. test_api_idct stays bit-exact across CPU/QPU/AUTO substrates. Refs daedalus-fourier task #161. --- src/daedalus_core.c | 20 ++++++++++---------- src/v3d_runner.c | 22 ++++++++++++++++++++++ src/v3d_runner.h | 12 ++++++++++++ 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 375064c..ff4c255 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -325,8 +325,8 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx, ._pad = 0, }; - VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); - if (cb == VK_NULL_HANDLE) goto fail; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->idct8_pipe)) goto fail; + VkCommandBuffer cb = ctx->idct8_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, @@ -442,8 +442,8 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8, if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail; uint32_t wg_count = (uint32_t)((n_edges + 31) / 32); - VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); - if (cb == VK_NULL_HANDLE) goto fail; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, p)) goto fail; + VkCommandBuffer cb = p->cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline); @@ -530,8 +530,8 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx, mc_pc pc = { .n_blocks = (uint32_t) n_blocks, .dst_stride_u8 = (uint32_t) dst_stride, .src_stride_u8 = (uint32_t) src_stride }; - VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); - if (cb == VK_NULL_HANDLE) goto fail; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->mc8h_pipe)) goto fail; + VkCommandBuffer cb = ctx->mc8h_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline); @@ -615,8 +615,8 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx, cdef_pc pc = { .n_blocks = (uint32_t) n_blocks, .tmp_stride_u16 = 16, .dst_stride_u8 = (uint32_t) dst_stride }; - VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); - if (cb == VK_NULL_HANDLE) goto fail; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->cdef_pipe)) goto fail; + VkCommandBuffer cb = ctx->cdef_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline); @@ -691,8 +691,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx, uint32_t wg_count = (uint32_t)((n_edges + 15) / 16); h264deblock_pc pc = { .n_edges = (uint32_t) n_edges, .dst_stride_u8 = (uint32_t) dst_stride }; - VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); - if (cb == VK_NULL_HANDLE) goto fail; + if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_pipe)) goto fail; + VkCommandBuffer cb = ctx->h264deblock_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline); diff --git a/src/v3d_runner.c b/src/v3d_runner.c index cbf000d..05d34c5 100644 --- a/src/v3d_runner.c +++ b/src/v3d_runner.c @@ -486,12 +486,27 @@ int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path, .pSetLayouts = &out->ds_layout, }; CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set)); + + /* Persistent command buffer — pool was created with + * RESET_COMMAND_BUFFER_BIT (see v3d_runner_create) so dispatch + * sites can call vkResetCommandBuffer on this same cb instead + * of paying vkAllocateCommandBuffers per call. */ + VkCommandBufferAllocateInfo cbai = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = r->pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }; + CHK(vkAllocateCommandBuffers(r->device, &cbai, &out->cb)); + return 0; } void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p) { if (!p || p->pipeline == VK_NULL_HANDLE) return; + if (p->cb != VK_NULL_HANDLE) + vkFreeCommandBuffers(r->device, r->pool, 1, &p->cb); vkDestroyPipeline(r->device, p->pipeline, NULL); vkDestroyPipelineLayout(r->device, p->layout, NULL); vkDestroyDescriptorPool(r->device, p->pool, NULL); /* frees its set */ @@ -499,6 +514,13 @@ void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p) memset(p, 0, sizeof(*p)); } +int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p) +{ + (void) r; + if (!p || p->cb == VK_NULL_HANDLE) return -1; + return vkResetCommandBuffer(p->cb, 0) == VK_SUCCESS ? 0 : -1; +} + int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p, const v3d_buffer *bufs, uint32_t n) { diff --git a/src/v3d_runner.h b/src/v3d_runner.h index fb4147b..86f706a 100644 --- a/src/v3d_runner.h +++ b/src/v3d_runner.h @@ -34,6 +34,12 @@ typedef struct { VkDescriptorSet desc_set; uint32_t n_ssbos; uint32_t push_const_size; + /* Persistent command buffer. Allocated at create-pipeline time; + * dispatch sites use v3d_runner_pipeline_cmdbuf_reset() to + * vkResetCommandBuffer instead of paying vkAllocateCommandBuffers + * per dispatch. Pool flagged RESET_COMMAND_BUFFER_BIT so reset + * is permitted. */ + VkCommandBuffer cb; } v3d_pipeline; /* @@ -121,6 +127,12 @@ int v3d_runner_bind_buffers(v3d_runner *r, /* Allocate a primary command buffer from the runner's pool. */ VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r); +/* Reset @p->cb so it can be re-recorded. Returns 0 on success. + * Replaces v3d_runner_alloc_cmdbuf() on the dispatch hot path — + * vkResetCommandBuffer is O(1) vs vkAllocateCommandBuffers' ~1-5us + * driver cost. */ +int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p); + /* Submit `cb` to the queue and wait for completion. The classic * timed operation. Returns 0 on success. */