v3d_runner: buffer pool for QPU dispatch hot path #6
@@ -492,6 +492,10 @@ add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c)
|
||||
target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core)
|
||||
target_compile_options(test_api_opportunistic_qpu PRIVATE -O2)
|
||||
|
||||
add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
|
||||
target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
|
||||
target_compile_options(bench_pool_overhead PRIVATE -O2)
|
||||
|
||||
if (DAEDALUS_BUILD_VULKAN)
|
||||
# (re-open the conditional so the closing endif() below balances)
|
||||
|
||||
|
||||
+53
-53
@@ -291,13 +291,13 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||||
}
|
||||
|
||||
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
}
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
}
|
||||
|
||||
/* Upload. Coeffs and meta are straight copies. Dst we copy the
|
||||
@@ -325,8 +325,8 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||||
._pad = 0,
|
||||
};
|
||||
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->idct8_pipe)) goto fail;
|
||||
VkCommandBuffer cb = ctx->idct8_pipe.cb;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
@@ -344,15 +344,15 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||||
/* Read-back dst. */
|
||||
memcpy(dst, buf_dst.mapped, max_byte_touched);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -424,9 +424,9 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||||
size_t dst_window_size = hi - lo;
|
||||
|
||||
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1;
|
||||
}
|
||||
|
||||
memcpy(buf_dst.mapped, dst + lo, dst_window_size);
|
||||
@@ -442,8 +442,8 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||||
if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
|
||||
|
||||
uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, p)) goto fail;
|
||||
VkCommandBuffer cb = p->cb;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
|
||||
@@ -468,12 +468,12 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||||
|
||||
memcpy(dst + lo, buf_dst.mapped, dst_window_size);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -509,9 +509,9 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
||||
}
|
||||
|
||||
v3d_buffer bm = {0}, bd = {0}, bs = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, src_max, &bs)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
|
||||
memcpy(bs.mapped, src, src_max);
|
||||
memcpy(bd.mapped, dst, dst_max);
|
||||
@@ -530,8 +530,8 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
||||
mc_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
||||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||
.src_stride_u8 = (uint32_t) src_stride };
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->mc8h_pipe)) goto fail;
|
||||
VkCommandBuffer cb = ctx->mc8h_pipe.cb;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline);
|
||||
@@ -545,14 +545,14 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bs);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bs);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -588,9 +588,9 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
||||
size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
|
||||
|
||||
v3d_buffer bm = {0}, bd = {0}, bt = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_create_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
|
||||
/* tmp may need padding before block-origin offset (caller-allocated). Just
|
||||
* copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
|
||||
@@ -615,8 +615,8 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
||||
cdef_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
||||
.tmp_stride_u16 = 16,
|
||||
.dst_stride_u8 = (uint32_t) dst_stride };
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->cdef_pipe)) goto fail;
|
||||
VkCommandBuffer cb = ctx->cdef_pipe.cb;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline);
|
||||
@@ -630,14 +630,14 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bt);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bt);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bt);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bt);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -670,8 +670,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
||||
}
|
||||
|
||||
v3d_buffer bm = {0}, bd = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
|
||||
memcpy(bd.mapped, dst, dst_max);
|
||||
uint32_t *m = bm.mapped;
|
||||
@@ -691,8 +691,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
||||
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
|
||||
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
|
||||
.dst_stride_u8 = (uint32_t) dst_stride };
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||
if (cb == VK_NULL_HANDLE) goto fail;
|
||||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_pipe)) goto fail;
|
||||
VkCommandBuffer cb = ctx->h264deblock_pipe.cb;
|
||||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||
vkBeginCommandBuffer(cb, &cbbi);
|
||||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline);
|
||||
@@ -706,12 +706,12 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
@@ -17,6 +17,18 @@
|
||||
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
|
||||
r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
|
||||
|
||||
/* Power-of-2 size classes from 2^8 (256 B) up to 2^23 (8 MiB). Cycle
|
||||
* 1's largest dispatch with n_blocks ≈ 8K is well under 8 MiB; oversize
|
||||
* requests fall through to non-pooled allocation. */
|
||||
#define V3D_POOL_MIN_LOG2 8
|
||||
#define V3D_POOL_MAX_LOG2 23
|
||||
#define V3D_POOL_BUCKETS (V3D_POOL_MAX_LOG2 - V3D_POOL_MIN_LOG2 + 1)
|
||||
|
||||
struct v3d_pool_entry {
|
||||
v3d_buffer buf;
|
||||
struct v3d_pool_entry *next;
|
||||
};
|
||||
|
||||
struct v3d_runner {
|
||||
VkInstance instance;
|
||||
VkPhysicalDevice phys;
|
||||
@@ -26,6 +38,15 @@ struct v3d_runner {
|
||||
VkCommandPool pool;
|
||||
char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
|
||||
VkPhysicalDeviceMemoryProperties mem_props;
|
||||
|
||||
/* Buffer pool: per-bucket freelist of previously-released
|
||||
* v3d_buffer. bucket index = ceil_log2(size) - V3D_POOL_MIN_LOG2.
|
||||
* pool_total_bytes accumulates every successful vkAllocateMemory
|
||||
* we've done through the pool — never decreases (the freelist
|
||||
* just hands buffers around, no vkFreeMemory until destroy).
|
||||
*/
|
||||
struct v3d_pool_entry *pool_free[V3D_POOL_BUCKETS];
|
||||
size_t pool_total_bytes;
|
||||
};
|
||||
|
||||
static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
|
||||
@@ -168,6 +189,21 @@ void v3d_runner_destroy(v3d_runner *r)
|
||||
{
|
||||
if (!r) return;
|
||||
if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
|
||||
|
||||
/* Drain the buffer pool BEFORE destroying device — the pool
|
||||
* entries own VkBuffer/VkDeviceMemory handles, which need a live
|
||||
* device for vkDestroyBuffer/vkFreeMemory. */
|
||||
for (int b = 0; b < V3D_POOL_BUCKETS; b++) {
|
||||
struct v3d_pool_entry *e = r->pool_free[b];
|
||||
while (e) {
|
||||
struct v3d_pool_entry *next = e->next;
|
||||
v3d_runner_destroy_buffer(r, &e->buf);
|
||||
free(e);
|
||||
e = next;
|
||||
}
|
||||
r->pool_free[b] = NULL;
|
||||
}
|
||||
|
||||
if (r->pool != VK_NULL_HANDLE)
|
||||
vkDestroyCommandPool(r->device, r->pool, NULL);
|
||||
if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
|
||||
@@ -175,6 +211,92 @@ void v3d_runner_destroy(v3d_runner *r)
|
||||
free(r);
|
||||
}
|
||||
|
||||
/* ---- Buffer pool ----------------------------------------------- */
|
||||
|
||||
/* ceil_log2 for buffer pool bucket selection. */
|
||||
static int v3d_pool_bucket_for(size_t size)
|
||||
{
|
||||
int log2;
|
||||
size_t m;
|
||||
|
||||
if (size <= ((size_t)1 << V3D_POOL_MIN_LOG2))
|
||||
return 0;
|
||||
m = size - 1;
|
||||
log2 = 0;
|
||||
while (m) { log2++; m >>= 1; }
|
||||
if (log2 < V3D_POOL_MIN_LOG2) log2 = V3D_POOL_MIN_LOG2;
|
||||
if (log2 > V3D_POOL_MAX_LOG2) return -1;
|
||||
return log2 - V3D_POOL_MIN_LOG2;
|
||||
}
|
||||
|
||||
int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
|
||||
{
|
||||
int bucket;
|
||||
size_t bucket_size;
|
||||
struct v3d_pool_entry *e;
|
||||
int rc;
|
||||
|
||||
if (!r || !out || size == 0) return -1;
|
||||
|
||||
bucket = v3d_pool_bucket_for(size);
|
||||
if (bucket < 0) {
|
||||
/* Oversize — fall through to non-pooled allocation. Caller
|
||||
* still calls v3d_runner_release_buffer(), which detects the
|
||||
* oversize bucket via bucket_for() and destroys. */
|
||||
return v3d_runner_create_buffer(r, size, out);
|
||||
}
|
||||
bucket_size = (size_t)1 << (bucket + V3D_POOL_MIN_LOG2);
|
||||
|
||||
e = r->pool_free[bucket];
|
||||
if (e) {
|
||||
r->pool_free[bucket] = e->next;
|
||||
*out = e->buf;
|
||||
free(e);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Miss — allocate fresh at the bucket size. Subsequent acquire/
|
||||
* release for the same bucket reuses this buffer. */
|
||||
rc = v3d_runner_create_buffer(r, bucket_size, out);
|
||||
if (rc == 0)
|
||||
r->pool_total_bytes += bucket_size;
|
||||
return rc;
|
||||
}
|
||||
|
||||
void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf)
|
||||
{
|
||||
int bucket;
|
||||
struct v3d_pool_entry *e;
|
||||
|
||||
if (!r || !buf || buf->buffer == VK_NULL_HANDLE) return;
|
||||
|
||||
bucket = v3d_pool_bucket_for(buf->size);
|
||||
if (bucket < 0) {
|
||||
/* Oversize — destroy outright; never made it into the pool. */
|
||||
v3d_runner_destroy_buffer(r, buf);
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
return;
|
||||
}
|
||||
|
||||
e = malloc(sizeof(*e));
|
||||
if (!e) {
|
||||
/* Allocator failure: just destroy. Pool degenerates to
|
||||
* non-pooled behaviour but doesn't leak. */
|
||||
v3d_runner_destroy_buffer(r, buf);
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
return;
|
||||
}
|
||||
e->buf = *buf;
|
||||
e->next = r->pool_free[bucket];
|
||||
r->pool_free[bucket] = e;
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
}
|
||||
|
||||
size_t v3d_runner_pool_total_bytes(v3d_runner *r)
|
||||
{
|
||||
return r ? r->pool_total_bytes : 0;
|
||||
}
|
||||
|
||||
VkDevice v3d_runner_device(v3d_runner *r) { return r->device; }
|
||||
VkQueue v3d_runner_queue(v3d_runner *r) { return r->queue; }
|
||||
uint32_t v3d_runner_queue_family(v3d_runner *r) { return r->queue_family; }
|
||||
@@ -364,12 +486,27 @@ int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path,
|
||||
.pSetLayouts = &out->ds_layout,
|
||||
};
|
||||
CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set));
|
||||
|
||||
/* Persistent command buffer — pool was created with
|
||||
* RESET_COMMAND_BUFFER_BIT (see v3d_runner_create) so dispatch
|
||||
* sites can call vkResetCommandBuffer on this same cb instead
|
||||
* of paying vkAllocateCommandBuffers per call. */
|
||||
VkCommandBufferAllocateInfo cbai = {
|
||||
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
|
||||
.commandPool = r->pool,
|
||||
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
|
||||
.commandBufferCount = 1,
|
||||
};
|
||||
CHK(vkAllocateCommandBuffers(r->device, &cbai, &out->cb));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
|
||||
{
|
||||
if (!p || p->pipeline == VK_NULL_HANDLE) return;
|
||||
if (p->cb != VK_NULL_HANDLE)
|
||||
vkFreeCommandBuffers(r->device, r->pool, 1, &p->cb);
|
||||
vkDestroyPipeline(r->device, p->pipeline, NULL);
|
||||
vkDestroyPipelineLayout(r->device, p->layout, NULL);
|
||||
vkDestroyDescriptorPool(r->device, p->pool, NULL); /* frees its set */
|
||||
@@ -377,6 +514,13 @@ void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
|
||||
memset(p, 0, sizeof(*p));
|
||||
}
|
||||
|
||||
int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p)
|
||||
{
|
||||
(void) r;
|
||||
if (!p || p->cb == VK_NULL_HANDLE) return -1;
|
||||
return vkResetCommandBuffer(p->cb, 0) == VK_SUCCESS ? 0 : -1;
|
||||
}
|
||||
|
||||
int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p,
|
||||
const v3d_buffer *bufs, uint32_t n)
|
||||
{
|
||||
|
||||
@@ -34,6 +34,12 @@ typedef struct {
|
||||
VkDescriptorSet desc_set;
|
||||
uint32_t n_ssbos;
|
||||
uint32_t push_const_size;
|
||||
/* Persistent command buffer. Allocated at create-pipeline time;
|
||||
* dispatch sites use v3d_runner_pipeline_cmdbuf_reset() to
|
||||
* vkResetCommandBuffer instead of paying vkAllocateCommandBuffers
|
||||
* per dispatch. Pool flagged RESET_COMMAND_BUFFER_BIT so reset
|
||||
* is permitted. */
|
||||
VkCommandBuffer cb;
|
||||
} v3d_pipeline;
|
||||
|
||||
/*
|
||||
@@ -57,10 +63,43 @@ const char *v3d_runner_device_name(v3d_runner *r);
|
||||
* host side. The mapping persists for the lifetime of the buffer.
|
||||
*
|
||||
* Returns 0 on success, non-zero on failure.
|
||||
*
|
||||
* NOTE: prefer v3d_runner_acquire_buffer() on the dispatch hot path —
|
||||
* create_buffer/destroy_buffer go straight to vkAllocateMemory each
|
||||
* call, which on V3D7's Mesa stack costs ~10-50us. The acquire/
|
||||
* release pair pulls from a freelist and pays vkAllocateMemory only
|
||||
* on a cache miss.
|
||||
*/
|
||||
int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
|
||||
void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf);
|
||||
|
||||
/*
|
||||
* Pooled buffer acquisition. Returns a v3d_buffer whose .size is the
|
||||
* smallest power-of-2 >= the requested size (so callers can pool
|
||||
* across similar-sized requests). Backed by HOST_VISIBLE |
|
||||
* HOST_COHERENT memory; mapped pointer is valid.
|
||||
*
|
||||
* On cache hit: zero-cost reuse of a previously-released buffer.
|
||||
* On miss: falls through to v3d_runner_create_buffer(). Release with
|
||||
* v3d_runner_release_buffer(); pool drains in v3d_runner_destroy().
|
||||
*
|
||||
* Lifetime contract: the returned buffer's .mapped contents are
|
||||
* UNINITIALISED — the previous user's data may still be present.
|
||||
* Callers that need a clean buffer must memset themselves. This is
|
||||
* deliberate; the dispatch hot paths immediately overwrite the
|
||||
* buffer with new coefficients / meta anyway.
|
||||
*
|
||||
* Thread-safety: NOT thread-safe. A daedalus_ctx is single-threaded
|
||||
* by API contract; the pool inherits that constraint.
|
||||
*/
|
||||
int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
|
||||
void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf);
|
||||
|
||||
/* Pool diagnostics: total allocated bytes (sum across all size
|
||||
* classes, including currently-released entries). Useful for
|
||||
* watermark logging. */
|
||||
size_t v3d_runner_pool_total_bytes(v3d_runner *r);
|
||||
|
||||
/* Compute pipeline from a SPIR-V file path. The descriptor-set
|
||||
* layout exposes `n_ssbos` storage buffer bindings at binding
|
||||
* indices 0..n_ssbos-1, all visible to the compute stage. A push
|
||||
@@ -88,6 +127,12 @@ int v3d_runner_bind_buffers(v3d_runner *r,
|
||||
/* Allocate a primary command buffer from the runner's pool. */
|
||||
VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r);
|
||||
|
||||
/* Reset @p->cb so it can be re-recorded. Returns 0 on success.
|
||||
* Replaces v3d_runner_alloc_cmdbuf() on the dispatch hot path —
|
||||
* vkResetCommandBuffer is O(1) vs vkAllocateCommandBuffers' ~1-5us
|
||||
* driver cost. */
|
||||
int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p);
|
||||
|
||||
/* Submit `cb` to the queue and wait for completion. The classic
|
||||
* timed operation. Returns 0 on success.
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* bench_pool_overhead — measure QPU dispatch overhead with and without
|
||||
* the v3d_runner buffer pool warm.
|
||||
*
|
||||
* Times N consecutive daedalus_recipe_dispatch_vp9_idct8 calls and
|
||||
* prints the per-call distribution. The first call pays
|
||||
* vkAllocateMemory (typically tens of microseconds on V3D7's Mesa);
|
||||
* the second and subsequent should hit the pool freelist and amortise
|
||||
* to the pure dispatch-floor cost.
|
||||
*
|
||||
* Purpose: provide a concrete before/after number for the QPU-default
|
||||
* substrate decree (2026-05-23). Bench is non-gating and runs in
|
||||
* fractions of a second.
|
||||
*
|
||||
* License: BSD-2-Clause.
|
||||
*/
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "../include/daedalus.h"
|
||||
|
||||
extern size_t v3d_runner_pool_total_bytes(void *); /* exposed if we wanted it */
|
||||
|
||||
static double now_seconds(void)
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||||
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||||
}
|
||||
|
||||
static int cmp_double(const void *a, const void *b)
|
||||
{
|
||||
double da = *(const double *)a, db = *(const double *)b;
|
||||
return da < db ? -1 : da > db ? 1 : 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int n_calls = argc > 1 ? atoi(argv[1]) : 200;
|
||||
int n_blocks = 8; /* one MB column of 8x8 IDCT blocks */
|
||||
int stride = 64;
|
||||
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
|
||||
int has_qpu = daedalus_ctx_has_qpu(ctx);
|
||||
printf("ctx: has_qpu=%d\n", has_qpu);
|
||||
if (!has_qpu) {
|
||||
fprintf(stderr, "QPU not available on this device; bench needs V3D\n");
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return 2;
|
||||
}
|
||||
|
||||
/* Build a representative IDCT 8x8 batch and warm a dst buffer. */
|
||||
int16_t *coeffs = calloc((size_t) n_blocks * 64, sizeof(int16_t));
|
||||
uint8_t *dst = calloc((size_t) n_blocks * 8 * stride, 1);
|
||||
daedalus_idct8_meta *meta = calloc((size_t) n_blocks, sizeof(*meta));
|
||||
if (!coeffs || !dst || !meta) { fprintf(stderr, "alloc fail\n"); return 1; }
|
||||
|
||||
uint64_t s = 0x1234567abcdefULL;
|
||||
for (size_t i = 0; i < (size_t) n_blocks * 64; i++) {
|
||||
s ^= s << 13; s ^= s >> 7; s ^= s << 17;
|
||||
coeffs[i] = (int16_t)(s & 0x7ff) - 0x400;
|
||||
}
|
||||
for (int b = 0; b < n_blocks; b++) {
|
||||
meta[b].dst_off = (uint32_t) b * 8;
|
||||
meta[b].block_x = (uint32_t) b;
|
||||
meta[b].block_y = 0;
|
||||
}
|
||||
|
||||
double *t = malloc((size_t) n_calls * sizeof(double));
|
||||
int rc;
|
||||
|
||||
printf("=== dispatching %d times, n_blocks=%d/call ===\n",
|
||||
n_calls, n_blocks);
|
||||
|
||||
for (int i = 0; i < n_calls; i++) {
|
||||
double t0 = now_seconds();
|
||||
rc = daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_QPU,
|
||||
dst, (size_t) stride,
|
||||
coeffs, (size_t) n_blocks, meta);
|
||||
double t1 = now_seconds();
|
||||
if (rc) { fprintf(stderr, "dispatch %d rc=%d\n", i, rc); return 1; }
|
||||
t[i] = (t1 - t0) * 1e6; /* us */
|
||||
}
|
||||
|
||||
/* Per-call distribution (first few + sorted summary on the steady-state) */
|
||||
printf("\nfirst 5 calls (cold-warm transition):\n");
|
||||
for (int i = 0; i < 5 && i < n_calls; i++)
|
||||
printf(" call %d: %.2f us\n", i, t[i]);
|
||||
|
||||
int skip = 10; /* drop warm-up calls from the steady-state stats */
|
||||
if (n_calls > skip + 10) {
|
||||
int n = n_calls - skip;
|
||||
double *s_arr = malloc((size_t) n * sizeof(double));
|
||||
memcpy(s_arr, t + skip, (size_t) n * sizeof(double));
|
||||
qsort(s_arr, (size_t) n, sizeof(double), cmp_double);
|
||||
double sum = 0;
|
||||
for (int i = 0; i < n; i++) sum += s_arr[i];
|
||||
printf("\nsteady-state stats (calls %d..%d, n=%d):\n",
|
||||
skip, n_calls - 1, n);
|
||||
printf(" min: %.2f us\n", s_arr[0]);
|
||||
printf(" p50: %.2f us\n", s_arr[n / 2]);
|
||||
printf(" p90: %.2f us\n", s_arr[(int)(n * 0.9)]);
|
||||
printf(" p99: %.2f us\n", s_arr[(int)(n * 0.99)]);
|
||||
printf(" max: %.2f us\n", s_arr[n - 1]);
|
||||
printf(" mean: %.2f us\n", sum / n);
|
||||
printf("\nfirst-call / steady-state median ratio: %.1fx\n",
|
||||
t[0] / s_arr[n / 2]);
|
||||
free(s_arr);
|
||||
}
|
||||
|
||||
free(t); free(coeffs); free(dst); free(meta);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user