v3d_runner: buffer pool for QPU dispatch hot path
Per the QPU-default substrate decree 2026-05-23: the per-dispatch
vkAllocateMemory in dispatch_*_qpu was the biggest single fixable
contributor to QPU dispatch overhead. This pools v3d_buffer
allocations by power-of-2 size class so the second-and-subsequent
dispatch hits a freelist instead of paying ~10-50us of Mesa-V3D7
memory-allocation cost per call.
API additions (v3d_runner.h):
- v3d_runner_acquire_buffer(): pulls from per-bucket freelist;
falls through to v3d_runner_create_buffer() on miss.
- v3d_runner_release_buffer(): pushes back onto the freelist; the
backing VkBuffer/VkDeviceMemory only get vkFreeMemory'd in
v3d_runner_destroy().
- v3d_runner_pool_total_bytes(): diagnostic watermark.
Size classes 2^8..2^23 (256 B to 8 MiB). Oversize requests fall
through to non-pooled (vkAllocateMemory) for both ends — pool stays
correct, just degenerates to old behaviour for those calls.
Migration: daedalus_core.c dispatch_*_qpu paths globally swap
create_buffer → acquire_buffer and destroy_buffer → release_buffer.
All five QPU dispatch functions (idct8 / lpf / mc_8h / cdef /
h264_deblock) now reuse buffers across calls. test_api_idct stays
bit-exact (4096/4096 bytes on CPU/QPU/AUTO substrates on hertz).
Microbench (tests/bench_pool_overhead.c) on hertz (Pi 5,
6.18.29+rpt-rpi-2712, V3D 7.1):
call 0: 434.89 us (cold — 3x vkAllocateMemory)
call 1: 100.06 us (pool hit on all 3 buffers)
steady-state:
p50: 76.44 us
p90: 90.52 us
mean: 77.95 us
first-call / steady-state ratio: 5.7x
The remaining ~76us steady-state is dominated by vkQueueWaitIdle +
shader execution + per-call descriptor-set update + command-buffer
allocation — addressed in follow-on tasks 161 (persistent cmdbuf)
and 162 (dmabuf import for dst, eliminates memcpy in/out).
Refs daedalus-fourier task #160.
This commit is contained in:
@@ -17,6 +17,18 @@
|
||||
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
|
||||
r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
|
||||
|
||||
/* Power-of-2 size classes from 2^8 (256 B) up to 2^23 (8 MiB). Cycle
|
||||
* 1's largest dispatch with n_blocks ≈ 8K is well under 8 MiB; oversize
|
||||
* requests fall through to non-pooled allocation. */
|
||||
#define V3D_POOL_MIN_LOG2 8
|
||||
#define V3D_POOL_MAX_LOG2 23
|
||||
#define V3D_POOL_BUCKETS (V3D_POOL_MAX_LOG2 - V3D_POOL_MIN_LOG2 + 1)
|
||||
|
||||
struct v3d_pool_entry {
|
||||
v3d_buffer buf;
|
||||
struct v3d_pool_entry *next;
|
||||
};
|
||||
|
||||
struct v3d_runner {
|
||||
VkInstance instance;
|
||||
VkPhysicalDevice phys;
|
||||
@@ -26,6 +38,15 @@ struct v3d_runner {
|
||||
VkCommandPool pool;
|
||||
char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
|
||||
VkPhysicalDeviceMemoryProperties mem_props;
|
||||
|
||||
/* Buffer pool: per-bucket freelist of previously-released
|
||||
* v3d_buffer. bucket index = ceil_log2(size) - V3D_POOL_MIN_LOG2.
|
||||
* pool_total_bytes accumulates every successful vkAllocateMemory
|
||||
* we've done through the pool — never decreases (the freelist
|
||||
* just hands buffers around, no vkFreeMemory until destroy).
|
||||
*/
|
||||
struct v3d_pool_entry *pool_free[V3D_POOL_BUCKETS];
|
||||
size_t pool_total_bytes;
|
||||
};
|
||||
|
||||
static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
|
||||
@@ -168,6 +189,21 @@ void v3d_runner_destroy(v3d_runner *r)
|
||||
{
|
||||
if (!r) return;
|
||||
if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
|
||||
|
||||
/* Drain the buffer pool BEFORE destroying device — the pool
|
||||
* entries own VkBuffer/VkDeviceMemory handles, which need a live
|
||||
* device for vkDestroyBuffer/vkFreeMemory. */
|
||||
for (int b = 0; b < V3D_POOL_BUCKETS; b++) {
|
||||
struct v3d_pool_entry *e = r->pool_free[b];
|
||||
while (e) {
|
||||
struct v3d_pool_entry *next = e->next;
|
||||
v3d_runner_destroy_buffer(r, &e->buf);
|
||||
free(e);
|
||||
e = next;
|
||||
}
|
||||
r->pool_free[b] = NULL;
|
||||
}
|
||||
|
||||
if (r->pool != VK_NULL_HANDLE)
|
||||
vkDestroyCommandPool(r->device, r->pool, NULL);
|
||||
if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
|
||||
@@ -175,6 +211,92 @@ void v3d_runner_destroy(v3d_runner *r)
|
||||
free(r);
|
||||
}
|
||||
|
||||
/* ---- Buffer pool ----------------------------------------------- */
|
||||
|
||||
/* ceil_log2 for buffer pool bucket selection. */
|
||||
static int v3d_pool_bucket_for(size_t size)
|
||||
{
|
||||
int log2;
|
||||
size_t m;
|
||||
|
||||
if (size <= ((size_t)1 << V3D_POOL_MIN_LOG2))
|
||||
return 0;
|
||||
m = size - 1;
|
||||
log2 = 0;
|
||||
while (m) { log2++; m >>= 1; }
|
||||
if (log2 < V3D_POOL_MIN_LOG2) log2 = V3D_POOL_MIN_LOG2;
|
||||
if (log2 > V3D_POOL_MAX_LOG2) return -1;
|
||||
return log2 - V3D_POOL_MIN_LOG2;
|
||||
}
|
||||
|
||||
int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
|
||||
{
|
||||
int bucket;
|
||||
size_t bucket_size;
|
||||
struct v3d_pool_entry *e;
|
||||
int rc;
|
||||
|
||||
if (!r || !out || size == 0) return -1;
|
||||
|
||||
bucket = v3d_pool_bucket_for(size);
|
||||
if (bucket < 0) {
|
||||
/* Oversize — fall through to non-pooled allocation. Caller
|
||||
* still calls v3d_runner_release_buffer(), which detects the
|
||||
* oversize bucket via bucket_for() and destroys. */
|
||||
return v3d_runner_create_buffer(r, size, out);
|
||||
}
|
||||
bucket_size = (size_t)1 << (bucket + V3D_POOL_MIN_LOG2);
|
||||
|
||||
e = r->pool_free[bucket];
|
||||
if (e) {
|
||||
r->pool_free[bucket] = e->next;
|
||||
*out = e->buf;
|
||||
free(e);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Miss — allocate fresh at the bucket size. Subsequent acquire/
|
||||
* release for the same bucket reuses this buffer. */
|
||||
rc = v3d_runner_create_buffer(r, bucket_size, out);
|
||||
if (rc == 0)
|
||||
r->pool_total_bytes += bucket_size;
|
||||
return rc;
|
||||
}
|
||||
|
||||
void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf)
|
||||
{
|
||||
int bucket;
|
||||
struct v3d_pool_entry *e;
|
||||
|
||||
if (!r || !buf || buf->buffer == VK_NULL_HANDLE) return;
|
||||
|
||||
bucket = v3d_pool_bucket_for(buf->size);
|
||||
if (bucket < 0) {
|
||||
/* Oversize — destroy outright; never made it into the pool. */
|
||||
v3d_runner_destroy_buffer(r, buf);
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
return;
|
||||
}
|
||||
|
||||
e = malloc(sizeof(*e));
|
||||
if (!e) {
|
||||
/* Allocator failure: just destroy. Pool degenerates to
|
||||
* non-pooled behaviour but doesn't leak. */
|
||||
v3d_runner_destroy_buffer(r, buf);
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
return;
|
||||
}
|
||||
e->buf = *buf;
|
||||
e->next = r->pool_free[bucket];
|
||||
r->pool_free[bucket] = e;
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
}
|
||||
|
||||
size_t v3d_runner_pool_total_bytes(v3d_runner *r)
|
||||
{
|
||||
return r ? r->pool_total_bytes : 0;
|
||||
}
|
||||
|
||||
VkDevice v3d_runner_device(v3d_runner *r) { return r->device; }
|
||||
VkQueue v3d_runner_queue(v3d_runner *r) { return r->queue; }
|
||||
uint32_t v3d_runner_queue_family(v3d_runner *r) { return r->queue_family; }
|
||||
|
||||
Reference in New Issue
Block a user