Files
daedalus-fourier/src/v3d_runner.c
T
claude-noether 98553278dd v3d_runner: persistent per-pipeline command buffer
Phase 2 of the QPU-default substrate campaign — eliminate
vkAllocateCommandBuffers from the dispatch hot path.

Attaches a VkCommandBuffer to each v3d_pipeline, allocated once in
v3d_runner_create_pipeline() and freed in destroy_pipeline().  The
five dispatch_*_qpu sites switch from v3d_runner_alloc_cmdbuf() to
v3d_runner_pipeline_cmdbuf_reset() — vkResetCommandBuffer is O(1)
versus the driver-side allocation walk.  Pool was already created
with VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT so reset is
permitted.

Microbench (hertz, Pi 5, kernel 6.18.29, V3D 7.1):

  before (task 160 pool only):
    steady-state p50: 76.44 us
    steady-state mean: 77.95 us
  after (task 160 pool + task 161 persistent cb):
    steady-state p50: 54.56 us
    steady-state mean: 56.00 us
    -> 28% per-dispatch reduction

The remaining ~54 us steady-state is dominated by vkQueueWaitIdle +
shader execution + the two memcpy(in/out) on the dst buffer — task
162 (dmabuf import for dst) targets the memcpy half.

test_api_idct stays bit-exact across CPU/QPU/AUTO substrates.

Refs daedalus-fourier task #161.
2026-05-23 19:56:35 +02:00

580 lines
19 KiB
C

/*
* v3d_runner — implementation. See v3d_runner.h.
*
* License: BSD-2-Clause.
*/
#include "v3d_runner.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
r__, __FILE__, __LINE__, #call); return -1; } } while (0)
#define CHK_NULL(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
/* Power-of-2 size classes from 2^8 (256 B) up to 2^23 (8 MiB). Cycle
* 1's largest dispatch with n_blocks ≈ 8K is well under 8 MiB; oversize
* requests fall through to non-pooled allocation. */
#define V3D_POOL_MIN_LOG2 8
#define V3D_POOL_MAX_LOG2 23
#define V3D_POOL_BUCKETS (V3D_POOL_MAX_LOG2 - V3D_POOL_MIN_LOG2 + 1)
struct v3d_pool_entry {
v3d_buffer buf;
struct v3d_pool_entry *next;
};
struct v3d_runner {
VkInstance instance;
VkPhysicalDevice phys;
VkDevice device;
VkQueue queue;
uint32_t queue_family;
VkCommandPool pool;
char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
VkPhysicalDeviceMemoryProperties mem_props;
/* Buffer pool: per-bucket freelist of previously-released
* v3d_buffer. bucket index = ceil_log2(size) - V3D_POOL_MIN_LOG2.
* pool_total_bytes accumulates every successful vkAllocateMemory
* we've done through the pool — never decreases (the freelist
* just hands buffers around, no vkFreeMemory until destroy).
*/
struct v3d_pool_entry *pool_free[V3D_POOL_BUCKETS];
size_t pool_total_bytes;
};
static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
char name_out[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE])
{
uint32_t n = 0;
if (vkEnumeratePhysicalDevices(inst, &n, NULL) != VK_SUCCESS || n == 0) {
fprintf(stderr, "v3d_runner: no Vulkan physical devices\n");
return -1;
}
VkPhysicalDevice *pds = malloc(n * sizeof(*pds));
if (!pds) return -1;
vkEnumeratePhysicalDevices(inst, &n, pds);
int picked = -1;
for (uint32_t i = 0; i < n; i++) {
VkPhysicalDeviceProperties p;
vkGetPhysicalDeviceProperties(pds[i], &p);
if (strstr(p.deviceName, "V3D") != NULL) {
*out = pds[i];
memcpy(name_out, p.deviceName, sizeof(p.deviceName));
picked = 0;
break;
}
}
free(pds);
if (picked != 0)
fprintf(stderr, "v3d_runner: no V3D device found (looked for "
"\"V3D\" substring in deviceName)\n");
return picked;
}
static uint32_t pick_compute_queue_family(VkPhysicalDevice phys)
{
uint32_t n = 0;
vkGetPhysicalDeviceQueueFamilyProperties(phys, &n, NULL);
VkQueueFamilyProperties *q = malloc(n * sizeof(*q));
if (!q) return UINT32_MAX;
vkGetPhysicalDeviceQueueFamilyProperties(phys, &n, q);
uint32_t out = UINT32_MAX;
for (uint32_t i = 0; i < n; i++) {
if (q[i].queueFlags & VK_QUEUE_COMPUTE_BIT) { out = i; break; }
}
free(q);
return out;
}
v3d_runner *v3d_runner_create(void)
{
v3d_runner *r = calloc(1, sizeof(*r));
if (!r) return NULL;
/* Instance — Vulkan 1.3 to inherit 1.2 promoted features. */
VkApplicationInfo app = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "daedalus-fourier",
.apiVersion = VK_API_VERSION_1_3,
};
VkInstanceCreateInfo ici = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &app,
};
CHK_NULL(vkCreateInstance(&ici, NULL, &r->instance));
if (pick_v3d_physical_device(r->instance, &r->phys, r->device_name) != 0) {
vkDestroyInstance(r->instance, NULL);
free(r);
return NULL;
}
vkGetPhysicalDeviceMemoryProperties(r->phys, &r->mem_props);
r->queue_family = pick_compute_queue_family(r->phys);
if (r->queue_family == UINT32_MAX) {
fprintf(stderr, "v3d_runner: no compute queue family\n");
vkDestroyInstance(r->instance, NULL);
free(r);
return NULL;
}
/* Enable 8-bit + 16-bit storage features. Both are exposed on
* V3D 7.1 per vulkaninfo_v3d_7_1_7_hertz.txt; the kernel
* declares storageBuffer8BitAccess (uint8_t dst[]) and
* storageBuffer16BitAccess (int16_t coeffs[]).
*/
VkPhysicalDevice16BitStorageFeatures f16 = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES,
.storageBuffer16BitAccess = VK_TRUE,
.uniformAndStorageBuffer16BitAccess = VK_TRUE,
};
VkPhysicalDevice8BitStorageFeatures f8 = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES,
.pNext = &f16,
.storageBuffer8BitAccess = VK_TRUE,
.uniformAndStorageBuffer8BitAccess = VK_TRUE,
};
VkPhysicalDeviceFeatures2 f2 = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
.pNext = &f8,
};
float qprio = 1.0f;
VkDeviceQueueCreateInfo dqci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.queueFamilyIndex = r->queue_family,
.queueCount = 1,
.pQueuePriorities = &qprio,
};
VkDeviceCreateInfo dci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.pNext = &f2,
.queueCreateInfoCount = 1,
.pQueueCreateInfos = &dqci,
};
if (vkCreateDevice(r->phys, &dci, NULL, &r->device) != VK_SUCCESS) {
fprintf(stderr, "v3d_runner: vkCreateDevice failed\n");
vkDestroyInstance(r->instance, NULL);
free(r);
return NULL;
}
vkGetDeviceQueue(r->device, r->queue_family, 0, &r->queue);
VkCommandPoolCreateInfo cpci = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
.queueFamilyIndex = r->queue_family,
};
if (vkCreateCommandPool(r->device, &cpci, NULL, &r->pool) != VK_SUCCESS) {
fprintf(stderr, "v3d_runner: vkCreateCommandPool failed\n");
vkDestroyDevice(r->device, NULL);
vkDestroyInstance(r->instance, NULL);
free(r);
return NULL;
}
return r;
}
void v3d_runner_destroy(v3d_runner *r)
{
if (!r) return;
if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
/* Drain the buffer pool BEFORE destroying device — the pool
* entries own VkBuffer/VkDeviceMemory handles, which need a live
* device for vkDestroyBuffer/vkFreeMemory. */
for (int b = 0; b < V3D_POOL_BUCKETS; b++) {
struct v3d_pool_entry *e = r->pool_free[b];
while (e) {
struct v3d_pool_entry *next = e->next;
v3d_runner_destroy_buffer(r, &e->buf);
free(e);
e = next;
}
r->pool_free[b] = NULL;
}
if (r->pool != VK_NULL_HANDLE)
vkDestroyCommandPool(r->device, r->pool, NULL);
if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
if (r->instance != VK_NULL_HANDLE) vkDestroyInstance(r->instance, NULL);
free(r);
}
/* ---- Buffer pool ----------------------------------------------- */
/* ceil_log2 for buffer pool bucket selection. */
static int v3d_pool_bucket_for(size_t size)
{
int log2;
size_t m;
if (size <= ((size_t)1 << V3D_POOL_MIN_LOG2))
return 0;
m = size - 1;
log2 = 0;
while (m) { log2++; m >>= 1; }
if (log2 < V3D_POOL_MIN_LOG2) log2 = V3D_POOL_MIN_LOG2;
if (log2 > V3D_POOL_MAX_LOG2) return -1;
return log2 - V3D_POOL_MIN_LOG2;
}
int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
{
int bucket;
size_t bucket_size;
struct v3d_pool_entry *e;
int rc;
if (!r || !out || size == 0) return -1;
bucket = v3d_pool_bucket_for(size);
if (bucket < 0) {
/* Oversize — fall through to non-pooled allocation. Caller
* still calls v3d_runner_release_buffer(), which detects the
* oversize bucket via bucket_for() and destroys. */
return v3d_runner_create_buffer(r, size, out);
}
bucket_size = (size_t)1 << (bucket + V3D_POOL_MIN_LOG2);
e = r->pool_free[bucket];
if (e) {
r->pool_free[bucket] = e->next;
*out = e->buf;
free(e);
return 0;
}
/* Miss — allocate fresh at the bucket size. Subsequent acquire/
* release for the same bucket reuses this buffer. */
rc = v3d_runner_create_buffer(r, bucket_size, out);
if (rc == 0)
r->pool_total_bytes += bucket_size;
return rc;
}
void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf)
{
int bucket;
struct v3d_pool_entry *e;
if (!r || !buf || buf->buffer == VK_NULL_HANDLE) return;
bucket = v3d_pool_bucket_for(buf->size);
if (bucket < 0) {
/* Oversize — destroy outright; never made it into the pool. */
v3d_runner_destroy_buffer(r, buf);
memset(buf, 0, sizeof(*buf));
return;
}
e = malloc(sizeof(*e));
if (!e) {
/* Allocator failure: just destroy. Pool degenerates to
* non-pooled behaviour but doesn't leak. */
v3d_runner_destroy_buffer(r, buf);
memset(buf, 0, sizeof(*buf));
return;
}
e->buf = *buf;
e->next = r->pool_free[bucket];
r->pool_free[bucket] = e;
memset(buf, 0, sizeof(*buf));
}
size_t v3d_runner_pool_total_bytes(v3d_runner *r)
{
return r ? r->pool_total_bytes : 0;
}
VkDevice v3d_runner_device(v3d_runner *r) { return r->device; }
VkQueue v3d_runner_queue(v3d_runner *r) { return r->queue; }
uint32_t v3d_runner_queue_family(v3d_runner *r) { return r->queue_family; }
VkCommandPool v3d_runner_cmd_pool(v3d_runner *r) { return r->pool; }
const char *v3d_runner_device_name(v3d_runner *r) { return r->device_name; }
/* ---- Buffers ---------------------------------------------------- */
static int find_memory_type(VkPhysicalDeviceMemoryProperties *p,
uint32_t type_bits, VkMemoryPropertyFlags wanted)
{
for (uint32_t i = 0; i < p->memoryTypeCount; i++) {
if ((type_bits & (1u << i)) &&
(p->memoryTypes[i].propertyFlags & wanted) == wanted)
return (int) i;
}
return -1;
}
int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
{
memset(out, 0, sizeof(*out));
out->size = size;
VkBufferCreateInfo bci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.size = size,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
| VK_BUFFER_USAGE_TRANSFER_SRC_BIT
| VK_BUFFER_USAGE_TRANSFER_DST_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
};
CHK(vkCreateBuffer(r->device, &bci, NULL, &out->buffer));
VkMemoryRequirements req;
vkGetBufferMemoryRequirements(r->device, out->buffer, &req);
/* HOST_VISIBLE | HOST_COHERENT is the unified-memory zero-copy
* path on Pi 5: CPU and GPU see the same LPDDR4x physical pages,
* no explicit flush/invalidate needed (the COHERENT bit asserts
* that). */
int mt = find_memory_type(&r->mem_props, req.memoryTypeBits,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
| VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
if (mt < 0) {
fprintf(stderr, "v3d_runner: no HOST_VISIBLE|COHERENT memory type\n");
return -1;
}
VkMemoryAllocateInfo mai = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.allocationSize = req.size,
.memoryTypeIndex = (uint32_t) mt,
};
CHK(vkAllocateMemory(r->device, &mai, NULL, &out->memory));
CHK(vkBindBufferMemory(r->device, out->buffer, out->memory, 0));
CHK(vkMapMemory(r->device, out->memory, 0, VK_WHOLE_SIZE, 0, &out->mapped));
return 0;
}
void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf)
{
if (!buf || buf->buffer == VK_NULL_HANDLE) return;
if (buf->mapped) vkUnmapMemory(r->device, buf->memory);
vkDestroyBuffer(r->device, buf->buffer, NULL);
vkFreeMemory(r->device, buf->memory, NULL);
memset(buf, 0, sizeof(*buf));
}
/* ---- Pipelines -------------------------------------------------- */
static uint32_t *read_spv(const char *path, size_t *out_size)
{
FILE *f = fopen(path, "rb");
if (!f) { perror(path); return NULL; }
fseek(f, 0, SEEK_END);
long sz = ftell(f);
fseek(f, 0, SEEK_SET);
if (sz <= 0 || (sz & 3)) {
fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz);
fclose(f); return NULL;
}
uint32_t *buf = malloc(sz);
if (!buf || fread(buf, 1, sz, f) != (size_t)sz) {
perror("read"); fclose(f); free(buf); return NULL;
}
fclose(f);
*out_size = sz;
return buf;
}
int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path,
uint32_t n_ssbos, uint32_t push_const_size,
v3d_pipeline *out)
{
memset(out, 0, sizeof(*out));
out->n_ssbos = n_ssbos;
out->push_const_size = push_const_size;
/* Descriptor set layout: n_ssbos SSBO bindings, compute-only. */
VkDescriptorSetLayoutBinding *binds = calloc(n_ssbos, sizeof(*binds));
if (!binds) return -1;
for (uint32_t i = 0; i < n_ssbos; i++) {
binds[i] = (VkDescriptorSetLayoutBinding){
.binding = i,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
};
}
VkDescriptorSetLayoutCreateInfo dslci = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
.bindingCount = n_ssbos,
.pBindings = binds,
};
VkResult vr = vkCreateDescriptorSetLayout(r->device, &dslci, NULL,
&out->ds_layout);
free(binds);
if (vr != VK_SUCCESS) {
fprintf(stderr, "vkCreateDescriptorSetLayout = %d\n", vr); return -1;
}
VkPushConstantRange pcr = {
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.offset = 0,
.size = push_const_size,
};
VkPipelineLayoutCreateInfo plci = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 1,
.pSetLayouts = &out->ds_layout,
.pushConstantRangeCount = push_const_size ? 1 : 0,
.pPushConstantRanges = push_const_size ? &pcr : NULL,
};
CHK(vkCreatePipelineLayout(r->device, &plci, NULL, &out->layout));
size_t spv_size = 0;
uint32_t *spv = read_spv(spv_path, &spv_size);
if (!spv) return -1;
VkShaderModuleCreateInfo smci = {
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
.codeSize = spv_size,
.pCode = spv,
};
VkShaderModule shader;
vr = vkCreateShaderModule(r->device, &smci, NULL, &shader);
free(spv);
if (vr != VK_SUCCESS) {
fprintf(stderr, "vkCreateShaderModule(%s) = %d\n", spv_path, vr);
return -1;
}
VkComputePipelineCreateInfo cpci = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = shader,
.pName = "main",
},
.layout = out->layout,
};
vr = vkCreateComputePipelines(r->device, VK_NULL_HANDLE, 1, &cpci, NULL,
&out->pipeline);
vkDestroyShaderModule(r->device, shader, NULL);
if (vr != VK_SUCCESS) {
fprintf(stderr, "vkCreateComputePipelines = %d\n", vr); return -1;
}
/* Single descriptor pool + set for this pipeline. */
VkDescriptorPoolSize ps = {
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = n_ssbos,
};
VkDescriptorPoolCreateInfo dpci = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
.maxSets = 1,
.poolSizeCount = 1,
.pPoolSizes = &ps,
};
CHK(vkCreateDescriptorPool(r->device, &dpci, NULL, &out->pool));
VkDescriptorSetAllocateInfo dsai = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
.descriptorPool = out->pool,
.descriptorSetCount = 1,
.pSetLayouts = &out->ds_layout,
};
CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set));
/* Persistent command buffer — pool was created with
* RESET_COMMAND_BUFFER_BIT (see v3d_runner_create) so dispatch
* sites can call vkResetCommandBuffer on this same cb instead
* of paying vkAllocateCommandBuffers per call. */
VkCommandBufferAllocateInfo cbai = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.commandPool = r->pool,
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
CHK(vkAllocateCommandBuffers(r->device, &cbai, &out->cb));
return 0;
}
void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
{
if (!p || p->pipeline == VK_NULL_HANDLE) return;
if (p->cb != VK_NULL_HANDLE)
vkFreeCommandBuffers(r->device, r->pool, 1, &p->cb);
vkDestroyPipeline(r->device, p->pipeline, NULL);
vkDestroyPipelineLayout(r->device, p->layout, NULL);
vkDestroyDescriptorPool(r->device, p->pool, NULL); /* frees its set */
vkDestroyDescriptorSetLayout(r->device, p->ds_layout, NULL);
memset(p, 0, sizeof(*p));
}
int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p)
{
(void) r;
if (!p || p->cb == VK_NULL_HANDLE) return -1;
return vkResetCommandBuffer(p->cb, 0) == VK_SUCCESS ? 0 : -1;
}
int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p,
const v3d_buffer *bufs, uint32_t n)
{
if (n != p->n_ssbos) {
fprintf(stderr, "bind_buffers: n=%u != pipeline n_ssbos=%u\n",
n, p->n_ssbos);
return -1;
}
VkDescriptorBufferInfo *bi = calloc(n, sizeof(*bi));
VkWriteDescriptorSet *wr = calloc(n, sizeof(*wr));
if (!bi || !wr) { free(bi); free(wr); return -1; }
for (uint32_t i = 0; i < n; i++) {
bi[i].buffer = bufs[i].buffer;
bi[i].offset = 0;
bi[i].range = bufs[i].size;
wr[i] = (VkWriteDescriptorSet){
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = p->desc_set,
.dstBinding = i,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.pBufferInfo = &bi[i],
};
}
vkUpdateDescriptorSets(r->device, n, wr, 0, NULL);
free(bi); free(wr);
return 0;
}
/* ---- Command buffers ------------------------------------------- */
VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r)
{
VkCommandBufferAllocateInfo cbai = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.commandPool = r->pool,
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
VkCommandBuffer cb = VK_NULL_HANDLE;
if (vkAllocateCommandBuffers(r->device, &cbai, &cb) != VK_SUCCESS)
return VK_NULL_HANDLE;
return cb;
}
int v3d_runner_submit_wait(v3d_runner *r, VkCommandBuffer cb)
{
VkSubmitInfo si = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.commandBufferCount = 1,
.pCommandBuffers = &cb,
};
CHK(vkQueueSubmit(r->queue, 1, &si, VK_NULL_HANDLE));
CHK(vkQueueWaitIdle(r->queue));
return 0;
}