98553278dd
Phase 2 of the QPU-default substrate campaign — eliminate
vkAllocateCommandBuffers from the dispatch hot path.
Attaches a VkCommandBuffer to each v3d_pipeline, allocated once in
v3d_runner_create_pipeline() and freed in destroy_pipeline(). The
five dispatch_*_qpu sites switch from v3d_runner_alloc_cmdbuf() to
v3d_runner_pipeline_cmdbuf_reset() — vkResetCommandBuffer is O(1)
versus the driver-side allocation walk. Pool was already created
with VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT so reset is
permitted.
Microbench (hertz, Pi 5, kernel 6.18.29, V3D 7.1):
before (task 160 pool only):
steady-state p50: 76.44 us
steady-state mean: 77.95 us
after (task 160 pool + task 161 persistent cb):
steady-state p50: 54.56 us
steady-state mean: 56.00 us
-> 28% per-dispatch reduction
The remaining ~54 us steady-state is dominated by vkQueueWaitIdle +
shader execution + the two memcpy(in/out) on the dst buffer — task
162 (dmabuf import for dst) targets the memcpy half.
test_api_idct stays bit-exact across CPU/QPU/AUTO substrates.
Refs daedalus-fourier task #161.
142 lines
5.5 KiB
C
142 lines
5.5 KiB
C
/*
|
|
* v3d_runner — minimal Vulkan compute plumbing for V3D 7.1 on Pi 5.
|
|
*
|
|
* Factored out of tests/bench_vulkan_dispatch.c so successive kernel
|
|
* benches can reuse the device/queue/buffer/pipeline machinery
|
|
* without copy-paste. Kept deliberately small and concrete — no
|
|
* generality beyond what daedalus-fourier needs.
|
|
*
|
|
* License: BSD-2-Clause.
|
|
*/
|
|
#ifndef DAEDALUS_V3D_RUNNER_H
|
|
#define DAEDALUS_V3D_RUNNER_H
|
|
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <vulkan/vulkan.h>
|
|
|
|
typedef struct v3d_runner v3d_runner;
|
|
|
|
/* Host-visible SSBO. .mapped is a CPU-side pointer to .size bytes. */
|
|
typedef struct {
|
|
VkBuffer buffer;
|
|
VkDeviceMemory memory;
|
|
void *mapped;
|
|
size_t size;
|
|
} v3d_buffer;
|
|
|
|
/* Compute pipeline + its descriptor set (one set per pipeline). */
|
|
typedef struct {
|
|
VkPipeline pipeline;
|
|
VkPipelineLayout layout;
|
|
VkDescriptorSetLayout ds_layout;
|
|
VkDescriptorPool pool;
|
|
VkDescriptorSet desc_set;
|
|
uint32_t n_ssbos;
|
|
uint32_t push_const_size;
|
|
/* Persistent command buffer. Allocated at create-pipeline time;
|
|
* dispatch sites use v3d_runner_pipeline_cmdbuf_reset() to
|
|
* vkResetCommandBuffer instead of paying vkAllocateCommandBuffers
|
|
* per dispatch. Pool flagged RESET_COMMAND_BUFFER_BIT so reset
|
|
* is permitted. */
|
|
VkCommandBuffer cb;
|
|
} v3d_pipeline;
|
|
|
|
/*
|
|
* Create runner: Vulkan instance, V3D physical device, logical
|
|
* device with storageBuffer{8,16}BitAccess features enabled,
|
|
* compute queue, command pool.
|
|
*
|
|
* Returns NULL on failure (writes errors to stderr).
|
|
*/
|
|
v3d_runner *v3d_runner_create(void);
|
|
void v3d_runner_destroy(v3d_runner *r);
|
|
|
|
/* Expose a few internals for code that wants direct vkCmd*. */
|
|
VkDevice v3d_runner_device(v3d_runner *r);
|
|
VkQueue v3d_runner_queue(v3d_runner *r);
|
|
uint32_t v3d_runner_queue_family(v3d_runner *r);
|
|
VkCommandPool v3d_runner_cmd_pool(v3d_runner *r);
|
|
const char *v3d_runner_device_name(v3d_runner *r);
|
|
|
|
/* Storage buffer, HOST_VISIBLE | HOST_COHERENT, mapped on the
|
|
* host side. The mapping persists for the lifetime of the buffer.
|
|
*
|
|
* Returns 0 on success, non-zero on failure.
|
|
*
|
|
* NOTE: prefer v3d_runner_acquire_buffer() on the dispatch hot path —
|
|
* create_buffer/destroy_buffer go straight to vkAllocateMemory each
|
|
* call, which on V3D7's Mesa stack costs ~10-50us. The acquire/
|
|
* release pair pulls from a freelist and pays vkAllocateMemory only
|
|
* on a cache miss.
|
|
*/
|
|
int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
|
|
void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf);
|
|
|
|
/*
|
|
* Pooled buffer acquisition. Returns a v3d_buffer whose .size is the
|
|
* smallest power-of-2 >= the requested size (so callers can pool
|
|
* across similar-sized requests). Backed by HOST_VISIBLE |
|
|
* HOST_COHERENT memory; mapped pointer is valid.
|
|
*
|
|
* On cache hit: zero-cost reuse of a previously-released buffer.
|
|
* On miss: falls through to v3d_runner_create_buffer(). Release with
|
|
* v3d_runner_release_buffer(); pool drains in v3d_runner_destroy().
|
|
*
|
|
* Lifetime contract: the returned buffer's .mapped contents are
|
|
* UNINITIALISED — the previous user's data may still be present.
|
|
* Callers that need a clean buffer must memset themselves. This is
|
|
* deliberate; the dispatch hot paths immediately overwrite the
|
|
* buffer with new coefficients / meta anyway.
|
|
*
|
|
* Thread-safety: NOT thread-safe. A daedalus_ctx is single-threaded
|
|
* by API contract; the pool inherits that constraint.
|
|
*/
|
|
int v3d_runner_acquire_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
|
|
void v3d_runner_release_buffer(v3d_runner *r, v3d_buffer *buf);
|
|
|
|
/* Pool diagnostics: total allocated bytes (sum across all size
|
|
* classes, including currently-released entries). Useful for
|
|
* watermark logging. */
|
|
size_t v3d_runner_pool_total_bytes(v3d_runner *r);
|
|
|
|
/* Compute pipeline from a SPIR-V file path. The descriptor-set
|
|
* layout exposes `n_ssbos` storage buffer bindings at binding
|
|
* indices 0..n_ssbos-1, all visible to the compute stage. A push
|
|
* constant range of `push_const_size` bytes is added if non-zero.
|
|
*
|
|
* The single descriptor set is pre-allocated; bind buffers via
|
|
* v3d_runner_bind_buffers().
|
|
*/
|
|
int v3d_runner_create_pipeline(v3d_runner *r,
|
|
const char *spv_path,
|
|
uint32_t n_ssbos,
|
|
uint32_t push_const_size,
|
|
v3d_pipeline *out);
|
|
void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p);
|
|
|
|
/* Bind SSBOs to the pipeline's descriptor set. `bufs` must have
|
|
* exactly `p->n_ssbos` entries, in binding order. Idempotent —
|
|
* rebind freely between dispatches if buffers change.
|
|
*/
|
|
int v3d_runner_bind_buffers(v3d_runner *r,
|
|
v3d_pipeline *p,
|
|
const v3d_buffer *bufs,
|
|
uint32_t n);
|
|
|
|
/* Allocate a primary command buffer from the runner's pool. */
|
|
VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r);
|
|
|
|
/* Reset @p->cb so it can be re-recorded. Returns 0 on success.
|
|
* Replaces v3d_runner_alloc_cmdbuf() on the dispatch hot path —
|
|
* vkResetCommandBuffer is O(1) vs vkAllocateCommandBuffers' ~1-5us
|
|
* driver cost. */
|
|
int v3d_runner_pipeline_cmdbuf_reset(v3d_runner *r, v3d_pipeline *p);
|
|
|
|
/* Submit `cb` to the queue and wait for completion. The classic
|
|
* timed operation. Returns 0 on success.
|
|
*/
|
|
int v3d_runner_submit_wait(v3d_runner *r, VkCommandBuffer cb);
|
|
|
|
#endif /* DAEDALUS_V3D_RUNNER_H */
|