Files
daedalus-fourier/tests/bench_vulkan_dispatch.c
marfrit dcbbc77038 Path B pivot + Phase 0-3 closed with first baseline numbers
This is a from-scratch initial commit on a fresh .git. The original
scaffold commit (7510b56) and the earlier session's working-tree
docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted
.git is preserved at .git-broken-2026-05-18/ (gitignored) for
forensic inspection.

Scope re-anchored from Path A (custom VPU firmware on VC7 scalar
cores; blocked by BCM2712 silicon-RoT mask-ROM signature check)
to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or
direct DRM, on stock signed Pi 5 / CM5). See README.md and
docs/phase0.md for the substrate audit that closed Path A.

Phases closed:
  Phase 0 — substrate audit; Path A blocked, Path B open;
            codec-back-end-fits-QPU finding (docs/phase0.md)
  Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with
            publish-before-measure R = M2/M3 decision rules
            (docs/phase1.md)
  Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored
            under external/ffmpeg-snapshot/ (PROVENANCE.md pins
            commit f46e514 + per-file SHA-256s) (docs/phase2.md)
  Phase 3 — real baseline measurements on hertz (docs/phase3.md):
              M1 bit-exact            100.0000 % (10000/10000)
              M3 NEON IDCT8 single    8.171 Mblock/s (122.4 ns/block)
              M5a empty Vulkan submit 22.66 us
              M5b 1-WG noop dispatch  55.60 us
              M5 delta                32.95 us/dispatch
            => per-dispatch overhead is ~455x per-NEON-block cost;
               Phase 4 must batch at frame level or close to it.

Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c,
vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} +
external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM).
Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12,
libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S
assembles via the config.h shim.

Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching
constraint) -> Phase 5 second-model review -> Phase 6 implement.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 11:30:12 +00:00

280 lines
9.3 KiB
C

/*
* Phase 3 — Vulkan compute dispatch-overhead microbench (M5).
*
* Measures the per-dispatch wall-clock floor on V3D 7.1 via Mesa
* v3dv: vkQueueSubmit + vkQueueWaitIdle round-trip cost for a
* noop compute shader. Establishes the floor below which kernel
* batching is mandatory.
*
* Two measurements:
* M5a: empty command-buffer submit (no dispatch at all)
* M5b: 1-workgroup dispatch of an empty shader
*
* The delta M5b - M5a isolates the per-vkCmdDispatch cost from
* the per-vkQueueSubmit cost.
*
* Build: cmake -DDAEDALUS_BUILD_VULKAN=ON ..
* Run: ./bench_vulkan_dispatch [--iters N] [--spv PATH]
*
* License: BSD-2-Clause (daedalus-fourier).
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <getopt.h>
#include <vulkan/vulkan.h>
#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
fprintf(stderr, "vulkan error %d at %s:%d (%s)\n", r__, __FILE__, __LINE__, #call); \
exit(1); } } while (0)
static double now_seconds(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
static uint32_t *read_spv(const char *path, size_t *out_size)
{
FILE *f = fopen(path, "rb");
if (!f) { perror(path); exit(1); }
fseek(f, 0, SEEK_END);
long sz = ftell(f);
fseek(f, 0, SEEK_SET);
if (sz <= 0 || (sz & 3)) {
fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz);
exit(1);
}
uint32_t *buf = malloc(sz);
if (!buf || fread(buf, 1, sz, f) != (size_t)sz) {
perror("read"); exit(1);
}
fclose(f);
*out_size = sz;
return buf;
}
int main(int argc, char **argv)
{
int iters = 100000;
const char *spv_path = "noop.spv";
static struct option opts[] = {
{"iters", required_argument, 0, 'i'},
{"spv", required_argument, 0, 's'},
{"help", no_argument, 0, 'h'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "i:s:h", opts, 0)) != -1;) {
switch (c) {
case 'i': iters = atoi(optarg); break;
case 's': spv_path = optarg; break;
case 'h':
fprintf(stderr,
"Usage: %s [--iters N] [--spv noop.spv]\n", argv[0]);
return 0;
default:
return 2;
}
}
/* ---- Instance ---- */
VkApplicationInfo app = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "daedalus-fourier-bench",
.apiVersion = VK_API_VERSION_1_3,
};
VkInstanceCreateInfo ici = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &app,
};
VkInstance instance;
CHK(vkCreateInstance(&ici, NULL, &instance));
/* ---- Pick V3D physical device (skip llvmpipe) ---- */
uint32_t pd_count = 0;
CHK(vkEnumeratePhysicalDevices(instance, &pd_count, NULL));
VkPhysicalDevice *pds = malloc(pd_count * sizeof(*pds));
CHK(vkEnumeratePhysicalDevices(instance, &pd_count, pds));
VkPhysicalDevice phys = VK_NULL_HANDLE;
VkPhysicalDeviceProperties props = {0};
for (uint32_t i = 0; i < pd_count; i++) {
vkGetPhysicalDeviceProperties(pds[i], &props);
printf("device %u: %s (api %u.%u.%u, vendor 0x%04x)\n",
i, props.deviceName,
VK_VERSION_MAJOR(props.apiVersion),
VK_VERSION_MINOR(props.apiVersion),
VK_VERSION_PATCH(props.apiVersion),
props.vendorID);
if (strstr(props.deviceName, "V3D") != NULL) {
phys = pds[i];
}
}
if (phys == VK_NULL_HANDLE) {
fprintf(stderr, "no V3D device found; bailing.\n");
return 1;
}
vkGetPhysicalDeviceProperties(phys, &props);
printf("selected: %s\n", props.deviceName);
free(pds);
/* ---- Compute queue family ---- */
uint32_t qfc = 0;
vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, NULL);
VkQueueFamilyProperties *qfp = malloc(qfc * sizeof(*qfp));
vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, qfp);
uint32_t qfi = (uint32_t) -1;
for (uint32_t i = 0; i < qfc; i++) {
if (qfp[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
qfi = i; break;
}
}
if (qfi == (uint32_t) -1) {
fprintf(stderr, "no compute queue family\n");
return 1;
}
free(qfp);
/* ---- Logical device ---- */
float qprio = 1.0f;
VkDeviceQueueCreateInfo dqci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.queueFamilyIndex = qfi,
.queueCount = 1,
.pQueuePriorities = &qprio,
};
VkDeviceCreateInfo dci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.queueCreateInfoCount = 1,
.pQueueCreateInfos = &dqci,
};
VkDevice dev;
CHK(vkCreateDevice(phys, &dci, NULL, &dev));
VkQueue queue;
vkGetDeviceQueue(dev, qfi, 0, &queue);
/* ---- Command pool + buffers ---- */
VkCommandPoolCreateInfo cpci = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
.queueFamilyIndex = qfi,
};
VkCommandPool pool;
CHK(vkCreateCommandPool(dev, &cpci, NULL, &pool));
VkCommandBuffer cb_empty, cb_dispatch;
VkCommandBufferAllocateInfo cbai = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.commandPool = pool,
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_empty));
CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_dispatch));
/* ---- Pipeline layout (empty: no descriptors, no push constants) ---- */
VkPipelineLayoutCreateInfo plci = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
};
VkPipelineLayout playout;
CHK(vkCreatePipelineLayout(dev, &plci, NULL, &playout));
/* ---- Compute pipeline from noop SPIR-V ---- */
size_t spv_size = 0;
uint32_t *spv = read_spv(spv_path, &spv_size);
VkShaderModuleCreateInfo smci = {
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
.codeSize = spv_size,
.pCode = spv,
};
VkShaderModule shader;
CHK(vkCreateShaderModule(dev, &smci, NULL, &shader));
free(spv);
VkComputePipelineCreateInfo cpci2 = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = shader,
.pName = "main",
},
.layout = playout,
};
VkPipeline pipe;
CHK(vkCreateComputePipelines(dev, VK_NULL_HANDLE, 1, &cpci2, NULL, &pipe));
/* ---- Record both command buffers once, reuse for every iteration ---- */
VkCommandBufferBeginInfo cbbi = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
};
CHK(vkBeginCommandBuffer(cb_empty, &cbbi));
CHK(vkEndCommandBuffer(cb_empty));
CHK(vkBeginCommandBuffer(cb_dispatch, &cbbi));
vkCmdBindPipeline(cb_dispatch, VK_PIPELINE_BIND_POINT_COMPUTE, pipe);
vkCmdDispatch(cb_dispatch, 1, 1, 1);
CHK(vkEndCommandBuffer(cb_dispatch));
VkSubmitInfo si_empty = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.commandBufferCount = 1, .pCommandBuffers = &cb_empty,
};
VkSubmitInfo si_disp = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.commandBufferCount = 1, .pCommandBuffers = &cb_dispatch,
};
/* ---- Warm-up ---- */
for (int i = 0; i < 100; i++) {
CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
CHK(vkQueueWaitIdle(queue));
}
/* ---- M5a: empty CB submit+wait ---- */
double t0 = now_seconds();
for (int i = 0; i < iters; i++) {
CHK(vkQueueSubmit(queue, 1, &si_empty, VK_NULL_HANDLE));
CHK(vkQueueWaitIdle(queue));
}
double t1 = now_seconds();
double m5a_per = (t1 - t0) / iters * 1e6; /* µs */
/* ---- M5b: 1-WG noop dispatch submit+wait ---- */
double t2 = now_seconds();
for (int i = 0; i < iters; i++) {
CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
CHK(vkQueueWaitIdle(queue));
}
double t3 = now_seconds();
double m5b_per = (t3 - t2) / iters * 1e6; /* µs */
printf("\n=== M5: Vulkan compute dispatch overhead ===\n");
printf(" iters per measurement: %d\n", iters);
printf(" M5a empty CB submit+wait: %.2f µs/op\n", m5a_per);
printf(" M5b 1-WG noop dispatch submit+wait: %.2f µs/op\n", m5b_per);
printf(" delta (per-vkCmdDispatch + per-pipeline-bind): %.2f µs\n",
m5b_per - m5a_per);
printf("\n");
printf(" Implication for kernel batching:\n");
printf(" if QPU IDCT8 = ~ 100ns/block (best case, hypothetical),\n");
printf(" a single-block dispatch costs %.0fx more in overhead\n",
m5b_per * 1e3 / 100.0);
printf(" -> batch at least %.0f blocks per dispatch to break even.\n",
m5b_per * 1e3 / 100.0);
/* ---- Tear down (minimal — process exit handles the rest) ---- */
vkDestroyPipeline(dev, pipe, NULL);
vkDestroyShaderModule(dev, shader, NULL);
vkDestroyPipelineLayout(dev, playout, NULL);
vkDestroyCommandPool(dev, pool, NULL);
vkDestroyDevice(dev, NULL);
vkDestroyInstance(instance, NULL);
return 0;
}