dcbbc77038
This is a from-scratch initial commit on a fresh .git. The original
scaffold commit (7510b56) and the earlier session's working-tree
docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted
.git is preserved at .git-broken-2026-05-18/ (gitignored) for
forensic inspection.
Scope re-anchored from Path A (custom VPU firmware on VC7 scalar
cores; blocked by BCM2712 silicon-RoT mask-ROM signature check)
to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or
direct DRM, on stock signed Pi 5 / CM5). See README.md and
docs/phase0.md for the substrate audit that closed Path A.
Phases closed:
Phase 0 — substrate audit; Path A blocked, Path B open;
codec-back-end-fits-QPU finding (docs/phase0.md)
Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with
publish-before-measure R = M2/M3 decision rules
(docs/phase1.md)
Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored
under external/ffmpeg-snapshot/ (PROVENANCE.md pins
commit f46e514 + per-file SHA-256s) (docs/phase2.md)
Phase 3 — real baseline measurements on hertz (docs/phase3.md):
M1 bit-exact 100.0000 % (10000/10000)
M3 NEON IDCT8 single 8.171 Mblock/s (122.4 ns/block)
M5a empty Vulkan submit 22.66 us
M5b 1-WG noop dispatch 55.60 us
M5 delta 32.95 us/dispatch
=> per-dispatch overhead is ~455x per-NEON-block cost;
Phase 4 must batch at frame level or close to it.
Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c,
vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} +
external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM).
Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12,
libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S
assembles via the config.h shim.
Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching
constraint) -> Phase 5 second-model review -> Phase 6 implement.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
280 lines
9.3 KiB
C
280 lines
9.3 KiB
C
/*
|
|
* Phase 3 — Vulkan compute dispatch-overhead microbench (M5).
|
|
*
|
|
* Measures the per-dispatch wall-clock floor on V3D 7.1 via Mesa
|
|
* v3dv: vkQueueSubmit + vkQueueWaitIdle round-trip cost for a
|
|
* noop compute shader. Establishes the floor below which kernel
|
|
* batching is mandatory.
|
|
*
|
|
* Two measurements:
|
|
* M5a: empty command-buffer submit (no dispatch at all)
|
|
* M5b: 1-workgroup dispatch of an empty shader
|
|
*
|
|
* The delta M5b - M5a isolates the per-vkCmdDispatch cost from
|
|
* the per-vkQueueSubmit cost.
|
|
*
|
|
* Build: cmake -DDAEDALUS_BUILD_VULKAN=ON ..
|
|
* Run: ./bench_vulkan_dispatch [--iters N] [--spv PATH]
|
|
*
|
|
* License: BSD-2-Clause (daedalus-fourier).
|
|
*/
|
|
#define _POSIX_C_SOURCE 200809L
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
#include <time.h>
|
|
#include <getopt.h>
|
|
#include <vulkan/vulkan.h>
|
|
|
|
#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
|
|
fprintf(stderr, "vulkan error %d at %s:%d (%s)\n", r__, __FILE__, __LINE__, #call); \
|
|
exit(1); } } while (0)
|
|
|
|
static double now_seconds(void)
|
|
{
|
|
struct timespec ts;
|
|
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
|
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
|
}
|
|
|
|
static uint32_t *read_spv(const char *path, size_t *out_size)
|
|
{
|
|
FILE *f = fopen(path, "rb");
|
|
if (!f) { perror(path); exit(1); }
|
|
fseek(f, 0, SEEK_END);
|
|
long sz = ftell(f);
|
|
fseek(f, 0, SEEK_SET);
|
|
if (sz <= 0 || (sz & 3)) {
|
|
fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz);
|
|
exit(1);
|
|
}
|
|
uint32_t *buf = malloc(sz);
|
|
if (!buf || fread(buf, 1, sz, f) != (size_t)sz) {
|
|
perror("read"); exit(1);
|
|
}
|
|
fclose(f);
|
|
*out_size = sz;
|
|
return buf;
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
int iters = 100000;
|
|
const char *spv_path = "noop.spv";
|
|
|
|
static struct option opts[] = {
|
|
{"iters", required_argument, 0, 'i'},
|
|
{"spv", required_argument, 0, 's'},
|
|
{"help", no_argument, 0, 'h'},
|
|
{0,0,0,0}
|
|
};
|
|
for (int c; (c = getopt_long(argc, argv, "i:s:h", opts, 0)) != -1;) {
|
|
switch (c) {
|
|
case 'i': iters = atoi(optarg); break;
|
|
case 's': spv_path = optarg; break;
|
|
case 'h':
|
|
fprintf(stderr,
|
|
"Usage: %s [--iters N] [--spv noop.spv]\n", argv[0]);
|
|
return 0;
|
|
default:
|
|
return 2;
|
|
}
|
|
}
|
|
|
|
/* ---- Instance ---- */
|
|
VkApplicationInfo app = {
|
|
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
|
|
.pApplicationName = "daedalus-fourier-bench",
|
|
.apiVersion = VK_API_VERSION_1_3,
|
|
};
|
|
VkInstanceCreateInfo ici = {
|
|
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
|
|
.pApplicationInfo = &app,
|
|
};
|
|
VkInstance instance;
|
|
CHK(vkCreateInstance(&ici, NULL, &instance));
|
|
|
|
/* ---- Pick V3D physical device (skip llvmpipe) ---- */
|
|
uint32_t pd_count = 0;
|
|
CHK(vkEnumeratePhysicalDevices(instance, &pd_count, NULL));
|
|
VkPhysicalDevice *pds = malloc(pd_count * sizeof(*pds));
|
|
CHK(vkEnumeratePhysicalDevices(instance, &pd_count, pds));
|
|
VkPhysicalDevice phys = VK_NULL_HANDLE;
|
|
VkPhysicalDeviceProperties props = {0};
|
|
for (uint32_t i = 0; i < pd_count; i++) {
|
|
vkGetPhysicalDeviceProperties(pds[i], &props);
|
|
printf("device %u: %s (api %u.%u.%u, vendor 0x%04x)\n",
|
|
i, props.deviceName,
|
|
VK_VERSION_MAJOR(props.apiVersion),
|
|
VK_VERSION_MINOR(props.apiVersion),
|
|
VK_VERSION_PATCH(props.apiVersion),
|
|
props.vendorID);
|
|
if (strstr(props.deviceName, "V3D") != NULL) {
|
|
phys = pds[i];
|
|
}
|
|
}
|
|
if (phys == VK_NULL_HANDLE) {
|
|
fprintf(stderr, "no V3D device found; bailing.\n");
|
|
return 1;
|
|
}
|
|
vkGetPhysicalDeviceProperties(phys, &props);
|
|
printf("selected: %s\n", props.deviceName);
|
|
free(pds);
|
|
|
|
/* ---- Compute queue family ---- */
|
|
uint32_t qfc = 0;
|
|
vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, NULL);
|
|
VkQueueFamilyProperties *qfp = malloc(qfc * sizeof(*qfp));
|
|
vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, qfp);
|
|
uint32_t qfi = (uint32_t) -1;
|
|
for (uint32_t i = 0; i < qfc; i++) {
|
|
if (qfp[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
|
|
qfi = i; break;
|
|
}
|
|
}
|
|
if (qfi == (uint32_t) -1) {
|
|
fprintf(stderr, "no compute queue family\n");
|
|
return 1;
|
|
}
|
|
free(qfp);
|
|
|
|
/* ---- Logical device ---- */
|
|
float qprio = 1.0f;
|
|
VkDeviceQueueCreateInfo dqci = {
|
|
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
|
|
.queueFamilyIndex = qfi,
|
|
.queueCount = 1,
|
|
.pQueuePriorities = &qprio,
|
|
};
|
|
VkDeviceCreateInfo dci = {
|
|
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
|
|
.queueCreateInfoCount = 1,
|
|
.pQueueCreateInfos = &dqci,
|
|
};
|
|
VkDevice dev;
|
|
CHK(vkCreateDevice(phys, &dci, NULL, &dev));
|
|
VkQueue queue;
|
|
vkGetDeviceQueue(dev, qfi, 0, &queue);
|
|
|
|
/* ---- Command pool + buffers ---- */
|
|
VkCommandPoolCreateInfo cpci = {
|
|
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
|
|
.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
|
|
.queueFamilyIndex = qfi,
|
|
};
|
|
VkCommandPool pool;
|
|
CHK(vkCreateCommandPool(dev, &cpci, NULL, &pool));
|
|
|
|
VkCommandBuffer cb_empty, cb_dispatch;
|
|
VkCommandBufferAllocateInfo cbai = {
|
|
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
|
|
.commandPool = pool,
|
|
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
|
|
.commandBufferCount = 1,
|
|
};
|
|
CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_empty));
|
|
CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_dispatch));
|
|
|
|
/* ---- Pipeline layout (empty: no descriptors, no push constants) ---- */
|
|
VkPipelineLayoutCreateInfo plci = {
|
|
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
|
|
};
|
|
VkPipelineLayout playout;
|
|
CHK(vkCreatePipelineLayout(dev, &plci, NULL, &playout));
|
|
|
|
/* ---- Compute pipeline from noop SPIR-V ---- */
|
|
size_t spv_size = 0;
|
|
uint32_t *spv = read_spv(spv_path, &spv_size);
|
|
VkShaderModuleCreateInfo smci = {
|
|
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
|
|
.codeSize = spv_size,
|
|
.pCode = spv,
|
|
};
|
|
VkShaderModule shader;
|
|
CHK(vkCreateShaderModule(dev, &smci, NULL, &shader));
|
|
free(spv);
|
|
|
|
VkComputePipelineCreateInfo cpci2 = {
|
|
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
|
|
.stage = {
|
|
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
|
|
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
|
|
.module = shader,
|
|
.pName = "main",
|
|
},
|
|
.layout = playout,
|
|
};
|
|
VkPipeline pipe;
|
|
CHK(vkCreateComputePipelines(dev, VK_NULL_HANDLE, 1, &cpci2, NULL, &pipe));
|
|
|
|
/* ---- Record both command buffers once, reuse for every iteration ---- */
|
|
VkCommandBufferBeginInfo cbbi = {
|
|
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
|
|
};
|
|
|
|
CHK(vkBeginCommandBuffer(cb_empty, &cbbi));
|
|
CHK(vkEndCommandBuffer(cb_empty));
|
|
|
|
CHK(vkBeginCommandBuffer(cb_dispatch, &cbbi));
|
|
vkCmdBindPipeline(cb_dispatch, VK_PIPELINE_BIND_POINT_COMPUTE, pipe);
|
|
vkCmdDispatch(cb_dispatch, 1, 1, 1);
|
|
CHK(vkEndCommandBuffer(cb_dispatch));
|
|
|
|
VkSubmitInfo si_empty = {
|
|
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
|
|
.commandBufferCount = 1, .pCommandBuffers = &cb_empty,
|
|
};
|
|
VkSubmitInfo si_disp = {
|
|
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
|
|
.commandBufferCount = 1, .pCommandBuffers = &cb_dispatch,
|
|
};
|
|
|
|
/* ---- Warm-up ---- */
|
|
for (int i = 0; i < 100; i++) {
|
|
CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
|
|
CHK(vkQueueWaitIdle(queue));
|
|
}
|
|
|
|
/* ---- M5a: empty CB submit+wait ---- */
|
|
double t0 = now_seconds();
|
|
for (int i = 0; i < iters; i++) {
|
|
CHK(vkQueueSubmit(queue, 1, &si_empty, VK_NULL_HANDLE));
|
|
CHK(vkQueueWaitIdle(queue));
|
|
}
|
|
double t1 = now_seconds();
|
|
double m5a_per = (t1 - t0) / iters * 1e6; /* µs */
|
|
|
|
/* ---- M5b: 1-WG noop dispatch submit+wait ---- */
|
|
double t2 = now_seconds();
|
|
for (int i = 0; i < iters; i++) {
|
|
CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
|
|
CHK(vkQueueWaitIdle(queue));
|
|
}
|
|
double t3 = now_seconds();
|
|
double m5b_per = (t3 - t2) / iters * 1e6; /* µs */
|
|
|
|
printf("\n=== M5: Vulkan compute dispatch overhead ===\n");
|
|
printf(" iters per measurement: %d\n", iters);
|
|
printf(" M5a empty CB submit+wait: %.2f µs/op\n", m5a_per);
|
|
printf(" M5b 1-WG noop dispatch submit+wait: %.2f µs/op\n", m5b_per);
|
|
printf(" delta (per-vkCmdDispatch + per-pipeline-bind): %.2f µs\n",
|
|
m5b_per - m5a_per);
|
|
printf("\n");
|
|
printf(" Implication for kernel batching:\n");
|
|
printf(" if QPU IDCT8 = ~ 100ns/block (best case, hypothetical),\n");
|
|
printf(" a single-block dispatch costs %.0fx more in overhead\n",
|
|
m5b_per * 1e3 / 100.0);
|
|
printf(" -> batch at least %.0f blocks per dispatch to break even.\n",
|
|
m5b_per * 1e3 / 100.0);
|
|
|
|
/* ---- Tear down (minimal — process exit handles the rest) ---- */
|
|
vkDestroyPipeline(dev, pipe, NULL);
|
|
vkDestroyShaderModule(dev, shader, NULL);
|
|
vkDestroyPipelineLayout(dev, playout, NULL);
|
|
vkDestroyCommandPool(dev, pool, NULL);
|
|
vkDestroyDevice(dev, NULL);
|
|
vkDestroyInstance(instance, NULL);
|
|
return 0;
|
|
}
|