/* * Phase 3 — Vulkan compute dispatch-overhead microbench (M5). * * Measures the per-dispatch wall-clock floor on V3D 7.1 via Mesa * v3dv: vkQueueSubmit + vkQueueWaitIdle round-trip cost for a * noop compute shader. Establishes the floor below which kernel * batching is mandatory. * * Two measurements: * M5a: empty command-buffer submit (no dispatch at all) * M5b: 1-workgroup dispatch of an empty shader * * The delta M5b - M5a isolates the per-vkCmdDispatch cost from * the per-vkQueueSubmit cost. * * Build: cmake -DDAEDALUS_BUILD_VULKAN=ON .. * Run: ./bench_vulkan_dispatch [--iters N] [--spv PATH] * * License: BSD-2-Clause (daedalus-fourier). */ #define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include #include #define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \ fprintf(stderr, "vulkan error %d at %s:%d (%s)\n", r__, __FILE__, __LINE__, #call); \ exit(1); } } while (0) static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } static uint32_t *read_spv(const char *path, size_t *out_size) { FILE *f = fopen(path, "rb"); if (!f) { perror(path); exit(1); } fseek(f, 0, SEEK_END); long sz = ftell(f); fseek(f, 0, SEEK_SET); if (sz <= 0 || (sz & 3)) { fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz); exit(1); } uint32_t *buf = malloc(sz); if (!buf || fread(buf, 1, sz, f) != (size_t)sz) { perror("read"); exit(1); } fclose(f); *out_size = sz; return buf; } int main(int argc, char **argv) { int iters = 100000; const char *spv_path = "noop.spv"; static struct option opts[] = { {"iters", required_argument, 0, 'i'}, {"spv", required_argument, 0, 's'}, {"help", no_argument, 0, 'h'}, {0,0,0,0} }; for (int c; (c = getopt_long(argc, argv, "i:s:h", opts, 0)) != -1;) { switch (c) { case 'i': iters = atoi(optarg); break; case 's': spv_path = optarg; break; case 'h': fprintf(stderr, "Usage: %s [--iters N] [--spv noop.spv]\n", argv[0]); return 0; default: return 2; } } /* ---- Instance ---- */ VkApplicationInfo app = { .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, .pApplicationName = "daedalus-fourier-bench", .apiVersion = VK_API_VERSION_1_3, }; VkInstanceCreateInfo ici = { .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, .pApplicationInfo = &app, }; VkInstance instance; CHK(vkCreateInstance(&ici, NULL, &instance)); /* ---- Pick V3D physical device (skip llvmpipe) ---- */ uint32_t pd_count = 0; CHK(vkEnumeratePhysicalDevices(instance, &pd_count, NULL)); VkPhysicalDevice *pds = malloc(pd_count * sizeof(*pds)); CHK(vkEnumeratePhysicalDevices(instance, &pd_count, pds)); VkPhysicalDevice phys = VK_NULL_HANDLE; VkPhysicalDeviceProperties props = {0}; for (uint32_t i = 0; i < pd_count; i++) { vkGetPhysicalDeviceProperties(pds[i], &props); printf("device %u: %s (api %u.%u.%u, vendor 0x%04x)\n", i, props.deviceName, VK_VERSION_MAJOR(props.apiVersion), VK_VERSION_MINOR(props.apiVersion), VK_VERSION_PATCH(props.apiVersion), props.vendorID); if (strstr(props.deviceName, "V3D") != NULL) { phys = pds[i]; } } if (phys == VK_NULL_HANDLE) { fprintf(stderr, "no V3D device found; bailing.\n"); return 1; } vkGetPhysicalDeviceProperties(phys, &props); printf("selected: %s\n", props.deviceName); free(pds); /* ---- Compute queue family ---- */ uint32_t qfc = 0; vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, NULL); VkQueueFamilyProperties *qfp = malloc(qfc * sizeof(*qfp)); vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, qfp); uint32_t qfi = (uint32_t) -1; for (uint32_t i = 0; i < qfc; i++) { if (qfp[i].queueFlags & VK_QUEUE_COMPUTE_BIT) { qfi = i; break; } } if (qfi == (uint32_t) -1) { fprintf(stderr, "no compute queue family\n"); return 1; } free(qfp); /* ---- Logical device ---- */ float qprio = 1.0f; VkDeviceQueueCreateInfo dqci = { .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, .queueFamilyIndex = qfi, .queueCount = 1, .pQueuePriorities = &qprio, }; VkDeviceCreateInfo dci = { .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, .queueCreateInfoCount = 1, .pQueueCreateInfos = &dqci, }; VkDevice dev; CHK(vkCreateDevice(phys, &dci, NULL, &dev)); VkQueue queue; vkGetDeviceQueue(dev, qfi, 0, &queue); /* ---- Command pool + buffers ---- */ VkCommandPoolCreateInfo cpci = { .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, .queueFamilyIndex = qfi, }; VkCommandPool pool; CHK(vkCreateCommandPool(dev, &cpci, NULL, &pool)); VkCommandBuffer cb_empty, cb_dispatch; VkCommandBufferAllocateInfo cbai = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, .commandPool = pool, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, .commandBufferCount = 1, }; CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_empty)); CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_dispatch)); /* ---- Pipeline layout (empty: no descriptors, no push constants) ---- */ VkPipelineLayoutCreateInfo plci = { .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, }; VkPipelineLayout playout; CHK(vkCreatePipelineLayout(dev, &plci, NULL, &playout)); /* ---- Compute pipeline from noop SPIR-V ---- */ size_t spv_size = 0; uint32_t *spv = read_spv(spv_path, &spv_size); VkShaderModuleCreateInfo smci = { .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .codeSize = spv_size, .pCode = spv, }; VkShaderModule shader; CHK(vkCreateShaderModule(dev, &smci, NULL, &shader)); free(spv); VkComputePipelineCreateInfo cpci2 = { .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .stage = { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, .stage = VK_SHADER_STAGE_COMPUTE_BIT, .module = shader, .pName = "main", }, .layout = playout, }; VkPipeline pipe; CHK(vkCreateComputePipelines(dev, VK_NULL_HANDLE, 1, &cpci2, NULL, &pipe)); /* ---- Record both command buffers once, reuse for every iteration ---- */ VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, }; CHK(vkBeginCommandBuffer(cb_empty, &cbbi)); CHK(vkEndCommandBuffer(cb_empty)); CHK(vkBeginCommandBuffer(cb_dispatch, &cbbi)); vkCmdBindPipeline(cb_dispatch, VK_PIPELINE_BIND_POINT_COMPUTE, pipe); vkCmdDispatch(cb_dispatch, 1, 1, 1); CHK(vkEndCommandBuffer(cb_dispatch)); VkSubmitInfo si_empty = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .commandBufferCount = 1, .pCommandBuffers = &cb_empty, }; VkSubmitInfo si_disp = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .commandBufferCount = 1, .pCommandBuffers = &cb_dispatch, }; /* ---- Warm-up ---- */ for (int i = 0; i < 100; i++) { CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE)); CHK(vkQueueWaitIdle(queue)); } /* ---- M5a: empty CB submit+wait ---- */ double t0 = now_seconds(); for (int i = 0; i < iters; i++) { CHK(vkQueueSubmit(queue, 1, &si_empty, VK_NULL_HANDLE)); CHK(vkQueueWaitIdle(queue)); } double t1 = now_seconds(); double m5a_per = (t1 - t0) / iters * 1e6; /* µs */ /* ---- M5b: 1-WG noop dispatch submit+wait ---- */ double t2 = now_seconds(); for (int i = 0; i < iters; i++) { CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE)); CHK(vkQueueWaitIdle(queue)); } double t3 = now_seconds(); double m5b_per = (t3 - t2) / iters * 1e6; /* µs */ printf("\n=== M5: Vulkan compute dispatch overhead ===\n"); printf(" iters per measurement: %d\n", iters); printf(" M5a empty CB submit+wait: %.2f µs/op\n", m5a_per); printf(" M5b 1-WG noop dispatch submit+wait: %.2f µs/op\n", m5b_per); printf(" delta (per-vkCmdDispatch + per-pipeline-bind): %.2f µs\n", m5b_per - m5a_per); printf("\n"); printf(" Implication for kernel batching:\n"); printf(" if QPU IDCT8 = ~ 100ns/block (best case, hypothetical),\n"); printf(" a single-block dispatch costs %.0fx more in overhead\n", m5b_per * 1e3 / 100.0); printf(" -> batch at least %.0f blocks per dispatch to break even.\n", m5b_per * 1e3 / 100.0); /* ---- Tear down (minimal — process exit handles the rest) ---- */ vkDestroyPipeline(dev, pipe, NULL); vkDestroyShaderModule(dev, shader, NULL); vkDestroyPipelineLayout(dev, playout, NULL); vkDestroyCommandPool(dev, pool, NULL); vkDestroyDevice(dev, NULL); vkDestroyInstance(instance, NULL); return 0; }