Files
daedalus-fourier/src/v3d_runner.c
T
marfrit d66f22f333 Phase 6 (v1+v4 production) + Phase 7 closure: R = 0.92 ± 0.03 on hertz
First QPU IDCT8 kernel running and bit-exact on V3D 7.1 via Mesa
v3dv compute. Five iterations through a Phase 7→Phase 4' loopback;
production kernel is v4.

New files:
- src/v3d_runner.{c,h}  — reusable Vulkan compute plumbing (instance,
                          V3D device picker, HOST_VISIBLE|COHERENT
                          SSBOs with mmap, compute pipeline from .spv,
                          enables storageBuffer{8,16}BitAccess)
- src/v3d_idct8.comp    — VP9 8x8 DCT_DCT IDCT add, v4 production:
                          256 invocations/WG, 2 blocks/subgroup
                          (no idle lanes), uint8 dst SSBO (race-free
                          per phase5 finding 5), unrolled writes
                          (no chained ternary), oob-flag pattern
                          (barrier-safe per phase5 finding 7)
- tests/bench_v3d_idct.c — M1' bit-exact gate + M2 throughput vs C ref
- docs/phase7.md         — full iteration journey + decision verdict

CMakeLists.txt updated to build the new shader, library, and bench
when DAEDALUS_BUILD_VULKAN=ON.

Iteration record (1920x1088 luma, 32640 blocks/dispatch, N=3):

  ver  change                              R       ns/block
  v1   first-light                         0.230   533
  v2   kill ternary + 2-blocks-per-sg      0.474   258
  v3   per-pass scope oN                   0.481   254  (noise)
  v4   WG 64 -> 256 invocations            0.947   129
  v5   packed uint32 coeff reads           0.938   130  (noise, reverted)
  v4 final N=3                             0.918 +/- 0.033

Bit-exactness 100.0000% across all iterations (10000-block sample
on 128x128, 32640-block sample on 1080p) against both the C
reference (tests/vp9_idct8_ref.c) and the vendored FFmpeg NEON
ff_vp9_idct_idct_8x8_add_neon.

Key learning over the Phase 5 review's prediction model: the
chained ternary was NOT a spill killer on V3D 7.1 (shaderdb
showed 0:0 spills:fills even in v1). The actual lever was
workgroup-size-driven latency hiding — going from 64 to 256
invocations doubled throughput with the same compiled code
(270 inst, 2 threads, 21 max-temps, 0 spills) because the
v3dv scheduler had 4x more in-flight work to overlap TMU
latency.

Verdict per phase1.md decision rules: YELLOW band (0.5 <= R < 1.0)
by a wide margin, near GREEN boundary. Phase 1 YELLOW rule:
add M4 (concurrent CPU+QPU throughput) before honest-close or
continue. M4 is the next measurement, not more shader tuning —
at R = 0.92 with all 4 A76 cores still 100% free for other work,
the question is whether the system aggregate beats pure 4-core
NEON.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 12:09:00 +00:00

436 lines
15 KiB
C

/*
* v3d_runner — implementation. See v3d_runner.h.
*
* License: BSD-2-Clause.
*/
#include "v3d_runner.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
r__, __FILE__, __LINE__, #call); return -1; } } while (0)
#define CHK_NULL(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
struct v3d_runner {
VkInstance instance;
VkPhysicalDevice phys;
VkDevice device;
VkQueue queue;
uint32_t queue_family;
VkCommandPool pool;
char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
VkPhysicalDeviceMemoryProperties mem_props;
};
static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
char name_out[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE])
{
uint32_t n = 0;
if (vkEnumeratePhysicalDevices(inst, &n, NULL) != VK_SUCCESS || n == 0) {
fprintf(stderr, "v3d_runner: no Vulkan physical devices\n");
return -1;
}
VkPhysicalDevice *pds = malloc(n * sizeof(*pds));
if (!pds) return -1;
vkEnumeratePhysicalDevices(inst, &n, pds);
int picked = -1;
for (uint32_t i = 0; i < n; i++) {
VkPhysicalDeviceProperties p;
vkGetPhysicalDeviceProperties(pds[i], &p);
if (strstr(p.deviceName, "V3D") != NULL) {
*out = pds[i];
memcpy(name_out, p.deviceName, sizeof(p.deviceName));
picked = 0;
break;
}
}
free(pds);
if (picked != 0)
fprintf(stderr, "v3d_runner: no V3D device found (looked for "
"\"V3D\" substring in deviceName)\n");
return picked;
}
static uint32_t pick_compute_queue_family(VkPhysicalDevice phys)
{
uint32_t n = 0;
vkGetPhysicalDeviceQueueFamilyProperties(phys, &n, NULL);
VkQueueFamilyProperties *q = malloc(n * sizeof(*q));
if (!q) return UINT32_MAX;
vkGetPhysicalDeviceQueueFamilyProperties(phys, &n, q);
uint32_t out = UINT32_MAX;
for (uint32_t i = 0; i < n; i++) {
if (q[i].queueFlags & VK_QUEUE_COMPUTE_BIT) { out = i; break; }
}
free(q);
return out;
}
v3d_runner *v3d_runner_create(void)
{
v3d_runner *r = calloc(1, sizeof(*r));
if (!r) return NULL;
/* Instance — Vulkan 1.3 to inherit 1.2 promoted features. */
VkApplicationInfo app = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "daedalus-fourier",
.apiVersion = VK_API_VERSION_1_3,
};
VkInstanceCreateInfo ici = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &app,
};
CHK_NULL(vkCreateInstance(&ici, NULL, &r->instance));
if (pick_v3d_physical_device(r->instance, &r->phys, r->device_name) != 0) {
vkDestroyInstance(r->instance, NULL);
free(r);
return NULL;
}
vkGetPhysicalDeviceMemoryProperties(r->phys, &r->mem_props);
r->queue_family = pick_compute_queue_family(r->phys);
if (r->queue_family == UINT32_MAX) {
fprintf(stderr, "v3d_runner: no compute queue family\n");
vkDestroyInstance(r->instance, NULL);
free(r);
return NULL;
}
/* Enable 8-bit + 16-bit storage features. Both are exposed on
* V3D 7.1 per vulkaninfo_v3d_7_1_7_hertz.txt; the kernel
* declares storageBuffer8BitAccess (uint8_t dst[]) and
* storageBuffer16BitAccess (int16_t coeffs[]).
*/
VkPhysicalDevice16BitStorageFeatures f16 = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES,
.storageBuffer16BitAccess = VK_TRUE,
.uniformAndStorageBuffer16BitAccess = VK_TRUE,
};
VkPhysicalDevice8BitStorageFeatures f8 = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES,
.pNext = &f16,
.storageBuffer8BitAccess = VK_TRUE,
.uniformAndStorageBuffer8BitAccess = VK_TRUE,
};
VkPhysicalDeviceFeatures2 f2 = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
.pNext = &f8,
};
float qprio = 1.0f;
VkDeviceQueueCreateInfo dqci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.queueFamilyIndex = r->queue_family,
.queueCount = 1,
.pQueuePriorities = &qprio,
};
VkDeviceCreateInfo dci = {
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.pNext = &f2,
.queueCreateInfoCount = 1,
.pQueueCreateInfos = &dqci,
};
if (vkCreateDevice(r->phys, &dci, NULL, &r->device) != VK_SUCCESS) {
fprintf(stderr, "v3d_runner: vkCreateDevice failed\n");
vkDestroyInstance(r->instance, NULL);
free(r);
return NULL;
}
vkGetDeviceQueue(r->device, r->queue_family, 0, &r->queue);
VkCommandPoolCreateInfo cpci = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
.queueFamilyIndex = r->queue_family,
};
if (vkCreateCommandPool(r->device, &cpci, NULL, &r->pool) != VK_SUCCESS) {
fprintf(stderr, "v3d_runner: vkCreateCommandPool failed\n");
vkDestroyDevice(r->device, NULL);
vkDestroyInstance(r->instance, NULL);
free(r);
return NULL;
}
return r;
}
void v3d_runner_destroy(v3d_runner *r)
{
if (!r) return;
if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
if (r->pool != VK_NULL_HANDLE)
vkDestroyCommandPool(r->device, r->pool, NULL);
if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
if (r->instance != VK_NULL_HANDLE) vkDestroyInstance(r->instance, NULL);
free(r);
}
VkDevice v3d_runner_device(v3d_runner *r) { return r->device; }
VkQueue v3d_runner_queue(v3d_runner *r) { return r->queue; }
uint32_t v3d_runner_queue_family(v3d_runner *r) { return r->queue_family; }
VkCommandPool v3d_runner_cmd_pool(v3d_runner *r) { return r->pool; }
const char *v3d_runner_device_name(v3d_runner *r) { return r->device_name; }
/* ---- Buffers ---------------------------------------------------- */
static int find_memory_type(VkPhysicalDeviceMemoryProperties *p,
uint32_t type_bits, VkMemoryPropertyFlags wanted)
{
for (uint32_t i = 0; i < p->memoryTypeCount; i++) {
if ((type_bits & (1u << i)) &&
(p->memoryTypes[i].propertyFlags & wanted) == wanted)
return (int) i;
}
return -1;
}
int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
{
memset(out, 0, sizeof(*out));
out->size = size;
VkBufferCreateInfo bci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.size = size,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
| VK_BUFFER_USAGE_TRANSFER_SRC_BIT
| VK_BUFFER_USAGE_TRANSFER_DST_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
};
CHK(vkCreateBuffer(r->device, &bci, NULL, &out->buffer));
VkMemoryRequirements req;
vkGetBufferMemoryRequirements(r->device, out->buffer, &req);
/* HOST_VISIBLE | HOST_COHERENT is the unified-memory zero-copy
* path on Pi 5: CPU and GPU see the same LPDDR4x physical pages,
* no explicit flush/invalidate needed (the COHERENT bit asserts
* that). */
int mt = find_memory_type(&r->mem_props, req.memoryTypeBits,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
| VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
if (mt < 0) {
fprintf(stderr, "v3d_runner: no HOST_VISIBLE|COHERENT memory type\n");
return -1;
}
VkMemoryAllocateInfo mai = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.allocationSize = req.size,
.memoryTypeIndex = (uint32_t) mt,
};
CHK(vkAllocateMemory(r->device, &mai, NULL, &out->memory));
CHK(vkBindBufferMemory(r->device, out->buffer, out->memory, 0));
CHK(vkMapMemory(r->device, out->memory, 0, VK_WHOLE_SIZE, 0, &out->mapped));
return 0;
}
void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf)
{
if (!buf || buf->buffer == VK_NULL_HANDLE) return;
if (buf->mapped) vkUnmapMemory(r->device, buf->memory);
vkDestroyBuffer(r->device, buf->buffer, NULL);
vkFreeMemory(r->device, buf->memory, NULL);
memset(buf, 0, sizeof(*buf));
}
/* ---- Pipelines -------------------------------------------------- */
static uint32_t *read_spv(const char *path, size_t *out_size)
{
FILE *f = fopen(path, "rb");
if (!f) { perror(path); return NULL; }
fseek(f, 0, SEEK_END);
long sz = ftell(f);
fseek(f, 0, SEEK_SET);
if (sz <= 0 || (sz & 3)) {
fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz);
fclose(f); return NULL;
}
uint32_t *buf = malloc(sz);
if (!buf || fread(buf, 1, sz, f) != (size_t)sz) {
perror("read"); fclose(f); free(buf); return NULL;
}
fclose(f);
*out_size = sz;
return buf;
}
int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path,
uint32_t n_ssbos, uint32_t push_const_size,
v3d_pipeline *out)
{
memset(out, 0, sizeof(*out));
out->n_ssbos = n_ssbos;
out->push_const_size = push_const_size;
/* Descriptor set layout: n_ssbos SSBO bindings, compute-only. */
VkDescriptorSetLayoutBinding *binds = calloc(n_ssbos, sizeof(*binds));
if (!binds) return -1;
for (uint32_t i = 0; i < n_ssbos; i++) {
binds[i] = (VkDescriptorSetLayoutBinding){
.binding = i,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
};
}
VkDescriptorSetLayoutCreateInfo dslci = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
.bindingCount = n_ssbos,
.pBindings = binds,
};
VkResult vr = vkCreateDescriptorSetLayout(r->device, &dslci, NULL,
&out->ds_layout);
free(binds);
if (vr != VK_SUCCESS) {
fprintf(stderr, "vkCreateDescriptorSetLayout = %d\n", vr); return -1;
}
VkPushConstantRange pcr = {
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.offset = 0,
.size = push_const_size,
};
VkPipelineLayoutCreateInfo plci = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 1,
.pSetLayouts = &out->ds_layout,
.pushConstantRangeCount = push_const_size ? 1 : 0,
.pPushConstantRanges = push_const_size ? &pcr : NULL,
};
CHK(vkCreatePipelineLayout(r->device, &plci, NULL, &out->layout));
size_t spv_size = 0;
uint32_t *spv = read_spv(spv_path, &spv_size);
if (!spv) return -1;
VkShaderModuleCreateInfo smci = {
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
.codeSize = spv_size,
.pCode = spv,
};
VkShaderModule shader;
vr = vkCreateShaderModule(r->device, &smci, NULL, &shader);
free(spv);
if (vr != VK_SUCCESS) {
fprintf(stderr, "vkCreateShaderModule(%s) = %d\n", spv_path, vr);
return -1;
}
VkComputePipelineCreateInfo cpci = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = shader,
.pName = "main",
},
.layout = out->layout,
};
vr = vkCreateComputePipelines(r->device, VK_NULL_HANDLE, 1, &cpci, NULL,
&out->pipeline);
vkDestroyShaderModule(r->device, shader, NULL);
if (vr != VK_SUCCESS) {
fprintf(stderr, "vkCreateComputePipelines = %d\n", vr); return -1;
}
/* Single descriptor pool + set for this pipeline. */
VkDescriptorPoolSize ps = {
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = n_ssbos,
};
VkDescriptorPoolCreateInfo dpci = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
.maxSets = 1,
.poolSizeCount = 1,
.pPoolSizes = &ps,
};
CHK(vkCreateDescriptorPool(r->device, &dpci, NULL, &out->pool));
VkDescriptorSetAllocateInfo dsai = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
.descriptorPool = out->pool,
.descriptorSetCount = 1,
.pSetLayouts = &out->ds_layout,
};
CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set));
return 0;
}
void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
{
if (!p || p->pipeline == VK_NULL_HANDLE) return;
vkDestroyPipeline(r->device, p->pipeline, NULL);
vkDestroyPipelineLayout(r->device, p->layout, NULL);
vkDestroyDescriptorPool(r->device, p->pool, NULL); /* frees its set */
vkDestroyDescriptorSetLayout(r->device, p->ds_layout, NULL);
memset(p, 0, sizeof(*p));
}
int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p,
const v3d_buffer *bufs, uint32_t n)
{
if (n != p->n_ssbos) {
fprintf(stderr, "bind_buffers: n=%u != pipeline n_ssbos=%u\n",
n, p->n_ssbos);
return -1;
}
VkDescriptorBufferInfo *bi = calloc(n, sizeof(*bi));
VkWriteDescriptorSet *wr = calloc(n, sizeof(*wr));
if (!bi || !wr) { free(bi); free(wr); return -1; }
for (uint32_t i = 0; i < n; i++) {
bi[i].buffer = bufs[i].buffer;
bi[i].offset = 0;
bi[i].range = bufs[i].size;
wr[i] = (VkWriteDescriptorSet){
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = p->desc_set,
.dstBinding = i,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.pBufferInfo = &bi[i],
};
}
vkUpdateDescriptorSets(r->device, n, wr, 0, NULL);
free(bi); free(wr);
return 0;
}
/* ---- Command buffers ------------------------------------------- */
VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r)
{
VkCommandBufferAllocateInfo cbai = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.commandPool = r->pool,
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
VkCommandBuffer cb = VK_NULL_HANDLE;
if (vkAllocateCommandBuffers(r->device, &cbai, &cb) != VK_SUCCESS)
return VK_NULL_HANDLE;
return cb;
}
int v3d_runner_submit_wait(v3d_runner *r, VkCommandBuffer cb)
{
VkSubmitInfo si = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.commandBufferCount = 1,
.pCommandBuffers = &cb,
};
CHK(vkQueueSubmit(r->queue, 1, &si, VK_NULL_HANDLE));
CHK(vkQueueWaitIdle(r->queue));
return 0;
}