/*
 * Phase 3 — Vulkan compute dispatch-overhead microbench (M5).
 *
 * Measures the per-dispatch wall-clock floor on V3D 7.1 via Mesa
 * v3dv: vkQueueSubmit + vkQueueWaitIdle round-trip cost for a
 * noop compute shader. Establishes the floor below which kernel
 * batching is mandatory.
 *
 * Two measurements:
 *   M5a: empty command-buffer submit (no dispatch at all)
 *   M5b: 1-workgroup dispatch of an empty shader
 *
 * The delta M5b - M5a isolates the per-vkCmdDispatch cost from
 * the per-vkQueueSubmit cost.
 *
 * Build: cmake -DDAEDALUS_BUILD_VULKAN=ON ..
 * Run:   ./bench_vulkan_dispatch [--iters N] [--spv PATH]
 *
 * License: BSD-2-Clause (daedalus-fourier).
 */
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <getopt.h>
#include <vulkan/vulkan.h>

#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
    fprintf(stderr, "vulkan error %d at %s:%d (%s)\n", r__, __FILE__, __LINE__, #call); \
    exit(1); } } while (0)

static double now_seconds(void)
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
    return ts.tv_sec + ts.tv_nsec * 1e-9;
}

static uint32_t *read_spv(const char *path, size_t *out_size)
{
    FILE *f = fopen(path, "rb");
    if (!f) { perror(path); exit(1); }
    fseek(f, 0, SEEK_END);
    long sz = ftell(f);
    fseek(f, 0, SEEK_SET);
    if (sz <= 0 || (sz & 3)) {
        fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz);
        exit(1);
    }
    uint32_t *buf = malloc(sz);
    if (!buf || fread(buf, 1, sz, f) != (size_t)sz) {
        perror("read"); exit(1);
    }
    fclose(f);
    *out_size = sz;
    return buf;
}

int main(int argc, char **argv)
{
    int iters = 100000;
    const char *spv_path = "noop.spv";

    static struct option opts[] = {
        {"iters", required_argument, 0, 'i'},
        {"spv",   required_argument, 0, 's'},
        {"help",  no_argument,       0, 'h'},
        {0,0,0,0}
    };
    for (int c; (c = getopt_long(argc, argv, "i:s:h", opts, 0)) != -1;) {
        switch (c) {
        case 'i': iters    = atoi(optarg); break;
        case 's': spv_path = optarg; break;
        case 'h':
            fprintf(stderr,
                "Usage: %s [--iters N] [--spv noop.spv]\n", argv[0]);
            return 0;
        default:
            return 2;
        }
    }

    /* ---- Instance ---- */
    VkApplicationInfo app = {
        .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
        .pApplicationName = "daedalus-fourier-bench",
        .apiVersion = VK_API_VERSION_1_3,
    };
    VkInstanceCreateInfo ici = {
        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
        .pApplicationInfo = &app,
    };
    VkInstance instance;
    CHK(vkCreateInstance(&ici, NULL, &instance));

    /* ---- Pick V3D physical device (skip llvmpipe) ---- */
    uint32_t pd_count = 0;
    CHK(vkEnumeratePhysicalDevices(instance, &pd_count, NULL));
    VkPhysicalDevice *pds = malloc(pd_count * sizeof(*pds));
    CHK(vkEnumeratePhysicalDevices(instance, &pd_count, pds));
    VkPhysicalDevice phys = VK_NULL_HANDLE;
    VkPhysicalDeviceProperties props = {0};
    for (uint32_t i = 0; i < pd_count; i++) {
        vkGetPhysicalDeviceProperties(pds[i], &props);
        printf("device %u: %s (api %u.%u.%u, vendor 0x%04x)\n",
               i, props.deviceName,
               VK_VERSION_MAJOR(props.apiVersion),
               VK_VERSION_MINOR(props.apiVersion),
               VK_VERSION_PATCH(props.apiVersion),
               props.vendorID);
        if (strstr(props.deviceName, "V3D") != NULL) {
            phys = pds[i];
        }
    }
    if (phys == VK_NULL_HANDLE) {
        fprintf(stderr, "no V3D device found; bailing.\n");
        return 1;
    }
    vkGetPhysicalDeviceProperties(phys, &props);
    printf("selected: %s\n", props.deviceName);
    free(pds);

    /* ---- Compute queue family ---- */
    uint32_t qfc = 0;
    vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, NULL);
    VkQueueFamilyProperties *qfp = malloc(qfc * sizeof(*qfp));
    vkGetPhysicalDeviceQueueFamilyProperties(phys, &qfc, qfp);
    uint32_t qfi = (uint32_t) -1;
    for (uint32_t i = 0; i < qfc; i++) {
        if (qfp[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
            qfi = i; break;
        }
    }
    if (qfi == (uint32_t) -1) {
        fprintf(stderr, "no compute queue family\n");
        return 1;
    }
    free(qfp);

    /* ---- Logical device ---- */
    float qprio = 1.0f;
    VkDeviceQueueCreateInfo dqci = {
        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
        .queueFamilyIndex = qfi,
        .queueCount = 1,
        .pQueuePriorities = &qprio,
    };
    VkDeviceCreateInfo dci = {
        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
        .queueCreateInfoCount = 1,
        .pQueueCreateInfos = &dqci,
    };
    VkDevice dev;
    CHK(vkCreateDevice(phys, &dci, NULL, &dev));
    VkQueue queue;
    vkGetDeviceQueue(dev, qfi, 0, &queue);

    /* ---- Command pool + buffers ---- */
    VkCommandPoolCreateInfo cpci = {
        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
        .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
        .queueFamilyIndex = qfi,
    };
    VkCommandPool pool;
    CHK(vkCreateCommandPool(dev, &cpci, NULL, &pool));

    VkCommandBuffer cb_empty, cb_dispatch;
    VkCommandBufferAllocateInfo cbai = {
        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
        .commandPool = pool,
        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
        .commandBufferCount = 1,
    };
    CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_empty));
    CHK(vkAllocateCommandBuffers(dev, &cbai, &cb_dispatch));

    /* ---- Pipeline layout (empty: no descriptors, no push constants) ---- */
    VkPipelineLayoutCreateInfo plci = {
        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
    };
    VkPipelineLayout playout;
    CHK(vkCreatePipelineLayout(dev, &plci, NULL, &playout));

    /* ---- Compute pipeline from noop SPIR-V ---- */
    size_t spv_size = 0;
    uint32_t *spv = read_spv(spv_path, &spv_size);
    VkShaderModuleCreateInfo smci = {
        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
        .codeSize = spv_size,
        .pCode = spv,
    };
    VkShaderModule shader;
    CHK(vkCreateShaderModule(dev, &smci, NULL, &shader));
    free(spv);

    VkComputePipelineCreateInfo cpci2 = {
        .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
        .stage = {
            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
            .stage = VK_SHADER_STAGE_COMPUTE_BIT,
            .module = shader,
            .pName = "main",
        },
        .layout = playout,
    };
    VkPipeline pipe;
    CHK(vkCreateComputePipelines(dev, VK_NULL_HANDLE, 1, &cpci2, NULL, &pipe));

    /* ---- Record both command buffers once, reuse for every iteration ---- */
    VkCommandBufferBeginInfo cbbi = {
        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
    };

    CHK(vkBeginCommandBuffer(cb_empty, &cbbi));
    CHK(vkEndCommandBuffer(cb_empty));

    CHK(vkBeginCommandBuffer(cb_dispatch, &cbbi));
    vkCmdBindPipeline(cb_dispatch, VK_PIPELINE_BIND_POINT_COMPUTE, pipe);
    vkCmdDispatch(cb_dispatch, 1, 1, 1);
    CHK(vkEndCommandBuffer(cb_dispatch));

    VkSubmitInfo si_empty = {
        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
        .commandBufferCount = 1, .pCommandBuffers = &cb_empty,
    };
    VkSubmitInfo si_disp = {
        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
        .commandBufferCount = 1, .pCommandBuffers = &cb_dispatch,
    };

    /* ---- Warm-up ---- */
    for (int i = 0; i < 100; i++) {
        CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
        CHK(vkQueueWaitIdle(queue));
    }

    /* ---- M5a: empty CB submit+wait ---- */
    double t0 = now_seconds();
    for (int i = 0; i < iters; i++) {
        CHK(vkQueueSubmit(queue, 1, &si_empty, VK_NULL_HANDLE));
        CHK(vkQueueWaitIdle(queue));
    }
    double t1 = now_seconds();
    double m5a_per = (t1 - t0) / iters * 1e6;  /* µs */

    /* ---- M5b: 1-WG noop dispatch submit+wait ---- */
    double t2 = now_seconds();
    for (int i = 0; i < iters; i++) {
        CHK(vkQueueSubmit(queue, 1, &si_disp, VK_NULL_HANDLE));
        CHK(vkQueueWaitIdle(queue));
    }
    double t3 = now_seconds();
    double m5b_per = (t3 - t2) / iters * 1e6;  /* µs */

    printf("\n=== M5: Vulkan compute dispatch overhead ===\n");
    printf("  iters per measurement: %d\n", iters);
    printf("  M5a empty CB submit+wait:           %.2f µs/op\n", m5a_per);
    printf("  M5b 1-WG noop dispatch submit+wait: %.2f µs/op\n", m5b_per);
    printf("  delta (per-vkCmdDispatch + per-pipeline-bind): %.2f µs\n",
           m5b_per - m5a_per);
    printf("\n");
    printf("  Implication for kernel batching:\n");
    printf("    if QPU IDCT8 = ~ 100ns/block (best case, hypothetical),\n");
    printf("    a single-block dispatch costs %.0fx more in overhead\n",
           m5b_per * 1e3 / 100.0);
    printf("    -> batch at least %.0f blocks per dispatch to break even.\n",
           m5b_per * 1e3 / 100.0);

    /* ---- Tear down (minimal — process exit handles the rest) ---- */
    vkDestroyPipeline(dev, pipe, NULL);
    vkDestroyShaderModule(dev, shader, NULL);
    vkDestroyPipelineLayout(dev, playout, NULL);
    vkDestroyCommandPool(dev, pool, NULL);
    vkDestroyDevice(dev, NULL);
    vkDestroyInstance(instance, NULL);
    return 0;
}