daedalus-fourier/tests/bench_v3d_lpf.c

/*
 * Cycle 2 Phase 6 — QPU bench for VP9 4-tap inner loop filter on V3D 7.1.
 *
 * Reports:
 *   M1''  (correctness): bit-exact rate, QPU output vs C reference
 *   M2''  (throughput):  QPU sustained Medge/s over K dispatched batches
 *   fm/hev pass rates    (phase5'' finding 8 instrumentation)
 *
 * Asserts the two contracts from k2_deblock_phase4.md §4
 * (phase5'' findings 2+4): m.x ≥ 4, dst_stride ≥ 4.
 *
 * License: BSD-2-Clause.
 */
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stddef.h>
#include <assert.h>
#include <time.h>
#include <getopt.h>
#include <vulkan/vulkan.h>

#include "v3d_runner.h"

extern void daedalus_vp9_loop_filter_h_4_8_ref(
    uint8_t *dst, ptrdiff_t stride, int E, int I, int H);

/* --- RNG / generators (match bench_neon_lpf.c shape) ------------- */

static uint64_t xs_state;
static inline uint64_t xs(void) {
    uint64_t x = xs_state;
    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
    return xs_state = x;
}

#define EDGE_STRIDE 8
#define EDGE_W      8
#define EDGE_H      8
#define EDGE_BYTES  (EDGE_H * EDGE_STRIDE)   /* 64 */

static void gen_edge_pixels(uint8_t *buf)
{
    int side_a_base = (int)(xs() % 200) + 20;
    int side_b_base = (int)(xs() % 200) + 20;
    int noise_scale = (int)(xs() % 30);
    for (int r = 0; r < EDGE_H; r++) {
        for (int c = 0; c < EDGE_W; c++) {
            int base = (c < 4) ? side_a_base : side_b_base;
            int noise = ((int)(xs() % (2 * noise_scale + 1))) - noise_scale;
            int v = base + noise;
            buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
        }
    }
}

static void gen_thresholds(int *E, int *I, int *H)
{
    *E = (int)(xs() % 81);
    *I = (int)(xs() % 41);
    *H = (int)(xs() % 11);
}

static double now_seconds(void)
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
    return ts.tv_sec + ts.tv_nsec * 1e-9;
}

/* --- Push constants — match shader layout ------------------------ */

typedef struct {
    uint32_t n_edges;
    uint32_t dst_stride_u8;
    uint32_t _pad0;
    uint32_t _pad1;
} push_consts;

/* --- Pre-flight: fm/hev rate on the same RNG seed (informational) - */

static void estimate_pass_rates(uint64_t seed, int n_edges,
                                double *fm_rate, double *hev_rate)
{
    uint64_t saved = xs_state;
    xs_state = seed ? seed : 0xa57edbeef5717ULL;
    int fm_pass = 0, hev_pass = 0;

    uint8_t buf[EDGE_BYTES];
    for (int i = 0; i < n_edges; i++) {
        gen_edge_pixels(buf);
        int E, I, H;
        gen_thresholds(&E, &I, &H);

        /* Mirror the C-ref fm/hev for just the first row of this
         * edge — gives a sample of what the QPU would see. (For a
         * more rigorous picture, count per-row, but per-edge is
         * fine for instrumentation.) */
        uint8_t *d = buf + 4;          /* col 4 */
        int p3 = d[-4], p2 = d[-3], p1 = d[-2], p0 = d[-1];
        int q0 = d[ 0], q1 = d[+1], q2 = d[+2], q3 = d[+3];
        int aP3P2 = p3-p2; if (aP3P2 < 0) aP3P2 = -aP3P2;
        int aP2P1 = p2-p1; if (aP2P1 < 0) aP2P1 = -aP2P1;
        int aP1P0 = p1-p0; if (aP1P0 < 0) aP1P0 = -aP1P0;
        int aQ1Q0 = q1-q0; if (aQ1Q0 < 0) aQ1Q0 = -aQ1Q0;
        int aQ2Q1 = q2-q1; if (aQ2Q1 < 0) aQ2Q1 = -aQ2Q1;
        int aQ3Q2 = q3-q2; if (aQ3Q2 < 0) aQ3Q2 = -aQ3Q2;
        int aP0Q0 = p0-q0; if (aP0Q0 < 0) aP0Q0 = -aP0Q0;
        int aP1Q1 = p1-q1; if (aP1Q1 < 0) aP1Q1 = -aP1Q1;
        int fm = (aP3P2 <= I) && (aP2P1 <= I) && (aP1P0 <= I) &&
                 (aQ1Q0 <= I) && (aQ2Q1 <= I) && (aQ3Q2 <= I) &&
                 (aP0Q0 * 2 + (aP1Q1 >> 1) <= E);
        if (fm) {
            fm_pass++;
            if (aP1P0 > H || aQ1Q0 > H) hev_pass++;
        }
    }
    *fm_rate  = (double) fm_pass  / n_edges;
    *hev_rate = (double) hev_pass / n_edges;
    xs_state = saved;
}

/* --- Main ------------------------------------------------------- */

int main(int argc, char **argv)
{
    int n_edges = 65536;
    int iters = 100;
    int verify_only = 0;
    uint64_t seed = 0;
    const char *spv_path = "v3d_lpf_h_4_8.spv";

    static struct option opts[] = {
        {"edges",       required_argument, 0, 'e'},
        {"iters",       required_argument, 0, 'i'},
        {"seed",        required_argument, 0, 's'},
        {"spv",         required_argument, 0, 'S'},
        {"verify-only", no_argument,       0, 'V'},
        {0,0,0,0}
    };
    for (int c; (c = getopt_long(argc, argv, "e:i:s:S:V", opts, 0)) != -1;) {
        switch (c) {
        case 'e': n_edges = atoi(optarg); break;
        case 'i': iters = atoi(optarg); break;
        case 's': seed = strtoull(optarg, 0, 0); break;
        case 'S': spv_path = optarg; break;
        case 'V': verify_only = 1; break;
        default: return 2;
        }
    }

    xs_state = seed ? seed : 0xa57edbeef5717ULL;

    /* --- Setup ---- */
    v3d_runner *r = v3d_runner_create();
    if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; }
    printf("=== v3d LPF h_4_8 bench ===\n");
    printf("  device:  %s\n", v3d_runner_device_name(r));
    printf("  n_edges: %d  iters: %d  seed: 0x%016llx\n",
           n_edges, iters, (unsigned long long) (seed ? seed : 0xa57edbeef5717ULL));

    /* Per-edge layout in dst buffer: edge i occupies bytes
     * [i*64 .. i*64+63]. The "edge center" (column 4 of row 0) is at
     * byte offset i*64 + 4. Stride between rows of the same edge = 8. */
    size_t dst_bytes  = (size_t) n_edges * EDGE_BYTES;
    size_t meta_bytes = (size_t) n_edges * 4 * sizeof(uint32_t);   /* uvec4 per edge */

    v3d_buffer buf_meta = {0}, buf_dst = {0};
    if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1;
    if (v3d_runner_create_buffer(r, dst_bytes,  &buf_dst))  return 1;

    /* Master pixel set + thresholds — kept stable across iters. */
    uint8_t *master_pred = malloc(dst_bytes);
    uint8_t *expected    = malloc(dst_bytes);
    int     *Es = malloc(n_edges * sizeof(int));
    int     *Is = malloc(n_edges * sizeof(int));
    int     *Hs = malloc(n_edges * sizeof(int));
    if (!master_pred || !expected || !Es || !Is || !Hs) { fprintf(stderr, "alloc\n"); return 1; }

    for (int i = 0; i < n_edges; i++) {
        gen_edge_pixels(master_pred + (size_t)i * EDGE_BYTES);
        gen_thresholds(&Es[i], &Is[i], &Hs[i]);
    }

    /* Build C-ref expected output (separate copies, since the filter
     * mutates dst in place). */
    memcpy(expected, master_pred, dst_bytes);
    for (int i = 0; i < n_edges; i++) {
        daedalus_vp9_loop_filter_h_4_8_ref(
            expected + (size_t)i * EDGE_BYTES + 4,   /* col 4 of this edge */
            EDGE_STRIDE, Es[i], Is[i], Hs[i]);
    }

    /* Populate GPU buffers. Asserts enforce phase4 §4 contracts. */
    uint32_t *meta = (uint32_t *) buf_meta.mapped;
    uint32_t dst_stride_u8 = EDGE_STRIDE;
    assert(dst_stride_u8 >= 4 && "phase4 §4 contract 2 violated");
    for (int i = 0; i < n_edges; i++) {
        uint32_t mx = (uint32_t)((size_t)i * EDGE_BYTES + 4);
        assert(mx >= 4 && "phase4 §4 contract 1 violated");
        meta[4*i + 0] = mx;
        meta[4*i + 1] = (uint32_t) Es[i];
        meta[4*i + 2] = (uint32_t) Is[i];
        meta[4*i + 3] = (uint32_t) Hs[i];
    }
    memcpy(buf_dst.mapped, master_pred, dst_bytes);

    /* --- Pre-flight estimate of fm/hev pass rates --- */
    double fm_rate, hev_rate;
    estimate_pass_rates(seed, 10000, &fm_rate, &hev_rate);
    printf("  fm pass rate:  %.2f%% (10k-edge sample)\n",  fm_rate  * 100);
    printf("  hev pass rate: %.2f%% (of fm-passing)\n",    hev_rate * 100);

    /* --- Pipeline --- */
    v3d_pipeline pipe = {0};
    if (v3d_runner_create_pipeline(r, spv_path,
                                   /*n_ssbos=*/2,
                                   /*push_const_size=*/sizeof(push_consts),
                                   &pipe)) return 1;
    v3d_buffer bind_bufs[2] = { buf_meta, buf_dst };
    if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 2)) return 1;

    const uint32_t edges_per_wg = 32;
    uint32_t group_count_x = (uint32_t)((n_edges + edges_per_wg - 1) / edges_per_wg);
    printf("  dispatch: %u WGs × 256 invocations = %u edges (rounded up from %d)\n",
           group_count_x, group_count_x * edges_per_wg, n_edges);

    push_consts pc = {
        .n_edges       = (uint32_t) n_edges,
        .dst_stride_u8 = dst_stride_u8,
        ._pad0 = 0, ._pad1 = 0,
    };

    /* Record command buffer once. */
    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
    if (cb == VK_NULL_HANDLE) return 1;
    VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
    vkBeginCommandBuffer(cb, &cbbi);
    vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
    vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
                            pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
    vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
                       0, sizeof(pc), &pc);
    vkCmdDispatch(cb, group_count_x, 1, 1);
    vkEndCommandBuffer(cb);

    /* --- M1'': bit-exact verification --- */
    printf("\n=== M1'': QPU vs C-reference bit-exact ===\n");
    memcpy(buf_dst.mapped, master_pred, dst_bytes);
    if (v3d_runner_submit_wait(r, cb)) return 1;

    int mismatch_edges = 0;
    int total_byte_diffs = 0;
    int prints = 0;
    for (int i = 0; i < n_edges; i++) {
        const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * EDGE_BYTES;
        const uint8_t *e = expected + (size_t)i * EDGE_BYTES;
        if (memcmp(q, e, EDGE_BYTES) != 0) {
            int diffs = 0;
            for (int j = 0; j < EDGE_BYTES; j++) if (q[j] != e[j]) diffs++;
            total_byte_diffs += diffs;
            if (prints < 3) {
                fprintf(stderr, "MISMATCH edge %d (E=%d I=%d H=%d): %d/64 bytes differ\n",
                        i, Es[i], Is[i], Hs[i], diffs);
                fprintf(stderr, "  ref:");
                for (int r0 = 0; r0 < 8; r0++) {
                    fprintf(stderr, "\n    r%d ", r0);
                    for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", e[r0*8+c]);
                }
                fprintf(stderr, "\n  qpu:");
                for (int r0 = 0; r0 < 8; r0++) {
                    fprintf(stderr, "\n    r%d ", r0);
                    for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", q[r0*8+c]);
                }
                fprintf(stderr, "\n");
                prints++;
            }
            mismatch_edges++;
        }
    }
    printf("  edges bit-exact: %d / %d (%.4f%%)\n",
           n_edges - mismatch_edges, n_edges,
           100.0 * (n_edges - mismatch_edges) / n_edges);
    printf("  total byte diffs: %d / %zu (%.4f%%)\n",
           total_byte_diffs, (size_t) n_edges * EDGE_BYTES,
           100.0 * total_byte_diffs / ((double) n_edges * EDGE_BYTES));

    if (mismatch_edges > 0) {
        fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
        v3d_runner_destroy_pipeline(r, &pipe);
        v3d_runner_destroy_buffer(r, &buf_dst);
        v3d_runner_destroy_buffer(r, &buf_meta);
        v3d_runner_destroy(r);
        return 1;
    }

    if (verify_only) {
        v3d_runner_destroy_pipeline(r, &pipe);
        v3d_runner_destroy_buffer(r, &buf_dst);
        v3d_runner_destroy_buffer(r, &buf_meta);
        v3d_runner_destroy(r);
        return 0;
    }

    /* --- M2'': throughput --- */
    printf("\n=== M2'': QPU throughput ===\n");

    for (int i = 0; i < 10; i++) {     /* warm-up */
        memcpy(buf_dst.mapped, master_pred, dst_bytes);
        if (v3d_runner_submit_wait(r, cb)) return 1;
    }

    double t0 = now_seconds();
    for (int i = 0; i < iters; i++) {
        memcpy(buf_dst.mapped, master_pred, dst_bytes);
        if (v3d_runner_submit_wait(r, cb)) return 1;
    }
    double t1 = now_seconds();

    double s0 = now_seconds();
    for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master_pred, dst_bytes);
    double s1 = now_seconds();

    double kernel_seconds = (t1 - t0) - (s1 - s0);
    double total_edges = (double) n_edges * iters;
    double medges_s = total_edges / kernel_seconds / 1e6;

    printf("  edges/dispatch:  %d\n", n_edges);
    printf("  iters:           %d\n", iters);
    printf("  total edges:     %.0f\n", total_edges);
    printf("  elapsed (kernel)=%.6f s  (setup-subtracted)\n", kernel_seconds);
    printf("  elapsed (setup) =%.6f s\n", s1 - s0);
    printf("  M2'' throughput = %.3f Medge/s\n", medges_s);
    printf("  per-edge        = %.1f ns\n", kernel_seconds / total_edges * 1e9);
    printf("  per-dispatch    = %.1f us\n", kernel_seconds / iters * 1e6);

    double M3pp = 48.285;   /* from k2_deblock_phase3.md */
    double Rpp  = medges_s / M3pp;
    printf("\n  Cycle 2 NEON M3'' = %.3f Medge/s\n", M3pp);
    printf("  R'' = M2''/M3''   = %.3f\n", Rpp);
    if      (Rpp >= 1.0) printf("  decision band     = GREEN: QPU beats NEON in isolation\n");
    else if (Rpp >= 0.5) printf("  decision band     = YELLOW: M4'' decides\n");
    else if (Rpp >= 0.1) printf("  decision band     = ORANGE: M4'' may still rescue (cycle-1 calibration)\n");
    else                 printf("  decision band     = RED: structural mismatch\n");

    v3d_runner_destroy_pipeline(r, &pipe);
    v3d_runner_destroy_buffer(r, &buf_dst);
    v3d_runner_destroy_buffer(r, &buf_meta);
    v3d_runner_destroy(r);
    free(master_pred); free(expected); free(Es); free(Is); free(Hs);
    return 0;
}