daedalus-decoder/tests/bench_flush_frame.c

/* SPDX-License-Identifier: BSD-2-Clause */
/* Needed for CLOCK_MONOTONIC under -std=c11 -CMAKE_C_EXTENSIONS=OFF. */
#define _POSIX_C_SOURCE 200809L
/*
 * bench_flush_frame — IDCT-layer throughput baseline.
 *
 * Times daedalus_decoder_flush_frame at a configurable coded
 * resolution with random coefficients (the dispatch path doesn't
 * care if the residuals are meaningful, only the layout / counts /
 * bit-exactness; perf is independent of coefficient content).
 *
 * NOT a ctest — produces wall-time numbers, doesn't pass/fail.
 * Invoke manually after a build:
 *
 *   ./build/bench_flush_frame [width] [height] [iters] [warmup] [substrate]
 *
 * Defaults: 1920 1088 100 5 auto
 *
 * The [substrate] argument selects the dispatch path:
 *   auto — recipe table picks (V3D7 when available, else NEON)
 *   cpu  — force NEON path
 *   qpu  — force V3D7 path (fails on hosts without it)
 *
 * Run both to quantify the substrate gap.  The "QPU is default
 * substrate" decree (2026-05-23, feedback_qpu_is_default_substrate.md)
 * is a policy claim; this bench is how we measure whether the policy
 * pays off for the IDCT layer specifically.
 *
 * The first `warmup` iterations are excluded from the timing
 * average because the daedalus-fourier shader pool needs to
 * materialise pipelines + buffer pool entries on the first few
 * calls (cycle 8b buffer-pool work amortises this; this bench is
 * how we'd notice if that ever regresses).
 *
 * Output gives:
 *   - per-frame mean / median / p99 latency
 *   - frames per second steady-state
 *   - vs. the 30 fps @ 1080p target from the user's
 *     project_30fps_floor_is_fine.md memory
 *
 * NB: this is IDCT-only (luma 4x4 + 8x8 + chroma 4x4).  It does
 * NOT include intra prediction, MC, or deblock — those land in
 * Stage 2+ / 4.  A 30 fps number here is necessary-but-not-sufficient
 * for the final decoder hitting the same.
 */

#include "daedalus_decoder.h"

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

static uint64_t xs64_state;
static uint64_t xs64(void)
{
    uint64_t x = xs64_state;
    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
    return xs64_state = x;
}

static int cmp_double(const void *a, const void *b)
{
    double da = *(const double *)a, db = *(const double *)b;
    return (da > db) - (da < db);
}

static double now_ms(void)
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1.0e6;
}

int main(int argc, char **argv)
{
    int width   = argc > 1 ? atoi(argv[1]) : 1920;
    int height  = argc > 2 ? atoi(argv[2]) : 1088;
    int iters   = argc > 3 ? atoi(argv[3]) : 100;
    int warmup  = argc > 4 ? atoi(argv[4]) : 5;

    daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO;
    const char *sub_name = "auto";
    if (argc > 5) {
        if      (!strcmp(argv[5], "cpu"))  { sub = DAEDALUS_DECODER_SUBSTRATE_CPU;  sub_name = "cpu"; }
        else if (!strcmp(argv[5], "qpu"))  { sub = DAEDALUS_DECODER_SUBSTRATE_QPU;  sub_name = "qpu"; }
        else if (!strcmp(argv[5], "auto")) { /* default */ }
        else {
            fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[5]);
            return 1;
        }
    }

    if (warmup >= iters) {
        fprintf(stderr, "warmup (%d) must be < iters (%d)\n", warmup, iters);
        return 1;
    }

    int mb_w = width  / 16;
    int mb_h = height / 16;
    int n_mbs = mb_w * mb_h;
    printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup), substrate=%s\n",
           width, height, n_mbs, iters, warmup, sub_name);

    daedalus_decoder *dec = daedalus_decoder_create(width, height);
    if (!dec) {
        fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
        return 0;
    }
    if (daedalus_decoder_set_substrate(dec, sub) != 0) {
        fprintf(stderr, "set_substrate(%s) failed\n", sub_name);
        return 1;
    }
    printf("ctx has_qpu=%d\n", daedalus_decoder_has_qpu(dec));

    /* Pre-generate per-MB random coeffs once.  We re-append the same
     * per-MB buffer across iterations — the dispatch path doesn't
     * cache anything per-MB across frames, so this is representative. */
    xs64_state = 0xfeedface5a5a5a5aULL;
    int16_t (*per_mb)[384] = malloc((size_t) n_mbs * sizeof(*per_mb));
    uint8_t *mb_8x8 = malloc((size_t) n_mbs);
    if (!per_mb || !mb_8x8) {
        fprintf(stderr, "alloc fail\n");
        return 1;
    }
    for (int mb = 0; mb < n_mbs; mb++) {
        for (int i = 0; i < 384; i++)
            per_mb[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
        mb_8x8[mb] = (mb & 1) ? 1 : 0;  /* same 50/50 mix as bit-exact test */
    }

    size_t y_size  = (size_t) width * height;
    size_t uv_size = (size_t) width * height / 2;
    uint8_t *out_y  = malloc(y_size);
    uint8_t *out_uv = malloc(uv_size);
    if (!out_y || !out_uv) {
        fprintf(stderr, "alloc fail\n");
        return 1;
    }

    /* Sample buffer for per-iteration timings (post-warmup). */
    int sample_count = iters - warmup;
    double *samples = malloc((size_t) sample_count * sizeof(double));
    if (!samples) return 1;

    for (int it = 0; it < iters; it++) {
        /* Re-append all MBs for the frame.  flush_frame resets
         * mbs_appended to 0 internally on completion, so this loop
         * is exactly the cost we'd pay per real frame. */
        struct daedalus_decoder_mb_input mb = {0};
        for (int my = 0; my < mb_h; my++) {
            for (int mx = 0; mx < mb_w; mx++) {
                int idx = my * mb_w + mx;
                mb.mb_x = (uint16_t) mx;
                mb.mb_y = (uint16_t) my;
                mb.coeffs = per_mb[idx];
                mb.transform_8x8 = mb_8x8[idx];
                if (daedalus_decoder_append_mb(dec, &mb) != 0) {
                    fprintf(stderr, "append fail iter=%d idx=%d\n", it, idx);
                    return 1;
                }
            }
        }

        double t0 = now_ms();
        int frc = daedalus_decoder_flush_frame(dec, out_y, (size_t) width,
                                                out_uv, (size_t) width);
        double t1 = now_ms();
        if (frc != 0) {
            fprintf(stderr, "flush_frame rc=%d iter=%d\n", frc, it);
            return 1;
        }

        if (it >= warmup) samples[it - warmup] = t1 - t0;
    }

    /* Stats. */
    qsort(samples, (size_t) sample_count, sizeof(double), cmp_double);
    double sum = 0;
    for (int i = 0; i < sample_count; i++) sum += samples[i];
    double mean   = sum / sample_count;
    double median = samples[sample_count / 2];
    double p99    = samples[(sample_count * 99) / 100];
    double min_   = samples[0];
    double max_   = samples[sample_count - 1];

    printf("\nflush_frame (post-warmup, %d samples):\n", sample_count);
    printf("  min    = %7.3f ms\n", min_);
    printf("  median = %7.3f ms\n", median);
    printf("  mean   = %7.3f ms\n", mean);
    printf("  p99    = %7.3f ms\n", p99);
    printf("  max    = %7.3f ms\n", max_);

    double fps_mean   = 1000.0 / mean;
    double fps_median = 1000.0 / median;
    printf("\nthroughput (steady-state, IDCT only — NO intra/MC/deblock):\n");
    printf("  mean   = %.1f fps\n", fps_mean);
    printf("  median = %.1f fps\n", fps_median);
    printf("  target = 30.0 fps (project_30fps_floor_is_fine.md)\n");
    if (fps_median >= 30.0)
        printf("  status = MEETS target (with %.1fx headroom for "
               "intra/MC/deblock)\n", fps_median / 30.0);
    else
        printf("  status = BELOW target (need %.1fx speedup just at IDCT)\n",
               30.0 / fps_median);

    free(samples);
    free(out_uv);
    free(out_y);
    free(mb_8x8);
    free(per_mb);
    daedalus_decoder_destroy(dec);
    return 0;
}