/* * bench_pool_overhead — measure QPU dispatch overhead with and without * the v3d_runner buffer pool warm. * * Times N consecutive daedalus_recipe_dispatch_vp9_idct8 calls and * prints the per-call distribution. The first call pays * vkAllocateMemory (typically tens of microseconds on V3D7's Mesa); * the second and subsequent should hit the pool freelist and amortise * to the pure dispatch-floor cost. * * Purpose: provide a concrete before/after number for the QPU-default * substrate decree (2026-05-23). Bench is non-gating and runs in * fractions of a second. * * License: BSD-2-Clause. */ #define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include "../include/daedalus.h" extern size_t v3d_runner_pool_total_bytes(void *); /* exposed if we wanted it */ static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } static int cmp_double(const void *a, const void *b) { double da = *(const double *)a, db = *(const double *)b; return da < db ? -1 : da > db ? 1 : 0; } int main(int argc, char **argv) { int n_calls = argc > 1 ? atoi(argv[1]) : 200; int n_blocks = 8; /* one MB column of 8x8 IDCT blocks */ int stride = 64; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; } int has_qpu = daedalus_ctx_has_qpu(ctx); printf("ctx: has_qpu=%d\n", has_qpu); if (!has_qpu) { fprintf(stderr, "QPU not available on this device; bench needs V3D\n"); daedalus_ctx_destroy(ctx); return 2; } /* Build a representative IDCT 8x8 batch and warm a dst buffer. */ int16_t *coeffs = calloc((size_t) n_blocks * 64, sizeof(int16_t)); uint8_t *dst = calloc((size_t) n_blocks * 8 * stride, 1); daedalus_idct8_meta *meta = calloc((size_t) n_blocks, sizeof(*meta)); if (!coeffs || !dst || !meta) { fprintf(stderr, "alloc fail\n"); return 1; } uint64_t s = 0x1234567abcdefULL; for (size_t i = 0; i < (size_t) n_blocks * 64; i++) { s ^= s << 13; s ^= s >> 7; s ^= s << 17; coeffs[i] = (int16_t)(s & 0x7ff) - 0x400; } for (int b = 0; b < n_blocks; b++) { meta[b].dst_off = (uint32_t) b * 8; meta[b].block_x = (uint32_t) b; meta[b].block_y = 0; } double *t = malloc((size_t) n_calls * sizeof(double)); int rc; printf("=== dispatching %d times, n_blocks=%d/call ===\n", n_calls, n_blocks); for (int i = 0; i < n_calls; i++) { double t0 = now_seconds(); rc = daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_QPU, dst, (size_t) stride, coeffs, (size_t) n_blocks, meta); double t1 = now_seconds(); if (rc) { fprintf(stderr, "dispatch %d rc=%d\n", i, rc); return 1; } t[i] = (t1 - t0) * 1e6; /* us */ } /* Per-call distribution (first few + sorted summary on the steady-state) */ printf("\nfirst 5 calls (cold-warm transition):\n"); for (int i = 0; i < 5 && i < n_calls; i++) printf(" call %d: %.2f us\n", i, t[i]); int skip = 10; /* drop warm-up calls from the steady-state stats */ if (n_calls > skip + 10) { int n = n_calls - skip; double *s_arr = malloc((size_t) n * sizeof(double)); memcpy(s_arr, t + skip, (size_t) n * sizeof(double)); qsort(s_arr, (size_t) n, sizeof(double), cmp_double); double sum = 0; for (int i = 0; i < n; i++) sum += s_arr[i]; printf("\nsteady-state stats (calls %d..%d, n=%d):\n", skip, n_calls - 1, n); printf(" min: %.2f us\n", s_arr[0]); printf(" p50: %.2f us\n", s_arr[n / 2]); printf(" p90: %.2f us\n", s_arr[(int)(n * 0.9)]); printf(" p99: %.2f us\n", s_arr[(int)(n * 0.99)]); printf(" max: %.2f us\n", s_arr[n - 1]); printf(" mean: %.2f us\n", sum / n); printf("\nfirst-call / steady-state median ratio: %.1fx\n", t[0] / s_arr[n / 2]); free(s_arr); } free(t); free(coeffs); free(dst); free(meta); daedalus_ctx_destroy(ctx); return 0; }