daedalus-fourier/tests/bench_pool_overhead.c

/*
 * bench_pool_overhead — measure QPU dispatch overhead with and without
 * the v3d_runner buffer pool warm.
 *
 * Times N consecutive daedalus_recipe_dispatch_vp9_idct8 calls and
 * prints the per-call distribution.  The first call pays
 * vkAllocateMemory (typically tens of microseconds on V3D7's Mesa);
 * the second and subsequent should hit the pool freelist and amortise
 * to the pure dispatch-floor cost.
 *
 * Purpose: provide a concrete before/after number for the QPU-default
 * substrate decree (2026-05-23).  Bench is non-gating and runs in
 * fractions of a second.
 *
 * License: BSD-2-Clause.
 */
#define _POSIX_C_SOURCE 200809L

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>

#include "../include/daedalus.h"

extern size_t v3d_runner_pool_total_bytes(void *);  /* exposed if we wanted it */

static double now_seconds(void)
{
	struct timespec ts;
	clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
	return ts.tv_sec + ts.tv_nsec * 1e-9;
}

static int cmp_double(const void *a, const void *b)
{
	double da = *(const double *)a, db = *(const double *)b;
	return da < db ? -1 : da > db ? 1 : 0;
}

int main(int argc, char **argv)
{
	int n_calls = argc > 1 ? atoi(argv[1]) : 200;
	int n_blocks = 8;	/* one MB column of 8x8 IDCT blocks */
	int stride = 64;

	daedalus_ctx *ctx = daedalus_ctx_create();
	if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
	int has_qpu = daedalus_ctx_has_qpu(ctx);
	printf("ctx: has_qpu=%d\n", has_qpu);
	if (!has_qpu) {
		fprintf(stderr, "QPU not available on this device; bench needs V3D\n");
		daedalus_ctx_destroy(ctx);
		return 2;
	}

	/* Build a representative IDCT 8x8 batch and warm a dst buffer. */
	int16_t *coeffs = calloc((size_t) n_blocks * 64, sizeof(int16_t));
	uint8_t *dst    = calloc((size_t) n_blocks * 8 * stride, 1);
	daedalus_idct8_meta *meta = calloc((size_t) n_blocks, sizeof(*meta));
	if (!coeffs || !dst || !meta) { fprintf(stderr, "alloc fail\n"); return 1; }

	uint64_t s = 0x1234567abcdefULL;
	for (size_t i = 0; i < (size_t) n_blocks * 64; i++) {
		s ^= s << 13; s ^= s >> 7; s ^= s << 17;
		coeffs[i] = (int16_t)(s & 0x7ff) - 0x400;
	}
	for (int b = 0; b < n_blocks; b++) {
		meta[b].dst_off = (uint32_t) b * 8;
		meta[b].block_x = (uint32_t) b;
		meta[b].block_y = 0;
	}

	double *t = malloc((size_t) n_calls * sizeof(double));
	int rc;

	printf("=== dispatching %d times, n_blocks=%d/call ===\n",
	       n_calls, n_blocks);

	for (int i = 0; i < n_calls; i++) {
		double t0 = now_seconds();
		rc = daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_QPU,
						  dst, (size_t) stride,
						  coeffs, (size_t) n_blocks, meta);
		double t1 = now_seconds();
		if (rc) { fprintf(stderr, "dispatch %d rc=%d\n", i, rc); return 1; }
		t[i] = (t1 - t0) * 1e6;	/* us */
	}

	/* Per-call distribution (first few + sorted summary on the steady-state) */
	printf("\nfirst 5 calls (cold-warm transition):\n");
	for (int i = 0; i < 5 && i < n_calls; i++)
		printf("  call %d:  %.2f us\n", i, t[i]);

	int skip = 10;	/* drop warm-up calls from the steady-state stats */
	if (n_calls > skip + 10) {
		int n = n_calls - skip;
		double *s_arr = malloc((size_t) n * sizeof(double));
		memcpy(s_arr, t + skip, (size_t) n * sizeof(double));
		qsort(s_arr, (size_t) n, sizeof(double), cmp_double);
		double sum = 0;
		for (int i = 0; i < n; i++) sum += s_arr[i];
		printf("\nsteady-state stats (calls %d..%d, n=%d):\n",
		       skip, n_calls - 1, n);
		printf("  min:    %.2f us\n", s_arr[0]);
		printf("  p50:    %.2f us\n", s_arr[n / 2]);
		printf("  p90:    %.2f us\n", s_arr[(int)(n * 0.9)]);
		printf("  p99:    %.2f us\n", s_arr[(int)(n * 0.99)]);
		printf("  max:    %.2f us\n", s_arr[n - 1]);
		printf("  mean:   %.2f us\n", sum / n);
		printf("\nfirst-call / steady-state median ratio: %.1fx\n",
		       t[0] / s_arr[n / 2]);
		free(s_arr);
	}

	free(t); free(coeffs); free(dst); free(meta);
	daedalus_ctx_destroy(ctx);
	return 0;
}