Files
daedalus-fourier/tests/bench_pool_overhead.c
T
claude-noether 0a042a8e95 v3d_runner: buffer pool for QPU dispatch hot path
Per the QPU-default substrate decree 2026-05-23: the per-dispatch
vkAllocateMemory in dispatch_*_qpu was the biggest single fixable
contributor to QPU dispatch overhead.  This pools v3d_buffer
allocations by power-of-2 size class so the second-and-subsequent
dispatch hits a freelist instead of paying ~10-50us of Mesa-V3D7
memory-allocation cost per call.

API additions (v3d_runner.h):
  - v3d_runner_acquire_buffer(): pulls from per-bucket freelist;
    falls through to v3d_runner_create_buffer() on miss.
  - v3d_runner_release_buffer(): pushes back onto the freelist; the
    backing VkBuffer/VkDeviceMemory only get vkFreeMemory'd in
    v3d_runner_destroy().
  - v3d_runner_pool_total_bytes(): diagnostic watermark.

Size classes 2^8..2^23 (256 B to 8 MiB).  Oversize requests fall
through to non-pooled (vkAllocateMemory) for both ends — pool stays
correct, just degenerates to old behaviour for those calls.

Migration: daedalus_core.c dispatch_*_qpu paths globally swap
create_buffer → acquire_buffer and destroy_buffer → release_buffer.
All five QPU dispatch functions (idct8 / lpf / mc_8h / cdef /
h264_deblock) now reuse buffers across calls.  test_api_idct stays
bit-exact (4096/4096 bytes on CPU/QPU/AUTO substrates on hertz).

Microbench (tests/bench_pool_overhead.c) on hertz (Pi 5,
6.18.29+rpt-rpi-2712, V3D 7.1):

  call 0:  434.89 us  (cold — 3x vkAllocateMemory)
  call 1:  100.06 us  (pool hit on all 3 buffers)
  steady-state:
    p50:    76.44 us
    p90:    90.52 us
    mean:   77.95 us
  first-call / steady-state ratio: 5.7x

The remaining ~76us steady-state is dominated by vkQueueWaitIdle +
shader execution + per-call descriptor-set update + command-buffer
allocation — addressed in follow-on tasks 161 (persistent cmdbuf)
and 162 (dmabuf import for dst, eliminates memcpy in/out).

Refs daedalus-fourier task #160.
2026-05-23 19:52:50 +02:00

121 lines
3.9 KiB
C

/*
* bench_pool_overhead — measure QPU dispatch overhead with and without
* the v3d_runner buffer pool warm.
*
* Times N consecutive daedalus_recipe_dispatch_vp9_idct8 calls and
* prints the per-call distribution. The first call pays
* vkAllocateMemory (typically tens of microseconds on V3D7's Mesa);
* the second and subsequent should hit the pool freelist and amortise
* to the pure dispatch-floor cost.
*
* Purpose: provide a concrete before/after number for the QPU-default
* substrate decree (2026-05-23). Bench is non-gating and runs in
* fractions of a second.
*
* License: BSD-2-Clause.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include "../include/daedalus.h"
extern size_t v3d_runner_pool_total_bytes(void *); /* exposed if we wanted it */
static double now_seconds(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
static int cmp_double(const void *a, const void *b)
{
double da = *(const double *)a, db = *(const double *)b;
return da < db ? -1 : da > db ? 1 : 0;
}
int main(int argc, char **argv)
{
int n_calls = argc > 1 ? atoi(argv[1]) : 200;
int n_blocks = 8; /* one MB column of 8x8 IDCT blocks */
int stride = 64;
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
int has_qpu = daedalus_ctx_has_qpu(ctx);
printf("ctx: has_qpu=%d\n", has_qpu);
if (!has_qpu) {
fprintf(stderr, "QPU not available on this device; bench needs V3D\n");
daedalus_ctx_destroy(ctx);
return 2;
}
/* Build a representative IDCT 8x8 batch and warm a dst buffer. */
int16_t *coeffs = calloc((size_t) n_blocks * 64, sizeof(int16_t));
uint8_t *dst = calloc((size_t) n_blocks * 8 * stride, 1);
daedalus_idct8_meta *meta = calloc((size_t) n_blocks, sizeof(*meta));
if (!coeffs || !dst || !meta) { fprintf(stderr, "alloc fail\n"); return 1; }
uint64_t s = 0x1234567abcdefULL;
for (size_t i = 0; i < (size_t) n_blocks * 64; i++) {
s ^= s << 13; s ^= s >> 7; s ^= s << 17;
coeffs[i] = (int16_t)(s & 0x7ff) - 0x400;
}
for (int b = 0; b < n_blocks; b++) {
meta[b].dst_off = (uint32_t) b * 8;
meta[b].block_x = (uint32_t) b;
meta[b].block_y = 0;
}
double *t = malloc((size_t) n_calls * sizeof(double));
int rc;
printf("=== dispatching %d times, n_blocks=%d/call ===\n",
n_calls, n_blocks);
for (int i = 0; i < n_calls; i++) {
double t0 = now_seconds();
rc = daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_QPU,
dst, (size_t) stride,
coeffs, (size_t) n_blocks, meta);
double t1 = now_seconds();
if (rc) { fprintf(stderr, "dispatch %d rc=%d\n", i, rc); return 1; }
t[i] = (t1 - t0) * 1e6; /* us */
}
/* Per-call distribution (first few + sorted summary on the steady-state) */
printf("\nfirst 5 calls (cold-warm transition):\n");
for (int i = 0; i < 5 && i < n_calls; i++)
printf(" call %d: %.2f us\n", i, t[i]);
int skip = 10; /* drop warm-up calls from the steady-state stats */
if (n_calls > skip + 10) {
int n = n_calls - skip;
double *s_arr = malloc((size_t) n * sizeof(double));
memcpy(s_arr, t + skip, (size_t) n * sizeof(double));
qsort(s_arr, (size_t) n, sizeof(double), cmp_double);
double sum = 0;
for (int i = 0; i < n; i++) sum += s_arr[i];
printf("\nsteady-state stats (calls %d..%d, n=%d):\n",
skip, n_calls - 1, n);
printf(" min: %.2f us\n", s_arr[0]);
printf(" p50: %.2f us\n", s_arr[n / 2]);
printf(" p90: %.2f us\n", s_arr[(int)(n * 0.9)]);
printf(" p99: %.2f us\n", s_arr[(int)(n * 0.99)]);
printf(" max: %.2f us\n", s_arr[n - 1]);
printf(" mean: %.2f us\n", sum / n);
printf("\nfirst-call / steady-state median ratio: %.1fx\n",
t[0] / s_arr[n / 2]);
free(s_arr);
}
free(t); free(coeffs); free(dst); free(meta);
daedalus_ctx_destroy(ctx);
return 0;
}