phase1: add IDCT-layer throughput benchmark (bench_flush_frame) #8

Merged
marfrit merged 1 commits from noether/phase1-bench-flush into main 2026-05-24 21:03:11 +00:00
2 changed files with 200 additions and 0 deletions
Showing only changes of commit 352373a9be - Show all commits
+11
View File
@@ -128,6 +128,17 @@ add_test(NAME idct_bitexact COMMAND test_idct_bitexact)
# gets slow we'll split into a CTest LABEL for opt-in.
add_test(NAME idct_bitexact_1080p COMMAND test_idct_bitexact 1920 1088)
# ---- Benchmarks (not gated by ctest) ------------------------------
#
# Build-time only; user runs them by hand when checking perf. Adding
# them as ctest would make every CI run slow and the numbers would
# get drowned in pass/fail noise. See the header of each .c for what
# they measure.
add_executable(bench_flush_frame tests/bench_flush_frame.c)
target_link_libraries(bench_flush_frame PRIVATE daedalus_decoder)
target_compile_options(bench_flush_frame PRIVATE -O2)
# ---- Install ------------------------------------------------------
#
# Library + public header. Stage 2/3 will add a pkg-config file and
+189
View File
@@ -0,0 +1,189 @@
/* SPDX-License-Identifier: BSD-2-Clause */
/* Needed for CLOCK_MONOTONIC under -std=c11 -CMAKE_C_EXTENSIONS=OFF. */
#define _POSIX_C_SOURCE 200809L
/*
* bench_flush_frame — IDCT-layer throughput baseline.
*
* Times daedalus_decoder_flush_frame at a configurable coded
* resolution with random coefficients (the dispatch path doesn't
* care if the residuals are meaningful, only the layout / counts /
* bit-exactness; perf is independent of coefficient content).
*
* NOT a ctest — produces wall-time numbers, doesn't pass/fail.
* Invoke manually after a build:
*
* ./build/bench_flush_frame [width] [height] [iters] [warmup]
*
* Defaults: 1920 1088 100 5
*
* The first `warmup` iterations are excluded from the timing
* average because the daedalus-fourier shader pool needs to
* materialise pipelines + buffer pool entries on the first few
* calls (cycle 8b buffer-pool work amortises this; this bench is
* how we'd notice if that ever regresses).
*
* Output gives:
* - per-frame mean / median / p99 latency
* - frames per second steady-state
* - vs. the 30 fps @ 1080p target from the user's
* project_30fps_floor_is_fine.md memory
*
* NB: this is IDCT-only (luma 4x4 + 8x8 + chroma 4x4). It does
* NOT include intra prediction, MC, or deblock — those land in
* Stage 2+ / 4. A 30 fps number here is necessary-but-not-sufficient
* for the final decoder hitting the same.
*/
#include "daedalus_decoder.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
static uint64_t xs64_state;
static uint64_t xs64(void)
{
uint64_t x = xs64_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs64_state = x;
}
static int cmp_double(const void *a, const void *b)
{
double da = *(const double *)a, db = *(const double *)b;
return (da > db) - (da < db);
}
static double now_ms(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1.0e6;
}
int main(int argc, char **argv)
{
int width = argc > 1 ? atoi(argv[1]) : 1920;
int height = argc > 2 ? atoi(argv[2]) : 1088;
int iters = argc > 3 ? atoi(argv[3]) : 100;
int warmup = argc > 4 ? atoi(argv[4]) : 5;
if (warmup >= iters) {
fprintf(stderr, "warmup (%d) must be < iters (%d)\n", warmup, iters);
return 1;
}
int mb_w = width / 16;
int mb_h = height / 16;
int n_mbs = mb_w * mb_h;
printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup)\n",
width, height, n_mbs, iters, warmup);
daedalus_decoder *dec = daedalus_decoder_create(width, height);
if (!dec) {
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
return 0;
}
printf("ctx has_qpu=%d\n", daedalus_decoder_has_qpu(dec));
/* Pre-generate per-MB random coeffs once. We re-append the same
* per-MB buffer across iterations — the dispatch path doesn't
* cache anything per-MB across frames, so this is representative. */
xs64_state = 0xfeedface5a5a5a5aULL;
int16_t (*per_mb)[384] = malloc((size_t) n_mbs * sizeof(*per_mb));
uint8_t *mb_8x8 = malloc((size_t) n_mbs);
if (!per_mb || !mb_8x8) {
fprintf(stderr, "alloc fail\n");
return 1;
}
for (int mb = 0; mb < n_mbs; mb++) {
for (int i = 0; i < 384; i++)
per_mb[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
mb_8x8[mb] = (mb & 1) ? 1 : 0; /* same 50/50 mix as bit-exact test */
}
size_t y_size = (size_t) width * height;
size_t uv_size = (size_t) width * height / 2;
uint8_t *out_y = malloc(y_size);
uint8_t *out_uv = malloc(uv_size);
if (!out_y || !out_uv) {
fprintf(stderr, "alloc fail\n");
return 1;
}
/* Sample buffer for per-iteration timings (post-warmup). */
int sample_count = iters - warmup;
double *samples = malloc((size_t) sample_count * sizeof(double));
if (!samples) return 1;
for (int it = 0; it < iters; it++) {
/* Re-append all MBs for the frame. flush_frame resets
* mbs_appended to 0 internally on completion, so this loop
* is exactly the cost we'd pay per real frame. */
struct daedalus_decoder_mb_input mb = {0};
for (int my = 0; my < mb_h; my++) {
for (int mx = 0; mx < mb_w; mx++) {
int idx = my * mb_w + mx;
mb.mb_x = (uint16_t) mx;
mb.mb_y = (uint16_t) my;
mb.coeffs = per_mb[idx];
mb.transform_8x8 = mb_8x8[idx];
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
fprintf(stderr, "append fail iter=%d idx=%d\n", it, idx);
return 1;
}
}
}
double t0 = now_ms();
int frc = daedalus_decoder_flush_frame(dec, out_y, (size_t) width,
out_uv, (size_t) width);
double t1 = now_ms();
if (frc != 0) {
fprintf(stderr, "flush_frame rc=%d iter=%d\n", frc, it);
return 1;
}
if (it >= warmup) samples[it - warmup] = t1 - t0;
}
/* Stats. */
qsort(samples, (size_t) sample_count, sizeof(double), cmp_double);
double sum = 0;
for (int i = 0; i < sample_count; i++) sum += samples[i];
double mean = sum / sample_count;
double median = samples[sample_count / 2];
double p99 = samples[(sample_count * 99) / 100];
double min_ = samples[0];
double max_ = samples[sample_count - 1];
printf("\nflush_frame (post-warmup, %d samples):\n", sample_count);
printf(" min = %7.3f ms\n", min_);
printf(" median = %7.3f ms\n", median);
printf(" mean = %7.3f ms\n", mean);
printf(" p99 = %7.3f ms\n", p99);
printf(" max = %7.3f ms\n", max_);
double fps_mean = 1000.0 / mean;
double fps_median = 1000.0 / median;
printf("\nthroughput (steady-state, IDCT only — NO intra/MC/deblock):\n");
printf(" mean = %.1f fps\n", fps_mean);
printf(" median = %.1f fps\n", fps_median);
printf(" target = 30.0 fps (project_30fps_floor_is_fine.md)\n");
if (fps_median >= 30.0)
printf(" status = MEETS target (with %.1fx headroom for "
"intra/MC/deblock)\n", fps_median / 30.0);
else
printf(" status = BELOW target (need %.1fx speedup just at IDCT)\n",
30.0 / fps_median);
free(samples);
free(out_uv);
free(out_y);
free(mb_8x8);
free(per_mb);
daedalus_decoder_destroy(dec);
return 0;
}