352373a9be
Establishes a steady-state baseline for the Path C frame-level
dispatch architecture. Times daedalus_decoder_flush_frame at a
configurable coded resolution with random coefficients, reporting
per-frame latency stats and fps.
NOT a ctest — produces wall-time numbers, doesn't pass/fail. Run
manually:
./build/bench_flush_frame [width] [height] [iters] [warmup]
Defaults to 1920x1088, 100 iters, 5-frame warmup (excludes shader-
pipeline-pool materialisation cost from the timing average).
Measured on hertz (Pi 5 / V3D 7.1 / daedalus-fourier 0.1.0):
$ ./build/bench_flush_frame
bench_flush_frame: 1920x1088 (8160 MBs), 100 iters (5 warmup)
ctx has_qpu=1
flush_frame (post-warmup, 95 samples):
min = 9.699 ms
median = 9.905 ms
mean = 10.014 ms
p99 = 12.011 ms
max = 12.011 ms
throughput (steady-state, IDCT only — NO intra/MC/deblock):
mean = 99.9 fps
median = 101.0 fps
target = 30.0 fps (project_30fps_floor_is_fine.md)
status = MEETS target (with 3.4x headroom for intra/MC/deblock)
Interpretation:
Per-frame work measured:
- CPU partition + flat-pack of 8160 MBs into luma_4x4, luma_8x8,
chroma meta+coeffs buffers
- 3 GPU dispatches (luma 4x4, luma 8x8, chroma 4x4) with their
respective vkQueueSubmit + vkQueueWaitIdle round-trips
- CPU NV12 interleave (chroma planar → UV)
- calloc/free for scratch_y / coeffs / meta buffers
Doing all of that in ~10 ms means the architecture pays back the
Path C design bet: ONE Vulkan submit per dispatch (cycle 8b buffer
pool keeps amortised cost low) is the right granularity. The
per-block dispatch fail-mode that motivated Path C (~6500 ms/frame
from the libavcodec substitution arc) is 600x slower than this.
3.4x headroom from 101 fps → 30 fps target gives a budget of
~23 ms/frame for the remaining decode work (intra prediction
wavefront, MC, deblock). Each of those needs to fit inside that
budget at steady state for the end-to-end decoder to hit 30 fps
at 1080p.
p99 latency 12 ms means even worst-case frames clear the 33-ms
deadline (30 fps) easily; tail latency isn't a concern at this
stage.
What this number does NOT validate:
- Intra prediction shader dispatch overhead (likely per-anti-diagonal
or per-MB-wavefront; dispatch count goes up)
- MC dispatch (per qpel-block; up to several per MB)
- Deblock dispatch (4 edges per MB; per-edge meta entries)
- Real H.264 streams (random coeffs ≠ real residuals; perf shape
of memory access is content-independent, but cache pressure may
differ at scale).
190 lines
6.5 KiB
C
190 lines
6.5 KiB
C
/* SPDX-License-Identifier: BSD-2-Clause */
|
|
/* Needed for CLOCK_MONOTONIC under -std=c11 -CMAKE_C_EXTENSIONS=OFF. */
|
|
#define _POSIX_C_SOURCE 200809L
|
|
/*
|
|
* bench_flush_frame — IDCT-layer throughput baseline.
|
|
*
|
|
* Times daedalus_decoder_flush_frame at a configurable coded
|
|
* resolution with random coefficients (the dispatch path doesn't
|
|
* care if the residuals are meaningful, only the layout / counts /
|
|
* bit-exactness; perf is independent of coefficient content).
|
|
*
|
|
* NOT a ctest — produces wall-time numbers, doesn't pass/fail.
|
|
* Invoke manually after a build:
|
|
*
|
|
* ./build/bench_flush_frame [width] [height] [iters] [warmup]
|
|
*
|
|
* Defaults: 1920 1088 100 5
|
|
*
|
|
* The first `warmup` iterations are excluded from the timing
|
|
* average because the daedalus-fourier shader pool needs to
|
|
* materialise pipelines + buffer pool entries on the first few
|
|
* calls (cycle 8b buffer-pool work amortises this; this bench is
|
|
* how we'd notice if that ever regresses).
|
|
*
|
|
* Output gives:
|
|
* - per-frame mean / median / p99 latency
|
|
* - frames per second steady-state
|
|
* - vs. the 30 fps @ 1080p target from the user's
|
|
* project_30fps_floor_is_fine.md memory
|
|
*
|
|
* NB: this is IDCT-only (luma 4x4 + 8x8 + chroma 4x4). It does
|
|
* NOT include intra prediction, MC, or deblock — those land in
|
|
* Stage 2+ / 4. A 30 fps number here is necessary-but-not-sufficient
|
|
* for the final decoder hitting the same.
|
|
*/
|
|
|
|
#include "daedalus_decoder.h"
|
|
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <time.h>
|
|
|
|
static uint64_t xs64_state;
|
|
static uint64_t xs64(void)
|
|
{
|
|
uint64_t x = xs64_state;
|
|
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
|
return xs64_state = x;
|
|
}
|
|
|
|
static int cmp_double(const void *a, const void *b)
|
|
{
|
|
double da = *(const double *)a, db = *(const double *)b;
|
|
return (da > db) - (da < db);
|
|
}
|
|
|
|
static double now_ms(void)
|
|
{
|
|
struct timespec ts;
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1.0e6;
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
int width = argc > 1 ? atoi(argv[1]) : 1920;
|
|
int height = argc > 2 ? atoi(argv[2]) : 1088;
|
|
int iters = argc > 3 ? atoi(argv[3]) : 100;
|
|
int warmup = argc > 4 ? atoi(argv[4]) : 5;
|
|
|
|
if (warmup >= iters) {
|
|
fprintf(stderr, "warmup (%d) must be < iters (%d)\n", warmup, iters);
|
|
return 1;
|
|
}
|
|
|
|
int mb_w = width / 16;
|
|
int mb_h = height / 16;
|
|
int n_mbs = mb_w * mb_h;
|
|
printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup)\n",
|
|
width, height, n_mbs, iters, warmup);
|
|
|
|
daedalus_decoder *dec = daedalus_decoder_create(width, height);
|
|
if (!dec) {
|
|
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
|
|
return 0;
|
|
}
|
|
printf("ctx has_qpu=%d\n", daedalus_decoder_has_qpu(dec));
|
|
|
|
/* Pre-generate per-MB random coeffs once. We re-append the same
|
|
* per-MB buffer across iterations — the dispatch path doesn't
|
|
* cache anything per-MB across frames, so this is representative. */
|
|
xs64_state = 0xfeedface5a5a5a5aULL;
|
|
int16_t (*per_mb)[384] = malloc((size_t) n_mbs * sizeof(*per_mb));
|
|
uint8_t *mb_8x8 = malloc((size_t) n_mbs);
|
|
if (!per_mb || !mb_8x8) {
|
|
fprintf(stderr, "alloc fail\n");
|
|
return 1;
|
|
}
|
|
for (int mb = 0; mb < n_mbs; mb++) {
|
|
for (int i = 0; i < 384; i++)
|
|
per_mb[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
|
|
mb_8x8[mb] = (mb & 1) ? 1 : 0; /* same 50/50 mix as bit-exact test */
|
|
}
|
|
|
|
size_t y_size = (size_t) width * height;
|
|
size_t uv_size = (size_t) width * height / 2;
|
|
uint8_t *out_y = malloc(y_size);
|
|
uint8_t *out_uv = malloc(uv_size);
|
|
if (!out_y || !out_uv) {
|
|
fprintf(stderr, "alloc fail\n");
|
|
return 1;
|
|
}
|
|
|
|
/* Sample buffer for per-iteration timings (post-warmup). */
|
|
int sample_count = iters - warmup;
|
|
double *samples = malloc((size_t) sample_count * sizeof(double));
|
|
if (!samples) return 1;
|
|
|
|
for (int it = 0; it < iters; it++) {
|
|
/* Re-append all MBs for the frame. flush_frame resets
|
|
* mbs_appended to 0 internally on completion, so this loop
|
|
* is exactly the cost we'd pay per real frame. */
|
|
struct daedalus_decoder_mb_input mb = {0};
|
|
for (int my = 0; my < mb_h; my++) {
|
|
for (int mx = 0; mx < mb_w; mx++) {
|
|
int idx = my * mb_w + mx;
|
|
mb.mb_x = (uint16_t) mx;
|
|
mb.mb_y = (uint16_t) my;
|
|
mb.coeffs = per_mb[idx];
|
|
mb.transform_8x8 = mb_8x8[idx];
|
|
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
|
|
fprintf(stderr, "append fail iter=%d idx=%d\n", it, idx);
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
double t0 = now_ms();
|
|
int frc = daedalus_decoder_flush_frame(dec, out_y, (size_t) width,
|
|
out_uv, (size_t) width);
|
|
double t1 = now_ms();
|
|
if (frc != 0) {
|
|
fprintf(stderr, "flush_frame rc=%d iter=%d\n", frc, it);
|
|
return 1;
|
|
}
|
|
|
|
if (it >= warmup) samples[it - warmup] = t1 - t0;
|
|
}
|
|
|
|
/* Stats. */
|
|
qsort(samples, (size_t) sample_count, sizeof(double), cmp_double);
|
|
double sum = 0;
|
|
for (int i = 0; i < sample_count; i++) sum += samples[i];
|
|
double mean = sum / sample_count;
|
|
double median = samples[sample_count / 2];
|
|
double p99 = samples[(sample_count * 99) / 100];
|
|
double min_ = samples[0];
|
|
double max_ = samples[sample_count - 1];
|
|
|
|
printf("\nflush_frame (post-warmup, %d samples):\n", sample_count);
|
|
printf(" min = %7.3f ms\n", min_);
|
|
printf(" median = %7.3f ms\n", median);
|
|
printf(" mean = %7.3f ms\n", mean);
|
|
printf(" p99 = %7.3f ms\n", p99);
|
|
printf(" max = %7.3f ms\n", max_);
|
|
|
|
double fps_mean = 1000.0 / mean;
|
|
double fps_median = 1000.0 / median;
|
|
printf("\nthroughput (steady-state, IDCT only — NO intra/MC/deblock):\n");
|
|
printf(" mean = %.1f fps\n", fps_mean);
|
|
printf(" median = %.1f fps\n", fps_median);
|
|
printf(" target = 30.0 fps (project_30fps_floor_is_fine.md)\n");
|
|
if (fps_median >= 30.0)
|
|
printf(" status = MEETS target (with %.1fx headroom for "
|
|
"intra/MC/deblock)\n", fps_median / 30.0);
|
|
else
|
|
printf(" status = BELOW target (need %.1fx speedup just at IDCT)\n",
|
|
30.0 / fps_median);
|
|
|
|
free(samples);
|
|
free(out_uv);
|
|
free(out_y);
|
|
free(mb_8x8);
|
|
free(per_mb);
|
|
daedalus_decoder_destroy(dec);
|
|
return 0;
|
|
}
|