0b6482bc8f
Extends bench_flush_frame with an argv[5] substrate selector
(auto/cpu/qpu). Same enum as test_idct_bitexact's argv[4] — keeps
both binaries' CLI in sync.
The whole point of plumbing the selector through is to put a number
on the "QPU is default substrate" decree (2026-05-23,
feedback_qpu_is_default_substrate.md) for the IDCT layer
specifically. The decree said: "What can be done, will be done in
QPU. Dispatch overhead is fixable defect." This measurement
quantifies the unfixed defect.
Bench config: 1920x1088, 100 iters, 5 warmup, half 4x4 / half 8x8
luma MBs + chroma always 4x4. Pi 5 / V3D 7.1 / daedalus-fourier
0.1.0 (with cycle 6/7/9 H.264 IDCT shaders). Hertz, idle system.
Results:
substrate min median mean p99 fps (median)
─────────────────────────────────────────────────────────────
CPU NEON 8.75 9.27 11.10 33.06 107.8
QPU V3D7 31.92 37.77 37.67 47.27 26.5
AUTO 31.99 33.19 36.04 92.23 30.1
Targets: 30 fps @ 1080p (project_30fps_floor_is_fine.md).
Stages NOT yet measured: intra prediction, MC, deblock.
Interpretation:
- For the IDCT-only workload at frame batch granularity, CPU NEON
is 4.1x faster than QPU V3D7.
- AUTO → recipe table → QPU per the decree → BELOW the 30 fps
target with no headroom for the remaining decoder stages.
- The earlier "101 fps median at 1080p" measurement reported in
PR #8's commit was actually the CPU NEON path — the daedalus-
fourier install on hertz at the time predated the cycle 6 H.264
QPU shader, so recipe AUTO silently fell back to CPU NEON.
PR #8's "Path C is viable" conclusion stands, but the substrate
label was wrong. Apologies for the misleading number.
What this means for the campaign:
- The decree's "fixable defect" claim is still aspirational for
the H.264 IDCT shaders. The current QPU shader dispatch costs
~3.6 ms per IDCT round-trip (luma 4x4 + luma 8x8 + chroma 4x4 =
~10 ms total cf. CPU's 2.3 ms), which dominates over the compute.
- daedalus-decoder doesn't need to take a position on this — the
AUTO path follows the recipe table and respects the decree.
The substrate selector is the escape hatch when consumers want
to override.
- For the libavcodec intercept patch when it lands, the right
move is probably to start with CPU NEON for IDCT and switch to
QPU once the dispatch overhead drops (issue #162 dmabuf import
+ further pool work on the daedalus-fourier side).
No source change to flush_frame itself; this is purely a measurement
add. The bench is opt-in (not a ctest) — these numbers belong in
commit messages and the campaign log, not in CI gating.
216 lines
7.6 KiB
C
216 lines
7.6 KiB
C
/* SPDX-License-Identifier: BSD-2-Clause */
|
|
/* Needed for CLOCK_MONOTONIC under -std=c11 -CMAKE_C_EXTENSIONS=OFF. */
|
|
#define _POSIX_C_SOURCE 200809L
|
|
/*
|
|
* bench_flush_frame — IDCT-layer throughput baseline.
|
|
*
|
|
* Times daedalus_decoder_flush_frame at a configurable coded
|
|
* resolution with random coefficients (the dispatch path doesn't
|
|
* care if the residuals are meaningful, only the layout / counts /
|
|
* bit-exactness; perf is independent of coefficient content).
|
|
*
|
|
* NOT a ctest — produces wall-time numbers, doesn't pass/fail.
|
|
* Invoke manually after a build:
|
|
*
|
|
* ./build/bench_flush_frame [width] [height] [iters] [warmup] [substrate]
|
|
*
|
|
* Defaults: 1920 1088 100 5 auto
|
|
*
|
|
* The [substrate] argument selects the dispatch path:
|
|
* auto — recipe table picks (V3D7 when available, else NEON)
|
|
* cpu — force NEON path
|
|
* qpu — force V3D7 path (fails on hosts without it)
|
|
*
|
|
* Run both to quantify the substrate gap. The "QPU is default
|
|
* substrate" decree (2026-05-23, feedback_qpu_is_default_substrate.md)
|
|
* is a policy claim; this bench is how we measure whether the policy
|
|
* pays off for the IDCT layer specifically.
|
|
*
|
|
* The first `warmup` iterations are excluded from the timing
|
|
* average because the daedalus-fourier shader pool needs to
|
|
* materialise pipelines + buffer pool entries on the first few
|
|
* calls (cycle 8b buffer-pool work amortises this; this bench is
|
|
* how we'd notice if that ever regresses).
|
|
*
|
|
* Output gives:
|
|
* - per-frame mean / median / p99 latency
|
|
* - frames per second steady-state
|
|
* - vs. the 30 fps @ 1080p target from the user's
|
|
* project_30fps_floor_is_fine.md memory
|
|
*
|
|
* NB: this is IDCT-only (luma 4x4 + 8x8 + chroma 4x4). It does
|
|
* NOT include intra prediction, MC, or deblock — those land in
|
|
* Stage 2+ / 4. A 30 fps number here is necessary-but-not-sufficient
|
|
* for the final decoder hitting the same.
|
|
*/
|
|
|
|
#include "daedalus_decoder.h"
|
|
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <time.h>
|
|
|
|
static uint64_t xs64_state;
|
|
static uint64_t xs64(void)
|
|
{
|
|
uint64_t x = xs64_state;
|
|
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
|
return xs64_state = x;
|
|
}
|
|
|
|
static int cmp_double(const void *a, const void *b)
|
|
{
|
|
double da = *(const double *)a, db = *(const double *)b;
|
|
return (da > db) - (da < db);
|
|
}
|
|
|
|
static double now_ms(void)
|
|
{
|
|
struct timespec ts;
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1.0e6;
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
int width = argc > 1 ? atoi(argv[1]) : 1920;
|
|
int height = argc > 2 ? atoi(argv[2]) : 1088;
|
|
int iters = argc > 3 ? atoi(argv[3]) : 100;
|
|
int warmup = argc > 4 ? atoi(argv[4]) : 5;
|
|
|
|
daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO;
|
|
const char *sub_name = "auto";
|
|
if (argc > 5) {
|
|
if (!strcmp(argv[5], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; }
|
|
else if (!strcmp(argv[5], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; }
|
|
else if (!strcmp(argv[5], "auto")) { /* default */ }
|
|
else {
|
|
fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[5]);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
if (warmup >= iters) {
|
|
fprintf(stderr, "warmup (%d) must be < iters (%d)\n", warmup, iters);
|
|
return 1;
|
|
}
|
|
|
|
int mb_w = width / 16;
|
|
int mb_h = height / 16;
|
|
int n_mbs = mb_w * mb_h;
|
|
printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup), substrate=%s\n",
|
|
width, height, n_mbs, iters, warmup, sub_name);
|
|
|
|
daedalus_decoder *dec = daedalus_decoder_create(width, height);
|
|
if (!dec) {
|
|
fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
|
|
return 0;
|
|
}
|
|
if (daedalus_decoder_set_substrate(dec, sub) != 0) {
|
|
fprintf(stderr, "set_substrate(%s) failed\n", sub_name);
|
|
return 1;
|
|
}
|
|
printf("ctx has_qpu=%d\n", daedalus_decoder_has_qpu(dec));
|
|
|
|
/* Pre-generate per-MB random coeffs once. We re-append the same
|
|
* per-MB buffer across iterations — the dispatch path doesn't
|
|
* cache anything per-MB across frames, so this is representative. */
|
|
xs64_state = 0xfeedface5a5a5a5aULL;
|
|
int16_t (*per_mb)[384] = malloc((size_t) n_mbs * sizeof(*per_mb));
|
|
uint8_t *mb_8x8 = malloc((size_t) n_mbs);
|
|
if (!per_mb || !mb_8x8) {
|
|
fprintf(stderr, "alloc fail\n");
|
|
return 1;
|
|
}
|
|
for (int mb = 0; mb < n_mbs; mb++) {
|
|
for (int i = 0; i < 384; i++)
|
|
per_mb[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
|
|
mb_8x8[mb] = (mb & 1) ? 1 : 0; /* same 50/50 mix as bit-exact test */
|
|
}
|
|
|
|
size_t y_size = (size_t) width * height;
|
|
size_t uv_size = (size_t) width * height / 2;
|
|
uint8_t *out_y = malloc(y_size);
|
|
uint8_t *out_uv = malloc(uv_size);
|
|
if (!out_y || !out_uv) {
|
|
fprintf(stderr, "alloc fail\n");
|
|
return 1;
|
|
}
|
|
|
|
/* Sample buffer for per-iteration timings (post-warmup). */
|
|
int sample_count = iters - warmup;
|
|
double *samples = malloc((size_t) sample_count * sizeof(double));
|
|
if (!samples) return 1;
|
|
|
|
for (int it = 0; it < iters; it++) {
|
|
/* Re-append all MBs for the frame. flush_frame resets
|
|
* mbs_appended to 0 internally on completion, so this loop
|
|
* is exactly the cost we'd pay per real frame. */
|
|
struct daedalus_decoder_mb_input mb = {0};
|
|
for (int my = 0; my < mb_h; my++) {
|
|
for (int mx = 0; mx < mb_w; mx++) {
|
|
int idx = my * mb_w + mx;
|
|
mb.mb_x = (uint16_t) mx;
|
|
mb.mb_y = (uint16_t) my;
|
|
mb.coeffs = per_mb[idx];
|
|
mb.transform_8x8 = mb_8x8[idx];
|
|
if (daedalus_decoder_append_mb(dec, &mb) != 0) {
|
|
fprintf(stderr, "append fail iter=%d idx=%d\n", it, idx);
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
double t0 = now_ms();
|
|
int frc = daedalus_decoder_flush_frame(dec, out_y, (size_t) width,
|
|
out_uv, (size_t) width);
|
|
double t1 = now_ms();
|
|
if (frc != 0) {
|
|
fprintf(stderr, "flush_frame rc=%d iter=%d\n", frc, it);
|
|
return 1;
|
|
}
|
|
|
|
if (it >= warmup) samples[it - warmup] = t1 - t0;
|
|
}
|
|
|
|
/* Stats. */
|
|
qsort(samples, (size_t) sample_count, sizeof(double), cmp_double);
|
|
double sum = 0;
|
|
for (int i = 0; i < sample_count; i++) sum += samples[i];
|
|
double mean = sum / sample_count;
|
|
double median = samples[sample_count / 2];
|
|
double p99 = samples[(sample_count * 99) / 100];
|
|
double min_ = samples[0];
|
|
double max_ = samples[sample_count - 1];
|
|
|
|
printf("\nflush_frame (post-warmup, %d samples):\n", sample_count);
|
|
printf(" min = %7.3f ms\n", min_);
|
|
printf(" median = %7.3f ms\n", median);
|
|
printf(" mean = %7.3f ms\n", mean);
|
|
printf(" p99 = %7.3f ms\n", p99);
|
|
printf(" max = %7.3f ms\n", max_);
|
|
|
|
double fps_mean = 1000.0 / mean;
|
|
double fps_median = 1000.0 / median;
|
|
printf("\nthroughput (steady-state, IDCT only — NO intra/MC/deblock):\n");
|
|
printf(" mean = %.1f fps\n", fps_mean);
|
|
printf(" median = %.1f fps\n", fps_median);
|
|
printf(" target = 30.0 fps (project_30fps_floor_is_fine.md)\n");
|
|
if (fps_median >= 30.0)
|
|
printf(" status = MEETS target (with %.1fx headroom for "
|
|
"intra/MC/deblock)\n", fps_median / 30.0);
|
|
else
|
|
printf(" status = BELOW target (need %.1fx speedup just at IDCT)\n",
|
|
30.0 / fps_median);
|
|
|
|
free(samples);
|
|
free(out_uv);
|
|
free(out_y);
|
|
free(mb_8x8);
|
|
free(per_mb);
|
|
daedalus_decoder_destroy(dec);
|
|
return 0;
|
|
}
|