diff --git a/CMakeLists.txt b/CMakeLists.txt index 97b98f0..db9e726 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,6 +128,17 @@ add_test(NAME idct_bitexact COMMAND test_idct_bitexact) # gets slow we'll split into a CTest LABEL for opt-in. add_test(NAME idct_bitexact_1080p COMMAND test_idct_bitexact 1920 1088) +# ---- Benchmarks (not gated by ctest) ------------------------------ +# +# Build-time only; user runs them by hand when checking perf. Adding +# them as ctest would make every CI run slow and the numbers would +# get drowned in pass/fail noise. See the header of each .c for what +# they measure. + +add_executable(bench_flush_frame tests/bench_flush_frame.c) +target_link_libraries(bench_flush_frame PRIVATE daedalus_decoder) +target_compile_options(bench_flush_frame PRIVATE -O2) + # ---- Install ------------------------------------------------------ # # Library + public header. Stage 2/3 will add a pkg-config file and diff --git a/tests/bench_flush_frame.c b/tests/bench_flush_frame.c new file mode 100644 index 0000000..84cd6bd --- /dev/null +++ b/tests/bench_flush_frame.c @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* Needed for CLOCK_MONOTONIC under -std=c11 -CMAKE_C_EXTENSIONS=OFF. */ +#define _POSIX_C_SOURCE 200809L +/* + * bench_flush_frame — IDCT-layer throughput baseline. + * + * Times daedalus_decoder_flush_frame at a configurable coded + * resolution with random coefficients (the dispatch path doesn't + * care if the residuals are meaningful, only the layout / counts / + * bit-exactness; perf is independent of coefficient content). + * + * NOT a ctest — produces wall-time numbers, doesn't pass/fail. + * Invoke manually after a build: + * + * ./build/bench_flush_frame [width] [height] [iters] [warmup] + * + * Defaults: 1920 1088 100 5 + * + * The first `warmup` iterations are excluded from the timing + * average because the daedalus-fourier shader pool needs to + * materialise pipelines + buffer pool entries on the first few + * calls (cycle 8b buffer-pool work amortises this; this bench is + * how we'd notice if that ever regresses). + * + * Output gives: + * - per-frame mean / median / p99 latency + * - frames per second steady-state + * - vs. the 30 fps @ 1080p target from the user's + * project_30fps_floor_is_fine.md memory + * + * NB: this is IDCT-only (luma 4x4 + 8x8 + chroma 4x4). It does + * NOT include intra prediction, MC, or deblock — those land in + * Stage 2+ / 4. A 30 fps number here is necessary-but-not-sufficient + * for the final decoder hitting the same. + */ + +#include "daedalus_decoder.h" + +#include +#include +#include +#include +#include + +static uint64_t xs64_state; +static uint64_t xs64(void) +{ + uint64_t x = xs64_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs64_state = x; +} + +static int cmp_double(const void *a, const void *b) +{ + double da = *(const double *)a, db = *(const double *)b; + return (da > db) - (da < db); +} + +static double now_ms(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000.0 + ts.tv_nsec / 1.0e6; +} + +int main(int argc, char **argv) +{ + int width = argc > 1 ? atoi(argv[1]) : 1920; + int height = argc > 2 ? atoi(argv[2]) : 1088; + int iters = argc > 3 ? atoi(argv[3]) : 100; + int warmup = argc > 4 ? atoi(argv[4]) : 5; + + if (warmup >= iters) { + fprintf(stderr, "warmup (%d) must be < iters (%d)\n", warmup, iters); + return 1; + } + + int mb_w = width / 16; + int mb_h = height / 16; + int n_mbs = mb_w * mb_h; + printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup)\n", + width, height, n_mbs, iters, warmup); + + daedalus_decoder *dec = daedalus_decoder_create(width, height); + if (!dec) { + fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n"); + return 0; + } + printf("ctx has_qpu=%d\n", daedalus_decoder_has_qpu(dec)); + + /* Pre-generate per-MB random coeffs once. We re-append the same + * per-MB buffer across iterations — the dispatch path doesn't + * cache anything per-MB across frames, so this is representative. */ + xs64_state = 0xfeedface5a5a5a5aULL; + int16_t (*per_mb)[384] = malloc((size_t) n_mbs * sizeof(*per_mb)); + uint8_t *mb_8x8 = malloc((size_t) n_mbs); + if (!per_mb || !mb_8x8) { + fprintf(stderr, "alloc fail\n"); + return 1; + } + for (int mb = 0; mb < n_mbs; mb++) { + for (int i = 0; i < 384; i++) + per_mb[mb][i] = (int16_t)((int)(xs64() % 1024) - 512); + mb_8x8[mb] = (mb & 1) ? 1 : 0; /* same 50/50 mix as bit-exact test */ + } + + size_t y_size = (size_t) width * height; + size_t uv_size = (size_t) width * height / 2; + uint8_t *out_y = malloc(y_size); + uint8_t *out_uv = malloc(uv_size); + if (!out_y || !out_uv) { + fprintf(stderr, "alloc fail\n"); + return 1; + } + + /* Sample buffer for per-iteration timings (post-warmup). */ + int sample_count = iters - warmup; + double *samples = malloc((size_t) sample_count * sizeof(double)); + if (!samples) return 1; + + for (int it = 0; it < iters; it++) { + /* Re-append all MBs for the frame. flush_frame resets + * mbs_appended to 0 internally on completion, so this loop + * is exactly the cost we'd pay per real frame. */ + struct daedalus_decoder_mb_input mb = {0}; + for (int my = 0; my < mb_h; my++) { + for (int mx = 0; mx < mb_w; mx++) { + int idx = my * mb_w + mx; + mb.mb_x = (uint16_t) mx; + mb.mb_y = (uint16_t) my; + mb.coeffs = per_mb[idx]; + mb.transform_8x8 = mb_8x8[idx]; + if (daedalus_decoder_append_mb(dec, &mb) != 0) { + fprintf(stderr, "append fail iter=%d idx=%d\n", it, idx); + return 1; + } + } + } + + double t0 = now_ms(); + int frc = daedalus_decoder_flush_frame(dec, out_y, (size_t) width, + out_uv, (size_t) width); + double t1 = now_ms(); + if (frc != 0) { + fprintf(stderr, "flush_frame rc=%d iter=%d\n", frc, it); + return 1; + } + + if (it >= warmup) samples[it - warmup] = t1 - t0; + } + + /* Stats. */ + qsort(samples, (size_t) sample_count, sizeof(double), cmp_double); + double sum = 0; + for (int i = 0; i < sample_count; i++) sum += samples[i]; + double mean = sum / sample_count; + double median = samples[sample_count / 2]; + double p99 = samples[(sample_count * 99) / 100]; + double min_ = samples[0]; + double max_ = samples[sample_count - 1]; + + printf("\nflush_frame (post-warmup, %d samples):\n", sample_count); + printf(" min = %7.3f ms\n", min_); + printf(" median = %7.3f ms\n", median); + printf(" mean = %7.3f ms\n", mean); + printf(" p99 = %7.3f ms\n", p99); + printf(" max = %7.3f ms\n", max_); + + double fps_mean = 1000.0 / mean; + double fps_median = 1000.0 / median; + printf("\nthroughput (steady-state, IDCT only — NO intra/MC/deblock):\n"); + printf(" mean = %.1f fps\n", fps_mean); + printf(" median = %.1f fps\n", fps_median); + printf(" target = 30.0 fps (project_30fps_floor_is_fine.md)\n"); + if (fps_median >= 30.0) + printf(" status = MEETS target (with %.1fx headroom for " + "intra/MC/deblock)\n", fps_median / 30.0); + else + printf(" status = BELOW target (need %.1fx speedup just at IDCT)\n", + 30.0 / fps_median); + + free(samples); + free(out_uv); + free(out_y); + free(mb_8x8); + free(per_mb); + daedalus_decoder_destroy(dec); + return 0; +}