/* SPDX-License-Identifier: BSD-2-Clause */ /* Needed for CLOCK_MONOTONIC under -std=c11 -CMAKE_C_EXTENSIONS=OFF. */ #define _POSIX_C_SOURCE 200809L /* * bench_flush_frame — IDCT-layer throughput baseline. * * Times daedalus_decoder_flush_frame at a configurable coded * resolution with random coefficients (the dispatch path doesn't * care if the residuals are meaningful, only the layout / counts / * bit-exactness; perf is independent of coefficient content). * * NOT a ctest — produces wall-time numbers, doesn't pass/fail. * Invoke manually after a build: * * ./build/bench_flush_frame [width] [height] [iters] [warmup] [substrate] * * Defaults: 1920 1088 100 5 auto * * The [substrate] argument selects the dispatch path: * auto — recipe table picks (V3D7 when available, else NEON) * cpu — force NEON path * qpu — force V3D7 path (fails on hosts without it) * * Run both to quantify the substrate gap. The "QPU is default * substrate" decree (2026-05-23, feedback_qpu_is_default_substrate.md) * is a policy claim; this bench is how we measure whether the policy * pays off for the IDCT layer specifically. * * The first `warmup` iterations are excluded from the timing * average because the daedalus-fourier shader pool needs to * materialise pipelines + buffer pool entries on the first few * calls (cycle 8b buffer-pool work amortises this; this bench is * how we'd notice if that ever regresses). * * Output gives: * - per-frame mean / median / p99 latency * - frames per second steady-state * - vs. the 30 fps @ 1080p target from the user's * project_30fps_floor_is_fine.md memory * * NB: this is IDCT-only (luma 4x4 + 8x8 + chroma 4x4). It does * NOT include intra prediction, MC, or deblock — those land in * Stage 2+ / 4. A 30 fps number here is necessary-but-not-sufficient * for the final decoder hitting the same. */ #include "daedalus_decoder.h" #include #include #include #include #include static uint64_t xs64_state; static uint64_t xs64(void) { uint64_t x = xs64_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs64_state = x; } static int cmp_double(const void *a, const void *b) { double da = *(const double *)a, db = *(const double *)b; return (da > db) - (da < db); } static double now_ms(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec * 1000.0 + ts.tv_nsec / 1.0e6; } int main(int argc, char **argv) { int width = argc > 1 ? atoi(argv[1]) : 1920; int height = argc > 2 ? atoi(argv[2]) : 1088; int iters = argc > 3 ? atoi(argv[3]) : 100; int warmup = argc > 4 ? atoi(argv[4]) : 5; daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO; const char *sub_name = "auto"; if (argc > 5) { if (!strcmp(argv[5], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; } else if (!strcmp(argv[5], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; } else if (!strcmp(argv[5], "auto")) { /* default */ } else { fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[5]); return 1; } } if (warmup >= iters) { fprintf(stderr, "warmup (%d) must be < iters (%d)\n", warmup, iters); return 1; } int mb_w = width / 16; int mb_h = height / 16; int n_mbs = mb_w * mb_h; printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup), substrate=%s\n", width, height, n_mbs, iters, warmup, sub_name); daedalus_decoder *dec = daedalus_decoder_create(width, height); if (!dec) { fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n"); return 0; } if (daedalus_decoder_set_substrate(dec, sub) != 0) { fprintf(stderr, "set_substrate(%s) failed\n", sub_name); return 1; } printf("ctx has_qpu=%d\n", daedalus_decoder_has_qpu(dec)); /* Pre-generate per-MB random coeffs once. We re-append the same * per-MB buffer across iterations — the dispatch path doesn't * cache anything per-MB across frames, so this is representative. */ xs64_state = 0xfeedface5a5a5a5aULL; int16_t (*per_mb)[384] = malloc((size_t) n_mbs * sizeof(*per_mb)); uint8_t *mb_8x8 = malloc((size_t) n_mbs); if (!per_mb || !mb_8x8) { fprintf(stderr, "alloc fail\n"); return 1; } for (int mb = 0; mb < n_mbs; mb++) { for (int i = 0; i < 384; i++) per_mb[mb][i] = (int16_t)((int)(xs64() % 1024) - 512); mb_8x8[mb] = (mb & 1) ? 1 : 0; /* same 50/50 mix as bit-exact test */ } size_t y_size = (size_t) width * height; size_t uv_size = (size_t) width * height / 2; uint8_t *out_y = malloc(y_size); uint8_t *out_uv = malloc(uv_size); if (!out_y || !out_uv) { fprintf(stderr, "alloc fail\n"); return 1; } /* Sample buffer for per-iteration timings (post-warmup). */ int sample_count = iters - warmup; double *samples = malloc((size_t) sample_count * sizeof(double)); if (!samples) return 1; for (int it = 0; it < iters; it++) { /* Re-append all MBs for the frame. flush_frame resets * mbs_appended to 0 internally on completion, so this loop * is exactly the cost we'd pay per real frame. */ struct daedalus_decoder_mb_input mb = {0}; for (int my = 0; my < mb_h; my++) { for (int mx = 0; mx < mb_w; mx++) { int idx = my * mb_w + mx; mb.mb_x = (uint16_t) mx; mb.mb_y = (uint16_t) my; mb.coeffs = per_mb[idx]; mb.transform_8x8 = mb_8x8[idx]; if (daedalus_decoder_append_mb(dec, &mb) != 0) { fprintf(stderr, "append fail iter=%d idx=%d\n", it, idx); return 1; } } } double t0 = now_ms(); int frc = daedalus_decoder_flush_frame(dec, out_y, (size_t) width, out_uv, (size_t) width); double t1 = now_ms(); if (frc != 0) { fprintf(stderr, "flush_frame rc=%d iter=%d\n", frc, it); return 1; } if (it >= warmup) samples[it - warmup] = t1 - t0; } /* Stats. */ qsort(samples, (size_t) sample_count, sizeof(double), cmp_double); double sum = 0; for (int i = 0; i < sample_count; i++) sum += samples[i]; double mean = sum / sample_count; double median = samples[sample_count / 2]; double p99 = samples[(sample_count * 99) / 100]; double min_ = samples[0]; double max_ = samples[sample_count - 1]; printf("\nflush_frame (post-warmup, %d samples):\n", sample_count); printf(" min = %7.3f ms\n", min_); printf(" median = %7.3f ms\n", median); printf(" mean = %7.3f ms\n", mean); printf(" p99 = %7.3f ms\n", p99); printf(" max = %7.3f ms\n", max_); double fps_mean = 1000.0 / mean; double fps_median = 1000.0 / median; printf("\nthroughput (steady-state, IDCT only — NO intra/MC/deblock):\n"); printf(" mean = %.1f fps\n", fps_mean); printf(" median = %.1f fps\n", fps_median); printf(" target = 30.0 fps (project_30fps_floor_is_fine.md)\n"); if (fps_median >= 30.0) printf(" status = MEETS target (with %.1fx headroom for " "intra/MC/deblock)\n", fps_median / 30.0); else printf(" status = BELOW target (need %.1fx speedup just at IDCT)\n", 30.0 / fps_median); free(samples); free(out_uv); free(out_y); free(mb_8x8); free(per_mb); daedalus_decoder_destroy(dec); return 0; }