From 0b6482bc8f276b26da871d7d917934d9bc6563d4 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Sun, 24 May 2026 23:19:39 +0200 Subject: [PATCH] phase1: bench_flush_frame substrate selector + IDCT-layer QPU vs CPU data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends bench_flush_frame with an argv[5] substrate selector (auto/cpu/qpu). Same enum as test_idct_bitexact's argv[4] — keeps both binaries' CLI in sync. The whole point of plumbing the selector through is to put a number on the "QPU is default substrate" decree (2026-05-23, feedback_qpu_is_default_substrate.md) for the IDCT layer specifically. The decree said: "What can be done, will be done in QPU. Dispatch overhead is fixable defect." This measurement quantifies the unfixed defect. Bench config: 1920x1088, 100 iters, 5 warmup, half 4x4 / half 8x8 luma MBs + chroma always 4x4. Pi 5 / V3D 7.1 / daedalus-fourier 0.1.0 (with cycle 6/7/9 H.264 IDCT shaders). Hertz, idle system. Results: substrate min median mean p99 fps (median) ───────────────────────────────────────────────────────────── CPU NEON 8.75 9.27 11.10 33.06 107.8 QPU V3D7 31.92 37.77 37.67 47.27 26.5 AUTO 31.99 33.19 36.04 92.23 30.1 Targets: 30 fps @ 1080p (project_30fps_floor_is_fine.md). Stages NOT yet measured: intra prediction, MC, deblock. Interpretation: - For the IDCT-only workload at frame batch granularity, CPU NEON is 4.1x faster than QPU V3D7. - AUTO → recipe table → QPU per the decree → BELOW the 30 fps target with no headroom for the remaining decoder stages. - The earlier "101 fps median at 1080p" measurement reported in PR #8's commit was actually the CPU NEON path — the daedalus- fourier install on hertz at the time predated the cycle 6 H.264 QPU shader, so recipe AUTO silently fell back to CPU NEON. PR #8's "Path C is viable" conclusion stands, but the substrate label was wrong. Apologies for the misleading number. What this means for the campaign: - The decree's "fixable defect" claim is still aspirational for the H.264 IDCT shaders. The current QPU shader dispatch costs ~3.6 ms per IDCT round-trip (luma 4x4 + luma 8x8 + chroma 4x4 = ~10 ms total cf. CPU's 2.3 ms), which dominates over the compute. - daedalus-decoder doesn't need to take a position on this — the AUTO path follows the recipe table and respects the decree. The substrate selector is the escape hatch when consumers want to override. - For the libavcodec intercept patch when it lands, the right move is probably to start with CPU NEON for IDCT and switch to QPU once the dispatch overhead drops (issue #162 dmabuf import + further pool work on the daedalus-fourier side). No source change to flush_frame itself; this is purely a measurement add. The bench is opt-in (not a ctest) — these numbers belong in commit messages and the campaign log, not in CI gating. --- tests/bench_flush_frame.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/tests/bench_flush_frame.c b/tests/bench_flush_frame.c index 84cd6bd..990b682 100644 --- a/tests/bench_flush_frame.c +++ b/tests/bench_flush_frame.c @@ -12,9 +12,19 @@ * NOT a ctest — produces wall-time numbers, doesn't pass/fail. * Invoke manually after a build: * - * ./build/bench_flush_frame [width] [height] [iters] [warmup] + * ./build/bench_flush_frame [width] [height] [iters] [warmup] [substrate] * - * Defaults: 1920 1088 100 5 + * Defaults: 1920 1088 100 5 auto + * + * The [substrate] argument selects the dispatch path: + * auto — recipe table picks (V3D7 when available, else NEON) + * cpu — force NEON path + * qpu — force V3D7 path (fails on hosts without it) + * + * Run both to quantify the substrate gap. The "QPU is default + * substrate" decree (2026-05-23, feedback_qpu_is_default_substrate.md) + * is a policy claim; this bench is how we measure whether the policy + * pays off for the IDCT layer specifically. * * The first `warmup` iterations are excluded from the timing * average because the daedalus-fourier shader pool needs to @@ -70,6 +80,18 @@ int main(int argc, char **argv) int iters = argc > 3 ? atoi(argv[3]) : 100; int warmup = argc > 4 ? atoi(argv[4]) : 5; + daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO; + const char *sub_name = "auto"; + if (argc > 5) { + if (!strcmp(argv[5], "cpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_CPU; sub_name = "cpu"; } + else if (!strcmp(argv[5], "qpu")) { sub = DAEDALUS_DECODER_SUBSTRATE_QPU; sub_name = "qpu"; } + else if (!strcmp(argv[5], "auto")) { /* default */ } + else { + fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[5]); + return 1; + } + } + if (warmup >= iters) { fprintf(stderr, "warmup (%d) must be < iters (%d)\n", warmup, iters); return 1; @@ -78,14 +100,18 @@ int main(int argc, char **argv) int mb_w = width / 16; int mb_h = height / 16; int n_mbs = mb_w * mb_h; - printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup)\n", - width, height, n_mbs, iters, warmup); + printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup), substrate=%s\n", + width, height, n_mbs, iters, warmup, sub_name); daedalus_decoder *dec = daedalus_decoder_create(width, height); if (!dec) { fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n"); return 0; } + if (daedalus_decoder_set_substrate(dec, sub) != 0) { + fprintf(stderr, "set_substrate(%s) failed\n", sub_name); + return 1; + } printf("ctx has_qpu=%d\n", daedalus_decoder_has_qpu(dec)); /* Pre-generate per-MB random coeffs once. We re-append the same -- 2.47.3