From 0b6482bc8f276b26da871d7d917934d9bc6563d4 Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Sun, 24 May 2026 23:19:39 +0200
Subject: [PATCH] phase1: bench_flush_frame substrate selector + IDCT-layer QPU
 vs CPU data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends bench_flush_frame with an argv[5] substrate selector
(auto/cpu/qpu).  Same enum as test_idct_bitexact's argv[4] — keeps
both binaries' CLI in sync.

The whole point of plumbing the selector through is to put a number
on the "QPU is default substrate" decree (2026-05-23,
feedback_qpu_is_default_substrate.md) for the IDCT layer
specifically.  The decree said: "What can be done, will be done in
QPU.  Dispatch overhead is fixable defect."  This measurement
quantifies the unfixed defect.

Bench config: 1920x1088, 100 iters, 5 warmup, half 4x4 / half 8x8
luma MBs + chroma always 4x4.  Pi 5 / V3D 7.1 / daedalus-fourier
0.1.0 (with cycle 6/7/9 H.264 IDCT shaders).  Hertz, idle system.

Results:

  substrate   min     median   mean    p99      fps (median)
  ─────────────────────────────────────────────────────────────
  CPU NEON    8.75    9.27     11.10   33.06    107.8
  QPU V3D7    31.92   37.77    37.67   47.27    26.5
  AUTO        31.99   33.19    36.04   92.23    30.1

  Targets: 30 fps @ 1080p (project_30fps_floor_is_fine.md).
  Stages NOT yet measured: intra prediction, MC, deblock.

Interpretation:

  - For the IDCT-only workload at frame batch granularity, CPU NEON
    is 4.1x faster than QPU V3D7.
  - AUTO → recipe table → QPU per the decree → BELOW the 30 fps
    target with no headroom for the remaining decoder stages.
  - The earlier "101 fps median at 1080p" measurement reported in
    PR #8's commit was actually the CPU NEON path — the daedalus-
    fourier install on hertz at the time predated the cycle 6 H.264
    QPU shader, so recipe AUTO silently fell back to CPU NEON.
    PR #8's "Path C is viable" conclusion stands, but the substrate
    label was wrong.  Apologies for the misleading number.

What this means for the campaign:

  - The decree's "fixable defect" claim is still aspirational for
    the H.264 IDCT shaders.  The current QPU shader dispatch costs
    ~3.6 ms per IDCT round-trip (luma 4x4 + luma 8x8 + chroma 4x4 =
    ~10 ms total cf. CPU's 2.3 ms), which dominates over the compute.
  - daedalus-decoder doesn't need to take a position on this — the
    AUTO path follows the recipe table and respects the decree.
    The substrate selector is the escape hatch when consumers want
    to override.
  - For the libavcodec intercept patch when it lands, the right
    move is probably to start with CPU NEON for IDCT and switch to
    QPU once the dispatch overhead drops (issue #162 dmabuf import
    + further pool work on the daedalus-fourier side).

No source change to flush_frame itself; this is purely a measurement
add.  The bench is opt-in (not a ctest) — these numbers belong in
commit messages and the campaign log, not in CI gating.
---
 tests/bench_flush_frame.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/tests/bench_flush_frame.c b/tests/bench_flush_frame.c
index 84cd6bd..990b682 100644
--- a/tests/bench_flush_frame.c
+++ b/tests/bench_flush_frame.c
@@ -12,9 +12,19 @@
  * NOT a ctest — produces wall-time numbers, doesn't pass/fail.
  * Invoke manually after a build:
  *
- *   ./build/bench_flush_frame [width] [height] [iters] [warmup]
+ *   ./build/bench_flush_frame [width] [height] [iters] [warmup] [substrate]
  *
- * Defaults: 1920 1088 100 5
+ * Defaults: 1920 1088 100 5 auto
+ *
+ * The [substrate] argument selects the dispatch path:
+ *   auto — recipe table picks (V3D7 when available, else NEON)
+ *   cpu  — force NEON path
+ *   qpu  — force V3D7 path (fails on hosts without it)
+ *
+ * Run both to quantify the substrate gap.  The "QPU is default
+ * substrate" decree (2026-05-23, feedback_qpu_is_default_substrate.md)
+ * is a policy claim; this bench is how we measure whether the policy
+ * pays off for the IDCT layer specifically.
  *
  * The first `warmup` iterations are excluded from the timing
  * average because the daedalus-fourier shader pool needs to
@@ -70,6 +80,18 @@ int main(int argc, char **argv)
     int iters   = argc > 3 ? atoi(argv[3]) : 100;
     int warmup  = argc > 4 ? atoi(argv[4]) : 5;
 
+    daedalus_decoder_substrate sub = DAEDALUS_DECODER_SUBSTRATE_AUTO;
+    const char *sub_name = "auto";
+    if (argc > 5) {
+        if      (!strcmp(argv[5], "cpu"))  { sub = DAEDALUS_DECODER_SUBSTRATE_CPU;  sub_name = "cpu"; }
+        else if (!strcmp(argv[5], "qpu"))  { sub = DAEDALUS_DECODER_SUBSTRATE_QPU;  sub_name = "qpu"; }
+        else if (!strcmp(argv[5], "auto")) { /* default */ }
+        else {
+            fprintf(stderr, "unknown substrate '%s' (want auto/cpu/qpu)\n", argv[5]);
+            return 1;
+        }
+    }
+
     if (warmup >= iters) {
         fprintf(stderr, "warmup (%d) must be < iters (%d)\n", warmup, iters);
         return 1;
@@ -78,14 +100,18 @@ int main(int argc, char **argv)
     int mb_w = width  / 16;
     int mb_h = height / 16;
     int n_mbs = mb_w * mb_h;
-    printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup)\n",
-           width, height, n_mbs, iters, warmup);
+    printf("bench_flush_frame: %dx%d (%d MBs), %d iters (%d warmup), substrate=%s\n",
+           width, height, n_mbs, iters, warmup, sub_name);
 
     daedalus_decoder *dec = daedalus_decoder_create(width, height);
     if (!dec) {
         fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
         return 0;
     }
+    if (daedalus_decoder_set_substrate(dec, sub) != 0) {
+        fprintf(stderr, "set_substrate(%s) failed\n", sub_name);
+        return 1;
+    }
     printf("ctx has_qpu=%d\n", daedalus_decoder_has_qpu(dec));
 
     /* Pre-generate per-MB random coeffs once.  We re-append the same
-- 
2.47.3