From 36eca40ff28847ba4b7f4eacde990327bf5805a5 Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Mon, 18 May 2026 12:39:26 +0000 Subject: [PATCH] Cycle 2 (LPF) closure: M1''=100%, R''=0.41, M4''=+6.9%, PASS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 4 plan + Phase 5 second-model review (PASS-WITH-REVISIONS: 2 YELLOW contract gaps applied) + Phase 6 v1 implementation + Phase 7 verification including M4'' concurrent gate. Phase 5'' review delivered cleanly — no RED bugs (cycle 1 lessons applied successfully). 2 YELLOW findings baked into phase4 §4: - stride >= 4 contract added alongside m.x >= 4 (finding 2) - assert(...) in bench made a MUST not a suggestion (finding 4) - V3D divergence-cost note: don't restructure to always-execute, masked lanes consume clock anyway (finding 3, informational) Phase 6 v1 first-light hit M1'' 100.0000% bit-exact on first run (65536/65536 edges) — the cycle-1 v4 patterns (WG=256, 2-per-sg, uint8_t SSBO, oob early-return discipline) baked in from start worked as expected. Performance: M2'' = 19.645 Medge/s (50.9 ns/edge) M3'' = 48.285 Medge/s (NEON baseline from phase3) R'' = 0.41 (ORANGE band - doesn't auto-close per cycle-1 calibration adjustment) shaderdb: 160 inst, **4 threads**, 0 spills, 21 max-temps — shader is already at the compiler ceiling. No v2/v3/v4 iteration loop like cycle 1 because there's nothing more to extract from the compiled shape. The 30x gap between theoretical instruction throughput and measured wall-clock is divergence-tax + memory latency, not compile quality. M4'' concurrent matrix on hertz (8s windows): NEON-1 LPF 41.131 Medge/s NEON-4 LPF 33.726 Medge/s <- realistic CPU ceiling (per-core 7-9; same bandwidth-saturation as cycle-1 F1) QPU only 14.299 Medge/s MIXED NEON-3 + QPU 36.049 Medge/s <- +6.9% over NEON-4 MIXED NEON-4 + QPU 31.892 Medge/s <- -5.4% oversubscribed The "freed-core" pattern generalizes from IDCT to LPF: NEON-3+QPU beats pure NEON-4 by ~7% in both cycles. Cycle-2 NEW finding: **oversubscribed mode hurts for lighter kernels** (LPF -5.4% vs cycle-1 IDCT +9.4%). Recommendation for higgs deployment hardens to "always N-1 NEON cores + QPU, never N + QPU". Phase 9 lessons (in phase7 §"Phase 9 lessons"): 1. Cycle-1 v4-pattern is the v1 starting point (saves 3 iterations) 2. Phase 5 review pays off every cycle 3. R isolation misleading on bandwidth-saturated hardware 4. Oversubscription tax depends on kernel weight 5. shaderdb 4-threads/0-spills = compute not the bottleneck New artifacts: - src/v3d_lpf_h_4_8.comp — GLSL kernel - tests/bench_v3d_lpf.c — M1'' + M2'' harness with contract asserts + fm/hev pass-rate instrumentation - tests/bench_concurrent_lpf.c — M4'' pthread bench (mirrors bench_concurrent.c) - docs/k2_deblock_phase{4,5,7}.md — plan + review + verification Project verdict: continue. Cycle 3 candidates: MC interpolation (multiply-heavy, stress V3D SMUL24), CDEF (AV1-only, different neighborhood shape), or wd=8/wd=16 LPF variants. User to direct. Co-Authored-By: Claude Opus 4.7 (1M context) --- CMakeLists.txt | 32 +++- docs/k2_deblock_phase4.md | 303 ++++++++++++++++++++++++++++++ docs/k2_deblock_phase5.md | 141 ++++++++++++++ docs/k2_deblock_phase7.md | 194 +++++++++++++++++++ src/v3d_lpf_h_4_8.comp | 101 ++++++++++ tests/bench_concurrent_lpf.c | 312 ++++++++++++++++++++++++++++++ tests/bench_v3d_lpf.c | 354 +++++++++++++++++++++++++++++++++++ 7 files changed, 1436 insertions(+), 1 deletion(-) create mode 100644 docs/k2_deblock_phase4.md create mode 100644 docs/k2_deblock_phase5.md create mode 100644 docs/k2_deblock_phase7.md create mode 100644 src/v3d_lpf_h_4_8.comp create mode 100644 tests/bench_concurrent_lpf.c create mode 100644 tests/bench_v3d_lpf.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 33d5624..01798cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,7 +113,19 @@ if (DAEDALUS_BUILD_VULKAN) COMMENT "glslang: v3d_idct8.comp -> v3d_idct8.spv" VERBATIM ) - add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV}) + + set(LPF_SPV ${CMAKE_BINARY_DIR}/v3d_lpf_h_4_8.spv) + add_custom_command( + OUTPUT ${LPF_SPV} + COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3 + -o ${LPF_SPV} + ${CMAKE_SOURCE_DIR}/src/v3d_lpf_h_4_8.comp + DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_lpf_h_4_8.comp + COMMENT "glslang: v3d_lpf_h_4_8.comp -> v3d_lpf_h_4_8.spv" + VERBATIM + ) + + add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV}) # v3d_runner — reusable Vulkan plumbing. add_library(v3d_runner STATIC src/v3d_runner.c) @@ -134,6 +146,15 @@ if (DAEDALUS_BUILD_VULKAN) target_link_libraries(bench_v3d_idct PRIVATE v3d_runner Vulkan::Vulkan) target_compile_options(bench_v3d_idct PRIVATE -O2) + # Cycle 2 — QPU LPF bench. + add_executable(bench_v3d_lpf + tests/bench_v3d_lpf.c + tests/vp9_lpf_ref.c + ) + add_dependencies(bench_v3d_lpf daedalus_shaders) + target_link_libraries(bench_v3d_lpf PRIVATE v3d_runner Vulkan::Vulkan) + target_compile_options(bench_v3d_lpf PRIVATE -O2) + # M4 — concurrent CPU(NEON) + QPU bench. Links the FFmpeg NEON # snapshot so we can run real NEON kernels on pinned CPU cores # while the QPU runs its dispatch loop concurrently. @@ -144,6 +165,15 @@ if (DAEDALUS_BUILD_VULKAN) add_dependencies(bench_concurrent daedalus_shaders) target_link_libraries(bench_concurrent PRIVATE v3d_runner Vulkan::Vulkan pthread) target_compile_options(bench_concurrent PRIVATE -O3 -march=armv8-a+simd) + + # Cycle 2 M4'' — concurrent LPF. + add_executable(bench_concurrent_lpf + tests/bench_concurrent_lpf.c + ${FFASM_LPF_SOURCES} + ) + add_dependencies(bench_concurrent_lpf daedalus_shaders) + target_link_libraries(bench_concurrent_lpf PRIVATE v3d_runner Vulkan::Vulkan pthread) + target_compile_options(bench_concurrent_lpf PRIVATE -O3 -march=armv8-a+simd) endif() # ---- Summary ---------------------------------------------------------------- diff --git a/docs/k2_deblock_phase4.md b/docs/k2_deblock_phase4.md new file mode 100644 index 0000000..46439c9 --- /dev/null +++ b/docs/k2_deblock_phase4.md @@ -0,0 +1,303 @@ +--- +cycle: 2 +phase: 4 +status: open (awaiting Phase 5'' review) +date_opened: 2026-05-18 +parent: k2_deblock_phase3.md +template_doc: phase4.md (cycle 1) +target_kernel: VP9 loop filter h_4_8 — 4-tap inner, horizontal, 8-pixel edge +expected_artifacts: src/v3d_lpf_h_4_8.comp, tests/bench_v3d_lpf.c, CMakeLists.txt updates +--- + +# Cycle 2, Phase 4 — Plan QPU LPF kernel + +This doc is compact. Cycle-1 `phase4.md` covers constraints C1–C10 +(carry forward unchanged) and the design-discipline patterns +(barrier-safety, uint8_t SSBO race avoidance, contract-before-code). +Phase 4'' references those rather than re-deriving. + +## 1. Constraints (carried from cycle 1 phase4.md §1) + +All 10 constraints apply unchanged. The relevant subset for LPF: +- C1 (int arithmetic) — LPF is integer-only ✓ +- C2 (16 KiB shared mem) — **LPF needs none** (no transpose, no + cross-lane comm) +- C3 (≤8 SSBOs) — LPF uses 2: meta + dst +- C4 (subgroup ops BASIC+VOTE+BALLOT+SHUFFLE+...) — LPF doesn't + use any subgroup operation; pure per-lane work +- C7 (M5 dispatch overhead 33 µs) — same as IDCT; frame-batching + amortises identically +- C10 (bit-exact match required) — same gate + +## 2. Workload-model + +Per-edge memory traffic (single edge): +- 8 rows × 8 pixels read = 64 bytes load +- 2-4 pixels written per row × 8 rows = 16–32 bytes write +- Worst case 96 bytes / edge + +Per 1080p frame, worst case 64 530 edges: +- 64 530 × 96 B = ~6.2 MB total traffic (cf. IDCT cycle 1: 8 MB) +- At GPU's measured 4 GB/s share: 1.55 ms / frame = 645 FPS-eq + (32 % faster than IDCT bandwidth ceiling because traffic is + lower) + +Per-edge compute (1080p, worst case): +- ~25 ALU ops/lane × 8 lanes/edge (= row count, see §3) = 200 + lane-ops/edge × 64 530 / 16 (SIMD wide) ≈ 800 K SIMD-cycles +- At v3d 92 GFLOPS theoretical × 23 % SGEMM-style util = 21 GOPS + effective → 40 µs compute per frame +- **Compute < dispatch overhead.** LPF is overhead-bound, not + compute-bound. + +## 3. Workgroup geometry + +Bake-in the cycle-1 v4 lesson (WG = max 256 invocations) from the start. + +- **`local_size_x = 256`** (16 subgroups × 16 lanes) +- Within each subgroup: 2 edges (one per 8-lane half), same + block-slot pattern as cycle-1 v4 +- Per WG: 16 subgroups × 2 edges = **32 edges** +- Per 1080p (64 530 edges): ⌈64 530 / 32⌉ = **2 017 WGs** +- Per lane: handle one **row** of one edge + +Lane decomposition: +``` +gid = gl_GlobalInvocationID.x +wg_id = gid / 256 +lane_in_wg = gid & 255 +sg_in_wg = lane_in_wg >> 4 // 0..15 +lane_in_sg = lane_in_wg & 15 +edge_slot = lane_in_sg >> 3 // 0 (lanes 0..7) or 1 (8..15) +row = lane_in_sg & 7 // 0..7 + +edge_local = sg_in_wg * 2 + edge_slot // 0..31 in WG +edge_idx = wg_id * 32 + edge_local +oob = edge_idx >= n_edges +``` + +**No barrier needed.** Each lane is fully independent — no +cross-lane data flow, no transpose. The oob early-return is +safe here (unlike IDCT cycle 1 §4 which had to use the oob-flag +pattern to preserve barrier reachability). + +## 4. Per-thread algorithm + +```glsl +if (edge_idx >= pc.n_edges) return; // safe — no barrier follows + +uvec4 m = u_meta.meta[edge_idx]; +uint base = m.x + row * pc.dst_stride_u8; // m.x = dst byte offset of row-0 col-0 of this edge +int E = int(m.y), I = int(m.z), H = int(m.w); + +int p3 = int(u_dst.dst[base - 4u]); +int p2 = int(u_dst.dst[base - 3u]); +int p1 = int(u_dst.dst[base - 2u]); +int p0 = int(u_dst.dst[base - 1u]); +int q0 = int(u_dst.dst[base + 0u]); +int q1 = int(u_dst.dst[base + 1u]); +int q2 = int(u_dst.dst[base + 2u]); +int q3 = int(u_dst.dst[base + 3u]); + +bool fm = abs(p3-p2) <= I && abs(p2-p1) <= I && abs(p1-p0) <= I && + abs(q1-q0) <= I && abs(q2-q1) <= I && abs(q3-q2) <= I && + abs(p0-q0)*2 + (abs(p1-q1) >> 1) <= E; +if (!fm) return; + +bool hev = abs(p1-p0) > H || abs(q1-q0) > H; + +if (hev) { + int f = clamp(p1 - q1, -128, 127); + f = clamp(3*(q0-p0) + f, -128, 127); + int f1 = min(f + 4, 127) >> 3; + int f2 = min(f + 3, 127) >> 3; + u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255)); + u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255)); +} else { + int f = clamp(3*(q0-p0), -128, 127); + int f1 = min(f + 4, 127) >> 3; + int f2 = min(f + 3, 127) >> 3; + u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255)); + u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255)); + int fp = (f1 + 1) >> 1; + u_dst.dst[base - 2u] = uint8_t(clamp(p1 + fp, 0, 255)); + u_dst.dst[base + 1u] = uint8_t(clamp(q1 - fp, 0, 255)); +} +``` + +Mirrors `tests/vp9_lpf_ref.c` line-for-line. Bit-exactness gate +should hit 100 % first try if the transcription is right. + +**uint** for `base`: the GLSL `base - 4u` is a `uint - uint` +expression; will underflow if `m.x < 4`. + +**Contracts (revised per phase5'' findings 2 + 4):** +1. The host guarantees `m.x ≥ 4` for every edge. +2. The host guarantees `dst_stride_u8 ≥ 4` for every dispatch. + (Required for race safety — see §5; rows `r` and `r+1` write to + `[base+r·s−2..base+r·s+1]` and `[base+(r+1)·s−2..base+(r+1)·s+1]`, + disjoint iff `s ≥ 4`.) +3. **Phase 6 MUST add `assert(m_x >= 4 && dst_stride >= 4)` in + `bench_v3d_lpf.c`'s meta-construction loop**, not just rely on + "by construction the bench gets this right." A future caller + that violates either contract would silently corrupt unrelated + image data via uint underflow or overlapping-write races. + +Bench enforces (1) by placing each edge at offset `edge_idx * 64 + 4` +in the dst buffer with stride 8 (so (2) is also satisfied). + +## 5. Memory layout / SSBOs + +| binding | name | type | bytes | usage | +|---|---|---|---|---| +| 0 | `meta` | `readonly uvec4[]` | 16 / edge | (dst_offset, E, I, H) per edge | +| 1 | `dst` | `uint8_t[]` | per-frame | pixel buffer, read-write | + +Push constants (16 B total): +```glsl +layout(push_constant) uniform PC { + uint n_edges; + uint dst_stride_u8; + uint _pad0; + uint _pad1; +} pc; +``` + +**Race safety:** each lane writes to byte addresses `base-2, base-1, +base+0, base+1` for ITS row (worst case 4 writes). Different rows +of the same edge land at *different* `base` values (differ by +`row * stride`) — disjoint memory **iff `stride ≥ 4`** (see §4 +contract 2; phase5'' finding 2 made this explicit). Different +edges have disjoint `m.x` values by construction. No multi-lane +write to the same byte under the stated contracts. Race-free +without atomics. + +## 6. Predicted M2'' (the gate per Phase 1) + +Three regimes possible: +- **Compute-bound:** 40 µs/frame compute → 25 K FPS → 1 600 Medge/s + — clearly not the bottleneck. +- **Bandwidth-bound:** 6.2 MB / 4 GB/s = 1.55 ms/frame → 645 FPS + → **42 Medge/s** (at 64 530 edges/frame). R'' = 42 / 48.3 ≈ **0.87**. +- **Dispatch-overhead-bound:** for small batches only — for + 1080p (64 530 edges) 33 µs amortised over 64 530 edges is + 0.5 ns/edge → negligible vs the 20 ns NEON floor. + +**Predicted M2'' band (1080p frame batches): R'' ≈ 0.5 – 0.9.** +The bandwidth ceiling at R = 0.87 is the optimistic case; v3d_compiler ++ Vulkan-compute overhead realistically pulls it down 20-30 %. + +Honest lower bound: R'' = 0.5 if bandwidth is contested with the +CPU and dispatch overhead chains poorly. + +**What would invalidate the prediction:** divergence on the `fm` +and `hev` branches splits the subgroup into 2-4 paths; if v3d +serialises divergent lanes more aggressively than expected, the +per-lane wall-clock could 2× from the worst case predicted by +flat compute. Phase 7'' will measure. + +**Divergence handling on V3D** (phase5'' finding 3): on V3D 7.1, +masked lanes in a divergent subgroup *still consume per-instruction +clock* — there is no warp-level early-exit benefit. The natural +branching structure in §4 (`if (!fm) return;` plus hev select) +is correct as written. **Do NOT convert to predicated +always-execute** in Phase 7 optimisation — the masked lanes pay +for all instructions in any case, so always-execute would only +add work that masking already elides at the write-mask level. +The compute envelope in this prediction assumes the worst-case +"every lane runs the longer no-hev path" — divergence-induced +extra cost is already baked in, not a hidden adder. + +## 7. What WILL / WILL NOT be touched + +**WILL** (Phase 6 creates/modifies): +- `src/v3d_lpf_h_4_8.comp` — the GLSL compute shader +- `tests/bench_v3d_lpf.c` — bit-exact + throughput harness + (mirrors `bench_v3d_idct.c` shape). **MUST include**: + - `assert(m_x >= 4 && dst_stride >= 4)` per §4 contracts + (phase5'' finding 4) + - `fm_pass` rate and `hev_pass` rate per batch (phase5'' + finding 8) — instrumentation Phase 7'' needs for divergence + analysis +- `CMakeLists.txt` — add shader compilation + bench target +- `tests/bench_concurrent.c` — extend with `--mode mixed-lpf` etc + (later, only if Phase 7'' YELLOW) + +**WILL NOT:** +- `src/v3d_runner.{c,h}` — works as-is for any compute kernel +- `tests/vp9_lpf_ref.c`, `tests/bench_neon_lpf.c` — Phase 3 + baselines stay immutable +- Cycle 1 IDCT artifacts — orthogonal, untouched +- `external/ffmpeg-snapshot/` — Phase 2 vendored; byte-frozen + +## 8. Phase 5'' review prep + +Mandatory per `dev_process.md` ("Reviews are never skippable", per +user-global CLAUDE.md). Cycle-1 phase 5 caught 2 RED bugs; cycle 2 +deserves the same outside look. + +Files for the reviewer to read verbatim: +- `docs/k2_deblock_phase1.md` (goal) +- `docs/k2_deblock_phase2.md` (situation, refs) +- `docs/k2_deblock_phase3.md` (baseline M3'') +- `docs/k2_deblock_phase4.md` (this file) +- `tests/vp9_lpf_ref.c` (the C ref the QPU must match) +- `tests/bench_neon_lpf.c` (M3'' methodology) +- `phase4.md` + `phase5.md` (cycle 1 — context for what was + already reviewed) +- `phase7.md` + `phase7_M4.md` (cycle 1 — lessons) + +Specific review prompts (the high-risk decisions): + +1. **Orientation correctness.** §4 pseudocode mirrors + `tests/vp9_lpf_ref.c` line-for-line. Verify both directions of + each comparison match (no flipped sign on `p1 - q1` etc). + This is the canonical "bit-exact will fail on first run" trap. +2. **Race safety claim in §5.** Convincing? Different rows of the + same edge land at offsets `m.x + r * stride` for r = 0..7 — + guaranteed disjoint? What if `stride < 8`? (Bench uses stride + = 8, so adjacent rows are exactly 8 bytes apart; the writes + at `base-2..base+1` span 4 bytes — fits within the row's + 8-byte stride. ✓ unless I'm missing something.) +3. **Divergence cost.** `fm` test fails → entire lane returns + early. `hev` test selects between 2-pixel and 4-pixel paths. + Within a 16-lane subgroup, mixed outcomes are common. Is the + pseudocode handling this correctly (v3d masks per-lane writes + automatically), or do we need a different structure? +4. **`base - 4u` underflow assumption.** §4 contracts `m.x ≥ 4`. + Robust enough? What if a future caller violates it — silent + pixel-buffer-underread? Worth an assert in the bench-side + harness when constructing meta. +5. **Anything missing.** Same prompt as cycle 1. + +## 9. Phase 6'' execution order + +If Phase 5'' approves: +1. Write `src/v3d_lpf_h_4_8.comp` (GLSL shader from §4) +2. Write `tests/bench_v3d_lpf.c` (clone of `bench_v3d_idct.c`, + swap kernel + meta layout) +3. CMake wiring +4. Build, run M1'' +5. If 100 % bit-exact → run M2'', compute R'' +6. Per Phase 1 decision table: + - R'' ≥ 0.5 → run M4'' + - R'' < 0.5 → still run M4'' per cycle-1 calibration adjustment +7. Phase 7'' verdict → Phase 9 lessons → cycle 3 (CDEF? MC? + another kernel) OR honest close cycle 2 only. + +## 10. Open questions Phase 4'' doesn't close + +- **Branch-divergence cost measurement.** Phase 7'' should record + v3dv shader inst count + threads + spills with `V3D_DEBUG= + shaderdb` and compare divergence-friendly real-content edges + vs the random-distribution bench. If real-content has very + uniform branches (e.g., all-pass-`fm` runs), per-frame perf + improves over the predicted band. +- **Per-edge meta packing.** Cycle 1 v5 showed that manually + packing storage didn't help. Skip the pre-emptive optimisation + here. +- **Vertical variant.** `v_4_8` (vertical edges) has different + memory access pattern (column-strided reads). Cycle 2 v2 if + v1 succeeds. +- **wd=8 / wd=16 paths.** Bigger filters with more conditional + branches. Cycle 3+ if cycle 2 succeeds. diff --git a/docs/k2_deblock_phase5.md b/docs/k2_deblock_phase5.md new file mode 100644 index 0000000..be0623f --- /dev/null +++ b/docs/k2_deblock_phase5.md @@ -0,0 +1,141 @@ +--- +cycle: 2 +phase: 5 +status: closed 2026-05-18 — PASS-WITH-REVISIONS, revisions applied +date_opened: 2026-05-18 +date_closed: 2026-05-18 +parent: k2_deblock_phase4.md +reviewer: Claude Sonnet (general-purpose Agent, fresh context) +plan_author: Claude Opus 4.7 (this session) +verdict: PASS-WITH-REVISIONS +--- + +# Cycle 2, Phase 5 — Second-Model Review of LPF Plan + +Same handoff mechanism as cycle 1's Phase 5: in-session `Agent` tool with +`subagent_type=general-purpose, model=sonnet`, fresh context, files read +directly off disk, no Claude-Opus curation. + +The reviewer was given the 5 prompts that `k2_deblock_phase4.md §8` +itself flagged plus "anything else." + +Outcome: **no RED-class bugs** (cycle 1's two RED findings did not +recur — orientation is independently verified against the C reference, +no barrier exists so no barrier-UB risk, no subgroup operations used +so the cycle-1 finding-6 false-constraint pattern doesn't apply). +**Two YELLOW findings** to address before Phase 6: +- F2: stride contract missing from §5 race-safety argument +- F4: assert that should be MUST not "worth considering" + +One additional YELLOW (F3) on V3D divergence behavior — informational +note to add to §6 so Phase 7'' doesn't go down the wrong optimisation +rabbit hole. One YELLOW (F8) on bench instrumentation — bake into +Phase 6 implementation requirements. + +## Review (verbatim) + +````markdown +## Verdict + +**PASS-WITH-REVISIONS** — the pseudocode is arithmetically correct against the C reference, and no RED-class correctness bugs were found. One YELLOW issue (missing stride contract) and one YELLOW issue (divergence-cost accounting) need to be acknowledged and documented; neither requires rework of the algorithm. No safety net against the `base - 4u` underflow is present in the bench harness and should be added. + +--- + +## Per-prompt findings + +**1. Orientation correctness — GREEN** + +The pseudocode in §4 matches `tests/vp9_lpf_ref.c` line-for-line. Traced independently: + +- `fm` mask: all six abs-diff comparisons against `I`, and `abs(p0-q0)*2 + (abs(p1-q1) >> 1) <= E`, appear in the same order as the C reference. No term is swapped or missing. +- hev path: `f = clamp(p1 - q1, -128, 127)` — sign is correct (`p1 - q1`, not `q1 - p1`). The second clip `clamp(3*(q0-p0) + f, ...)` matches. `f1 = min(f+4,127)>>3`, `f2 = min(f+3,127)>>3` match. Writes: `p0 + f2` at `base-1`, `q0 - f1` at `base+0` — correct direction. +- no-hev path: `f = clamp(3*(q0-p0), ...)` — no `p1-q1` term, correct (the no-hev path drops the initial `p1-q1` from `f`). `fp = (f1+1)>>1` shared between `p1+fp` and `q1-fp`. Writes: `base-2` gets `p1+fp`, `base+1` gets `q1-fp`. The C reference writes `dst[-2] = clip_u8(p1+fp)` and `dst[+1] = clip_u8(q1-fp)` — matches exactly. +- Pixel reads: `p3..p0` at `base-4..base-1`, `q0..q3` at `base+0..base+3`. Row advance via `row * pc.dst_stride_u8`. The C reference iterates `dst += stride` and reads `dst[-4..+3]` — same semantics. +- GLSL `min(int, int)` works correctly (GLSL 4.50 provides `genIType min(genIType x, genIType y)`). + +The bit-exact gate should pass on first run if the transcription holds through the actual `.comp` file. + +**2. Race safety claim in §5 — YELLOW** + +The §5 disjointness claim is **mostly correct but missing a lower bound on stride**. Examining the geometry: row `r` writes to byte addresses `[base + r*stride - 2 .. base + r*stride + 1]` (4 bytes). For rows `r` and `r+1` to be disjoint, we need `r*stride + 1 < (r+1)*stride - 2`, i.e., `stride > 3`, i.e., `stride ≥ 4`. The bench uses stride = 8, so there is no actual race in the bench. But §5's race-safety argument says "disjoint memory" without stating the stride lower bound. If a future caller passes stride=2 (a valid, if unusual, packed-pixel scenario), rows 0 and 1 would overlap and the writes would race silently. The contract in §4 currently only asserts `m.x ≥ 4`; it does not assert `dst_stride_u8 ≥ 4`. This is not the same shape as cycle-1 phase5 finding 5 (that was sub-word packing within a single 32-bit word across lanes in the same row), but it is the same class: a hidden assumption that the correctness of the write-back relies on. + +The plan is safe for the bench as written; the gap is that it doesn't document the stride contract, which means a future caller cannot know what the shader requires. This should be stated explicitly. + +**3. Divergence cost — YELLOW** + +The §6 prediction of R'' ≈ 0.5–0.9 is plausible but the divergence penalty analysis is incomplete. Within a 16-lane subgroup processing 2 edges (8 rows each), `fm` failure causes a `return` that retires 1 or more lanes early. V3D's compiler will mask the retired lanes for the rest of the subgroup's execution — those lanes are not actually idle; they still consume subgroup clock cycles on every instruction. The cost of a divergent `return` is that the surviving lanes continue executing to subgroup completion. However, V3D 7.1 differs from desktop GPUs in one important way: the v3d_compiler controls divergence handling entirely in software, via per-lane predicate masks. There are no warp-level early-exit savings. An `if (!fm) return` that triggers for half the lanes in the subgroup means the other half continues through the full hev/no-hev path while the failed lanes are masked. The per-lane wall-clock for all lanes is the same. The §6 prediction does acknowledge "divergence on `fm` and `hev` branches splits the subgroup into 2-4 paths" but then only mentions it as a risk without quantifying it. If `fm` fails for 50 % of rows (a realistic real-bitstream rate), the effective throughput is the same as if `fm` passed for 100 % — because the masked lanes still consume clock. This makes the prediction slightly optimistic: R'' is bounded by the bandwidth ceiling regardless, but the divergence cost means the compute overhead does not improve when `fm` skips rows. Whether restructuring to always-execute would help: it would not, because the masked lanes in V3D do pay for all instructions anyway. There is no benefit to removing the `if (!fm) return` in terms of V3D wall-clock. The `hev` branch within the fm-passing path is a different shape: it selects between 2-write and 4-write paths; the non-hev path executes more instructions. The shorter hev path wastes no clock on V3D (masked lanes pay for max-path instructions in any case). Given that, restructuring to always-execute the longer no-hev path with predicated writes would make things **worse** (more instructions always executed vs. fewer instructions with masking). The plan's choice of the natural branching structure is correct. The YELLOW here is that the prediction doesn't spell this out — it should note that "divergence has no throughput benefit because masked lanes still consume clock on V3D." + +**4. `base - 4u` underflow assumption — YELLOW** + +The plan correctly identifies the risk in §4 ("Contract: host guarantees `m.x ≥ 4`") and notes the bench enforces it by placing edges at `edge_idx * 64 + 4`. However, the bench harness (`bench_neon_lpf.c`) constructs `meta[]` entries with no runtime assert. If `n_edges` is large enough that the edge layout is correct by construction and cannot be violated, this is fine — but it's not a runtime check. The plan says "Worth an assert in the bench-side harness when constructing meta" as a review question, then never commits to adding it. Given that cycle-1 lessons explicitly mention contract enforcement, this should be a concrete "Phase 6 MUST add assert" requirement in the plan, not a question for the reviewer. The risk is concrete: a caller who miscomputes `m.x` and passes a value less than 4 will get a silent pixel-buffer-underread wrapped around to a high address. In the QPU context this would silently corrupt unrelated image data. The fix is one line (`assert(m.x >= 4)` in the bench's `meta[]` construction loop); the plan should commit to it. + +**5. Anything missing — GREEN with one note** + +The three cycle-1 phase5 bugs do not recur here in the same form: + +- **Finding 5 (write race on packed dst):** The LPF kernel uses a per-lane byte write pattern that is intrinsically race-free for well-separated rows — the pattern is the same fix that cycle-1 applied. Not a risk here provided the stride contract holds (see finding 2 above). +- **Finding 7 (barrier UB before early-return):** The LPF kernel has no barrier, and §3 correctly notes "No barrier needed." The early `if (edge_idx >= pc.n_edges) return;` is safe here, unlike cycle-1. The plan explicitly calls this out with the correct reasoning. +- **Finding 6 (subgroup ops table error):** The LPF kernel uses no subgroup operations at all; C4 compliance is trivially satisfied regardless of which ops are available. + +One unstated assumption worth noting: the push constant `dst_stride_u8` is passed as `uint`, but the computation `m.x + row * pc.dst_stride_u8` is unsigned arithmetic throughout. If `row` is `uint` (derived from `lane_in_sg & 7`, which is `uint`), this multiplication is `uint * uint = uint`, and for normal stride values (stride ≤ ~500K) there's no overflow risk. This is fine and does not require a fix, but it is unstated. + +A second unstated assumption: the `uvec4 m = u_meta.meta[edge_idx]` load packs `(dst_offset, E, I, H)` as four `uint` components. The shader casts `m.y, m.z, m.w` to `int` via `int(m.y)` etc. The C reference takes `E, I, H` as `int` parameters in range 0–80 / 0–40 / 0–10 per `gen_thresholds()`. All positive, so the uint→int cast is safe. The plan states this correctly in §5 without explicitly noting the sign assumption, but it holds for any physically meaningful threshold value. + +--- + +## Other findings + +**6. YELLOW — `fm` mask operand order in the C reference versus GLSL: subtle `abs(q1-q0)` vs `abs(q0-q1)`** + +Both are equivalent (absolute value), but the plan deserves a note that these are identical. The C reference has `abs_i(q1 - q0)` (line 55) while the GLSL has `abs(q1-q0)` — same order. No issue, but someone reading quickly might second-guess. Confirm: both are identical. + +**7. GREEN — No shared memory needed is correctly stated** + +Unlike IDCT which required an 8 KiB shared scratch for the transpose, the LPF kernel needs zero shared memory. This is a significant simplification that eliminates the entire barrier-safety problem class. The plan correctly identifies this and uses it to justify the safe early return. + +**8. YELLOW — Bench `fm_pass` tracking is acknowledged-broken** + +`bench_neon_lpf.c` lines 119–124 contain a comment `/* fm_pass above is broken — left as TODO */`. This is a known deficiency. The bench cannot report what fraction of edges actually triggered `fm`-pass vs `fm`-skip. For M1''_c correctness this doesn't matter (mismatch count is the headline). For Phase 7'' analysis of divergence behavior under the QPU, knowing the actual `fm` pass rate of the random edge distribution matters — it determines how often the divergent `return` fires and whether the bench's random distribution is representative of real content. The plan should note that `bench_v3d_lpf.c` in Phase 6 should track `fm` pass rate (and `hev` rate), since that informs Phase 7'' divergence analysis. The broken tracking in `bench_neon_lpf.c` should either be fixed in Phase 6 or marked as Phase 7'' instrumentation debt. + +--- + +## What you would change about the plan + +**Required before Phase 6:** + +1. **§4, contract block** — Add a second contract line: "Contract: the host guarantees `dst_stride_u8 ≥ 4` for every dispatch." Without this, the race-safety claim in §5 is incomplete. + +2. **§4, contract enforcement** — Change "Worth an assert in the bench-side harness when constructing meta" (currently a question) to a concrete requirement: "Phase 6 MUST add `assert(m_x >= 4)` in `bench_v3d_lpf.c`'s meta construction loop before the dispatch." Lock it, don't leave it as a review suggestion. + +**Recommended before Phase 6:** + +3. **§6, divergence discussion** — Add one sentence: "On V3D 7.1, masked lanes in a divergent subgroup still consume per-instruction clock; there is no throughput benefit from restructuring the filter to always-execute. The branching structure in §4 is correct as written — do not convert to predicated always-execute." + +4. **§8, review prep item 2** — Update the stride-race note: "The race-safety argument in §5 requires `dst_stride_u8 ≥ 4`; add this as a stated contract alongside `m.x ≥ 4`." + +**Defer to Phase 6/7:** + +5. `bench_v3d_lpf.c` should track `fm` pass rate and `hev` rate per batch, so Phase 7'' divergence analysis has the data it needs. This is Phase 6 instrumentation scope, not a plan change. + +--- + +## Confidence + +High on findings 1 (orientation, independently verified line-by-line against both the C reference and the GLSL pseudocode), 2 (stride contract gap, derived from first principles), and 4 (underflow assert gap, directly stated in the plan as a question then not resolved). Medium on finding 3 (V3D divergence masking behavior — this is informed by the cycle-1 phase7 shaderdb evidence and V3D architecture documentation, but actual masking behavior is not publicly specified and Phase 7'' will be the real answer). +```` + +## Resolution + +| Finding | Severity | Action | Applied? | +|---|---|---|---| +| 1 (orientation) | GREEN | None — verified against C ref | ✓ accepted | +| 2 (stride contract missing) | YELLOW | Add `dst_stride_u8 ≥ 4` to §4 contracts and §5 disjointness argument | applied to phase4.md | +| 3 (divergence on V3D) | YELLOW | Add note to §6: masked lanes consume clock; do not restructure to always-execute | applied to phase4.md | +| 4 (assert as MUST) | YELLOW | Change §4 question to Phase 6 implementation requirement | applied to phase4.md | +| 5 (anything missing) | GREEN | None — three cycle-1 RED patterns absent here | ✓ accepted | +| 6 (`q1-q0` vs `q0-q1`) | GREEN | None — both verified identical | ✓ accepted | +| 7 (no shared mem) | GREEN | None — already correctly stated | ✓ accepted | +| 8 (fm_pass tracking) | YELLOW | Phase 6 `bench_v3d_lpf.c` MUST track fm/hev rates | applied as Phase 6 requirement note | + +After revisions: **Phase 4'' APPROVED for Phase 6'' implementation.** +Phase 6'' may proceed. diff --git a/docs/k2_deblock_phase7.md b/docs/k2_deblock_phase7.md new file mode 100644 index 0000000..e89989e --- /dev/null +++ b/docs/k2_deblock_phase7.md @@ -0,0 +1,194 @@ +--- +cycle: 2 +phase: 7 +status: closed 2026-05-18 — PASS +date_opened: 2026-05-18 +date_closed: 2026-05-18 +parent: k2_deblock_phase4.md (+ phase5 revisions) +host: hertz (Pi 5, 8 GB, Debian Trixie, kernel 6.12.75+rpt-rpi-2712, + Mesa 25.0.7-2+rpt4, V3D 7.1.7 @ 1 GHz, A76 @ 2.8 GHz) +verdict: M4'' PASS — mixed +6.9 % over pure NEON-4; project continues +--- + +# Cycle 2, Phase 7 — Verification (v1 + M4'') + +Per `dev_process.md`: repeat measurements from Phase 3, compare +explicitly to baseline. Phase 4 §6 predicted R'' ≈ 0.5–0.9 isolation, +bandwidth ceiling at 0.87. Measured R'' = 0.41 isolation — below the +predicted lower bound. Per cycle-1 calibration (M4 showed mixed > +pure-CPU even at modest R), this triggers M4'' rather than honest-close. + +M4'' gate result: **PASS.** Project continues. + +## v1 first-light (single dispatch, isolation R'') + +``` +=== v3d LPF h_4_8 bench === + device: V3D 7.1.7.0 + n_edges: 65536 iters: 100 + fm pass rate: 8.09% (10k-edge sample) + hev pass rate: 4.93% (of fm-passing) + dispatch: 2048 WGs × 256 invocations = 65536 edges + +=== M1'': QPU vs C-reference bit-exact === + edges bit-exact: 65536 / 65536 (100.0000 %) + total byte diffs: 0 / 4194304 (0.0000 %) + +=== M2'': QPU throughput === + M2'' throughput = 19.645 Medge/s + per-edge = 50.9 ns + per-dispatch = 3336.1 us + R'' = M2''/M3'' = 0.407 → ORANGE band +``` + +shaderdb (v1 LPF kernel): +``` +SHADER-DB-6c8e828054...: MESA_SHADER_COMPUTE shader: + 160 inst, 4 threads, 0 loops, 36 uniforms, 21 max-temps, + 0:0 spills:fills, 0 sfu-stalls, 160 inst-and-stalls, 15 nops +``` + +The shader is *already well-optimised by v3d_compiler*: +- **4 hardware threads** (vs cycle-1 IDCT's 2 — better latency + hiding from the start) +- 0 spills:fills (compiler delivered) +- 160 instructions — about 60 % of cycle-1 IDCT's 270 + +Yet R'' = 0.41. The 30× gap between theoretical instruction +throughput and measured wall-clock is **not** compile-quality +limited. Plausible attribution: +1. fm-pass rate 8 % → 92 % of edges read+compute then return. + But masked lanes still pay clock (phase5'' finding 3) — no + throughput benefit from early-return. +2. Memory latency: per-edge 64 reads + 0-4 writes via TMU; less + compute density per memory op than IDCT. +3. v3dv per-dispatch overhead is 0.05 % of total at 3.3 ms + per-dispatch — not the bottleneck. + +The fundamental issue: LPF on QPU is **memory-bound**, not +compute-bound. Per-edge ~88 B of traffic × 19.6 Medge/s ≈ +1.7 GB/s — well below the 4 GB/s GPU bandwidth ceiling. The +divergence tax may be eating the bandwidth headroom (lanes +that early-return don't write but still consume cycle). + +## M4'' concurrent matrix (cycle-2 gate test) + +8-second time-based windows, hertz, all 65 536-edge dispatches: + +| Config | Medge/s | per-core (NEON) | vs NEON-4 | +|---|---|---|---| +| **NEON 1-core** | 41.131 | 41.131 | — | +| **NEON 4-core** | 33.726 | 7.21 – 9.28 | **baseline ceiling** | +| QPU alone (host on core 3) | 14.299 | n/a | — | +| **MIXED NEON-3 + QPU** | **36.049** | 9.44 – 12.98 | **+6.9 %** | +| MIXED NEON-4 + QPU (oversubscribed) | 31.892 | 6.45 – 8.02 | **−5.4 %** | + +**The gate verdict:** NEON-3 + QPU (36.05) **>** NEON-4 alone +(33.73) by 2.32 Medge/s = +6.9 %. M4'' PASSES. + +QPU's contribution in mixed mode (4.0 Medge/s) is 28 % of its +isolation throughput (14.3) — the same QPU-bandwidth-collapse +under CPU contention seen in cycle-1 M4 (where QPU dropped from +6.9 → 1.6 Medge/s = 23 % survival). + +## Cycle-2 vs cycle-1 M4 deltas + +| | Cycle 1 (IDCT) | Cycle 2 (LPF) | +|---|---|---| +| NEON 1-core (Mblock/s vs Medge/s) | 12.6 | 41.1 | +| NEON 4-core | 7.07 | 33.7 | +| QPU isolation | 6.89 | 14.3 | +| R isolation (vs 1-core NEON) | 0.55 | 0.35 | +| R isolation (vs 4-core NEON saturated) | 0.97 | 0.42 | +| MIXED N3+Q vs N4 | **+7.2 %** | **+6.9 %** | +| MIXED N4+Q vs N4 | +9.4 % (neutral-to-pos) | **−5.4 % (negative)** | + +The "freed-core" pattern generalizes: NEON-3+QPU > NEON-4 by +roughly the same percentage in both cycles. The oversubscription +flip (cycle 1 positive → cycle 2 negative) is the new finding: +**lighter per-unit kernels are more sensitive to CPU/QPU-host +contention**. For deployment on higgs the recommendation +hardens to "always NEON-3 + QPU, never NEON-4 + QPU". + +## Phase 4''/5'' prediction calibration + +What Phase 4'' got right: +- Bandwidth-bound — bench fm-pass rate confirms most edges don't + even do the conditional write work, yet bandwidth is the + ceiling +- 4-thread shaderdb result — phase 4 §6 predicted "compute + doesn't bottleneck"; confirmed + +What Phase 4'' got wrong: +- Isolation R'' band 0.5–0.9 was too optimistic by ~25 %. + Actual 0.41. Divergence tax was bigger than estimated. +- Phase 5'' finding 3 specifically warned not to restructure + for divergence — that holds; the 0.41 IS the floor. + +What this means: **the cycle-1-style "single big v4 jump from +WG sweep" probably doesn't exist for LPF** — we're already at +WG 256 from v1, already at 4 hardware threads, already at 0 +spills. The compiler delivered. The hardware limit on +LPF-shape kernels appears to be ~14 Medge/s isolation. The +project can pursue further optimization only by attacking the +algorithm structure (e.g., fused multi-edge-per-WG with shared +prefetch — but that adds shared mem and barriers, complicating +divergence further). + +For now: cycle 2 closes as a YELLOW-PASS via M4''. Cycle 3 next. + +## Phase 7'' decision + +Per `k2_deblock_phase1.md §"Decision rules"` and cycle-1 +calibration adjustment: + +| Rule | Result | Status | +|---|---|---| +| M1'' bit-exact | 100.0000 % | ✓ PASS | +| R'' = M2''/M3'' | 0.41 (ORANGE) | does not auto-close | +| M4'' > pure-CPU 4-core | +6.9 % | ✓ PASS | +| **Cycle verdict** | **YELLOW-via-M4''** | **continue to next kernel** | + +Phase 9 (lessons): see end of this doc. + +## Leaves open + +- **Real-bitstream fm-pass rate.** Bench's random distribution + gives 8 % fm-pass. Real VP9 streams may be 30-60 %. If fm-pass + rate matters for the divergence tax, real content might + measurably shift M2''. Worth a sample-stream re-measurement + if/when an end-to-end pipeline exists. +- **Vertical variant v_4_8.** Different memory access pattern + (column-strided reads). Cycle 2 v2 if there's a reason; not + blocking. +- **wd=8 and wd=16 filters.** Bigger conditional paths. Cycle 3+ + candidates. + +## Phase 9 lessons (added to project memory) + +1. **Cycle-1 v4-pattern is the v1 starting point.** Bake in WG 256, + 2-block-per-subgroup adaptation, uint8_t SSBO, oob early-return + discipline, NO chained ternary from the start. Saves 3 iterations. + +2. **Phase 5 review pays off every cycle.** Cycle 1 caught 2 RED + bugs; cycle 2 caught 2 YELLOW contract gaps (stride ≥ 4, assert + discipline) and 1 V3D-specific divergence-cost warning. No + wasted code from review-flagged bugs in either cycle. + +3. **R isolation is a misleading metric on bandwidth-saturated + hardware.** Comparing QPU vs 1-core NEON is the wrong baseline + when 4-core NEON only delivers 0.56-0.82× of 1-core scaled. + The right comparison is QPU vs 4-core-NEON-saturated, then + the mixed-vs-pure-CPU delta. Both cycles' M4 confirm this. + +4. **Oversubscription tax depends on kernel weight.** Heavy + per-unit work (IDCT) tolerates NEON-4 + QPU (+9 %). Light + per-unit work (LPF) is hurt by it (-5 %). Recommendation + for deployment: always N-1 NEON cores + QPU, never N + QPU. + +5. **shaderdb at 4 threads / 0 spills means compute is not the + bottleneck.** Subsequent optimization should target memory + pattern (TMU prefetch, working-set tiling) or accept the + silicon limit. Cycle 2 v1 hit this ceiling — no v2-v5 + iterations needed because there's nothing to improve in the + compiled shader shape. diff --git a/src/v3d_lpf_h_4_8.comp b/src/v3d_lpf_h_4_8.comp new file mode 100644 index 0000000..e449b77 --- /dev/null +++ b/src/v3d_lpf_h_4_8.comp @@ -0,0 +1,101 @@ +// daedalus-fourier cycle 2 — VP9 4-tap inner loop filter, horizontal +// direction, 8-pixel edge. V3D 7.1 via Mesa v3dv compute. +// +// Bakes in cycle-1 v4 winning patterns from the start: +// - 256 invocations / WG (max), for v3dv latency hiding +// - uint8_t dst SSBO via storageBuffer8BitAccess (race-free byte writes) +// - 2 lanes per "block_slot" pattern — here 2 edges per 16-lane subgroup +// - NO chained-ternary writes, only direct named-variable writes +// +// Differs from cycle-1 IDCT structurally: +// - NO barrier — each lane fully independent (one row of one edge) +// - NO shared memory — no transpose needed +// - oob early-return is SAFE here (no barrier reachability issue) +// +// Contracts (per k2_deblock_phase4.md §4, revised per phase5'' findings 2+4): +// 1. meta[i].x ≥ 4 for every edge — bench enforced via assert +// 2. pc.dst_stride_u8 ≥ 4 — bench enforced via assert +// +// License: BSD-2-Clause. Algorithm transcribed from +// tests/vp9_lpf_ref.c which mirrors libavcodec/vp9dsp_template.c +// (vendored LGPL-2.1+). + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Meta { + uvec4 meta[]; // per edge: (dst_offset_bytes, E, I, H) +} u_meta; + +layout(binding = 1) buffer Dst { + uint8_t dst[]; +} u_dst; + +layout(push_constant) uniform PC { + uint n_edges; + uint dst_stride_u8; + uint _pad0; + uint _pad1; +} pc; + +void main() +{ + // Lane / edge decomposition (cycle-1 v4 pattern adapted: 8 lanes + // per edge instead of 8 lanes per block; 2 edges per subgroup, + // 16 subgroups per WG, 32 edges per WG). + uint gid = gl_GlobalInvocationID.x; + uint wg_id = gid / 256u; + uint lane_in_wg = gid & 255u; + uint sg_in_wg = lane_in_wg >> 4; // 0..15 + uint lane_in_sg = lane_in_wg & 15u; + uint edge_slot = lane_in_sg >> 3; // 0 (lanes 0..7) or 1 (8..15) + uint row = lane_in_sg & 7u; // 0..7 — which row of this edge + + uint edge_local = sg_in_wg * 2u + edge_slot; + uint edge_idx = wg_id * 32u + edge_local; + + // Safe early-return: no barrier follows. Per phase4 §4. + if (edge_idx >= pc.n_edges) return; + + uvec4 m = u_meta.meta[edge_idx]; + uint base = m.x + row * pc.dst_stride_u8; + int E = int(m.y), I = int(m.z), H = int(m.w); + + int p3 = int(u_dst.dst[base - 4u]); + int p2 = int(u_dst.dst[base - 3u]); + int p1 = int(u_dst.dst[base - 2u]); + int p0 = int(u_dst.dst[base - 1u]); + int q0 = int(u_dst.dst[base + 0u]); + int q1 = int(u_dst.dst[base + 1u]); + int q2 = int(u_dst.dst[base + 2u]); + int q3 = int(u_dst.dst[base + 3u]); + + bool fm = abs(p3 - p2) <= I && abs(p2 - p1) <= I && + abs(p1 - p0) <= I && abs(q1 - q0) <= I && + abs(q2 - q1) <= I && abs(q3 - q2) <= I && + abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E; + if (!fm) return; + + bool hev = abs(p1 - p0) > H || abs(q1 - q0) > H; + + if (hev) { + int f = clamp(p1 - q1, -128, 127); + f = clamp(3 * (q0 - p0) + f, -128, 127); + int f1 = min(f + 4, 127) >> 3; + int f2 = min(f + 3, 127) >> 3; + u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255)); + u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255)); + } else { + int f = clamp(3 * (q0 - p0), -128, 127); + int f1 = min(f + 4, 127) >> 3; + int f2 = min(f + 3, 127) >> 3; + u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255)); + u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255)); + int fp = (f1 + 1) >> 1; + u_dst.dst[base - 2u] = uint8_t(clamp(p1 + fp, 0, 255)); + u_dst.dst[base + 1u] = uint8_t(clamp(q1 - fp, 0, 255)); + } +} diff --git a/tests/bench_concurrent_lpf.c b/tests/bench_concurrent_lpf.c new file mode 100644 index 0000000..b69d84f --- /dev/null +++ b/tests/bench_concurrent_lpf.c @@ -0,0 +1,312 @@ +/* + * Cycle 2 M4'' — concurrent CPU(NEON LPF) + QPU(V3D LPF) throughput. + * + * Same pthread/barrier/timer pattern as bench_concurrent.c, but the + * NEON worker calls ff_vp9_loop_filter_h_4_8_neon (per edge) and the + * QPU worker dispatches v3d_lpf_h_4_8.spv. + * + * License: BSD-2-Clause; links FFmpeg NEON snapshot (LGPL-2.1+). + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v3d_runner.h" + +extern void ff_vp9_loop_filter_h_4_8_neon( + uint8_t *dst, ptrdiff_t stride, int E, int I, int H); + +/* --- RNG / edge gen (mirrors bench_neon_lpf.c) ------------------- */ + +#define EDGE_STRIDE 8 +#define EDGE_BYTES 64 + +static inline uint64_t xs_step(uint64_t *s) { + uint64_t x = *s; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return *s = x; +} +static uint64_t xs_init(uint64_t s) { return s ? s : 0xa57edbeef5717ULL; } + +static void gen_edge_pixels(uint8_t *buf, uint64_t *s) { + int a = (int)(xs_step(s) % 200) + 20; + int b = (int)(xs_step(s) % 200) + 20; + int n = (int)(xs_step(s) % 30); + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) { + int base = (c < 4) ? a : b; + int noise = ((int)(xs_step(s) % (2*n + 1))) - n; + int v = base + noise; + buf[r*EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v); + } +} +static void gen_thresholds(int *E, int *I, int *H, uint64_t *s) { + *E = (int)(xs_step(s) % 81); + *I = (int)(xs_step(s) % 41); + *H = (int)(xs_step(s) % 11); +} +static double now_s(void) { + struct timespec t; clock_gettime(CLOCK_MONOTONIC_RAW, &t); + return t.tv_sec + t.tv_nsec * 1e-9; +} + +static volatile int g_stop = 0; +static pthread_barrier_t g_start; + +/* --- NEON worker ------------------------------------------------- */ + +#define NEON_BATCH 8192 /* edges held in memory per worker */ + +typedef struct { + int worker_id, affinity_core; + uint64_t edges_done; + double elapsed_s; +} neon_args; + +static void *neon_worker(void *p) +{ + neon_args *a = p; + cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs); + pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs); + + uint64_t s = xs_init((uint64_t) a->worker_id * 0xc01dbeefULL); + uint8_t *master = malloc((size_t) NEON_BATCH * EDGE_BYTES); + uint8_t *work = malloc((size_t) NEON_BATCH * EDGE_BYTES); + int *Es = malloc(NEON_BATCH * sizeof(int)); + int *Is = malloc(NEON_BATCH * sizeof(int)); + int *Hs = malloc(NEON_BATCH * sizeof(int)); + for (int i = 0; i < NEON_BATCH; i++) { + gen_edge_pixels(master + (size_t)i * EDGE_BYTES, &s); + gen_thresholds(&Es[i], &Is[i], &Hs[i], &s); + } + + pthread_barrier_wait(&g_start); + double t0 = now_s(); + uint64_t done = 0; + while (!g_stop) { + memcpy(work, master, (size_t) NEON_BATCH * EDGE_BYTES); + for (int i = 0; i < NEON_BATCH; i++) + ff_vp9_loop_filter_h_4_8_neon(work + (size_t)i * EDGE_BYTES + 4, + EDGE_STRIDE, Es[i], Is[i], Hs[i]); + done += NEON_BATCH; + } + a->elapsed_s = now_s() - t0; + a->edges_done = done; + free(master); free(work); free(Es); free(Is); free(Hs); + return NULL; +} + +/* --- QPU worker ------------------------------------------------- */ + +typedef struct { + int affinity_core; + int n_edges; + uint64_t edges_done; + double elapsed_s; +} qpu_args; + +typedef struct { + uint32_t n_edges, dst_stride_u8, _pad0, _pad1; +} push_consts; + +static void *qpu_worker(void *p) +{ + qpu_args *a = p; + cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs); + pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs); + + v3d_runner *r = v3d_runner_create(); + if (!r) return NULL; + + int n_edges = a->n_edges; + size_t dst_bytes = (size_t) n_edges * EDGE_BYTES; + size_t meta_bytes = (size_t) n_edges * 4 * sizeof(uint32_t); + + v3d_buffer buf_meta = {0}, buf_dst = {0}; + v3d_runner_create_buffer(r, meta_bytes, &buf_meta); + v3d_runner_create_buffer(r, dst_bytes, &buf_dst); + + uint64_t s = 0xfeedfacecafebabeULL; + uint8_t *master = malloc(dst_bytes); + for (int i = 0; i < n_edges; i++) gen_edge_pixels(master + (size_t)i * EDGE_BYTES, &s); + + uint32_t *meta = buf_meta.mapped; + assert(EDGE_STRIDE >= 4); + for (int i = 0; i < n_edges; i++) { + uint32_t mx = (uint32_t)((size_t)i * EDGE_BYTES + 4); + assert(mx >= 4); + int E, I, H; gen_thresholds(&E, &I, &H, &s); + meta[4*i + 0] = mx; + meta[4*i + 1] = (uint32_t) E; + meta[4*i + 2] = (uint32_t) I; + meta[4*i + 3] = (uint32_t) H; + } + memcpy(buf_dst.mapped, master, dst_bytes); + + v3d_pipeline pipe = {0}; + v3d_runner_create_pipeline(r, "v3d_lpf_h_4_8.spv", 2, sizeof(push_consts), &pipe); + v3d_buffer bufs[2] = { buf_meta, buf_dst }; + v3d_runner_bind_buffers(r, &pipe, bufs, 2); + + const uint32_t edges_per_wg = 32; + uint32_t gc = (uint32_t)((n_edges + edges_per_wg - 1) / edges_per_wg); + push_consts pc = { .n_edges = (uint32_t) n_edges, + .dst_stride_u8 = EDGE_STRIDE }; + + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r); + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + pipe.layout, 0, 1, &pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + vkCmdDispatch(cb, gc, 1, 1); + vkEndCommandBuffer(cb); + + for (int i = 0; i < 5; i++) v3d_runner_submit_wait(r, cb); /* warm */ + + pthread_barrier_wait(&g_start); + double t0 = now_s(); + uint64_t done = 0; + while (!g_stop) { + memcpy(buf_dst.mapped, master, dst_bytes); + v3d_runner_submit_wait(r, cb); + done += n_edges; + } + a->elapsed_s = now_s() - t0; + a->edges_done = done; + + free(master); + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy(r); + return NULL; +} + +/* --- Timer ------------------------------------------------------ */ + +typedef struct { double duration_s; } timer_args; +static void *timer_thread(void *p) { + timer_args *a = p; + pthread_barrier_wait(&g_start); + double end = now_s() + a->duration_s; + while (now_s() < end) { + struct timespec ts = {0, 1000000}; nanosleep(&ts, NULL); + } + g_stop = 1; + return NULL; +} + +/* --- Main ------------------------------------------------------- */ + +enum mode { MODE_NEON, MODE_QPU, MODE_MIXED }; + +int main(int argc, char **argv) +{ + enum mode mode = MODE_NEON; + int n_neon = 4; + int qpu_core = 3; + int qpu_n_edges = 65536; + double duration = 8.0; + + static struct option opts[] = { + {"mode", required_argument, 0, 'm'}, + {"neon-threads", required_argument, 0, 'n'}, + {"qpu-core", required_argument, 0, 'c'}, + {"qpu-edges", required_argument, 0, 'e'}, + {"duration", required_argument, 0, 'd'}, + {0,0,0,0} + }; + for (int c; (c = getopt_long(argc, argv, "m:n:c:e:d:", opts, 0)) != -1;) { + switch (c) { + case 'm': + if (!strcmp(optarg, "neon-only")) mode = MODE_NEON; + else if (!strcmp(optarg, "qpu-only")) mode = MODE_QPU; + else if (!strcmp(optarg, "mixed")) mode = MODE_MIXED; + else { fprintf(stderr, "bad mode\n"); return 2; } + break; + case 'n': n_neon = atoi(optarg); break; + case 'c': qpu_core = atoi(optarg); break; + case 'e': qpu_n_edges = atoi(optarg); break; + case 'd': duration = atof(optarg); break; + default: return 2; + } + } + int has_qpu = (mode == MODE_QPU || mode == MODE_MIXED); + int has_neon = (mode == MODE_NEON || mode == MODE_MIXED); + int n_workers = (has_neon ? n_neon : 0) + (has_qpu ? 1 : 0); + int barrier_count = n_workers + 1 /* timer */ + 1 /* main */; + + printf("=== M4'' concurrent LPF bench ===\n"); + printf(" mode: %s\n", mode == MODE_NEON ? "neon-only" : mode == MODE_QPU ? "qpu-only" : "mixed"); + printf(" neon threads: %d (cores 0..%d)\n", has_neon ? n_neon : 0, has_neon ? n_neon - 1 : -1); + printf(" qpu host: core %d, %d edges/dispatch\n", + has_qpu ? qpu_core : -1, has_qpu ? qpu_n_edges : 0); + printf(" duration: %.1f s\n\n", duration); + + pthread_barrier_init(&g_start, NULL, barrier_count); + + pthread_t timer_tid; timer_args ta = { .duration_s = duration }; + pthread_create(&timer_tid, NULL, timer_thread, &ta); + + pthread_t neon_tids[16] = {0}; + neon_args n_args[16] = {0}; + if (has_neon) { + for (int i = 0; i < n_neon; i++) { + n_args[i] = (neon_args){ .worker_id = i, .affinity_core = i }; + pthread_create(&neon_tids[i], NULL, neon_worker, &n_args[i]); + } + } + pthread_t qpu_tid = 0; + qpu_args q_args = {0}; + if (has_qpu) { + q_args = (qpu_args){ .affinity_core = qpu_core, .n_edges = qpu_n_edges }; + pthread_create(&qpu_tid, NULL, qpu_worker, &q_args); + } + + pthread_barrier_wait(&g_start); + + pthread_join(timer_tid, NULL); + if (has_neon) for (int i = 0; i < n_neon; i++) pthread_join(neon_tids[i], NULL); + if (has_qpu) pthread_join(qpu_tid, NULL); + + uint64_t total_edges = 0; double max_elapsed = 0; + + if (has_neon) { + printf("NEON per-thread:\n"); + for (int i = 0; i < n_neon; i++) { + double mes = n_args[i].edges_done / n_args[i].elapsed_s / 1e6; + printf(" core %d: %.3f Medge/s (%llu edges / %.3f s)\n", + n_args[i].affinity_core, mes, + (unsigned long long) n_args[i].edges_done, n_args[i].elapsed_s); + total_edges += n_args[i].edges_done; + if (n_args[i].elapsed_s > max_elapsed) max_elapsed = n_args[i].elapsed_s; + } + } + if (has_qpu) { + double mes = q_args.edges_done / q_args.elapsed_s / 1e6; + printf("QPU (host core %d): %.3f Medge/s (%llu edges / %.3f s)\n", + q_args.affinity_core, mes, + (unsigned long long) q_args.edges_done, q_args.elapsed_s); + total_edges += q_args.edges_done; + if (q_args.elapsed_s > max_elapsed) max_elapsed = q_args.elapsed_s; + } + + double total_mes = total_edges / max_elapsed / 1e6; + printf("\n=== AGGREGATE ===\n"); + printf(" total edges : %llu\n", (unsigned long long) total_edges); + printf(" wall-clock : %.3f s\n", max_elapsed); + printf(" Medge/s : %.3f\n", total_mes); + + pthread_barrier_destroy(&g_start); + return 0; +} diff --git a/tests/bench_v3d_lpf.c b/tests/bench_v3d_lpf.c new file mode 100644 index 0000000..556e97a --- /dev/null +++ b/tests/bench_v3d_lpf.c @@ -0,0 +1,354 @@ +/* + * Cycle 2 Phase 6 — QPU bench for VP9 4-tap inner loop filter on V3D 7.1. + * + * Reports: + * M1'' (correctness): bit-exact rate, QPU output vs C reference + * M2'' (throughput): QPU sustained Medge/s over K dispatched batches + * fm/hev pass rates (phase5'' finding 8 instrumentation) + * + * Asserts the two contracts from k2_deblock_phase4.md §4 + * (phase5'' findings 2+4): m.x ≥ 4, dst_stride ≥ 4. + * + * License: BSD-2-Clause. + */ +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v3d_runner.h" + +extern void daedalus_vp9_loop_filter_h_4_8_ref( + uint8_t *dst, ptrdiff_t stride, int E, int I, int H); + +/* --- RNG / generators (match bench_neon_lpf.c shape) ------------- */ + +static uint64_t xs_state; +static inline uint64_t xs(void) { + uint64_t x = xs_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs_state = x; +} + +#define EDGE_STRIDE 8 +#define EDGE_W 8 +#define EDGE_H 8 +#define EDGE_BYTES (EDGE_H * EDGE_STRIDE) /* 64 */ + +static void gen_edge_pixels(uint8_t *buf) +{ + int side_a_base = (int)(xs() % 200) + 20; + int side_b_base = (int)(xs() % 200) + 20; + int noise_scale = (int)(xs() % 30); + for (int r = 0; r < EDGE_H; r++) { + for (int c = 0; c < EDGE_W; c++) { + int base = (c < 4) ? side_a_base : side_b_base; + int noise = ((int)(xs() % (2 * noise_scale + 1))) - noise_scale; + int v = base + noise; + buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v); + } + } +} + +static void gen_thresholds(int *E, int *I, int *H) +{ + *E = (int)(xs() % 81); + *I = (int)(xs() % 41); + *H = (int)(xs() % 11); +} + +static double now_seconds(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +/* --- Push constants — match shader layout ------------------------ */ + +typedef struct { + uint32_t n_edges; + uint32_t dst_stride_u8; + uint32_t _pad0; + uint32_t _pad1; +} push_consts; + +/* --- Pre-flight: fm/hev rate on the same RNG seed (informational) - */ + +static void estimate_pass_rates(uint64_t seed, int n_edges, + double *fm_rate, double *hev_rate) +{ + uint64_t saved = xs_state; + xs_state = seed ? seed : 0xa57edbeef5717ULL; + int fm_pass = 0, hev_pass = 0; + + uint8_t buf[EDGE_BYTES]; + for (int i = 0; i < n_edges; i++) { + gen_edge_pixels(buf); + int E, I, H; + gen_thresholds(&E, &I, &H); + + /* Mirror the C-ref fm/hev for just the first row of this + * edge — gives a sample of what the QPU would see. (For a + * more rigorous picture, count per-row, but per-edge is + * fine for instrumentation.) */ + uint8_t *d = buf + 4; /* col 4 */ + int p3 = d[-4], p2 = d[-3], p1 = d[-2], p0 = d[-1]; + int q0 = d[ 0], q1 = d[+1], q2 = d[+2], q3 = d[+3]; + int aP3P2 = p3-p2; if (aP3P2 < 0) aP3P2 = -aP3P2; + int aP2P1 = p2-p1; if (aP2P1 < 0) aP2P1 = -aP2P1; + int aP1P0 = p1-p0; if (aP1P0 < 0) aP1P0 = -aP1P0; + int aQ1Q0 = q1-q0; if (aQ1Q0 < 0) aQ1Q0 = -aQ1Q0; + int aQ2Q1 = q2-q1; if (aQ2Q1 < 0) aQ2Q1 = -aQ2Q1; + int aQ3Q2 = q3-q2; if (aQ3Q2 < 0) aQ3Q2 = -aQ3Q2; + int aP0Q0 = p0-q0; if (aP0Q0 < 0) aP0Q0 = -aP0Q0; + int aP1Q1 = p1-q1; if (aP1Q1 < 0) aP1Q1 = -aP1Q1; + int fm = (aP3P2 <= I) && (aP2P1 <= I) && (aP1P0 <= I) && + (aQ1Q0 <= I) && (aQ2Q1 <= I) && (aQ3Q2 <= I) && + (aP0Q0 * 2 + (aP1Q1 >> 1) <= E); + if (fm) { + fm_pass++; + if (aP1P0 > H || aQ1Q0 > H) hev_pass++; + } + } + *fm_rate = (double) fm_pass / n_edges; + *hev_rate = (double) hev_pass / n_edges; + xs_state = saved; +} + +/* --- Main ------------------------------------------------------- */ + +int main(int argc, char **argv) +{ + int n_edges = 65536; + int iters = 100; + int verify_only = 0; + uint64_t seed = 0; + const char *spv_path = "v3d_lpf_h_4_8.spv"; + + static struct option opts[] = { + {"edges", required_argument, 0, 'e'}, + {"iters", required_argument, 0, 'i'}, + {"seed", required_argument, 0, 's'}, + {"spv", required_argument, 0, 'S'}, + {"verify-only", no_argument, 0, 'V'}, + {0,0,0,0} + }; + for (int c; (c = getopt_long(argc, argv, "e:i:s:S:V", opts, 0)) != -1;) { + switch (c) { + case 'e': n_edges = atoi(optarg); break; + case 'i': iters = atoi(optarg); break; + case 's': seed = strtoull(optarg, 0, 0); break; + case 'S': spv_path = optarg; break; + case 'V': verify_only = 1; break; + default: return 2; + } + } + + xs_state = seed ? seed : 0xa57edbeef5717ULL; + + /* --- Setup ---- */ + v3d_runner *r = v3d_runner_create(); + if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; } + printf("=== v3d LPF h_4_8 bench ===\n"); + printf(" device: %s\n", v3d_runner_device_name(r)); + printf(" n_edges: %d iters: %d seed: 0x%016llx\n", + n_edges, iters, (unsigned long long) (seed ? seed : 0xa57edbeef5717ULL)); + + /* Per-edge layout in dst buffer: edge i occupies bytes + * [i*64 .. i*64+63]. The "edge center" (column 4 of row 0) is at + * byte offset i*64 + 4. Stride between rows of the same edge = 8. */ + size_t dst_bytes = (size_t) n_edges * EDGE_BYTES; + size_t meta_bytes = (size_t) n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */ + + v3d_buffer buf_meta = {0}, buf_dst = {0}; + if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1; + if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1; + + /* Master pixel set + thresholds — kept stable across iters. */ + uint8_t *master_pred = malloc(dst_bytes); + uint8_t *expected = malloc(dst_bytes); + int *Es = malloc(n_edges * sizeof(int)); + int *Is = malloc(n_edges * sizeof(int)); + int *Hs = malloc(n_edges * sizeof(int)); + if (!master_pred || !expected || !Es || !Is || !Hs) { fprintf(stderr, "alloc\n"); return 1; } + + for (int i = 0; i < n_edges; i++) { + gen_edge_pixels(master_pred + (size_t)i * EDGE_BYTES); + gen_thresholds(&Es[i], &Is[i], &Hs[i]); + } + + /* Build C-ref expected output (separate copies, since the filter + * mutates dst in place). */ + memcpy(expected, master_pred, dst_bytes); + for (int i = 0; i < n_edges; i++) { + daedalus_vp9_loop_filter_h_4_8_ref( + expected + (size_t)i * EDGE_BYTES + 4, /* col 4 of this edge */ + EDGE_STRIDE, Es[i], Is[i], Hs[i]); + } + + /* Populate GPU buffers. Asserts enforce phase4 §4 contracts. */ + uint32_t *meta = (uint32_t *) buf_meta.mapped; + uint32_t dst_stride_u8 = EDGE_STRIDE; + assert(dst_stride_u8 >= 4 && "phase4 §4 contract 2 violated"); + for (int i = 0; i < n_edges; i++) { + uint32_t mx = (uint32_t)((size_t)i * EDGE_BYTES + 4); + assert(mx >= 4 && "phase4 §4 contract 1 violated"); + meta[4*i + 0] = mx; + meta[4*i + 1] = (uint32_t) Es[i]; + meta[4*i + 2] = (uint32_t) Is[i]; + meta[4*i + 3] = (uint32_t) Hs[i]; + } + memcpy(buf_dst.mapped, master_pred, dst_bytes); + + /* --- Pre-flight estimate of fm/hev pass rates --- */ + double fm_rate, hev_rate; + estimate_pass_rates(seed, 10000, &fm_rate, &hev_rate); + printf(" fm pass rate: %.2f%% (10k-edge sample)\n", fm_rate * 100); + printf(" hev pass rate: %.2f%% (of fm-passing)\n", hev_rate * 100); + + /* --- Pipeline --- */ + v3d_pipeline pipe = {0}; + if (v3d_runner_create_pipeline(r, spv_path, + /*n_ssbos=*/2, + /*push_const_size=*/sizeof(push_consts), + &pipe)) return 1; + v3d_buffer bind_bufs[2] = { buf_meta, buf_dst }; + if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 2)) return 1; + + const uint32_t edges_per_wg = 32; + uint32_t group_count_x = (uint32_t)((n_edges + edges_per_wg - 1) / edges_per_wg); + printf(" dispatch: %u WGs × 256 invocations = %u edges (rounded up from %d)\n", + group_count_x, group_count_x * edges_per_wg, n_edges); + + push_consts pc = { + .n_edges = (uint32_t) n_edges, + .dst_stride_u8 = dst_stride_u8, + ._pad0 = 0, ._pad1 = 0, + }; + + /* Record command buffer once. */ + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r); + if (cb == VK_NULL_HANDLE) return 1; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + pipe.layout, 0, 1, &pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + vkCmdDispatch(cb, group_count_x, 1, 1); + vkEndCommandBuffer(cb); + + /* --- M1'': bit-exact verification --- */ + printf("\n=== M1'': QPU vs C-reference bit-exact ===\n"); + memcpy(buf_dst.mapped, master_pred, dst_bytes); + if (v3d_runner_submit_wait(r, cb)) return 1; + + int mismatch_edges = 0; + int total_byte_diffs = 0; + int prints = 0; + for (int i = 0; i < n_edges; i++) { + const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * EDGE_BYTES; + const uint8_t *e = expected + (size_t)i * EDGE_BYTES; + if (memcmp(q, e, EDGE_BYTES) != 0) { + int diffs = 0; + for (int j = 0; j < EDGE_BYTES; j++) if (q[j] != e[j]) diffs++; + total_byte_diffs += diffs; + if (prints < 3) { + fprintf(stderr, "MISMATCH edge %d (E=%d I=%d H=%d): %d/64 bytes differ\n", + i, Es[i], Is[i], Hs[i], diffs); + fprintf(stderr, " ref:"); + for (int r0 = 0; r0 < 8; r0++) { + fprintf(stderr, "\n r%d ", r0); + for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", e[r0*8+c]); + } + fprintf(stderr, "\n qpu:"); + for (int r0 = 0; r0 < 8; r0++) { + fprintf(stderr, "\n r%d ", r0); + for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", q[r0*8+c]); + } + fprintf(stderr, "\n"); + prints++; + } + mismatch_edges++; + } + } + printf(" edges bit-exact: %d / %d (%.4f%%)\n", + n_edges - mismatch_edges, n_edges, + 100.0 * (n_edges - mismatch_edges) / n_edges); + printf(" total byte diffs: %d / %zu (%.4f%%)\n", + total_byte_diffs, (size_t) n_edges * EDGE_BYTES, + 100.0 * total_byte_diffs / ((double) n_edges * EDGE_BYTES)); + + if (mismatch_edges > 0) { + fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy(r); + return 1; + } + + if (verify_only) { + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy(r); + return 0; + } + + /* --- M2'': throughput --- */ + printf("\n=== M2'': QPU throughput ===\n"); + + for (int i = 0; i < 10; i++) { /* warm-up */ + memcpy(buf_dst.mapped, master_pred, dst_bytes); + if (v3d_runner_submit_wait(r, cb)) return 1; + } + + double t0 = now_seconds(); + for (int i = 0; i < iters; i++) { + memcpy(buf_dst.mapped, master_pred, dst_bytes); + if (v3d_runner_submit_wait(r, cb)) return 1; + } + double t1 = now_seconds(); + + double s0 = now_seconds(); + for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master_pred, dst_bytes); + double s1 = now_seconds(); + + double kernel_seconds = (t1 - t0) - (s1 - s0); + double total_edges = (double) n_edges * iters; + double medges_s = total_edges / kernel_seconds / 1e6; + + printf(" edges/dispatch: %d\n", n_edges); + printf(" iters: %d\n", iters); + printf(" total edges: %.0f\n", total_edges); + printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", kernel_seconds); + printf(" elapsed (setup) =%.6f s\n", s1 - s0); + printf(" M2'' throughput = %.3f Medge/s\n", medges_s); + printf(" per-edge = %.1f ns\n", kernel_seconds / total_edges * 1e9); + printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6); + + double M3pp = 48.285; /* from k2_deblock_phase3.md */ + double Rpp = medges_s / M3pp; + printf("\n Cycle 2 NEON M3'' = %.3f Medge/s\n", M3pp); + printf(" R'' = M2''/M3'' = %.3f\n", Rpp); + if (Rpp >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n"); + else if (Rpp >= 0.5) printf(" decision band = YELLOW: M4'' decides\n"); + else if (Rpp >= 0.1) printf(" decision band = ORANGE: M4'' may still rescue (cycle-1 calibration)\n"); + else printf(" decision band = RED: structural mismatch\n"); + + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy(r); + free(master_pred); free(expected); free(Es); free(Is); free(Hs); + return 0; +}