diff --git a/CMakeLists.txt b/CMakeLists.txt index a6b5125..9371a2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,12 +86,37 @@ if (DAEDALUS_BUILD_VULKAN) COMMENT "glslang: noop.comp -> noop.spv" VERBATIM ) - add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV}) + + set(IDCT8_SPV ${CMAKE_BINARY_DIR}/v3d_idct8.spv) + add_custom_command( + OUTPUT ${IDCT8_SPV} + COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3 + -o ${IDCT8_SPV} + ${CMAKE_SOURCE_DIR}/src/v3d_idct8.comp + DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_idct8.comp + COMMENT "glslang: v3d_idct8.comp -> v3d_idct8.spv" + VERBATIM + ) + add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV}) + + # v3d_runner — reusable Vulkan plumbing. + add_library(v3d_runner STATIC src/v3d_runner.c) + target_include_directories(v3d_runner PUBLIC src) + target_link_libraries(v3d_runner PUBLIC Vulkan::Vulkan) + target_compile_options(v3d_runner PRIVATE -O2) add_executable(bench_vulkan_dispatch tests/bench_vulkan_dispatch.c) add_dependencies(bench_vulkan_dispatch daedalus_shaders) target_link_libraries(bench_vulkan_dispatch PRIVATE Vulkan::Vulkan) target_compile_options(bench_vulkan_dispatch PRIVATE -O2) + + add_executable(bench_v3d_idct + tests/bench_v3d_idct.c + tests/vp9_idct8_ref.c + ) + add_dependencies(bench_v3d_idct daedalus_shaders) + target_link_libraries(bench_v3d_idct PRIVATE v3d_runner Vulkan::Vulkan) + target_compile_options(bench_v3d_idct PRIVATE -O2) endif() # ---- Summary ---------------------------------------------------------------- diff --git a/docs/phase7.md b/docs/phase7.md new file mode 100644 index 0000000..3c335e1 --- /dev/null +++ b/docs/phase7.md @@ -0,0 +1,159 @@ +--- +phase: 7 +status: closed 2026-05-18 +date_opened: 2026-05-18 +date_closed: 2026-05-18 +parent: phase6 → phase4' (loopback) → phase6 (iter 2..5) +host: hertz +result_v1: R = 0.230 (ORANGE) +result_v4: R = 0.918 ± 0.033 N=3 (YELLOW, at GREEN boundary) +--- + +# Phase 7 — Verification, with two Phase 4' loopbacks + +Per `dev_process.md`: + +> Repeat measurements from Phase 3. Compare explicitly against baseline. +> If the delta does not match Phase 4's prediction → loop back to Phase 4. + +Phase 6 v1 measurement (R = 0.230) did not match Phase 4's prediction +(R = 2.0 predicted, R = 1.0 worst-case honest lower bound). Loop +back triggered. Phase 7 captures the full iteration record from v1 +through v5 and ends at v4 (production) with R ≈ 0.92 on 1080p luma. + +The Sonnet "v3d perf tricks" web-research (`docs/phase4_v3d_research` +referenced in session transcript) provided the three candidate +optimizations that drove iterations v2 / v3 / v5; the v4 jump came +from a fourth lever (workgroup-size sweep) that the research only +implicitly flagged. + +## Iteration table + +All R values on hertz, 1920×1088 luma (32 640 blocks/dispatch). +M3 baseline = 8.171 Mblock/s (Phase 3, NEON `ff_vp9_idct_idct_8x8_add_neon`). + +| ver | change | bit-exact | M2 Mblock/s | ns/block | R | shaderdb inst / threads / temps / spills | +|---|---|---|---|---|---|---| +| v1 | first-light (4 blocks/WG, lane 0-7 col / 8-15 row, chained ternary in row pass, uint8 dst SSBO) | 100.00% | 1.878 | 532.6 | 0.230 | (not captured) | +| v2 | **Opt 1+2**: kill chained ternary (unrolled 8 writes), 2 blocks/subgroup (no idle lanes, every lane does both passes) — 8 blocks/WG | 100.00% | 3.877 | 258.0 | **0.474** | 268 / 2 / 20 / 0:0 | +| v3 | Opt 4 (sibling): scope `oN` per pass | 100.00% | 3.930 | 254.5 | 0.481 | 268 / 2 / 20 / 0:0 (identical — compiler had already coalesced) | +| v4 | **WG sweep**: 64 → 256 invocations (32 blocks/WG, 16 subgroups, shared mem grows 2 → 8 KiB) | 100.00% | 7.734 | 129.3 | **0.947** | 270 / 2 / 21 / 0:0 | +| v5 | Opt 3 (research): packed uint32 coeff reads with manual unpack | 100.00% | 7.663 | 130.5 | 0.938 | 255 / 2 / 21 / 0:0 (fewer inst, no perf gain — reverted) | + +**Final production kernel: v4.** N=3 repeat on 1080p: +R = 0.931, 0.944, 0.879 → mean **0.918 ± 0.033** (range; third run +likely caught LXD-container interference on hertz). + +## What worked (and how surprising it was) + +**v2 (predicted 3× win, got 2.07×):** Phase 4' attribution split was +wrong. Phase 5 finding 3 (2-blocks-per-subgroup) and the perf +research's "kill the chained ternary" were both bet on. The +shaderdb showed **zero spills already** — the chained ternary +wasn't actually inflating registers as the research model +predicted. So the 2.07× win came almost entirely from lane +occupancy (Opt 2), not register pressure (Opt 1). + +**v4 (the actual jump):** going from 64 to 256 invocations/WG +gave the v3dv scheduler 4× more in-flight work per WG to hide +TMU latency over. Doubled throughput. The shader compiled to the +*same* code shape (270 inst, 2 threads, 21 max-temps) — pure +scheduler benefit from a bigger work pool. This wasn't in the +v3d perf research's "top 3" list but follows directly from the +report's structural framing ("the v3d_compiler tries to spread +loads away from their consumers but is latency-hiding-limited +with small WG sizes"). + +The general lesson: **when measured behaviour disagrees with +predicted attribution, run the diagnostic (V3D_DEBUG=shaderdb) +before iterating further.** v3 (Opt 4) cost effectively nothing +to try and confirmed Opt 1 wasn't the lever. v4's WG-size sweep +was the actual win, and it came from looking at the shaderdb +output (which showed "2 threads" forced by register pressure but +0 spills, hinting that more in-flight work per WG was the +remaining lever). + +## What didn't work + +**v3 (per-pass scoping of `oN`):** zero perf delta. Compiler had +already coalesced `oN` lifetime across the barrier. Kept the +change in v4 — it's strictly cleaner code, just not faster. + +**v5 (packed uint32 coeff reads):** 0.947 → 0.938, within +noise. Plausible reasons: (a) coeff reads weren't the bottleneck +(TMU was already efficient for the 4 MB/frame coeff stream); (b) +the per-lane unpack branch (`hi = (k&1)==1`) introduced subgroup +divergence; (c) v3d_compiler internally treats int16 storage +exactly like packed uint32 storage anyway. Reverted in +production kernel for simplicity. + +## Predictions vs measurements summary + +| | predicted | measured | delta | +|---|---|---|---| +| Phase 4 R (v1) | 2.0 (envelope) / 1.0 (lower) | 0.230 | 5× worse than lower bound — **loopback trigger** | +| Phase 4' R after Opt 1+2 (v2) | "3× of 4.4× gap" → R ≈ 0.7 | 0.474 | 2× worse than predicted (the 2-blocks-per-subgroup attribution was right but Opt 1 wasn't load-bearing) | +| Phase 4' R after WG sweep (v4) | not predicted | 0.947 | new finding, biggest single iteration win | +| Phase 4' R after Opt 3 (v5) | "+20-40%" → R ≈ 1.1-1.3 | 0.938 | no gain, reverted | + +The single best predictor turned out to be the diagnostic that the +research suggested (V3D_DEBUG=shaderdb) rather than any of the +specific top-3 optimizations. The "more in-flight work hides +latency" finding came from looking at "2 threads instead of 4" +in the shaderdb output and inferring that latency-hiding capacity +was bottlenecked. + +## Decision per Phase 1 rules + +`phase1.md §"Decision rules"`: + +| R | Interpretation | Next step | +|---|---|---| +| ≥ 1.0 | QPU beats NEON. | Phase 9 → Phase 1 of next kernel | +| **0.5 ≤ R < 1.0** | **YELLOW: hybrid concurrent-work hypothesis viable** | **Add M4: combined CPU+QPU throughput; decide based on that** | +| 0.1 ≤ R < 0.5 | ORANGE: honest close | Phase 9 documents negative result | +| < 0.1 | RED: structural mismatch | Honest close | + +**Verdict: YELLOW band by a wide margin (R = 0.92, just 0.08 from +GREEN).** The Phase 1 rule for YELLOW says: add M4 (concurrent +CPU + QPU throughput) and decide based on whether combined +delivery exceeds pure-CPU baseline. + +M4 is the next measurement, not more shader tuning. The R = 0.92 +result with 4 NEON cores still 100% free for other work is +*much better* than running NEON at 1× core with the other 3 +busy. If we can run the QPU kernel concurrently with the NEON +path doing other things (entropy decode, the rest of the system, +the LXD spine), the total system throughput goes up by close to +1.0 / (1.0 - QPU_fraction_of_time), even at R < 1. + +## What Phase 7 leaves open (M4 / future) + +- **M4: concurrent CPU + QPU.** Run the bench_v3d_idct dispatch + loop while a parallel thread is running `bench_neon_idct` on a + pinned CPU core. Measure: does combined Mblock/s exceed + `bench_neon_idct -t 4` (4-core NEON)? If yes, GPU offload is a + net win for the system; if no, the bandwidth contention or + thermal coupling neutralises the gain. +- **M6: WG size sweep (Phase 1 secondary).** v4 is at 256 + invocations (max). Smaller sweeps (16, 32, 128) would + characterise the latency-hiding curve but won't change v4's + status as the production kernel. +- **M7: power delta via Himbeere plug.** Most relevant for the + higgs (battery) deployment, not hertz. +- **Thermal headroom under sustained mixed load.** With QPU + running flat-out (1.9 GB/s memory traffic) + 4-core NEON busy, + hertz may throttle. Not yet measured. + +## Production artifact + +- `src/v3d_idct8.comp` — v4 production shader, 270 inst, R = 0.92 +- `src/v3d_runner.{c,h}` — Vulkan plumbing (unchanged since Phase 6) +- `tests/bench_v3d_idct.c` — bench harness, blocks_per_wg = 32 + +Spec contract: still VP9 8×8 DCT_DCT inverse transform + add, +8-bit pixels, bit-exact against `ff_vp9_idct_idct_8x8_add_neon` +and `daedalus_vp9_idct_idct_8x8_add_ref`. Output orientation +matches FFmpeg's transposed column-pass / columnar dst-write +pattern (Phase 5 finding 1 verified independently in 100% of +~30 000 random blocks per run). diff --git a/src/v3d_idct8.comp b/src/v3d_idct8.comp new file mode 100644 index 0000000..532ec29 --- /dev/null +++ b/src/v3d_idct8.comp @@ -0,0 +1,217 @@ +// daedalus-fourier — VP9 8×8 DCT_DCT inverse-transform-add, V3D 7.1. +// v2: post-Phase-7 loopback. Phase 4' iteration 1. +// +// Changes from v1 (per phase47 iteration 1 + Sonnet v3d perf research): +// +// Opt 1 — kill the chained ternary. v1's row-pass write had +// `(r==0)?o0:(r==1)?o1:...` inside a `for r` loop; that +// kept all 8 oN scalars live across 7 phi nodes and almost +// certainly forced register spills (Iago Toral 2021, +// blogs.igalia.com/itoral). v2 unrolls the 8 writes +// completely — each oN is used exactly once. +// +// Opt 2 — 2 blocks per subgroup. v1 had 1 block per 16-lane +// subgroup with 8 lanes idle per phase. v2 packs 2 blocks +// per subgroup (one in lanes 0..7, one in lanes 8..15), +// and every lane runs both passes for its own block. +// Eliminates idle lanes AND removes the col_pass/row_pass +// branch divergence. 8 blocks per WG (vs 4 before), +// dispatch count halves from 8160 to 4080 on 1080p. +// Shared-mem footprint doubles to 2 KiB (still « 16 KiB). +// +// (Opt 3 — packed uint32 storage — deferred; do it if Opt 1+2 +// don't get us into the GREEN/YELLOW decision band.) +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +// v4: local_size 256 (was 64) — 16 subgroups × 16 lanes = 32 blocks/WG. +// More in-flight work per WG = more latency hiding for v3d's TMU. +// shared = 32 × 64 × 4 B = 8 KiB (still under 16 KiB). +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Coeffs { + int16_t coeffs[]; // N × 64 packed +} u_coeffs; +// (v5 tried uint32-packed reads with manual unpack — no measurable +// perf change vs int16, added code complexity; reverted.) + +layout(binding = 1) buffer Dst { + uint8_t dst[]; // H × stride bytes +} u_dst; + +layout(binding = 2) readonly buffer Meta { + uvec2 meta[]; // per-block (block_x_8, block_y_8) +} u_meta; + +layout(push_constant) uniform PC { + uint n_blocks; + uint blocks_per_row; // unused (meta drives position) + uint dst_stride_u8; + uint _pad; +} pc; + +// 32 blocks per WG × 64 i32 per block × 4 B = 8192 B shared. +shared int tmp_shared[32 * 64]; + +// VP9 Q14 trig constants (spec §8.7.1.4). +const int COSPI_16 = 11585; +const int COSPI_24 = 6270; +const int COSPI_08 = 15137; +const int COSPI_28 = 3196; +const int COSPI_04 = 16069; +const int COSPI_20 = 9102; +const int COSPI_12 = 13623; + +int qround14(int x) { return (x + (1 << 13)) >> 14; } + +void idct8_1d(int i0, int i1, int i2, int i3, + int i4, int i5, int i6, int i7, + out int o0, out int o1, out int o2, out int o3, + out int o4, out int o5, out int o6, out int o7) +{ + int t0a = qround14((i0 + i4) * COSPI_16); + int t1a = qround14((i0 - i4) * COSPI_16); + int t2a = qround14(i2 * COSPI_24 - i6 * COSPI_08); + int t3a = qround14(i2 * COSPI_08 + i6 * COSPI_24); + int t4a = qround14(i1 * COSPI_28 - i7 * COSPI_04); + int t5a = qround14(i5 * COSPI_12 - i3 * COSPI_20); + int t6a = qround14(i5 * COSPI_20 + i3 * COSPI_12); + int t7a = qround14(i1 * COSPI_04 + i7 * COSPI_28); + + int t0 = t0a + t3a, t1 = t1a + t2a; + int t2 = t1a - t2a, t3 = t0a - t3a; + int t4 = t4a + t5a; + int t5p = t4a - t5a; + int t7 = t7a + t6a; + int t6p = t7a - t6a; + + int t5 = qround14((t6p - t5p) * COSPI_16); + int t6 = qround14((t6p + t5p) * COSPI_16); + + o0 = t0 + t7; o1 = t1 + t6; + o2 = t2 + t5; o3 = t3 + t4; + o4 = t3 - t4; o5 = t2 - t5; + o6 = t1 - t6; o7 = t0 - t7; +} + +void main() +{ + // ---- Lane / block decomposition -------------------------------- + // 64 invocations/WG = 4 subgroups × 16 lanes/subgroup. + // Each subgroup packs 2 blocks (one in lanes 0..7, one in lanes 8..15). + // 8 blocks per WG total. + // + // Every lane runs both column and row pass for its own block — + // no idle lanes, no col_pass/row_pass branch divergence. + + uint gid = gl_GlobalInvocationID.x; + uint wg_id = gid / 256u; + uint lane_in_wg = gid & 255u; + uint sg_in_wg = lane_in_wg >> 4; // 0..15 + uint lane_in_sg = lane_in_wg & 15u; + uint block_slot = lane_in_sg >> 3; // 0 (lanes 0..7) or 1 (lanes 8..15) + uint k = lane_in_sg & 7u; // 0..7 + + uint block_local = sg_in_wg * 2u + block_slot; // 0..31 within WG + uint block_idx = wg_id * 32u + block_local; + + // OOB flag — gates work bodies, but barrier() is reached by all. + // Per phase5.md finding 7. + bool oob = (block_idx >= pc.n_blocks); + + // ---- Column pass ---------------------------------------------- + // v3 (Opt 4): scope oN inside each pass so they're dead at the + // barrier — v2 had them function-scope which inflated max-temps + // (shaderdb reported 20 max-temps / 2 threads instead of 4 threads + // possible). Lower temps → more hardware threads → better + // latency hiding. + if (!oob) { + uint base = block_idx * 64u; + int c0 = int(u_coeffs.coeffs[base + 0u * 8u + k]); + int c1 = int(u_coeffs.coeffs[base + 1u * 8u + k]); + int c2 = int(u_coeffs.coeffs[base + 2u * 8u + k]); + int c3 = int(u_coeffs.coeffs[base + 3u * 8u + k]); + int c4 = int(u_coeffs.coeffs[base + 4u * 8u + k]); + int c5 = int(u_coeffs.coeffs[base + 5u * 8u + k]); + int c6 = int(u_coeffs.coeffs[base + 6u * 8u + k]); + int c7 = int(u_coeffs.coeffs[base + 7u * 8u + k]); + + int o0, o1, o2, o3, o4, o5, o6, o7; + idct8_1d(c0, c1, c2, c3, c4, c5, c6, c7, + o0, o1, o2, o3, o4, o5, o6, o7); + + // Transposed write: row k of tmp_shared[block_local]. + uint tbase = block_local * 64u + k * 8u; + tmp_shared[tbase + 0u] = o0; + tmp_shared[tbase + 1u] = o1; + tmp_shared[tbase + 2u] = o2; + tmp_shared[tbase + 3u] = o3; + tmp_shared[tbase + 4u] = o4; + tmp_shared[tbase + 5u] = o5; + tmp_shared[tbase + 6u] = o6; + tmp_shared[tbase + 7u] = o7; + } + + barrier(); // unconditional — every lane in the WG reaches this + + // ---- Row pass -------------------------------------------------- + if (!oob) { + // Read column k of tmp_shared[block_local]. + uint tbase = block_local * 64u; + int s0 = tmp_shared[tbase + 0u * 8u + k]; + int s1 = tmp_shared[tbase + 1u * 8u + k]; + int s2 = tmp_shared[tbase + 2u * 8u + k]; + int s3 = tmp_shared[tbase + 3u * 8u + k]; + int s4 = tmp_shared[tbase + 4u * 8u + k]; + int s5 = tmp_shared[tbase + 5u * 8u + k]; + int s6 = tmp_shared[tbase + 6u * 8u + k]; + int s7 = tmp_shared[tbase + 7u * 8u + k]; + + int o0, o1, o2, o3, o4, o5, o6, o7; + idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7, + o0, o1, o2, o3, o4, o5, o6, o7); + + // Columnar write into dst. Each lane owns column k of its block. + // Block position in dst from meta. + uvec2 bp = u_meta.meta[block_idx]; + uint block_x = bp.x; + uint block_y = bp.y; + uint dx = block_x * 8u + k; + uint dy0 = block_y * 8u; + uint stride = pc.dst_stride_u8; + + // Opt 1: 8 fully-unrolled writes — each o_i used exactly once. + // No chained ternary, no loop with runtime-variable index. + uint a0 = (dy0 + 0u) * stride + dx; + uint a1 = (dy0 + 1u) * stride + dx; + uint a2 = (dy0 + 2u) * stride + dx; + uint a3 = (dy0 + 3u) * stride + dx; + uint a4 = (dy0 + 4u) * stride + dx; + uint a5 = (dy0 + 5u) * stride + dx; + uint a6 = (dy0 + 6u) * stride + dx; + uint a7 = (dy0 + 7u) * stride + dx; + + int p0 = int(u_dst.dst[a0]); + int p1 = int(u_dst.dst[a1]); + int p2 = int(u_dst.dst[a2]); + int p3 = int(u_dst.dst[a3]); + int p4 = int(u_dst.dst[a4]); + int p5 = int(u_dst.dst[a5]); + int p6 = int(u_dst.dst[a6]); + int p7 = int(u_dst.dst[a7]); + + u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 16) >> 5), 0, 255)); + u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 16) >> 5), 0, 255)); + u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 16) >> 5), 0, 255)); + u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 16) >> 5), 0, 255)); + u_dst.dst[a4] = uint8_t(clamp(p4 + ((o4 + 16) >> 5), 0, 255)); + u_dst.dst[a5] = uint8_t(clamp(p5 + ((o5 + 16) >> 5), 0, 255)); + u_dst.dst[a6] = uint8_t(clamp(p6 + ((o6 + 16) >> 5), 0, 255)); + u_dst.dst[a7] = uint8_t(clamp(p7 + ((o7 + 16) >> 5), 0, 255)); + } +} diff --git a/src/v3d_runner.c b/src/v3d_runner.c new file mode 100644 index 0000000..25d139b --- /dev/null +++ b/src/v3d_runner.c @@ -0,0 +1,435 @@ +/* + * v3d_runner — implementation. See v3d_runner.h. + * + * License: BSD-2-Clause. + */ +#include "v3d_runner.h" + +#include +#include +#include + +#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \ + fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \ + r__, __FILE__, __LINE__, #call); return -1; } } while (0) + +#define CHK_NULL(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \ + fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \ + r__, __FILE__, __LINE__, #call); return NULL; } } while (0) + +struct v3d_runner { + VkInstance instance; + VkPhysicalDevice phys; + VkDevice device; + VkQueue queue; + uint32_t queue_family; + VkCommandPool pool; + char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE]; + VkPhysicalDeviceMemoryProperties mem_props; +}; + +static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out, + char name_out[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE]) +{ + uint32_t n = 0; + if (vkEnumeratePhysicalDevices(inst, &n, NULL) != VK_SUCCESS || n == 0) { + fprintf(stderr, "v3d_runner: no Vulkan physical devices\n"); + return -1; + } + VkPhysicalDevice *pds = malloc(n * sizeof(*pds)); + if (!pds) return -1; + vkEnumeratePhysicalDevices(inst, &n, pds); + + int picked = -1; + for (uint32_t i = 0; i < n; i++) { + VkPhysicalDeviceProperties p; + vkGetPhysicalDeviceProperties(pds[i], &p); + if (strstr(p.deviceName, "V3D") != NULL) { + *out = pds[i]; + memcpy(name_out, p.deviceName, sizeof(p.deviceName)); + picked = 0; + break; + } + } + free(pds); + if (picked != 0) + fprintf(stderr, "v3d_runner: no V3D device found (looked for " + "\"V3D\" substring in deviceName)\n"); + return picked; +} + +static uint32_t pick_compute_queue_family(VkPhysicalDevice phys) +{ + uint32_t n = 0; + vkGetPhysicalDeviceQueueFamilyProperties(phys, &n, NULL); + VkQueueFamilyProperties *q = malloc(n * sizeof(*q)); + if (!q) return UINT32_MAX; + vkGetPhysicalDeviceQueueFamilyProperties(phys, &n, q); + uint32_t out = UINT32_MAX; + for (uint32_t i = 0; i < n; i++) { + if (q[i].queueFlags & VK_QUEUE_COMPUTE_BIT) { out = i; break; } + } + free(q); + return out; +} + +v3d_runner *v3d_runner_create(void) +{ + v3d_runner *r = calloc(1, sizeof(*r)); + if (!r) return NULL; + + /* Instance — Vulkan 1.3 to inherit 1.2 promoted features. */ + VkApplicationInfo app = { + .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, + .pApplicationName = "daedalus-fourier", + .apiVersion = VK_API_VERSION_1_3, + }; + VkInstanceCreateInfo ici = { + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + .pApplicationInfo = &app, + }; + CHK_NULL(vkCreateInstance(&ici, NULL, &r->instance)); + + if (pick_v3d_physical_device(r->instance, &r->phys, r->device_name) != 0) { + vkDestroyInstance(r->instance, NULL); + free(r); + return NULL; + } + + vkGetPhysicalDeviceMemoryProperties(r->phys, &r->mem_props); + + r->queue_family = pick_compute_queue_family(r->phys); + if (r->queue_family == UINT32_MAX) { + fprintf(stderr, "v3d_runner: no compute queue family\n"); + vkDestroyInstance(r->instance, NULL); + free(r); + return NULL; + } + + /* Enable 8-bit + 16-bit storage features. Both are exposed on + * V3D 7.1 per vulkaninfo_v3d_7_1_7_hertz.txt; the kernel + * declares storageBuffer8BitAccess (uint8_t dst[]) and + * storageBuffer16BitAccess (int16_t coeffs[]). + */ + VkPhysicalDevice16BitStorageFeatures f16 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES, + .storageBuffer16BitAccess = VK_TRUE, + .uniformAndStorageBuffer16BitAccess = VK_TRUE, + }; + VkPhysicalDevice8BitStorageFeatures f8 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES, + .pNext = &f16, + .storageBuffer8BitAccess = VK_TRUE, + .uniformAndStorageBuffer8BitAccess = VK_TRUE, + }; + VkPhysicalDeviceFeatures2 f2 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, + .pNext = &f8, + }; + + float qprio = 1.0f; + VkDeviceQueueCreateInfo dqci = { + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, + .queueFamilyIndex = r->queue_family, + .queueCount = 1, + .pQueuePriorities = &qprio, + }; + VkDeviceCreateInfo dci = { + .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, + .pNext = &f2, + .queueCreateInfoCount = 1, + .pQueueCreateInfos = &dqci, + }; + if (vkCreateDevice(r->phys, &dci, NULL, &r->device) != VK_SUCCESS) { + fprintf(stderr, "v3d_runner: vkCreateDevice failed\n"); + vkDestroyInstance(r->instance, NULL); + free(r); + return NULL; + } + vkGetDeviceQueue(r->device, r->queue_family, 0, &r->queue); + + VkCommandPoolCreateInfo cpci = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = r->queue_family, + }; + if (vkCreateCommandPool(r->device, &cpci, NULL, &r->pool) != VK_SUCCESS) { + fprintf(stderr, "v3d_runner: vkCreateCommandPool failed\n"); + vkDestroyDevice(r->device, NULL); + vkDestroyInstance(r->instance, NULL); + free(r); + return NULL; + } + + return r; +} + +void v3d_runner_destroy(v3d_runner *r) +{ + if (!r) return; + if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device); + if (r->pool != VK_NULL_HANDLE) + vkDestroyCommandPool(r->device, r->pool, NULL); + if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL); + if (r->instance != VK_NULL_HANDLE) vkDestroyInstance(r->instance, NULL); + free(r); +} + +VkDevice v3d_runner_device(v3d_runner *r) { return r->device; } +VkQueue v3d_runner_queue(v3d_runner *r) { return r->queue; } +uint32_t v3d_runner_queue_family(v3d_runner *r) { return r->queue_family; } +VkCommandPool v3d_runner_cmd_pool(v3d_runner *r) { return r->pool; } +const char *v3d_runner_device_name(v3d_runner *r) { return r->device_name; } + +/* ---- Buffers ---------------------------------------------------- */ + +static int find_memory_type(VkPhysicalDeviceMemoryProperties *p, + uint32_t type_bits, VkMemoryPropertyFlags wanted) +{ + for (uint32_t i = 0; i < p->memoryTypeCount; i++) { + if ((type_bits & (1u << i)) && + (p->memoryTypes[i].propertyFlags & wanted) == wanted) + return (int) i; + } + return -1; +} + +int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out) +{ + memset(out, 0, sizeof(*out)); + out->size = size; + + VkBufferCreateInfo bci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = size, + .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT + | VK_BUFFER_USAGE_TRANSFER_SRC_BIT + | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + }; + CHK(vkCreateBuffer(r->device, &bci, NULL, &out->buffer)); + + VkMemoryRequirements req; + vkGetBufferMemoryRequirements(r->device, out->buffer, &req); + + /* HOST_VISIBLE | HOST_COHERENT is the unified-memory zero-copy + * path on Pi 5: CPU and GPU see the same LPDDR4x physical pages, + * no explicit flush/invalidate needed (the COHERENT bit asserts + * that). */ + int mt = find_memory_type(&r->mem_props, req.memoryTypeBits, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT + | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + if (mt < 0) { + fprintf(stderr, "v3d_runner: no HOST_VISIBLE|COHERENT memory type\n"); + return -1; + } + + VkMemoryAllocateInfo mai = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = req.size, + .memoryTypeIndex = (uint32_t) mt, + }; + CHK(vkAllocateMemory(r->device, &mai, NULL, &out->memory)); + CHK(vkBindBufferMemory(r->device, out->buffer, out->memory, 0)); + CHK(vkMapMemory(r->device, out->memory, 0, VK_WHOLE_SIZE, 0, &out->mapped)); + return 0; +} + +void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf) +{ + if (!buf || buf->buffer == VK_NULL_HANDLE) return; + if (buf->mapped) vkUnmapMemory(r->device, buf->memory); + vkDestroyBuffer(r->device, buf->buffer, NULL); + vkFreeMemory(r->device, buf->memory, NULL); + memset(buf, 0, sizeof(*buf)); +} + +/* ---- Pipelines -------------------------------------------------- */ + +static uint32_t *read_spv(const char *path, size_t *out_size) +{ + FILE *f = fopen(path, "rb"); + if (!f) { perror(path); return NULL; } + fseek(f, 0, SEEK_END); + long sz = ftell(f); + fseek(f, 0, SEEK_SET); + if (sz <= 0 || (sz & 3)) { + fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz); + fclose(f); return NULL; + } + uint32_t *buf = malloc(sz); + if (!buf || fread(buf, 1, sz, f) != (size_t)sz) { + perror("read"); fclose(f); free(buf); return NULL; + } + fclose(f); + *out_size = sz; + return buf; +} + +int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path, + uint32_t n_ssbos, uint32_t push_const_size, + v3d_pipeline *out) +{ + memset(out, 0, sizeof(*out)); + out->n_ssbos = n_ssbos; + out->push_const_size = push_const_size; + + /* Descriptor set layout: n_ssbos SSBO bindings, compute-only. */ + VkDescriptorSetLayoutBinding *binds = calloc(n_ssbos, sizeof(*binds)); + if (!binds) return -1; + for (uint32_t i = 0; i < n_ssbos; i++) { + binds[i] = (VkDescriptorSetLayoutBinding){ + .binding = i, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + }; + } + VkDescriptorSetLayoutCreateInfo dslci = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = n_ssbos, + .pBindings = binds, + }; + VkResult vr = vkCreateDescriptorSetLayout(r->device, &dslci, NULL, + &out->ds_layout); + free(binds); + if (vr != VK_SUCCESS) { + fprintf(stderr, "vkCreateDescriptorSetLayout = %d\n", vr); return -1; + } + + VkPushConstantRange pcr = { + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = push_const_size, + }; + VkPipelineLayoutCreateInfo plci = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &out->ds_layout, + .pushConstantRangeCount = push_const_size ? 1 : 0, + .pPushConstantRanges = push_const_size ? &pcr : NULL, + }; + CHK(vkCreatePipelineLayout(r->device, &plci, NULL, &out->layout)); + + size_t spv_size = 0; + uint32_t *spv = read_spv(spv_path, &spv_size); + if (!spv) return -1; + VkShaderModuleCreateInfo smci = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .codeSize = spv_size, + .pCode = spv, + }; + VkShaderModule shader; + vr = vkCreateShaderModule(r->device, &smci, NULL, &shader); + free(spv); + if (vr != VK_SUCCESS) { + fprintf(stderr, "vkCreateShaderModule(%s) = %d\n", spv_path, vr); + return -1; + } + + VkComputePipelineCreateInfo cpci = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = shader, + .pName = "main", + }, + .layout = out->layout, + }; + vr = vkCreateComputePipelines(r->device, VK_NULL_HANDLE, 1, &cpci, NULL, + &out->pipeline); + vkDestroyShaderModule(r->device, shader, NULL); + if (vr != VK_SUCCESS) { + fprintf(stderr, "vkCreateComputePipelines = %d\n", vr); return -1; + } + + /* Single descriptor pool + set for this pipeline. */ + VkDescriptorPoolSize ps = { + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = n_ssbos, + }; + VkDescriptorPoolCreateInfo dpci = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = 1, + .poolSizeCount = 1, + .pPoolSizes = &ps, + }; + CHK(vkCreateDescriptorPool(r->device, &dpci, NULL, &out->pool)); + + VkDescriptorSetAllocateInfo dsai = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = out->pool, + .descriptorSetCount = 1, + .pSetLayouts = &out->ds_layout, + }; + CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set)); + return 0; +} + +void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p) +{ + if (!p || p->pipeline == VK_NULL_HANDLE) return; + vkDestroyPipeline(r->device, p->pipeline, NULL); + vkDestroyPipelineLayout(r->device, p->layout, NULL); + vkDestroyDescriptorPool(r->device, p->pool, NULL); /* frees its set */ + vkDestroyDescriptorSetLayout(r->device, p->ds_layout, NULL); + memset(p, 0, sizeof(*p)); +} + +int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p, + const v3d_buffer *bufs, uint32_t n) +{ + if (n != p->n_ssbos) { + fprintf(stderr, "bind_buffers: n=%u != pipeline n_ssbos=%u\n", + n, p->n_ssbos); + return -1; + } + VkDescriptorBufferInfo *bi = calloc(n, sizeof(*bi)); + VkWriteDescriptorSet *wr = calloc(n, sizeof(*wr)); + if (!bi || !wr) { free(bi); free(wr); return -1; } + for (uint32_t i = 0; i < n; i++) { + bi[i].buffer = bufs[i].buffer; + bi[i].offset = 0; + bi[i].range = bufs[i].size; + wr[i] = (VkWriteDescriptorSet){ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = p->desc_set, + .dstBinding = i, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .pBufferInfo = &bi[i], + }; + } + vkUpdateDescriptorSets(r->device, n, wr, 0, NULL); + free(bi); free(wr); + return 0; +} + +/* ---- Command buffers ------------------------------------------- */ + +VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r) +{ + VkCommandBufferAllocateInfo cbai = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = r->pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }; + VkCommandBuffer cb = VK_NULL_HANDLE; + if (vkAllocateCommandBuffers(r->device, &cbai, &cb) != VK_SUCCESS) + return VK_NULL_HANDLE; + return cb; +} + +int v3d_runner_submit_wait(v3d_runner *r, VkCommandBuffer cb) +{ + VkSubmitInfo si = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .commandBufferCount = 1, + .pCommandBuffers = &cb, + }; + CHK(vkQueueSubmit(r->queue, 1, &si, VK_NULL_HANDLE)); + CHK(vkQueueWaitIdle(r->queue)); + return 0; +} diff --git a/src/v3d_runner.h b/src/v3d_runner.h new file mode 100644 index 0000000..b729995 --- /dev/null +++ b/src/v3d_runner.h @@ -0,0 +1,96 @@ +/* + * v3d_runner — minimal Vulkan compute plumbing for V3D 7.1 on Pi 5. + * + * Factored out of tests/bench_vulkan_dispatch.c so successive kernel + * benches can reuse the device/queue/buffer/pipeline machinery + * without copy-paste. Kept deliberately small and concrete — no + * generality beyond what daedalus-fourier needs. + * + * License: BSD-2-Clause. + */ +#ifndef DAEDALUS_V3D_RUNNER_H +#define DAEDALUS_V3D_RUNNER_H + +#include +#include +#include + +typedef struct v3d_runner v3d_runner; + +/* Host-visible SSBO. .mapped is a CPU-side pointer to .size bytes. */ +typedef struct { + VkBuffer buffer; + VkDeviceMemory memory; + void *mapped; + size_t size; +} v3d_buffer; + +/* Compute pipeline + its descriptor set (one set per pipeline). */ +typedef struct { + VkPipeline pipeline; + VkPipelineLayout layout; + VkDescriptorSetLayout ds_layout; + VkDescriptorPool pool; + VkDescriptorSet desc_set; + uint32_t n_ssbos; + uint32_t push_const_size; +} v3d_pipeline; + +/* + * Create runner: Vulkan instance, V3D physical device, logical + * device with storageBuffer{8,16}BitAccess features enabled, + * compute queue, command pool. + * + * Returns NULL on failure (writes errors to stderr). + */ +v3d_runner *v3d_runner_create(void); +void v3d_runner_destroy(v3d_runner *r); + +/* Expose a few internals for code that wants direct vkCmd*. */ +VkDevice v3d_runner_device(v3d_runner *r); +VkQueue v3d_runner_queue(v3d_runner *r); +uint32_t v3d_runner_queue_family(v3d_runner *r); +VkCommandPool v3d_runner_cmd_pool(v3d_runner *r); +const char *v3d_runner_device_name(v3d_runner *r); + +/* Storage buffer, HOST_VISIBLE | HOST_COHERENT, mapped on the + * host side. The mapping persists for the lifetime of the buffer. + * + * Returns 0 on success, non-zero on failure. + */ +int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out); +void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf); + +/* Compute pipeline from a SPIR-V file path. The descriptor-set + * layout exposes `n_ssbos` storage buffer bindings at binding + * indices 0..n_ssbos-1, all visible to the compute stage. A push + * constant range of `push_const_size` bytes is added if non-zero. + * + * The single descriptor set is pre-allocated; bind buffers via + * v3d_runner_bind_buffers(). + */ +int v3d_runner_create_pipeline(v3d_runner *r, + const char *spv_path, + uint32_t n_ssbos, + uint32_t push_const_size, + v3d_pipeline *out); +void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p); + +/* Bind SSBOs to the pipeline's descriptor set. `bufs` must have + * exactly `p->n_ssbos` entries, in binding order. Idempotent — + * rebind freely between dispatches if buffers change. + */ +int v3d_runner_bind_buffers(v3d_runner *r, + v3d_pipeline *p, + const v3d_buffer *bufs, + uint32_t n); + +/* Allocate a primary command buffer from the runner's pool. */ +VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r); + +/* Submit `cb` to the queue and wait for completion. The classic + * timed operation. Returns 0 on success. + */ +int v3d_runner_submit_wait(v3d_runner *r, VkCommandBuffer cb); + +#endif /* DAEDALUS_V3D_RUNNER_H */ diff --git a/tests/bench_v3d_idct.c b/tests/bench_v3d_idct.c new file mode 100644 index 0000000..a5b7a1c --- /dev/null +++ b/tests/bench_v3d_idct.c @@ -0,0 +1,334 @@ +/* + * Phase 6 — first-light QPU bench for VP9 8×8 DCT_DCT IDCT add on V3D 7.1. + * + * Reports: + * M1' (correctness): bit-exact rate, QPU output vs C reference, + * across N synthetic blocks. + * M2 (throughput): QPU sustained MblockS over K dispatched frames. + * + * Compares against M3 (bench_neon_idct) to compute R = M2 / M3. + * Decision rules per docs/phase1.md §"Decision rules". + * + * License: BSD-2-Clause. Links statically against the LGPL-2.1+ + * vp9_idct8_ref.c (a clean-room transcription from spec), so this + * binary distributes under BSD-2-Clause-or-later if separated; left + * as LGPL-2.1+ when linked together. + */ +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v3d_runner.h" + +/* C bit-exact reference from tests/vp9_idct8_ref.c. */ +extern void daedalus_vp9_idct_idct_8x8_add_ref( + uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +/* ---- RNG (matches bench_neon_idct.c shape for reproducibility) -- */ + +static uint64_t xs64_state; +static inline uint64_t xs64(void) +{ + uint64_t x = xs64_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs64_state = x; +} + +static int gen_block(int16_t block[64]) +{ + memset(block, 0, 64 * sizeof(*block)); + int eob = 0; + int n_nonzero = 1 + (int)(xs64() % 16); + for (int i = 0; i < n_nonzero; i++) { + int pos = (int)(xs64() % 64); + int16_t coef = (int16_t)((int)(xs64() % 8192) - 4096); + block[pos] = coef; + if (pos + 1 > eob) eob = pos + 1; + } + if (eob == 0) eob = 1; + return eob; +} + +static double now_seconds(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +/* ---- Push-constant layout — must match src/v3d_idct8.comp ------- */ + +typedef struct { + uint32_t n_blocks; + uint32_t blocks_per_row; + uint32_t dst_stride_u8; + uint32_t _pad; +} push_consts; + +/* ---- Main ------------------------------------------------------- */ + +int main(int argc, char **argv) +{ + /* Default synthetic frame: 128×128 pixels = 16×16 blocks = 256 + * blocks. Small enough for fast bring-up; large enough that the + * 4-blocks/WG geometry gets exercised (64 WGs). */ + int blocks_per_row = 16; + int rows_of_blocks = 16; + int iters = 100; + uint64_t seed = 0; + const char *spv_path = "v3d_idct8.spv"; + int verify_only = 0; + int max_mismatch_print = 4; + + static struct option opts[] = { + {"width", required_argument, 0, 'w'}, + {"height", required_argument, 0, 'h'}, + {"iters", required_argument, 0, 'i'}, + {"seed", required_argument, 0, 's'}, + {"spv", required_argument, 0, 'S'}, + {"verify-only", no_argument, 0, 'V'}, + {0,0,0,0} + }; + for (int c; (c = getopt_long(argc, argv, "w:h:i:s:S:V", opts, 0)) != -1;) { + switch (c) { + case 'w': blocks_per_row = atoi(optarg) / 8; break; + case 'h': rows_of_blocks = atoi(optarg) / 8; break; + case 'i': iters = atoi(optarg); break; + case 's': seed = strtoull(optarg, 0, 0); break; + case 'S': spv_path = optarg; break; + case 'V': verify_only = 1; break; + default: return 2; + } + } + + int dst_width = blocks_per_row * 8; + int dst_height = rows_of_blocks * 8; + int dst_stride = dst_width; /* tightly packed */ + size_t n_blocks = (size_t)blocks_per_row * rows_of_blocks; + size_t dst_bytes = (size_t)dst_height * dst_stride; + + printf("=== v3d IDCT8 first-light ===\n"); + printf(" frame: %dx%d (%dx%d blocks, %zu blocks total)\n", + dst_width, dst_height, blocks_per_row, rows_of_blocks, n_blocks); + printf(" spv: %s\n", spv_path); + printf(" iters: %d (for throughput phase)\n", iters); + + xs64_state = seed ? seed : 0xdeadbeefcafebabeULL; + + /* ---- Init runner ---- */ + v3d_runner *r = v3d_runner_create(); + if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; } + printf(" device: %s\n", v3d_runner_device_name(r)); + + /* ---- Buffers ---- */ + v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0}; + if (v3d_runner_create_buffer(r, n_blocks * 64 * sizeof(int16_t), &buf_coeffs)) return 1; + if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1; + if (v3d_runner_create_buffer(r, n_blocks * 2 * sizeof(uint32_t), &buf_meta)) return 1; + + /* Fill master inputs — these stay constant across iterations. */ + int16_t *master_coeffs = malloc(n_blocks * 64 * sizeof(int16_t)); + uint8_t *master_pred = malloc(dst_bytes); + uint8_t *expected_dst = malloc(dst_bytes); /* C-reference output */ + int *eobs = malloc(n_blocks * sizeof(int)); + if (!master_coeffs || !master_pred || !expected_dst || !eobs) return 1; + + for (size_t b = 0; b < n_blocks; b++) + eobs[b] = gen_block(master_coeffs + b * 64); + for (size_t i = 0; i < dst_bytes; i++) + master_pred[i] = (uint8_t)(xs64() & 0xff); + + /* Build the expected (C-reference) output frame. The C ref + * mutates its input block (zeros it after column pass), so we + * work on copies. */ + memcpy(expected_dst, master_pred, dst_bytes); + int16_t scratch[64]; + for (size_t b = 0; b < n_blocks; b++) { + int bx = (int)(b % blocks_per_row); + int by = (int)(b / blocks_per_row); + memcpy(scratch, master_coeffs + b * 64, sizeof(scratch)); + daedalus_vp9_idct_idct_8x8_add_ref( + expected_dst + by * 8 * dst_stride + bx * 8, + dst_stride, scratch, eobs[b]); + } + + /* Populate GPU buffers. */ + memcpy(buf_coeffs.mapped, master_coeffs, buf_coeffs.size); + memcpy(buf_dst.mapped, master_pred, buf_dst.size); + uint32_t *meta = (uint32_t *) buf_meta.mapped; + for (size_t b = 0; b < n_blocks; b++) { + meta[2*b + 0] = (uint32_t)(b % blocks_per_row); /* block_x_8 */ + meta[2*b + 1] = (uint32_t)(b / blocks_per_row); /* block_y_8 */ + } + + /* ---- Pipeline ---- */ + v3d_pipeline pipe = {0}; + if (v3d_runner_create_pipeline(r, spv_path, + /*n_ssbos=*/3, + /*push_const_size=*/sizeof(push_consts), + &pipe)) return 1; + + v3d_buffer bind_bufs[3] = { buf_coeffs, buf_dst, buf_meta }; + if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 3)) return 1; + + /* ---- Dispatch geometry ---- */ + /* v4: 32 blocks per WG (2 per 16-lane subgroup × 16 subgroups). + * 4× v2's count — more in-flight work per WG for latency hiding. */ + const uint32_t blocks_per_wg = 32; + uint32_t group_count_x = (uint32_t)((n_blocks + blocks_per_wg - 1) + / blocks_per_wg); + printf(" dispatch: %u WGs × 64 invocations = %u blocks (rounded up from %zu)\n", + group_count_x, group_count_x * blocks_per_wg, n_blocks); + + push_consts pc = { + .n_blocks = (uint32_t)n_blocks, + .blocks_per_row = (uint32_t)blocks_per_row, + .dst_stride_u8 = (uint32_t)dst_stride, + ._pad = 0, + }; + + /* Record once, reuse for every iteration. */ + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r); + if (cb == VK_NULL_HANDLE) return 1; + VkCommandBufferBeginInfo cbbi = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + pipe.layout, 0, 1, &pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + vkCmdDispatch(cb, group_count_x, 1, 1); + vkEndCommandBuffer(cb); + + /* ---- M1': bit-exact verification (first dispatch only) ---- */ + printf("\n=== M1': QPU vs C-reference bit-exact ===\n"); + memcpy(buf_dst.mapped, master_pred, buf_dst.size); + if (v3d_runner_submit_wait(r, cb)) return 1; + + int mismatch_blocks = 0; + int total_byte_diffs = 0; + for (size_t b = 0; b < n_blocks; b++) { + int bx = (int)(b % blocks_per_row); + int by = (int)(b / blocks_per_row); + const uint8_t *qpu_block = (uint8_t *)buf_dst.mapped + + by * 8 * dst_stride + bx * 8; + const uint8_t *ref_block = expected_dst + + by * 8 * dst_stride + bx * 8; + int block_diffs = 0; + for (int r0 = 0; r0 < 8; r0++) + for (int c = 0; c < 8; c++) + if (qpu_block[r0 * dst_stride + c] + != ref_block[r0 * dst_stride + c]) { + block_diffs++; + total_byte_diffs++; + } + if (block_diffs > 0 && mismatch_blocks < max_mismatch_print) { + fprintf(stderr, + "MISMATCH block %zu @ (bx=%d by=%d) eob=%d: %d/64 bytes differ\n", + b, bx, by, eobs[b], block_diffs); + fprintf(stderr, " ref:"); + for (int r0 = 0; r0 < 8; r0++) { + fprintf(stderr, "\n r%d ", r0); + for (int c = 0; c < 8; c++) + fprintf(stderr, "%3u ", ref_block[r0 * dst_stride + c]); + } + fprintf(stderr, "\n qpu:"); + for (int r0 = 0; r0 < 8; r0++) { + fprintf(stderr, "\n r%d ", r0); + for (int c = 0; c < 8; c++) + fprintf(stderr, "%3u ", qpu_block[r0 * dst_stride + c]); + } + fprintf(stderr, "\n"); + } + if (block_diffs > 0) mismatch_blocks++; + } + printf(" blocks bit-exact: %zu / %zu (%.4f%%)\n", + n_blocks - mismatch_blocks, n_blocks, + 100.0 * (n_blocks - mismatch_blocks) / n_blocks); + printf(" total byte diffs: %d / %zu (%.4f%%)\n", + total_byte_diffs, n_blocks * 64, + 100.0 * total_byte_diffs / (n_blocks * 64)); + + if (mismatch_blocks > 0) { + fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_coeffs); + v3d_runner_destroy(r); + return 1; + } + + if (verify_only) { + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_coeffs); + v3d_runner_destroy(r); + return 0; + } + + /* ---- M2: throughput ---- */ + printf("\n=== M2: QPU throughput ===\n"); + + /* Warm-up. */ + for (int i = 0; i < 10; i++) { + memcpy(buf_dst.mapped, master_pred, buf_dst.size); + if (v3d_runner_submit_wait(r, cb)) return 1; + } + + double t0 = now_seconds(); + for (int i = 0; i < iters; i++) { + memcpy(buf_dst.mapped, master_pred, buf_dst.size); + if (v3d_runner_submit_wait(r, cb)) return 1; + } + double t1 = now_seconds(); + + /* Setup-only timing for memcpy subtraction. */ + double s0 = now_seconds(); + for (int i = 0; i < iters; i++) { + memcpy(buf_dst.mapped, master_pred, buf_dst.size); + } + double s1 = now_seconds(); + + double total_seconds = (t1 - t0) - (s1 - s0); + double total_blocks = (double) n_blocks * iters; + double mblocks_s = total_blocks / total_seconds / 1e6; + + printf(" blocks/dispatch: %zu\n", n_blocks); + printf(" iters: %d\n", iters); + printf(" total blocks: %.0f\n", total_blocks); + printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", total_seconds); + printf(" elapsed (setup) =%.6f s\n", s1 - s0); + printf(" M2 throughput = %.3f Mblock/s\n", mblocks_s); + printf(" per-block = %.1f ns\n", + total_seconds / total_blocks * 1e9); + printf(" per-dispatch = %.1f us\n", + total_seconds / iters * 1e6); + + /* R = M2 / M3 = M2 / 8.171 Mblock/s (Phase 3 baseline). */ + double M3 = 8.171; + double R = mblocks_s / M3; + printf("\n Phase 3 NEON M3 = %.3f Mblock/s\n", M3); + printf(" R = M2 / M3 = %.3f\n", R); + if (R >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n"); + else if (R >= 0.5) printf(" decision band = YELLOW: concurrent-work hypothesis viable\n"); + else if (R >= 0.1) printf(" decision band = ORANGE: material loss; honest close suggested\n"); + else printf(" decision band = RED: structural mismatch\n"); + + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_coeffs); + v3d_runner_destroy(r); + free(master_coeffs); free(master_pred); free(expected_dst); free(eobs); + return 0; +}