Phase 6 (v1+v4 production) + Phase 7 closure: R = 0.92 ± 0.03 on hertz
First QPU IDCT8 kernel running and bit-exact on V3D 7.1 via Mesa
v3dv compute. Five iterations through a Phase 7→Phase 4' loopback;
production kernel is v4.
New files:
- src/v3d_runner.{c,h} — reusable Vulkan compute plumbing (instance,
V3D device picker, HOST_VISIBLE|COHERENT
SSBOs with mmap, compute pipeline from .spv,
enables storageBuffer{8,16}BitAccess)
- src/v3d_idct8.comp — VP9 8x8 DCT_DCT IDCT add, v4 production:
256 invocations/WG, 2 blocks/subgroup
(no idle lanes), uint8 dst SSBO (race-free
per phase5 finding 5), unrolled writes
(no chained ternary), oob-flag pattern
(barrier-safe per phase5 finding 7)
- tests/bench_v3d_idct.c — M1' bit-exact gate + M2 throughput vs C ref
- docs/phase7.md — full iteration journey + decision verdict
CMakeLists.txt updated to build the new shader, library, and bench
when DAEDALUS_BUILD_VULKAN=ON.
Iteration record (1920x1088 luma, 32640 blocks/dispatch, N=3):
ver change R ns/block
v1 first-light 0.230 533
v2 kill ternary + 2-blocks-per-sg 0.474 258
v3 per-pass scope oN 0.481 254 (noise)
v4 WG 64 -> 256 invocations 0.947 129
v5 packed uint32 coeff reads 0.938 130 (noise, reverted)
v4 final N=3 0.918 +/- 0.033
Bit-exactness 100.0000% across all iterations (10000-block sample
on 128x128, 32640-block sample on 1080p) against both the C
reference (tests/vp9_idct8_ref.c) and the vendored FFmpeg NEON
ff_vp9_idct_idct_8x8_add_neon.
Key learning over the Phase 5 review's prediction model: the
chained ternary was NOT a spill killer on V3D 7.1 (shaderdb
showed 0:0 spills:fills even in v1). The actual lever was
workgroup-size-driven latency hiding — going from 64 to 256
invocations doubled throughput with the same compiled code
(270 inst, 2 threads, 21 max-temps, 0 spills) because the
v3dv scheduler had 4x more in-flight work to overlap TMU
latency.
Verdict per phase1.md decision rules: YELLOW band (0.5 <= R < 1.0)
by a wide margin, near GREEN boundary. Phase 1 YELLOW rule:
add M4 (concurrent CPU+QPU throughput) before honest-close or
continue. M4 is the next measurement, not more shader tuning —
at R = 0.92 with all 4 A76 cores still 100% free for other work,
the question is whether the system aggregate beats pure 4-core
NEON.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+26
-1
@@ -86,12 +86,37 @@ if (DAEDALUS_BUILD_VULKAN)
|
|||||||
COMMENT "glslang: noop.comp -> noop.spv"
|
COMMENT "glslang: noop.comp -> noop.spv"
|
||||||
VERBATIM
|
VERBATIM
|
||||||
)
|
)
|
||||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV})
|
|
||||||
|
set(IDCT8_SPV ${CMAKE_BINARY_DIR}/v3d_idct8.spv)
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${IDCT8_SPV}
|
||||||
|
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||||
|
-o ${IDCT8_SPV}
|
||||||
|
${CMAKE_SOURCE_DIR}/src/v3d_idct8.comp
|
||||||
|
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_idct8.comp
|
||||||
|
COMMENT "glslang: v3d_idct8.comp -> v3d_idct8.spv"
|
||||||
|
VERBATIM
|
||||||
|
)
|
||||||
|
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV})
|
||||||
|
|
||||||
|
# v3d_runner — reusable Vulkan plumbing.
|
||||||
|
add_library(v3d_runner STATIC src/v3d_runner.c)
|
||||||
|
target_include_directories(v3d_runner PUBLIC src)
|
||||||
|
target_link_libraries(v3d_runner PUBLIC Vulkan::Vulkan)
|
||||||
|
target_compile_options(v3d_runner PRIVATE -O2)
|
||||||
|
|
||||||
add_executable(bench_vulkan_dispatch tests/bench_vulkan_dispatch.c)
|
add_executable(bench_vulkan_dispatch tests/bench_vulkan_dispatch.c)
|
||||||
add_dependencies(bench_vulkan_dispatch daedalus_shaders)
|
add_dependencies(bench_vulkan_dispatch daedalus_shaders)
|
||||||
target_link_libraries(bench_vulkan_dispatch PRIVATE Vulkan::Vulkan)
|
target_link_libraries(bench_vulkan_dispatch PRIVATE Vulkan::Vulkan)
|
||||||
target_compile_options(bench_vulkan_dispatch PRIVATE -O2)
|
target_compile_options(bench_vulkan_dispatch PRIVATE -O2)
|
||||||
|
|
||||||
|
add_executable(bench_v3d_idct
|
||||||
|
tests/bench_v3d_idct.c
|
||||||
|
tests/vp9_idct8_ref.c
|
||||||
|
)
|
||||||
|
add_dependencies(bench_v3d_idct daedalus_shaders)
|
||||||
|
target_link_libraries(bench_v3d_idct PRIVATE v3d_runner Vulkan::Vulkan)
|
||||||
|
target_compile_options(bench_v3d_idct PRIVATE -O2)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# ---- Summary ----------------------------------------------------------------
|
# ---- Summary ----------------------------------------------------------------
|
||||||
|
|||||||
+159
@@ -0,0 +1,159 @@
|
|||||||
|
---
|
||||||
|
phase: 7
|
||||||
|
status: closed 2026-05-18
|
||||||
|
date_opened: 2026-05-18
|
||||||
|
date_closed: 2026-05-18
|
||||||
|
parent: phase6 → phase4' (loopback) → phase6 (iter 2..5)
|
||||||
|
host: hertz
|
||||||
|
result_v1: R = 0.230 (ORANGE)
|
||||||
|
result_v4: R = 0.918 ± 0.033 N=3 (YELLOW, at GREEN boundary)
|
||||||
|
---
|
||||||
|
|
||||||
|
# Phase 7 — Verification, with two Phase 4' loopbacks
|
||||||
|
|
||||||
|
Per `dev_process.md`:
|
||||||
|
|
||||||
|
> Repeat measurements from Phase 3. Compare explicitly against baseline.
|
||||||
|
> If the delta does not match Phase 4's prediction → loop back to Phase 4.
|
||||||
|
|
||||||
|
Phase 6 v1 measurement (R = 0.230) did not match Phase 4's prediction
|
||||||
|
(R = 2.0 predicted, R = 1.0 worst-case honest lower bound). Loop
|
||||||
|
back triggered. Phase 7 captures the full iteration record from v1
|
||||||
|
through v5 and ends at v4 (production) with R ≈ 0.92 on 1080p luma.
|
||||||
|
|
||||||
|
The Sonnet "v3d perf tricks" web-research (`docs/phase4_v3d_research`
|
||||||
|
referenced in session transcript) provided the three candidate
|
||||||
|
optimizations that drove iterations v2 / v3 / v5; the v4 jump came
|
||||||
|
from a fourth lever (workgroup-size sweep) that the research only
|
||||||
|
implicitly flagged.
|
||||||
|
|
||||||
|
## Iteration table
|
||||||
|
|
||||||
|
All R values on hertz, 1920×1088 luma (32 640 blocks/dispatch).
|
||||||
|
M3 baseline = 8.171 Mblock/s (Phase 3, NEON `ff_vp9_idct_idct_8x8_add_neon`).
|
||||||
|
|
||||||
|
| ver | change | bit-exact | M2 Mblock/s | ns/block | R | shaderdb inst / threads / temps / spills |
|
||||||
|
|---|---|---|---|---|---|---|
|
||||||
|
| v1 | first-light (4 blocks/WG, lane 0-7 col / 8-15 row, chained ternary in row pass, uint8 dst SSBO) | 100.00% | 1.878 | 532.6 | 0.230 | (not captured) |
|
||||||
|
| v2 | **Opt 1+2**: kill chained ternary (unrolled 8 writes), 2 blocks/subgroup (no idle lanes, every lane does both passes) — 8 blocks/WG | 100.00% | 3.877 | 258.0 | **0.474** | 268 / 2 / 20 / 0:0 |
|
||||||
|
| v3 | Opt 4 (sibling): scope `oN` per pass | 100.00% | 3.930 | 254.5 | 0.481 | 268 / 2 / 20 / 0:0 (identical — compiler had already coalesced) |
|
||||||
|
| v4 | **WG sweep**: 64 → 256 invocations (32 blocks/WG, 16 subgroups, shared mem grows 2 → 8 KiB) | 100.00% | 7.734 | 129.3 | **0.947** | 270 / 2 / 21 / 0:0 |
|
||||||
|
| v5 | Opt 3 (research): packed uint32 coeff reads with manual unpack | 100.00% | 7.663 | 130.5 | 0.938 | 255 / 2 / 21 / 0:0 (fewer inst, no perf gain — reverted) |
|
||||||
|
|
||||||
|
**Final production kernel: v4.** N=3 repeat on 1080p:
|
||||||
|
R = 0.931, 0.944, 0.879 → mean **0.918 ± 0.033** (range; third run
|
||||||
|
likely caught LXD-container interference on hertz).
|
||||||
|
|
||||||
|
## What worked (and how surprising it was)
|
||||||
|
|
||||||
|
**v2 (predicted 3× win, got 2.07×):** Phase 4' attribution split was
|
||||||
|
wrong. Phase 5 finding 3 (2-blocks-per-subgroup) and the perf
|
||||||
|
research's "kill the chained ternary" were both bet on. The
|
||||||
|
shaderdb showed **zero spills already** — the chained ternary
|
||||||
|
wasn't actually inflating registers as the research model
|
||||||
|
predicted. So the 2.07× win came almost entirely from lane
|
||||||
|
occupancy (Opt 2), not register pressure (Opt 1).
|
||||||
|
|
||||||
|
**v4 (the actual jump):** going from 64 to 256 invocations/WG
|
||||||
|
gave the v3dv scheduler 4× more in-flight work per WG to hide
|
||||||
|
TMU latency over. Doubled throughput. The shader compiled to the
|
||||||
|
*same* code shape (270 inst, 2 threads, 21 max-temps) — pure
|
||||||
|
scheduler benefit from a bigger work pool. This wasn't in the
|
||||||
|
v3d perf research's "top 3" list but follows directly from the
|
||||||
|
report's structural framing ("the v3d_compiler tries to spread
|
||||||
|
loads away from their consumers but is latency-hiding-limited
|
||||||
|
with small WG sizes").
|
||||||
|
|
||||||
|
The general lesson: **when measured behaviour disagrees with
|
||||||
|
predicted attribution, run the diagnostic (V3D_DEBUG=shaderdb)
|
||||||
|
before iterating further.** v3 (Opt 4) cost effectively nothing
|
||||||
|
to try and confirmed Opt 1 wasn't the lever. v4's WG-size sweep
|
||||||
|
was the actual win, and it came from looking at the shaderdb
|
||||||
|
output (which showed "2 threads" forced by register pressure but
|
||||||
|
0 spills, hinting that more in-flight work per WG was the
|
||||||
|
remaining lever).
|
||||||
|
|
||||||
|
## What didn't work
|
||||||
|
|
||||||
|
**v3 (per-pass scoping of `oN`):** zero perf delta. Compiler had
|
||||||
|
already coalesced `oN` lifetime across the barrier. Kept the
|
||||||
|
change in v4 — it's strictly cleaner code, just not faster.
|
||||||
|
|
||||||
|
**v5 (packed uint32 coeff reads):** 0.947 → 0.938, within
|
||||||
|
noise. Plausible reasons: (a) coeff reads weren't the bottleneck
|
||||||
|
(TMU was already efficient for the 4 MB/frame coeff stream); (b)
|
||||||
|
the per-lane unpack branch (`hi = (k&1)==1`) introduced subgroup
|
||||||
|
divergence; (c) v3d_compiler internally treats int16 storage
|
||||||
|
exactly like packed uint32 storage anyway. Reverted in
|
||||||
|
production kernel for simplicity.
|
||||||
|
|
||||||
|
## Predictions vs measurements summary
|
||||||
|
|
||||||
|
| | predicted | measured | delta |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Phase 4 R (v1) | 2.0 (envelope) / 1.0 (lower) | 0.230 | 5× worse than lower bound — **loopback trigger** |
|
||||||
|
| Phase 4' R after Opt 1+2 (v2) | "3× of 4.4× gap" → R ≈ 0.7 | 0.474 | 2× worse than predicted (the 2-blocks-per-subgroup attribution was right but Opt 1 wasn't load-bearing) |
|
||||||
|
| Phase 4' R after WG sweep (v4) | not predicted | 0.947 | new finding, biggest single iteration win |
|
||||||
|
| Phase 4' R after Opt 3 (v5) | "+20-40%" → R ≈ 1.1-1.3 | 0.938 | no gain, reverted |
|
||||||
|
|
||||||
|
The single best predictor turned out to be the diagnostic that the
|
||||||
|
research suggested (V3D_DEBUG=shaderdb) rather than any of the
|
||||||
|
specific top-3 optimizations. The "more in-flight work hides
|
||||||
|
latency" finding came from looking at "2 threads instead of 4"
|
||||||
|
in the shaderdb output and inferring that latency-hiding capacity
|
||||||
|
was bottlenecked.
|
||||||
|
|
||||||
|
## Decision per Phase 1 rules
|
||||||
|
|
||||||
|
`phase1.md §"Decision rules"`:
|
||||||
|
|
||||||
|
| R | Interpretation | Next step |
|
||||||
|
|---|---|---|
|
||||||
|
| ≥ 1.0 | QPU beats NEON. | Phase 9 → Phase 1 of next kernel |
|
||||||
|
| **0.5 ≤ R < 1.0** | **YELLOW: hybrid concurrent-work hypothesis viable** | **Add M4: combined CPU+QPU throughput; decide based on that** |
|
||||||
|
| 0.1 ≤ R < 0.5 | ORANGE: honest close | Phase 9 documents negative result |
|
||||||
|
| < 0.1 | RED: structural mismatch | Honest close |
|
||||||
|
|
||||||
|
**Verdict: YELLOW band by a wide margin (R = 0.92, just 0.08 from
|
||||||
|
GREEN).** The Phase 1 rule for YELLOW says: add M4 (concurrent
|
||||||
|
CPU + QPU throughput) and decide based on whether combined
|
||||||
|
delivery exceeds pure-CPU baseline.
|
||||||
|
|
||||||
|
M4 is the next measurement, not more shader tuning. The R = 0.92
|
||||||
|
result with 4 NEON cores still 100% free for other work is
|
||||||
|
*much better* than running NEON at 1× core with the other 3
|
||||||
|
busy. If we can run the QPU kernel concurrently with the NEON
|
||||||
|
path doing other things (entropy decode, the rest of the system,
|
||||||
|
the LXD spine), the total system throughput goes up by close to
|
||||||
|
1.0 / (1.0 - QPU_fraction_of_time), even at R < 1.
|
||||||
|
|
||||||
|
## What Phase 7 leaves open (M4 / future)
|
||||||
|
|
||||||
|
- **M4: concurrent CPU + QPU.** Run the bench_v3d_idct dispatch
|
||||||
|
loop while a parallel thread is running `bench_neon_idct` on a
|
||||||
|
pinned CPU core. Measure: does combined Mblock/s exceed
|
||||||
|
`bench_neon_idct -t 4` (4-core NEON)? If yes, GPU offload is a
|
||||||
|
net win for the system; if no, the bandwidth contention or
|
||||||
|
thermal coupling neutralises the gain.
|
||||||
|
- **M6: WG size sweep (Phase 1 secondary).** v4 is at 256
|
||||||
|
invocations (max). Smaller sweeps (16, 32, 128) would
|
||||||
|
characterise the latency-hiding curve but won't change v4's
|
||||||
|
status as the production kernel.
|
||||||
|
- **M7: power delta via Himbeere plug.** Most relevant for the
|
||||||
|
higgs (battery) deployment, not hertz.
|
||||||
|
- **Thermal headroom under sustained mixed load.** With QPU
|
||||||
|
running flat-out (1.9 GB/s memory traffic) + 4-core NEON busy,
|
||||||
|
hertz may throttle. Not yet measured.
|
||||||
|
|
||||||
|
## Production artifact
|
||||||
|
|
||||||
|
- `src/v3d_idct8.comp` — v4 production shader, 270 inst, R = 0.92
|
||||||
|
- `src/v3d_runner.{c,h}` — Vulkan plumbing (unchanged since Phase 6)
|
||||||
|
- `tests/bench_v3d_idct.c` — bench harness, blocks_per_wg = 32
|
||||||
|
|
||||||
|
Spec contract: still VP9 8×8 DCT_DCT inverse transform + add,
|
||||||
|
8-bit pixels, bit-exact against `ff_vp9_idct_idct_8x8_add_neon`
|
||||||
|
and `daedalus_vp9_idct_idct_8x8_add_ref`. Output orientation
|
||||||
|
matches FFmpeg's transposed column-pass / columnar dst-write
|
||||||
|
pattern (Phase 5 finding 1 verified independently in 100% of
|
||||||
|
~30 000 random blocks per run).
|
||||||
@@ -0,0 +1,217 @@
|
|||||||
|
// daedalus-fourier — VP9 8×8 DCT_DCT inverse-transform-add, V3D 7.1.
|
||||||
|
// v2: post-Phase-7 loopback. Phase 4' iteration 1.
|
||||||
|
//
|
||||||
|
// Changes from v1 (per phase47 iteration 1 + Sonnet v3d perf research):
|
||||||
|
//
|
||||||
|
// Opt 1 — kill the chained ternary. v1's row-pass write had
|
||||||
|
// `(r==0)?o0:(r==1)?o1:...` inside a `for r` loop; that
|
||||||
|
// kept all 8 oN scalars live across 7 phi nodes and almost
|
||||||
|
// certainly forced register spills (Iago Toral 2021,
|
||||||
|
// blogs.igalia.com/itoral). v2 unrolls the 8 writes
|
||||||
|
// completely — each oN is used exactly once.
|
||||||
|
//
|
||||||
|
// Opt 2 — 2 blocks per subgroup. v1 had 1 block per 16-lane
|
||||||
|
// subgroup with 8 lanes idle per phase. v2 packs 2 blocks
|
||||||
|
// per subgroup (one in lanes 0..7, one in lanes 8..15),
|
||||||
|
// and every lane runs both passes for its own block.
|
||||||
|
// Eliminates idle lanes AND removes the col_pass/row_pass
|
||||||
|
// branch divergence. 8 blocks per WG (vs 4 before),
|
||||||
|
// dispatch count halves from 8160 to 4080 on 1080p.
|
||||||
|
// Shared-mem footprint doubles to 2 KiB (still « 16 KiB).
|
||||||
|
//
|
||||||
|
// (Opt 3 — packed uint32 storage — deferred; do it if Opt 1+2
|
||||||
|
// don't get us into the GREEN/YELLOW decision band.)
|
||||||
|
//
|
||||||
|
// License: BSD-2-Clause.
|
||||||
|
|
||||||
|
#version 450
|
||||||
|
#extension GL_EXT_shader_8bit_storage : require
|
||||||
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
|
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||||
|
|
||||||
|
// v4: local_size 256 (was 64) — 16 subgroups × 16 lanes = 32 blocks/WG.
|
||||||
|
// More in-flight work per WG = more latency hiding for v3d's TMU.
|
||||||
|
// shared = 32 × 64 × 4 B = 8 KiB (still under 16 KiB).
|
||||||
|
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout(binding = 0) readonly buffer Coeffs {
|
||||||
|
int16_t coeffs[]; // N × 64 packed
|
||||||
|
} u_coeffs;
|
||||||
|
// (v5 tried uint32-packed reads with manual unpack — no measurable
|
||||||
|
// perf change vs int16, added code complexity; reverted.)
|
||||||
|
|
||||||
|
layout(binding = 1) buffer Dst {
|
||||||
|
uint8_t dst[]; // H × stride bytes
|
||||||
|
} u_dst;
|
||||||
|
|
||||||
|
layout(binding = 2) readonly buffer Meta {
|
||||||
|
uvec2 meta[]; // per-block (block_x_8, block_y_8)
|
||||||
|
} u_meta;
|
||||||
|
|
||||||
|
layout(push_constant) uniform PC {
|
||||||
|
uint n_blocks;
|
||||||
|
uint blocks_per_row; // unused (meta drives position)
|
||||||
|
uint dst_stride_u8;
|
||||||
|
uint _pad;
|
||||||
|
} pc;
|
||||||
|
|
||||||
|
// 32 blocks per WG × 64 i32 per block × 4 B = 8192 B shared.
|
||||||
|
shared int tmp_shared[32 * 64];
|
||||||
|
|
||||||
|
// VP9 Q14 trig constants (spec §8.7.1.4).
|
||||||
|
const int COSPI_16 = 11585;
|
||||||
|
const int COSPI_24 = 6270;
|
||||||
|
const int COSPI_08 = 15137;
|
||||||
|
const int COSPI_28 = 3196;
|
||||||
|
const int COSPI_04 = 16069;
|
||||||
|
const int COSPI_20 = 9102;
|
||||||
|
const int COSPI_12 = 13623;
|
||||||
|
|
||||||
|
int qround14(int x) { return (x + (1 << 13)) >> 14; }
|
||||||
|
|
||||||
|
void idct8_1d(int i0, int i1, int i2, int i3,
|
||||||
|
int i4, int i5, int i6, int i7,
|
||||||
|
out int o0, out int o1, out int o2, out int o3,
|
||||||
|
out int o4, out int o5, out int o6, out int o7)
|
||||||
|
{
|
||||||
|
int t0a = qround14((i0 + i4) * COSPI_16);
|
||||||
|
int t1a = qround14((i0 - i4) * COSPI_16);
|
||||||
|
int t2a = qround14(i2 * COSPI_24 - i6 * COSPI_08);
|
||||||
|
int t3a = qround14(i2 * COSPI_08 + i6 * COSPI_24);
|
||||||
|
int t4a = qround14(i1 * COSPI_28 - i7 * COSPI_04);
|
||||||
|
int t5a = qround14(i5 * COSPI_12 - i3 * COSPI_20);
|
||||||
|
int t6a = qround14(i5 * COSPI_20 + i3 * COSPI_12);
|
||||||
|
int t7a = qround14(i1 * COSPI_04 + i7 * COSPI_28);
|
||||||
|
|
||||||
|
int t0 = t0a + t3a, t1 = t1a + t2a;
|
||||||
|
int t2 = t1a - t2a, t3 = t0a - t3a;
|
||||||
|
int t4 = t4a + t5a;
|
||||||
|
int t5p = t4a - t5a;
|
||||||
|
int t7 = t7a + t6a;
|
||||||
|
int t6p = t7a - t6a;
|
||||||
|
|
||||||
|
int t5 = qround14((t6p - t5p) * COSPI_16);
|
||||||
|
int t6 = qround14((t6p + t5p) * COSPI_16);
|
||||||
|
|
||||||
|
o0 = t0 + t7; o1 = t1 + t6;
|
||||||
|
o2 = t2 + t5; o3 = t3 + t4;
|
||||||
|
o4 = t3 - t4; o5 = t2 - t5;
|
||||||
|
o6 = t1 - t6; o7 = t0 - t7;
|
||||||
|
}
|
||||||
|
|
||||||
|
void main()
|
||||||
|
{
|
||||||
|
// ---- Lane / block decomposition --------------------------------
|
||||||
|
// 64 invocations/WG = 4 subgroups × 16 lanes/subgroup.
|
||||||
|
// Each subgroup packs 2 blocks (one in lanes 0..7, one in lanes 8..15).
|
||||||
|
// 8 blocks per WG total.
|
||||||
|
//
|
||||||
|
// Every lane runs both column and row pass for its own block —
|
||||||
|
// no idle lanes, no col_pass/row_pass branch divergence.
|
||||||
|
|
||||||
|
uint gid = gl_GlobalInvocationID.x;
|
||||||
|
uint wg_id = gid / 256u;
|
||||||
|
uint lane_in_wg = gid & 255u;
|
||||||
|
uint sg_in_wg = lane_in_wg >> 4; // 0..15
|
||||||
|
uint lane_in_sg = lane_in_wg & 15u;
|
||||||
|
uint block_slot = lane_in_sg >> 3; // 0 (lanes 0..7) or 1 (lanes 8..15)
|
||||||
|
uint k = lane_in_sg & 7u; // 0..7
|
||||||
|
|
||||||
|
uint block_local = sg_in_wg * 2u + block_slot; // 0..31 within WG
|
||||||
|
uint block_idx = wg_id * 32u + block_local;
|
||||||
|
|
||||||
|
// OOB flag — gates work bodies, but barrier() is reached by all.
|
||||||
|
// Per phase5.md finding 7.
|
||||||
|
bool oob = (block_idx >= pc.n_blocks);
|
||||||
|
|
||||||
|
// ---- Column pass ----------------------------------------------
|
||||||
|
// v3 (Opt 4): scope oN inside each pass so they're dead at the
|
||||||
|
// barrier — v2 had them function-scope which inflated max-temps
|
||||||
|
// (shaderdb reported 20 max-temps / 2 threads instead of 4 threads
|
||||||
|
// possible). Lower temps → more hardware threads → better
|
||||||
|
// latency hiding.
|
||||||
|
if (!oob) {
|
||||||
|
uint base = block_idx * 64u;
|
||||||
|
int c0 = int(u_coeffs.coeffs[base + 0u * 8u + k]);
|
||||||
|
int c1 = int(u_coeffs.coeffs[base + 1u * 8u + k]);
|
||||||
|
int c2 = int(u_coeffs.coeffs[base + 2u * 8u + k]);
|
||||||
|
int c3 = int(u_coeffs.coeffs[base + 3u * 8u + k]);
|
||||||
|
int c4 = int(u_coeffs.coeffs[base + 4u * 8u + k]);
|
||||||
|
int c5 = int(u_coeffs.coeffs[base + 5u * 8u + k]);
|
||||||
|
int c6 = int(u_coeffs.coeffs[base + 6u * 8u + k]);
|
||||||
|
int c7 = int(u_coeffs.coeffs[base + 7u * 8u + k]);
|
||||||
|
|
||||||
|
int o0, o1, o2, o3, o4, o5, o6, o7;
|
||||||
|
idct8_1d(c0, c1, c2, c3, c4, c5, c6, c7,
|
||||||
|
o0, o1, o2, o3, o4, o5, o6, o7);
|
||||||
|
|
||||||
|
// Transposed write: row k of tmp_shared[block_local].
|
||||||
|
uint tbase = block_local * 64u + k * 8u;
|
||||||
|
tmp_shared[tbase + 0u] = o0;
|
||||||
|
tmp_shared[tbase + 1u] = o1;
|
||||||
|
tmp_shared[tbase + 2u] = o2;
|
||||||
|
tmp_shared[tbase + 3u] = o3;
|
||||||
|
tmp_shared[tbase + 4u] = o4;
|
||||||
|
tmp_shared[tbase + 5u] = o5;
|
||||||
|
tmp_shared[tbase + 6u] = o6;
|
||||||
|
tmp_shared[tbase + 7u] = o7;
|
||||||
|
}
|
||||||
|
|
||||||
|
barrier(); // unconditional — every lane in the WG reaches this
|
||||||
|
|
||||||
|
// ---- Row pass --------------------------------------------------
|
||||||
|
if (!oob) {
|
||||||
|
// Read column k of tmp_shared[block_local].
|
||||||
|
uint tbase = block_local * 64u;
|
||||||
|
int s0 = tmp_shared[tbase + 0u * 8u + k];
|
||||||
|
int s1 = tmp_shared[tbase + 1u * 8u + k];
|
||||||
|
int s2 = tmp_shared[tbase + 2u * 8u + k];
|
||||||
|
int s3 = tmp_shared[tbase + 3u * 8u + k];
|
||||||
|
int s4 = tmp_shared[tbase + 4u * 8u + k];
|
||||||
|
int s5 = tmp_shared[tbase + 5u * 8u + k];
|
||||||
|
int s6 = tmp_shared[tbase + 6u * 8u + k];
|
||||||
|
int s7 = tmp_shared[tbase + 7u * 8u + k];
|
||||||
|
|
||||||
|
int o0, o1, o2, o3, o4, o5, o6, o7;
|
||||||
|
idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7,
|
||||||
|
o0, o1, o2, o3, o4, o5, o6, o7);
|
||||||
|
|
||||||
|
// Columnar write into dst. Each lane owns column k of its block.
|
||||||
|
// Block position in dst from meta.
|
||||||
|
uvec2 bp = u_meta.meta[block_idx];
|
||||||
|
uint block_x = bp.x;
|
||||||
|
uint block_y = bp.y;
|
||||||
|
uint dx = block_x * 8u + k;
|
||||||
|
uint dy0 = block_y * 8u;
|
||||||
|
uint stride = pc.dst_stride_u8;
|
||||||
|
|
||||||
|
// Opt 1: 8 fully-unrolled writes — each o_i used exactly once.
|
||||||
|
// No chained ternary, no loop with runtime-variable index.
|
||||||
|
uint a0 = (dy0 + 0u) * stride + dx;
|
||||||
|
uint a1 = (dy0 + 1u) * stride + dx;
|
||||||
|
uint a2 = (dy0 + 2u) * stride + dx;
|
||||||
|
uint a3 = (dy0 + 3u) * stride + dx;
|
||||||
|
uint a4 = (dy0 + 4u) * stride + dx;
|
||||||
|
uint a5 = (dy0 + 5u) * stride + dx;
|
||||||
|
uint a6 = (dy0 + 6u) * stride + dx;
|
||||||
|
uint a7 = (dy0 + 7u) * stride + dx;
|
||||||
|
|
||||||
|
int p0 = int(u_dst.dst[a0]);
|
||||||
|
int p1 = int(u_dst.dst[a1]);
|
||||||
|
int p2 = int(u_dst.dst[a2]);
|
||||||
|
int p3 = int(u_dst.dst[a3]);
|
||||||
|
int p4 = int(u_dst.dst[a4]);
|
||||||
|
int p5 = int(u_dst.dst[a5]);
|
||||||
|
int p6 = int(u_dst.dst[a6]);
|
||||||
|
int p7 = int(u_dst.dst[a7]);
|
||||||
|
|
||||||
|
u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 16) >> 5), 0, 255));
|
||||||
|
u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 16) >> 5), 0, 255));
|
||||||
|
u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 16) >> 5), 0, 255));
|
||||||
|
u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 16) >> 5), 0, 255));
|
||||||
|
u_dst.dst[a4] = uint8_t(clamp(p4 + ((o4 + 16) >> 5), 0, 255));
|
||||||
|
u_dst.dst[a5] = uint8_t(clamp(p5 + ((o5 + 16) >> 5), 0, 255));
|
||||||
|
u_dst.dst[a6] = uint8_t(clamp(p6 + ((o6 + 16) >> 5), 0, 255));
|
||||||
|
u_dst.dst[a7] = uint8_t(clamp(p7 + ((o7 + 16) >> 5), 0, 255));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,435 @@
|
|||||||
|
/*
|
||||||
|
* v3d_runner — implementation. See v3d_runner.h.
|
||||||
|
*
|
||||||
|
* License: BSD-2-Clause.
|
||||||
|
*/
|
||||||
|
#include "v3d_runner.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
|
||||||
|
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
|
||||||
|
r__, __FILE__, __LINE__, #call); return -1; } } while (0)
|
||||||
|
|
||||||
|
#define CHK_NULL(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
|
||||||
|
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
|
||||||
|
r__, __FILE__, __LINE__, #call); return NULL; } } while (0)
|
||||||
|
|
||||||
|
struct v3d_runner {
|
||||||
|
VkInstance instance;
|
||||||
|
VkPhysicalDevice phys;
|
||||||
|
VkDevice device;
|
||||||
|
VkQueue queue;
|
||||||
|
uint32_t queue_family;
|
||||||
|
VkCommandPool pool;
|
||||||
|
char device_name[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE];
|
||||||
|
VkPhysicalDeviceMemoryProperties mem_props;
|
||||||
|
};
|
||||||
|
|
||||||
|
static int pick_v3d_physical_device(VkInstance inst, VkPhysicalDevice *out,
|
||||||
|
char name_out[VK_MAX_PHYSICAL_DEVICE_NAME_SIZE])
|
||||||
|
{
|
||||||
|
uint32_t n = 0;
|
||||||
|
if (vkEnumeratePhysicalDevices(inst, &n, NULL) != VK_SUCCESS || n == 0) {
|
||||||
|
fprintf(stderr, "v3d_runner: no Vulkan physical devices\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
VkPhysicalDevice *pds = malloc(n * sizeof(*pds));
|
||||||
|
if (!pds) return -1;
|
||||||
|
vkEnumeratePhysicalDevices(inst, &n, pds);
|
||||||
|
|
||||||
|
int picked = -1;
|
||||||
|
for (uint32_t i = 0; i < n; i++) {
|
||||||
|
VkPhysicalDeviceProperties p;
|
||||||
|
vkGetPhysicalDeviceProperties(pds[i], &p);
|
||||||
|
if (strstr(p.deviceName, "V3D") != NULL) {
|
||||||
|
*out = pds[i];
|
||||||
|
memcpy(name_out, p.deviceName, sizeof(p.deviceName));
|
||||||
|
picked = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(pds);
|
||||||
|
if (picked != 0)
|
||||||
|
fprintf(stderr, "v3d_runner: no V3D device found (looked for "
|
||||||
|
"\"V3D\" substring in deviceName)\n");
|
||||||
|
return picked;
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t pick_compute_queue_family(VkPhysicalDevice phys)
|
||||||
|
{
|
||||||
|
uint32_t n = 0;
|
||||||
|
vkGetPhysicalDeviceQueueFamilyProperties(phys, &n, NULL);
|
||||||
|
VkQueueFamilyProperties *q = malloc(n * sizeof(*q));
|
||||||
|
if (!q) return UINT32_MAX;
|
||||||
|
vkGetPhysicalDeviceQueueFamilyProperties(phys, &n, q);
|
||||||
|
uint32_t out = UINT32_MAX;
|
||||||
|
for (uint32_t i = 0; i < n; i++) {
|
||||||
|
if (q[i].queueFlags & VK_QUEUE_COMPUTE_BIT) { out = i; break; }
|
||||||
|
}
|
||||||
|
free(q);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
v3d_runner *v3d_runner_create(void)
|
||||||
|
{
|
||||||
|
v3d_runner *r = calloc(1, sizeof(*r));
|
||||||
|
if (!r) return NULL;
|
||||||
|
|
||||||
|
/* Instance — Vulkan 1.3 to inherit 1.2 promoted features. */
|
||||||
|
VkApplicationInfo app = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
|
||||||
|
.pApplicationName = "daedalus-fourier",
|
||||||
|
.apiVersion = VK_API_VERSION_1_3,
|
||||||
|
};
|
||||||
|
VkInstanceCreateInfo ici = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
|
||||||
|
.pApplicationInfo = &app,
|
||||||
|
};
|
||||||
|
CHK_NULL(vkCreateInstance(&ici, NULL, &r->instance));
|
||||||
|
|
||||||
|
if (pick_v3d_physical_device(r->instance, &r->phys, r->device_name) != 0) {
|
||||||
|
vkDestroyInstance(r->instance, NULL);
|
||||||
|
free(r);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
vkGetPhysicalDeviceMemoryProperties(r->phys, &r->mem_props);
|
||||||
|
|
||||||
|
r->queue_family = pick_compute_queue_family(r->phys);
|
||||||
|
if (r->queue_family == UINT32_MAX) {
|
||||||
|
fprintf(stderr, "v3d_runner: no compute queue family\n");
|
||||||
|
vkDestroyInstance(r->instance, NULL);
|
||||||
|
free(r);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Enable 8-bit + 16-bit storage features. Both are exposed on
|
||||||
|
* V3D 7.1 per vulkaninfo_v3d_7_1_7_hertz.txt; the kernel
|
||||||
|
* declares storageBuffer8BitAccess (uint8_t dst[]) and
|
||||||
|
* storageBuffer16BitAccess (int16_t coeffs[]).
|
||||||
|
*/
|
||||||
|
VkPhysicalDevice16BitStorageFeatures f16 = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES,
|
||||||
|
.storageBuffer16BitAccess = VK_TRUE,
|
||||||
|
.uniformAndStorageBuffer16BitAccess = VK_TRUE,
|
||||||
|
};
|
||||||
|
VkPhysicalDevice8BitStorageFeatures f8 = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES,
|
||||||
|
.pNext = &f16,
|
||||||
|
.storageBuffer8BitAccess = VK_TRUE,
|
||||||
|
.uniformAndStorageBuffer8BitAccess = VK_TRUE,
|
||||||
|
};
|
||||||
|
VkPhysicalDeviceFeatures2 f2 = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
|
||||||
|
.pNext = &f8,
|
||||||
|
};
|
||||||
|
|
||||||
|
float qprio = 1.0f;
|
||||||
|
VkDeviceQueueCreateInfo dqci = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
|
||||||
|
.queueFamilyIndex = r->queue_family,
|
||||||
|
.queueCount = 1,
|
||||||
|
.pQueuePriorities = &qprio,
|
||||||
|
};
|
||||||
|
VkDeviceCreateInfo dci = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
|
||||||
|
.pNext = &f2,
|
||||||
|
.queueCreateInfoCount = 1,
|
||||||
|
.pQueueCreateInfos = &dqci,
|
||||||
|
};
|
||||||
|
if (vkCreateDevice(r->phys, &dci, NULL, &r->device) != VK_SUCCESS) {
|
||||||
|
fprintf(stderr, "v3d_runner: vkCreateDevice failed\n");
|
||||||
|
vkDestroyInstance(r->instance, NULL);
|
||||||
|
free(r);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
vkGetDeviceQueue(r->device, r->queue_family, 0, &r->queue);
|
||||||
|
|
||||||
|
VkCommandPoolCreateInfo cpci = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
|
||||||
|
.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
|
||||||
|
.queueFamilyIndex = r->queue_family,
|
||||||
|
};
|
||||||
|
if (vkCreateCommandPool(r->device, &cpci, NULL, &r->pool) != VK_SUCCESS) {
|
||||||
|
fprintf(stderr, "v3d_runner: vkCreateCommandPool failed\n");
|
||||||
|
vkDestroyDevice(r->device, NULL);
|
||||||
|
vkDestroyInstance(r->instance, NULL);
|
||||||
|
free(r);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
void v3d_runner_destroy(v3d_runner *r)
|
||||||
|
{
|
||||||
|
if (!r) return;
|
||||||
|
if (r->device != VK_NULL_HANDLE) vkDeviceWaitIdle(r->device);
|
||||||
|
if (r->pool != VK_NULL_HANDLE)
|
||||||
|
vkDestroyCommandPool(r->device, r->pool, NULL);
|
||||||
|
if (r->device != VK_NULL_HANDLE) vkDestroyDevice(r->device, NULL);
|
||||||
|
if (r->instance != VK_NULL_HANDLE) vkDestroyInstance(r->instance, NULL);
|
||||||
|
free(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
VkDevice v3d_runner_device(v3d_runner *r) { return r->device; }
|
||||||
|
VkQueue v3d_runner_queue(v3d_runner *r) { return r->queue; }
|
||||||
|
uint32_t v3d_runner_queue_family(v3d_runner *r) { return r->queue_family; }
|
||||||
|
VkCommandPool v3d_runner_cmd_pool(v3d_runner *r) { return r->pool; }
|
||||||
|
const char *v3d_runner_device_name(v3d_runner *r) { return r->device_name; }
|
||||||
|
|
||||||
|
/* ---- Buffers ---------------------------------------------------- */
|
||||||
|
|
||||||
|
static int find_memory_type(VkPhysicalDeviceMemoryProperties *p,
|
||||||
|
uint32_t type_bits, VkMemoryPropertyFlags wanted)
|
||||||
|
{
|
||||||
|
for (uint32_t i = 0; i < p->memoryTypeCount; i++) {
|
||||||
|
if ((type_bits & (1u << i)) &&
|
||||||
|
(p->memoryTypes[i].propertyFlags & wanted) == wanted)
|
||||||
|
return (int) i;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out)
|
||||||
|
{
|
||||||
|
memset(out, 0, sizeof(*out));
|
||||||
|
out->size = size;
|
||||||
|
|
||||||
|
VkBufferCreateInfo bci = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
||||||
|
.size = size,
|
||||||
|
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT
|
||||||
|
| VK_BUFFER_USAGE_TRANSFER_SRC_BIT
|
||||||
|
| VK_BUFFER_USAGE_TRANSFER_DST_BIT,
|
||||||
|
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
|
||||||
|
};
|
||||||
|
CHK(vkCreateBuffer(r->device, &bci, NULL, &out->buffer));
|
||||||
|
|
||||||
|
VkMemoryRequirements req;
|
||||||
|
vkGetBufferMemoryRequirements(r->device, out->buffer, &req);
|
||||||
|
|
||||||
|
/* HOST_VISIBLE | HOST_COHERENT is the unified-memory zero-copy
|
||||||
|
* path on Pi 5: CPU and GPU see the same LPDDR4x physical pages,
|
||||||
|
* no explicit flush/invalidate needed (the COHERENT bit asserts
|
||||||
|
* that). */
|
||||||
|
int mt = find_memory_type(&r->mem_props, req.memoryTypeBits,
|
||||||
|
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
|
||||||
|
| VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
|
||||||
|
if (mt < 0) {
|
||||||
|
fprintf(stderr, "v3d_runner: no HOST_VISIBLE|COHERENT memory type\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
VkMemoryAllocateInfo mai = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
|
||||||
|
.allocationSize = req.size,
|
||||||
|
.memoryTypeIndex = (uint32_t) mt,
|
||||||
|
};
|
||||||
|
CHK(vkAllocateMemory(r->device, &mai, NULL, &out->memory));
|
||||||
|
CHK(vkBindBufferMemory(r->device, out->buffer, out->memory, 0));
|
||||||
|
CHK(vkMapMemory(r->device, out->memory, 0, VK_WHOLE_SIZE, 0, &out->mapped));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf)
|
||||||
|
{
|
||||||
|
if (!buf || buf->buffer == VK_NULL_HANDLE) return;
|
||||||
|
if (buf->mapped) vkUnmapMemory(r->device, buf->memory);
|
||||||
|
vkDestroyBuffer(r->device, buf->buffer, NULL);
|
||||||
|
vkFreeMemory(r->device, buf->memory, NULL);
|
||||||
|
memset(buf, 0, sizeof(*buf));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Pipelines -------------------------------------------------- */
|
||||||
|
|
||||||
|
static uint32_t *read_spv(const char *path, size_t *out_size)
|
||||||
|
{
|
||||||
|
FILE *f = fopen(path, "rb");
|
||||||
|
if (!f) { perror(path); return NULL; }
|
||||||
|
fseek(f, 0, SEEK_END);
|
||||||
|
long sz = ftell(f);
|
||||||
|
fseek(f, 0, SEEK_SET);
|
||||||
|
if (sz <= 0 || (sz & 3)) {
|
||||||
|
fprintf(stderr, "%s: bad SPIR-V size %ld\n", path, sz);
|
||||||
|
fclose(f); return NULL;
|
||||||
|
}
|
||||||
|
uint32_t *buf = malloc(sz);
|
||||||
|
if (!buf || fread(buf, 1, sz, f) != (size_t)sz) {
|
||||||
|
perror("read"); fclose(f); free(buf); return NULL;
|
||||||
|
}
|
||||||
|
fclose(f);
|
||||||
|
*out_size = sz;
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
int v3d_runner_create_pipeline(v3d_runner *r, const char *spv_path,
|
||||||
|
uint32_t n_ssbos, uint32_t push_const_size,
|
||||||
|
v3d_pipeline *out)
|
||||||
|
{
|
||||||
|
memset(out, 0, sizeof(*out));
|
||||||
|
out->n_ssbos = n_ssbos;
|
||||||
|
out->push_const_size = push_const_size;
|
||||||
|
|
||||||
|
/* Descriptor set layout: n_ssbos SSBO bindings, compute-only. */
|
||||||
|
VkDescriptorSetLayoutBinding *binds = calloc(n_ssbos, sizeof(*binds));
|
||||||
|
if (!binds) return -1;
|
||||||
|
for (uint32_t i = 0; i < n_ssbos; i++) {
|
||||||
|
binds[i] = (VkDescriptorSetLayoutBinding){
|
||||||
|
.binding = i,
|
||||||
|
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||||
|
.descriptorCount = 1,
|
||||||
|
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
VkDescriptorSetLayoutCreateInfo dslci = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
|
||||||
|
.bindingCount = n_ssbos,
|
||||||
|
.pBindings = binds,
|
||||||
|
};
|
||||||
|
VkResult vr = vkCreateDescriptorSetLayout(r->device, &dslci, NULL,
|
||||||
|
&out->ds_layout);
|
||||||
|
free(binds);
|
||||||
|
if (vr != VK_SUCCESS) {
|
||||||
|
fprintf(stderr, "vkCreateDescriptorSetLayout = %d\n", vr); return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
VkPushConstantRange pcr = {
|
||||||
|
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||||
|
.offset = 0,
|
||||||
|
.size = push_const_size,
|
||||||
|
};
|
||||||
|
VkPipelineLayoutCreateInfo plci = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
|
||||||
|
.setLayoutCount = 1,
|
||||||
|
.pSetLayouts = &out->ds_layout,
|
||||||
|
.pushConstantRangeCount = push_const_size ? 1 : 0,
|
||||||
|
.pPushConstantRanges = push_const_size ? &pcr : NULL,
|
||||||
|
};
|
||||||
|
CHK(vkCreatePipelineLayout(r->device, &plci, NULL, &out->layout));
|
||||||
|
|
||||||
|
size_t spv_size = 0;
|
||||||
|
uint32_t *spv = read_spv(spv_path, &spv_size);
|
||||||
|
if (!spv) return -1;
|
||||||
|
VkShaderModuleCreateInfo smci = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
|
||||||
|
.codeSize = spv_size,
|
||||||
|
.pCode = spv,
|
||||||
|
};
|
||||||
|
VkShaderModule shader;
|
||||||
|
vr = vkCreateShaderModule(r->device, &smci, NULL, &shader);
|
||||||
|
free(spv);
|
||||||
|
if (vr != VK_SUCCESS) {
|
||||||
|
fprintf(stderr, "vkCreateShaderModule(%s) = %d\n", spv_path, vr);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
VkComputePipelineCreateInfo cpci = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
|
||||||
|
.stage = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
|
||||||
|
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||||
|
.module = shader,
|
||||||
|
.pName = "main",
|
||||||
|
},
|
||||||
|
.layout = out->layout,
|
||||||
|
};
|
||||||
|
vr = vkCreateComputePipelines(r->device, VK_NULL_HANDLE, 1, &cpci, NULL,
|
||||||
|
&out->pipeline);
|
||||||
|
vkDestroyShaderModule(r->device, shader, NULL);
|
||||||
|
if (vr != VK_SUCCESS) {
|
||||||
|
fprintf(stderr, "vkCreateComputePipelines = %d\n", vr); return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Single descriptor pool + set for this pipeline. */
|
||||||
|
VkDescriptorPoolSize ps = {
|
||||||
|
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||||
|
.descriptorCount = n_ssbos,
|
||||||
|
};
|
||||||
|
VkDescriptorPoolCreateInfo dpci = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
|
||||||
|
.maxSets = 1,
|
||||||
|
.poolSizeCount = 1,
|
||||||
|
.pPoolSizes = &ps,
|
||||||
|
};
|
||||||
|
CHK(vkCreateDescriptorPool(r->device, &dpci, NULL, &out->pool));
|
||||||
|
|
||||||
|
VkDescriptorSetAllocateInfo dsai = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
|
||||||
|
.descriptorPool = out->pool,
|
||||||
|
.descriptorSetCount = 1,
|
||||||
|
.pSetLayouts = &out->ds_layout,
|
||||||
|
};
|
||||||
|
CHK(vkAllocateDescriptorSets(r->device, &dsai, &out->desc_set));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p)
|
||||||
|
{
|
||||||
|
if (!p || p->pipeline == VK_NULL_HANDLE) return;
|
||||||
|
vkDestroyPipeline(r->device, p->pipeline, NULL);
|
||||||
|
vkDestroyPipelineLayout(r->device, p->layout, NULL);
|
||||||
|
vkDestroyDescriptorPool(r->device, p->pool, NULL); /* frees its set */
|
||||||
|
vkDestroyDescriptorSetLayout(r->device, p->ds_layout, NULL);
|
||||||
|
memset(p, 0, sizeof(*p));
|
||||||
|
}
|
||||||
|
|
||||||
|
int v3d_runner_bind_buffers(v3d_runner *r, v3d_pipeline *p,
|
||||||
|
const v3d_buffer *bufs, uint32_t n)
|
||||||
|
{
|
||||||
|
if (n != p->n_ssbos) {
|
||||||
|
fprintf(stderr, "bind_buffers: n=%u != pipeline n_ssbos=%u\n",
|
||||||
|
n, p->n_ssbos);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
VkDescriptorBufferInfo *bi = calloc(n, sizeof(*bi));
|
||||||
|
VkWriteDescriptorSet *wr = calloc(n, sizeof(*wr));
|
||||||
|
if (!bi || !wr) { free(bi); free(wr); return -1; }
|
||||||
|
for (uint32_t i = 0; i < n; i++) {
|
||||||
|
bi[i].buffer = bufs[i].buffer;
|
||||||
|
bi[i].offset = 0;
|
||||||
|
bi[i].range = bufs[i].size;
|
||||||
|
wr[i] = (VkWriteDescriptorSet){
|
||||||
|
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
|
||||||
|
.dstSet = p->desc_set,
|
||||||
|
.dstBinding = i,
|
||||||
|
.descriptorCount = 1,
|
||||||
|
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||||
|
.pBufferInfo = &bi[i],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
vkUpdateDescriptorSets(r->device, n, wr, 0, NULL);
|
||||||
|
free(bi); free(wr);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Command buffers ------------------------------------------- */
|
||||||
|
|
||||||
|
VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r)
|
||||||
|
{
|
||||||
|
VkCommandBufferAllocateInfo cbai = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
|
||||||
|
.commandPool = r->pool,
|
||||||
|
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
|
||||||
|
.commandBufferCount = 1,
|
||||||
|
};
|
||||||
|
VkCommandBuffer cb = VK_NULL_HANDLE;
|
||||||
|
if (vkAllocateCommandBuffers(r->device, &cbai, &cb) != VK_SUCCESS)
|
||||||
|
return VK_NULL_HANDLE;
|
||||||
|
return cb;
|
||||||
|
}
|
||||||
|
|
||||||
|
int v3d_runner_submit_wait(v3d_runner *r, VkCommandBuffer cb)
|
||||||
|
{
|
||||||
|
VkSubmitInfo si = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
|
||||||
|
.commandBufferCount = 1,
|
||||||
|
.pCommandBuffers = &cb,
|
||||||
|
};
|
||||||
|
CHK(vkQueueSubmit(r->queue, 1, &si, VK_NULL_HANDLE));
|
||||||
|
CHK(vkQueueWaitIdle(r->queue));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@@ -0,0 +1,96 @@
|
|||||||
|
/*
|
||||||
|
* v3d_runner — minimal Vulkan compute plumbing for V3D 7.1 on Pi 5.
|
||||||
|
*
|
||||||
|
* Factored out of tests/bench_vulkan_dispatch.c so successive kernel
|
||||||
|
* benches can reuse the device/queue/buffer/pipeline machinery
|
||||||
|
* without copy-paste. Kept deliberately small and concrete — no
|
||||||
|
* generality beyond what daedalus-fourier needs.
|
||||||
|
*
|
||||||
|
* License: BSD-2-Clause.
|
||||||
|
*/
|
||||||
|
#ifndef DAEDALUS_V3D_RUNNER_H
|
||||||
|
#define DAEDALUS_V3D_RUNNER_H
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <vulkan/vulkan.h>
|
||||||
|
|
||||||
|
typedef struct v3d_runner v3d_runner;
|
||||||
|
|
||||||
|
/* Host-visible SSBO. .mapped is a CPU-side pointer to .size bytes. */
|
||||||
|
typedef struct {
|
||||||
|
VkBuffer buffer;
|
||||||
|
VkDeviceMemory memory;
|
||||||
|
void *mapped;
|
||||||
|
size_t size;
|
||||||
|
} v3d_buffer;
|
||||||
|
|
||||||
|
/* Compute pipeline + its descriptor set (one set per pipeline). */
|
||||||
|
typedef struct {
|
||||||
|
VkPipeline pipeline;
|
||||||
|
VkPipelineLayout layout;
|
||||||
|
VkDescriptorSetLayout ds_layout;
|
||||||
|
VkDescriptorPool pool;
|
||||||
|
VkDescriptorSet desc_set;
|
||||||
|
uint32_t n_ssbos;
|
||||||
|
uint32_t push_const_size;
|
||||||
|
} v3d_pipeline;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create runner: Vulkan instance, V3D physical device, logical
|
||||||
|
* device with storageBuffer{8,16}BitAccess features enabled,
|
||||||
|
* compute queue, command pool.
|
||||||
|
*
|
||||||
|
* Returns NULL on failure (writes errors to stderr).
|
||||||
|
*/
|
||||||
|
v3d_runner *v3d_runner_create(void);
|
||||||
|
void v3d_runner_destroy(v3d_runner *r);
|
||||||
|
|
||||||
|
/* Expose a few internals for code that wants direct vkCmd*. */
|
||||||
|
VkDevice v3d_runner_device(v3d_runner *r);
|
||||||
|
VkQueue v3d_runner_queue(v3d_runner *r);
|
||||||
|
uint32_t v3d_runner_queue_family(v3d_runner *r);
|
||||||
|
VkCommandPool v3d_runner_cmd_pool(v3d_runner *r);
|
||||||
|
const char *v3d_runner_device_name(v3d_runner *r);
|
||||||
|
|
||||||
|
/* Storage buffer, HOST_VISIBLE | HOST_COHERENT, mapped on the
|
||||||
|
* host side. The mapping persists for the lifetime of the buffer.
|
||||||
|
*
|
||||||
|
* Returns 0 on success, non-zero on failure.
|
||||||
|
*/
|
||||||
|
int v3d_runner_create_buffer(v3d_runner *r, size_t size, v3d_buffer *out);
|
||||||
|
void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf);
|
||||||
|
|
||||||
|
/* Compute pipeline from a SPIR-V file path. The descriptor-set
|
||||||
|
* layout exposes `n_ssbos` storage buffer bindings at binding
|
||||||
|
* indices 0..n_ssbos-1, all visible to the compute stage. A push
|
||||||
|
* constant range of `push_const_size` bytes is added if non-zero.
|
||||||
|
*
|
||||||
|
* The single descriptor set is pre-allocated; bind buffers via
|
||||||
|
* v3d_runner_bind_buffers().
|
||||||
|
*/
|
||||||
|
int v3d_runner_create_pipeline(v3d_runner *r,
|
||||||
|
const char *spv_path,
|
||||||
|
uint32_t n_ssbos,
|
||||||
|
uint32_t push_const_size,
|
||||||
|
v3d_pipeline *out);
|
||||||
|
void v3d_runner_destroy_pipeline(v3d_runner *r, v3d_pipeline *p);
|
||||||
|
|
||||||
|
/* Bind SSBOs to the pipeline's descriptor set. `bufs` must have
|
||||||
|
* exactly `p->n_ssbos` entries, in binding order. Idempotent —
|
||||||
|
* rebind freely between dispatches if buffers change.
|
||||||
|
*/
|
||||||
|
int v3d_runner_bind_buffers(v3d_runner *r,
|
||||||
|
v3d_pipeline *p,
|
||||||
|
const v3d_buffer *bufs,
|
||||||
|
uint32_t n);
|
||||||
|
|
||||||
|
/* Allocate a primary command buffer from the runner's pool. */
|
||||||
|
VkCommandBuffer v3d_runner_alloc_cmdbuf(v3d_runner *r);
|
||||||
|
|
||||||
|
/* Submit `cb` to the queue and wait for completion. The classic
|
||||||
|
* timed operation. Returns 0 on success.
|
||||||
|
*/
|
||||||
|
int v3d_runner_submit_wait(v3d_runner *r, VkCommandBuffer cb);
|
||||||
|
|
||||||
|
#endif /* DAEDALUS_V3D_RUNNER_H */
|
||||||
@@ -0,0 +1,334 @@
|
|||||||
|
/*
|
||||||
|
* Phase 6 — first-light QPU bench for VP9 8×8 DCT_DCT IDCT add on V3D 7.1.
|
||||||
|
*
|
||||||
|
* Reports:
|
||||||
|
* M1' (correctness): bit-exact rate, QPU output vs C reference,
|
||||||
|
* across N synthetic blocks.
|
||||||
|
* M2 (throughput): QPU sustained MblockS over K dispatched frames.
|
||||||
|
*
|
||||||
|
* Compares against M3 (bench_neon_idct) to compute R = M2 / M3.
|
||||||
|
* Decision rules per docs/phase1.md §"Decision rules".
|
||||||
|
*
|
||||||
|
* License: BSD-2-Clause. Links statically against the LGPL-2.1+
|
||||||
|
* vp9_idct8_ref.c (a clean-room transcription from spec), so this
|
||||||
|
* binary distributes under BSD-2-Clause-or-later if separated; left
|
||||||
|
* as LGPL-2.1+ when linked together.
|
||||||
|
*/
|
||||||
|
#define _POSIX_C_SOURCE 200809L
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <getopt.h>
|
||||||
|
#include <vulkan/vulkan.h>
|
||||||
|
|
||||||
|
#include "v3d_runner.h"
|
||||||
|
|
||||||
|
/* C bit-exact reference from tests/vp9_idct8_ref.c. */
|
||||||
|
extern void daedalus_vp9_idct_idct_8x8_add_ref(
|
||||||
|
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
|
||||||
|
|
||||||
|
/* ---- RNG (matches bench_neon_idct.c shape for reproducibility) -- */
|
||||||
|
|
||||||
|
static uint64_t xs64_state;
|
||||||
|
static inline uint64_t xs64(void)
|
||||||
|
{
|
||||||
|
uint64_t x = xs64_state;
|
||||||
|
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||||||
|
return xs64_state = x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int gen_block(int16_t block[64])
|
||||||
|
{
|
||||||
|
memset(block, 0, 64 * sizeof(*block));
|
||||||
|
int eob = 0;
|
||||||
|
int n_nonzero = 1 + (int)(xs64() % 16);
|
||||||
|
for (int i = 0; i < n_nonzero; i++) {
|
||||||
|
int pos = (int)(xs64() % 64);
|
||||||
|
int16_t coef = (int16_t)((int)(xs64() % 8192) - 4096);
|
||||||
|
block[pos] = coef;
|
||||||
|
if (pos + 1 > eob) eob = pos + 1;
|
||||||
|
}
|
||||||
|
if (eob == 0) eob = 1;
|
||||||
|
return eob;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double now_seconds(void)
|
||||||
|
{
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||||||
|
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Push-constant layout — must match src/v3d_idct8.comp ------- */
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint32_t n_blocks;
|
||||||
|
uint32_t blocks_per_row;
|
||||||
|
uint32_t dst_stride_u8;
|
||||||
|
uint32_t _pad;
|
||||||
|
} push_consts;
|
||||||
|
|
||||||
|
/* ---- Main ------------------------------------------------------- */
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
/* Default synthetic frame: 128×128 pixels = 16×16 blocks = 256
|
||||||
|
* blocks. Small enough for fast bring-up; large enough that the
|
||||||
|
* 4-blocks/WG geometry gets exercised (64 WGs). */
|
||||||
|
int blocks_per_row = 16;
|
||||||
|
int rows_of_blocks = 16;
|
||||||
|
int iters = 100;
|
||||||
|
uint64_t seed = 0;
|
||||||
|
const char *spv_path = "v3d_idct8.spv";
|
||||||
|
int verify_only = 0;
|
||||||
|
int max_mismatch_print = 4;
|
||||||
|
|
||||||
|
static struct option opts[] = {
|
||||||
|
{"width", required_argument, 0, 'w'},
|
||||||
|
{"height", required_argument, 0, 'h'},
|
||||||
|
{"iters", required_argument, 0, 'i'},
|
||||||
|
{"seed", required_argument, 0, 's'},
|
||||||
|
{"spv", required_argument, 0, 'S'},
|
||||||
|
{"verify-only", no_argument, 0, 'V'},
|
||||||
|
{0,0,0,0}
|
||||||
|
};
|
||||||
|
for (int c; (c = getopt_long(argc, argv, "w:h:i:s:S:V", opts, 0)) != -1;) {
|
||||||
|
switch (c) {
|
||||||
|
case 'w': blocks_per_row = atoi(optarg) / 8; break;
|
||||||
|
case 'h': rows_of_blocks = atoi(optarg) / 8; break;
|
||||||
|
case 'i': iters = atoi(optarg); break;
|
||||||
|
case 's': seed = strtoull(optarg, 0, 0); break;
|
||||||
|
case 'S': spv_path = optarg; break;
|
||||||
|
case 'V': verify_only = 1; break;
|
||||||
|
default: return 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int dst_width = blocks_per_row * 8;
|
||||||
|
int dst_height = rows_of_blocks * 8;
|
||||||
|
int dst_stride = dst_width; /* tightly packed */
|
||||||
|
size_t n_blocks = (size_t)blocks_per_row * rows_of_blocks;
|
||||||
|
size_t dst_bytes = (size_t)dst_height * dst_stride;
|
||||||
|
|
||||||
|
printf("=== v3d IDCT8 first-light ===\n");
|
||||||
|
printf(" frame: %dx%d (%dx%d blocks, %zu blocks total)\n",
|
||||||
|
dst_width, dst_height, blocks_per_row, rows_of_blocks, n_blocks);
|
||||||
|
printf(" spv: %s\n", spv_path);
|
||||||
|
printf(" iters: %d (for throughput phase)\n", iters);
|
||||||
|
|
||||||
|
xs64_state = seed ? seed : 0xdeadbeefcafebabeULL;
|
||||||
|
|
||||||
|
/* ---- Init runner ---- */
|
||||||
|
v3d_runner *r = v3d_runner_create();
|
||||||
|
if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; }
|
||||||
|
printf(" device: %s\n", v3d_runner_device_name(r));
|
||||||
|
|
||||||
|
/* ---- Buffers ---- */
|
||||||
|
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
|
||||||
|
if (v3d_runner_create_buffer(r, n_blocks * 64 * sizeof(int16_t), &buf_coeffs)) return 1;
|
||||||
|
if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1;
|
||||||
|
if (v3d_runner_create_buffer(r, n_blocks * 2 * sizeof(uint32_t), &buf_meta)) return 1;
|
||||||
|
|
||||||
|
/* Fill master inputs — these stay constant across iterations. */
|
||||||
|
int16_t *master_coeffs = malloc(n_blocks * 64 * sizeof(int16_t));
|
||||||
|
uint8_t *master_pred = malloc(dst_bytes);
|
||||||
|
uint8_t *expected_dst = malloc(dst_bytes); /* C-reference output */
|
||||||
|
int *eobs = malloc(n_blocks * sizeof(int));
|
||||||
|
if (!master_coeffs || !master_pred || !expected_dst || !eobs) return 1;
|
||||||
|
|
||||||
|
for (size_t b = 0; b < n_blocks; b++)
|
||||||
|
eobs[b] = gen_block(master_coeffs + b * 64);
|
||||||
|
for (size_t i = 0; i < dst_bytes; i++)
|
||||||
|
master_pred[i] = (uint8_t)(xs64() & 0xff);
|
||||||
|
|
||||||
|
/* Build the expected (C-reference) output frame. The C ref
|
||||||
|
* mutates its input block (zeros it after column pass), so we
|
||||||
|
* work on copies. */
|
||||||
|
memcpy(expected_dst, master_pred, dst_bytes);
|
||||||
|
int16_t scratch[64];
|
||||||
|
for (size_t b = 0; b < n_blocks; b++) {
|
||||||
|
int bx = (int)(b % blocks_per_row);
|
||||||
|
int by = (int)(b / blocks_per_row);
|
||||||
|
memcpy(scratch, master_coeffs + b * 64, sizeof(scratch));
|
||||||
|
daedalus_vp9_idct_idct_8x8_add_ref(
|
||||||
|
expected_dst + by * 8 * dst_stride + bx * 8,
|
||||||
|
dst_stride, scratch, eobs[b]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Populate GPU buffers. */
|
||||||
|
memcpy(buf_coeffs.mapped, master_coeffs, buf_coeffs.size);
|
||||||
|
memcpy(buf_dst.mapped, master_pred, buf_dst.size);
|
||||||
|
uint32_t *meta = (uint32_t *) buf_meta.mapped;
|
||||||
|
for (size_t b = 0; b < n_blocks; b++) {
|
||||||
|
meta[2*b + 0] = (uint32_t)(b % blocks_per_row); /* block_x_8 */
|
||||||
|
meta[2*b + 1] = (uint32_t)(b / blocks_per_row); /* block_y_8 */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- Pipeline ---- */
|
||||||
|
v3d_pipeline pipe = {0};
|
||||||
|
if (v3d_runner_create_pipeline(r, spv_path,
|
||||||
|
/*n_ssbos=*/3,
|
||||||
|
/*push_const_size=*/sizeof(push_consts),
|
||||||
|
&pipe)) return 1;
|
||||||
|
|
||||||
|
v3d_buffer bind_bufs[3] = { buf_coeffs, buf_dst, buf_meta };
|
||||||
|
if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 3)) return 1;
|
||||||
|
|
||||||
|
/* ---- Dispatch geometry ---- */
|
||||||
|
/* v4: 32 blocks per WG (2 per 16-lane subgroup × 16 subgroups).
|
||||||
|
* 4× v2's count — more in-flight work per WG for latency hiding. */
|
||||||
|
const uint32_t blocks_per_wg = 32;
|
||||||
|
uint32_t group_count_x = (uint32_t)((n_blocks + blocks_per_wg - 1)
|
||||||
|
/ blocks_per_wg);
|
||||||
|
printf(" dispatch: %u WGs × 64 invocations = %u blocks (rounded up from %zu)\n",
|
||||||
|
group_count_x, group_count_x * blocks_per_wg, n_blocks);
|
||||||
|
|
||||||
|
push_consts pc = {
|
||||||
|
.n_blocks = (uint32_t)n_blocks,
|
||||||
|
.blocks_per_row = (uint32_t)blocks_per_row,
|
||||||
|
.dst_stride_u8 = (uint32_t)dst_stride,
|
||||||
|
._pad = 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Record once, reuse for every iteration. */
|
||||||
|
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
|
||||||
|
if (cb == VK_NULL_HANDLE) return 1;
|
||||||
|
VkCommandBufferBeginInfo cbbi = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
|
||||||
|
};
|
||||||
|
vkBeginCommandBuffer(cb, &cbbi);
|
||||||
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
|
||||||
|
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||||
|
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
|
||||||
|
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||||||
|
0, sizeof(pc), &pc);
|
||||||
|
vkCmdDispatch(cb, group_count_x, 1, 1);
|
||||||
|
vkEndCommandBuffer(cb);
|
||||||
|
|
||||||
|
/* ---- M1': bit-exact verification (first dispatch only) ---- */
|
||||||
|
printf("\n=== M1': QPU vs C-reference bit-exact ===\n");
|
||||||
|
memcpy(buf_dst.mapped, master_pred, buf_dst.size);
|
||||||
|
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||||||
|
|
||||||
|
int mismatch_blocks = 0;
|
||||||
|
int total_byte_diffs = 0;
|
||||||
|
for (size_t b = 0; b < n_blocks; b++) {
|
||||||
|
int bx = (int)(b % blocks_per_row);
|
||||||
|
int by = (int)(b / blocks_per_row);
|
||||||
|
const uint8_t *qpu_block = (uint8_t *)buf_dst.mapped
|
||||||
|
+ by * 8 * dst_stride + bx * 8;
|
||||||
|
const uint8_t *ref_block = expected_dst
|
||||||
|
+ by * 8 * dst_stride + bx * 8;
|
||||||
|
int block_diffs = 0;
|
||||||
|
for (int r0 = 0; r0 < 8; r0++)
|
||||||
|
for (int c = 0; c < 8; c++)
|
||||||
|
if (qpu_block[r0 * dst_stride + c]
|
||||||
|
!= ref_block[r0 * dst_stride + c]) {
|
||||||
|
block_diffs++;
|
||||||
|
total_byte_diffs++;
|
||||||
|
}
|
||||||
|
if (block_diffs > 0 && mismatch_blocks < max_mismatch_print) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"MISMATCH block %zu @ (bx=%d by=%d) eob=%d: %d/64 bytes differ\n",
|
||||||
|
b, bx, by, eobs[b], block_diffs);
|
||||||
|
fprintf(stderr, " ref:");
|
||||||
|
for (int r0 = 0; r0 < 8; r0++) {
|
||||||
|
fprintf(stderr, "\n r%d ", r0);
|
||||||
|
for (int c = 0; c < 8; c++)
|
||||||
|
fprintf(stderr, "%3u ", ref_block[r0 * dst_stride + c]);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n qpu:");
|
||||||
|
for (int r0 = 0; r0 < 8; r0++) {
|
||||||
|
fprintf(stderr, "\n r%d ", r0);
|
||||||
|
for (int c = 0; c < 8; c++)
|
||||||
|
fprintf(stderr, "%3u ", qpu_block[r0 * dst_stride + c]);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
if (block_diffs > 0) mismatch_blocks++;
|
||||||
|
}
|
||||||
|
printf(" blocks bit-exact: %zu / %zu (%.4f%%)\n",
|
||||||
|
n_blocks - mismatch_blocks, n_blocks,
|
||||||
|
100.0 * (n_blocks - mismatch_blocks) / n_blocks);
|
||||||
|
printf(" total byte diffs: %d / %zu (%.4f%%)\n",
|
||||||
|
total_byte_diffs, n_blocks * 64,
|
||||||
|
100.0 * total_byte_diffs / (n_blocks * 64));
|
||||||
|
|
||||||
|
if (mismatch_blocks > 0) {
|
||||||
|
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
|
||||||
|
v3d_runner_destroy_pipeline(r, &pipe);
|
||||||
|
v3d_runner_destroy_buffer(r, &buf_meta);
|
||||||
|
v3d_runner_destroy_buffer(r, &buf_dst);
|
||||||
|
v3d_runner_destroy_buffer(r, &buf_coeffs);
|
||||||
|
v3d_runner_destroy(r);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verify_only) {
|
||||||
|
v3d_runner_destroy_pipeline(r, &pipe);
|
||||||
|
v3d_runner_destroy_buffer(r, &buf_meta);
|
||||||
|
v3d_runner_destroy_buffer(r, &buf_dst);
|
||||||
|
v3d_runner_destroy_buffer(r, &buf_coeffs);
|
||||||
|
v3d_runner_destroy(r);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---- M2: throughput ---- */
|
||||||
|
printf("\n=== M2: QPU throughput ===\n");
|
||||||
|
|
||||||
|
/* Warm-up. */
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
memcpy(buf_dst.mapped, master_pred, buf_dst.size);
|
||||||
|
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
double t0 = now_seconds();
|
||||||
|
for (int i = 0; i < iters; i++) {
|
||||||
|
memcpy(buf_dst.mapped, master_pred, buf_dst.size);
|
||||||
|
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||||||
|
}
|
||||||
|
double t1 = now_seconds();
|
||||||
|
|
||||||
|
/* Setup-only timing for memcpy subtraction. */
|
||||||
|
double s0 = now_seconds();
|
||||||
|
for (int i = 0; i < iters; i++) {
|
||||||
|
memcpy(buf_dst.mapped, master_pred, buf_dst.size);
|
||||||
|
}
|
||||||
|
double s1 = now_seconds();
|
||||||
|
|
||||||
|
double total_seconds = (t1 - t0) - (s1 - s0);
|
||||||
|
double total_blocks = (double) n_blocks * iters;
|
||||||
|
double mblocks_s = total_blocks / total_seconds / 1e6;
|
||||||
|
|
||||||
|
printf(" blocks/dispatch: %zu\n", n_blocks);
|
||||||
|
printf(" iters: %d\n", iters);
|
||||||
|
printf(" total blocks: %.0f\n", total_blocks);
|
||||||
|
printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", total_seconds);
|
||||||
|
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
|
||||||
|
printf(" M2 throughput = %.3f Mblock/s\n", mblocks_s);
|
||||||
|
printf(" per-block = %.1f ns\n",
|
||||||
|
total_seconds / total_blocks * 1e9);
|
||||||
|
printf(" per-dispatch = %.1f us\n",
|
||||||
|
total_seconds / iters * 1e6);
|
||||||
|
|
||||||
|
/* R = M2 / M3 = M2 / 8.171 Mblock/s (Phase 3 baseline). */
|
||||||
|
double M3 = 8.171;
|
||||||
|
double R = mblocks_s / M3;
|
||||||
|
printf("\n Phase 3 NEON M3 = %.3f Mblock/s\n", M3);
|
||||||
|
printf(" R = M2 / M3 = %.3f\n", R);
|
||||||
|
if (R >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n");
|
||||||
|
else if (R >= 0.5) printf(" decision band = YELLOW: concurrent-work hypothesis viable\n");
|
||||||
|
else if (R >= 0.1) printf(" decision band = ORANGE: material loss; honest close suggested\n");
|
||||||
|
else printf(" decision band = RED: structural mismatch\n");
|
||||||
|
|
||||||
|
v3d_runner_destroy_pipeline(r, &pipe);
|
||||||
|
v3d_runner_destroy_buffer(r, &buf_meta);
|
||||||
|
v3d_runner_destroy_buffer(r, &buf_dst);
|
||||||
|
v3d_runner_destroy_buffer(r, &buf_coeffs);
|
||||||
|
v3d_runner_destroy(r);
|
||||||
|
free(master_coeffs); free(master_pred); free(expected_dst); free(eobs);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user