diff --git a/CMakeLists.txt b/CMakeLists.txt index a3f11ac..a9d8f38 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -207,7 +207,18 @@ if (DAEDALUS_BUILD_VULKAN) VERBATIM ) - add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV}) + set(CDEF_SPV ${CMAKE_BINARY_DIR}/v3d_cdef.spv) + add_custom_command( + OUTPUT ${CDEF_SPV} + COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3 + -o ${CDEF_SPV} + ${CMAKE_SOURCE_DIR}/src/v3d_cdef.comp + DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_cdef.comp + COMMENT "glslang: v3d_cdef.comp -> v3d_cdef.spv" + VERBATIM + ) + + add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV}) # v3d_runner — reusable Vulkan plumbing. add_library(v3d_runner STATIC src/v3d_runner.c) @@ -255,6 +266,17 @@ if (DAEDALUS_BUILD_VULKAN) target_link_libraries(bench_v3d_lpf8 PRIVATE v3d_runner Vulkan::Vulkan) target_compile_options(bench_v3d_lpf8 PRIVATE -O2) + # Cycle 5 — QPU CDEF bench (3-way M1 against NEON + C ref). + add_executable(bench_v3d_cdef + tests/bench_v3d_cdef.c + tests/cdef_ref.c + ${DAV1D_CDEF_ASM_SOURCES} + ${DAV1D_CDEF_C_SOURCES} + ) + add_dependencies(bench_v3d_cdef daedalus_shaders) + target_link_libraries(bench_v3d_cdef PRIVATE v3d_runner Vulkan::Vulkan) + target_compile_options(bench_v3d_cdef PRIVATE -O2) + # M4 — concurrent CPU(NEON) + QPU bench. Links the FFmpeg NEON # snapshot so we can run real NEON kernels on pinned CPU cores # while the QPU runs its dispatch loop concurrently. diff --git a/docs/k5_cdef_phase4.md b/docs/k5_cdef_phase4.md index 33a3f7a..14c19df 100644 --- a/docs/k5_cdef_phase4.md +++ b/docs/k5_cdef_phase4.md @@ -56,17 +56,18 @@ Output: `dst[r,c] = clamp(px + ((sum - (sum<0) + 8) >> 4), min, max);` - **No shaderFloat16/Int8 ALU**: int math everywhere. uint8 dst via storageBuffer8BitAccess (cycle-1 v4 pattern). -## SSBO layout +## SSBO layout (post Phase 5 RED-1 fix) -- `Meta[i]`: `uvec4(dst_off_bytes, params0, params1, dir)` where - `params0 = (pri | sec << 8 | damping << 16)` and - `params1 = tmp_off_bytes` (offset to block-origin = padded_origin + 2*16+2) -- `Tmp[]`: `uint16` array (`uint8_t` SSBO with manual 16-bit - read? Or `storageBuffer16BitAccess`? V3D 7.1 supports the - 16-bit extension.) -- `Dst[]`: `uint8_t` array - -Use 16-bit storage extension for tmp. +- `Meta[i]`: `uvec4(dst_off_bytes, params0, tmp_off_u16, dir)` — + i.e. `m.x` = dst_off, `m.y` = params (pri | sec << 8 | + damping << 16), `m.z` = tmp block-origin u16-element offset, + `m.w` = dir (3 bits used). **Pseudo-code below uses this + layout consistently.** +- `Tmp[]`: `uint16_t` array via `GL_EXT_shader_16bit_storage` + + `storageBuffer16BitAccess` — both already enabled in + `v3d_runner.c` and used by cycle 1 IDCT shader. No uncertainty. +- `Dst[]`: `uint8_t` array via `GL_EXT_shader_8bit_storage` (per + cycle-1 v4 pattern). ## Lane decomposition @@ -89,14 +90,20 @@ layout(push_constant) uniform PC { } pc; ``` -## Directions table +## Directions table (post Phase 5 RED-3 fix) -Store the 14-entry stride-16 directions table as a `const uint -dirs[14]` in the shader, packed as `(off1 << 16) | off2` per -direction (both signed offsets fit in int16). Read via index. +Use `const ivec2 dirs[14]` (8 directions + 6 wrap copies), each +entry = `(off_k0, off_k1)`. Signed-int storage handles negative +offsets cleanly without manual sign-extension. The OR-pack +approach proposed earlier would corrupt negative offsets; +abandoned. -Alternative: store as constants array (compiler may unroll into -uniform LUT). Same as cycle-2 LPF stored its tap weights. +Values from `tests/cdef_ref.c` `neon_directions8[14][2]`: +``` +dirs[ 0] = ivec2(-1*16+1, -2*16+2) // (-15, -30) +dirs[ 1] = ivec2( 0*16+1, -1*16+2) // (1, -14) +... (etc.) +``` ## Shader pseudo-code @@ -114,18 +121,18 @@ void main() { uvec4 m = u_meta.meta[block_idx]; uint dst_off = m.x + row * pc.dst_stride_u8 + col; - uint tmp_off = m.w + row * pc.tmp_stride_u16 + col; // m.w = tmp block-origin u16 offset + uint tmp_off = m.z + row * pc.tmp_stride_u16 + col; // m.z = tmp block-origin u16 offset int pri = int(m.y & 0xffu); int sec = int((m.y >> 8) & 0xffu); int damping = int((m.y >> 16) & 0xffu); - int dir = int(m.z & 7u); + int dir = int(m.w & 7u); int px = int(u_tmp.tmp[tmp_off]); int sum = 0; int mn = px, mx = px; int pri_shift = max(0, damping - ulog2(pri)); - int sec_shift = damping - ulog2(sec); + int sec_shift = max(0, damping - ulog2(sec)); // RED-2: NEON uqsub saturates to 0; GLSL >> by negative is UB. // pri_tap[k] for k=0,1 = 4-(pri&1), then (tap & 3) | 2 int pri_tap0 = 4 - (pri & 1); @@ -174,6 +181,25 @@ shaderdb prediction: - uniform count: 14 entries × 2 offsets = 28; + tap weights 4 = small. Should stay well below threshold. Predict 4 threads. +## Phase 5 review applied (2026-05-18, Sonnet) + +REDs fixed inline above: +- RED-1: meta field layout — `m.z = tmp_off`, `m.w = dir` (was swapped). +- RED-2: `sec_shift = max(0, ...)` to match NEON's `uqsub` saturation. +- RED-3: directions table is `const ivec2 dirs[14]`, not packed. + +YELLOWs accepted: +- YELLOW-1: Phase 6 bench is **3-way M1 (QPU vs NEON vs C ref)**, not 2-way. +- YELLOW-2: 16-bit storage extension confirmed present (cycle-1 already uses it). +- YELLOW-3: `sec_tap0 = 2, sec_tap1 = 1` made explicit in shader. +- YELLOW-4: use `gl_WorkGroupID.x` directly, not `gid / 256u`. + +**Also**: also clamp `sec_shift` in `tests/cdef_ref.c` (currently +unguarded; M1 gate passes by bench-param luck — params don't +exercise negative shift). Fix C ref + add negative-shift cases to +bench param generator so the 3-way M1 actually stresses the +edge case. + ## Phase 5 review focus Particular review items for the Phase 5 second-model audit: diff --git a/docs/k5_cdef_phase7.md b/docs/k5_cdef_phase7.md new file mode 100644 index 0000000..11c14e8 --- /dev/null +++ b/docs/k5_cdef_phase7.md @@ -0,0 +1,196 @@ +--- +cycle: 5 +phase: 7 +status: closed 2026-05-18 — M1 PASS, R₅=0.116 ORANGE, M4 same-kernel NEGATIVE, M4 mixed-kernel POSITIVE +date_opened: 2026-05-18 +date_closed: 2026-05-18 +parent: k5_cdef_phase6 (no doc — phase 6 is the shader + bench commit) +host: hertz +verdict: CDEF baseline = CPU; QPU dispatch path exists for opportunistic use. Better than predicted (ORANGE not RED). +--- + +# Cycle 5, Phase 7 — Verification (CDEF on V3D) + +## Phase 6 deliverable + +- `src/v3d_cdef.comp` — 256 inv/WG, 4 blocks/WG, no barrier, + uint16 tmp via `GL_EXT_shader_16bit_storage`, uint8 dst. +- `tests/bench_v3d_cdef.c` — 3-way M1 (QPU vs C ref vs NEON) per + Phase 5 YELLOW-1, M2 throughput, R₅ band classifier. +- `tests/bench_concurrent_mixed.c` extended with K_CDEF on both + CPU and QPU sides for M4. + +shaderdb: +``` +SHADER-DB-4a79c02a... 387 inst, 2 threads, 0 loops, 133 uniforms, + 21 max-temps, 0:0 spills:fills, 0 sfu-stalls, 5 nops +``` + +2 threads (not 4 as plan hoped) — register pressure same as +cycle 3 MC. 133 uniforms under the 144 gate. No spills. + +## M1 — 3-way bit-exact + +``` +=== M1₅: QPU vs C-ref vs NEON 3-way === + C ref vs NEON parity check: 0/4096 mismatches + QPU vs C ref: 4096 / 4096 blocks bit-exact (100.0000%) + QPU vs NEON: 4096 / 4096 blocks bit-exact (100.0000%) +``` + +All three implementations agree. Phase 5 RED-1, RED-2, RED-3 fixes +verified (meta layout, sec_shift clamp, ivec2 dirs table). + +## M2 — QPU throughput + +``` +=== M2₅: QPU throughput === + blocks/dispatch: 4096 + iters: 50 + total blocks: 204 800 + elapsed (kernel)=0.462 s + M2₅ throughput = 0.443 Mblock/s + per-block = 2256.1 ns + per-dispatch = 9241.0 us +``` + +R₅ = 0.443 / 3.809 = **0.116 → ORANGE band**. + +**Better than predicted** (Phase 4 estimated R₅ = 0.02-0.05, deep +RED). The prediction was extrapolated from cycle 3 MC's R₃ = 0.067 +× scaling for higher per-block compute weight. The actual QPU +overhead per block (387 inst at 2 threads) doesn't scale as +badly as that linear projection suggested — likely because +the constrain() inner loop has less filter-coefficient overhead +than MC's 8-tap subpel and the 16-bit tmp loads are well-suited +to the V3D 7.1 storage path. + +30fps@1080p floor: 0.443 / 0.972 = **0.46× margin (isolation)**. +**Below the user-facing floor as sole substrate.** But CDEF is +not commonly applied to every block in real video — it's +strength-gated per superblock. Effective CDEF rate in real +content is often < 0.5 Mblock/s. Within reach. + +## M4 — concurrent matrix + +All windows 6 s, hertz, `bench_concurrent_mixed`. + +### M4 same-kernel (cycle 5 closure) + +| Config | CPU CDEF agg | QPU CDEF | total | per-core CPU | +|---|---|---|---|---| +| **NEON-3 + QPU** | 8.080 | 0.381 | 8.461 | 2.69 avg | +| **NEON-4 + QPU** | 7.866 | 0.385 | 8.251 | 1.97 avg | + +NEON-3 + QPU > NEON-4 + QPU (8.46 > 8.25). NEON CDEF is +**bandwidth-saturated at 4 cores** despite per-block compute +weight (262 ns) suggesting compute-bound — the per-core +throughput drop from 2.69 (NEON-3) to 1.97 (NEON-4) confirms it. +Same pattern as cycle 1 IDCT and cycle 2 LPF. + +Without a "no QPU" baseline in this bench (rerun with cycle 5's +M3 alone gives 3.8 Mblock/s per core × 4 ≈ 15 Mblock/s +theoretical), the same-kernel M4 verdict: +- NEON-4 alone CDEF estimated ~9-10 Mblock/s (saturation + reduces from theoretical 15 to actual; matches per-core 2.5 + trend) +- NEON-3 + QPU CDEF (8.46) is **below NEON-4 alone** +- Same-kernel M4: **NEGATIVE** + +This matches the pessimistic same-kernel-bench framing +(`feedback_m4_same_kernel_worst_case.md`). + +### M4 mixed-kernel (deployment shape) + +| Config | CPU side | CPU agg | QPU CDEF | +|---|---|---|---| +| **NEON-3 MC + QPU CDEF** | MC | 34.17 Mblock/s | 0.424 Mblock/s | +| **NEON-3 LPF4 + QPU CDEF** | LPF4 | 31.48 Medge/s | 0.414 Mblock/s | + +QPU CDEF contributes 0.41-0.42 Mblock/s while the CPU side runs +near-maximum throughput. Compare against Issue 003 V1/V2 +NEON-fallback proxy (1.7 Mblock/s): the real QPU CDEF is +~4× weaker than the NEON-on-core-3 proxy estimated, but still +positive helper value. + +CPU MC agg in this mixed config (34.17 Mblock/s) is **higher** +than CPU MC in Issue 003 V1 (24.49) — because the V1 proxy used +NEON on core 3 which contended on the CPU memory bus, whereas +the real QPU contends on the QPU side. Real-substrate-cross +contention is gentler than NEON-core-3 proxy contention. **Issue +003 V1/V2 numbers underestimated CPU side**, but correctly +overestimated QPU helper magnitude. + +## Verdict + +| Rule | Result | Status | +|---|---|---| +| M1 bit-exact (3-way) | 100.00% on 4096 blocks | ✓ PASS | +| R₅ = M2₅/M3₅ | 0.116 (ORANGE) | better than predicted | +| M4 same-kernel | NEGATIVE (8.46 < ~10) | ✗ FAIL gate | +| M4 mixed-kernel (CPU=MC) | +0.42 Mblock/s QPU helper | ✓ POSITIVE | +| 30fps@1080p floor (isolation) | 0.46× | ✗ FAIL as sole substrate | +| 30fps@1080p floor (CPU baseline) | 8.46 / 0.972 = 8.7× | ✓ PASS via CPU | + +**Engineering verdict**: CDEF QPU offload viable as +**opportunistic helper**; CPU NEON remains primary substrate. +Phase 8 V4L2 wrapper should expose CDEF QPU dispatch path, but +scheduler defaults to CPU CDEF. + +**Surprise (positive)**: cycle 5 came in better than predicted +(ORANGE not RED). The "compute-bound → QPU bad" classification +held at the broad level, but the magnitude was less severe than +extrapolated. + +## Deployment recipe update + +| Cycle | Kernel | Primary | QPU dispatch path | Verdict | +|---|---|---|---|---| +| 1 IDCT 8×8 | QPU | yes | M4 +7.2 % validated | +| 2 LPF wd=4 | QPU | yes | M4 +6.9 % validated; V4 confirmed | +| 3 MC 8h | CPU | exists, unused | QPU MC = 0.39 Mblock/s under any contention | +| 4 LPF wd=8 | QPU | yes | M4 +4.1 % validated | +| 5 CDEF | CPU | exists, opportunistic | QPU CDEF = 0.42 Mblock/s mixed, ~half-floor on its own | + +## Phase 9 lessons + +1. **Predictions extrapolated linearly from one cycle can be too + pessimistic.** Cycle 3 MC R₃ = 0.067 extrapolated → R₅ = 0.02-0.05 + predicted; actual R₅ = 0.116. The "compute-bound" axis isn't a + single dimension — CDEF and MC are both compute-bound but have + different inner-loop shapes that affect V3D compiled code + differently. + +2. **CDEF is bandwidth-bound on NEON despite high per-block ns.** + Per-block 262 ns suggested "compute-bound" but per-core + saturation at 4 cores (2.5 → 2.0 Mblock/s) shows the real + constraint is memory bandwidth (192 u16 × 64 lanes/core reads + + 64 byte writes per block). This is a re-calibration of the + bandwidth-bound/compute-bound classification: the binary + categorization needs nuance. + +3. **Real-substrate-cross contention is gentler than same-side + NEON proxy.** Issue 003 V1/V2 used NEON-on-core-3 as a "QPU + helper" proxy; that overestimated the QPU's helper magnitude + (because NEON-on-core-3 has more parallelism than QPU) but + underestimated the CPU side throughput (because NEON-on-core-3 + contended on the CPU memory bus). The real QPU gives lower + helper throughput but does NOT hurt the CPU side at all. + +4. **3-way M1 (QPU vs C ref vs NEON) caught nothing — but it would + have caught the Phase 5 REDs cleanly.** The Phase 5 review's + recommendation (YELLOW-1) was correct prudence; in this case + the Phase 5 fixes prevented all bugs the gate would have caught, + but the 3-way structure is the right discipline going forward. + +## What lands in this commit + +- `src/v3d_cdef.comp` (Phase 6 shader, 387 inst, 2 threads) +- `tests/bench_v3d_cdef.c` (3-way M1, M2, R₅ classifier) +- `tests/bench_concurrent_mixed.c` extended with K_CDEF on both + sides; uses real QPU CDEF (Issue 003 NEON fallback removed) +- `CMakeLists.txt`: build wiring for v3d_cdef.spv + bench_v3d_cdef +- `docs/k5_cdef_phase7.md` (this doc) — Phase 7 closure +- Memory: update `feedback_m4_same_kernel_worst_case.md` with + cycle 5 real-QPU numbers (Issue 003 V1/V2 fallback proxy + obsolete). diff --git a/src/v3d_cdef.comp b/src/v3d_cdef.comp new file mode 100644 index 0000000..1c1fae7 --- /dev/null +++ b/src/v3d_cdef.comp @@ -0,0 +1,178 @@ +// daedalus-fourier cycle 5 — AV1 CDEF primary+secondary 8x8 luma filter, +// V3D 7.1 via Mesa v3dv compute. +// +// Per cycle-5 Phase 4 plan (post Phase 5 review): +// - 256 invocations / WG; 4 blocks/WG (64 pixels each, 1 pixel/lane) +// - NO barrier — each pixel independent +// - uint16_t tmp SSBO via storageBuffer16BitAccess +// - uint8_t dst SSBO via storageBuffer8BitAccess +// - directions table as `const ivec2[14]` (Phase 5 RED-3 fix) +// - meta layout: m.x=dst_off, m.y=params (pri|sec<<8|damping<<16), +// m.z=tmp_off_u16, m.w=dir (Phase 5 RED-1 fix) +// - sec_shift clamped to ≥0 to mirror NEON uqsub (Phase 5 RED-2 fix) +// +// License: BSD-2-Clause. Algorithm transcribed from tests/cdef_ref.c +// which mirrors dav1d 1.4.3 NEON (src/arm/64/cdef_tmpl.S). + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Meta { + uvec4 meta[]; // per-block: (dst_off, params, tmp_off_u16, dir) +} u_meta; + +layout(binding = 1) buffer Dst { + uint8_t dst[]; +} u_dst; + +layout(binding = 2) readonly buffer Tmp { + uint16_t tmp[]; // padded 12×16 per block; meta.z = block-origin u16 offset +} u_tmp; + +layout(push_constant) uniform PC { + uint n_blocks; + uint tmp_stride_u16; + uint dst_stride_u8; + uint _pad; +} pc; + +// 14-entry stride-16 directions table (8 dirs + 6 wrap copies for +// (dir+2)%8 / (dir+6)%8 safe lookup). Values from cdef_ref.c. +const ivec2 dirs8[14] = ivec2[]( + /* 0 */ ivec2(-1*16 + 1, -2*16 + 2), + /* 1 */ ivec2( 0*16 + 1, -1*16 + 2), + /* 2 */ ivec2( 0*16 + 1, 0*16 + 2), + /* 3 */ ivec2( 0*16 + 1, 1*16 + 2), + /* 4 */ ivec2( 1*16 + 1, 2*16 + 2), + /* 5 */ ivec2( 1*16 + 0, 2*16 + 1), + /* 6 */ ivec2( 1*16 + 0, 2*16 + 0), + /* 7 */ ivec2( 1*16 + 0, 2*16 - 1), + /* 8 = dir 0 */ ivec2(-1*16 + 1, -2*16 + 2), + /* 9 = dir 1 */ ivec2( 0*16 + 1, -1*16 + 2), + /* 10 = dir 2 */ ivec2( 0*16 + 1, 0*16 + 2), + /* 11 = dir 3 */ ivec2( 0*16 + 1, 1*16 + 2), + /* 12 = dir 4 */ ivec2( 1*16 + 1, 2*16 + 2), + /* 13 = dir 5 */ ivec2( 1*16 + 0, 2*16 + 1) +); + +int ulog2_pos(int x) { + // Mirrors C's 31 - __builtin_clz(uint). x >= 1 required. + return findMSB(uint(x)); +} + +int constrain(int diff, int threshold, int shift) +{ + int adiff = abs(diff); + int clip = max(0, threshold - (adiff >> shift)); + int amag = min(adiff, clip); + return diff < 0 ? -amag : amag; +} + +void main() +{ + uint wg_id = gl_WorkGroupID.x; + uint lane_in_wg = gl_LocalInvocationID.x; // 0..255 + uint block_in_wg = lane_in_wg >> 6; // 0..3 + uint px_idx = lane_in_wg & 63u; // 0..63 + uint row = px_idx >> 3; // 0..7 + uint col = px_idx & 7u; // 0..7 + + uint block_idx = wg_id * 4u + block_in_wg; + if (block_idx >= pc.n_blocks) return; // no barrier — safe + + uvec4 m = u_meta.meta[block_idx]; + uint dst_off = m.x + row * pc.dst_stride_u8 + col; + uint tmp_off = m.z + row * pc.tmp_stride_u16 + col; + int pri = int(m.y & 0xffu); + int sec = int((m.y >> 8) & 0xffu); + int damping = int((m.y >> 16) & 0xffu); + int dir = int(m.w & 7u); + + int px = int(u_tmp.tmp[tmp_off]); + int sum = 0; + int mn = px; + int mx = px; + + int pri_shift = max(0, damping - ulog2_pos(pri)); + int sec_shift = max(0, damping - ulog2_pos(sec)); // RED-2 fix + + int pri_tap0 = 4 - (pri & 1); + int pri_tap1 = (pri_tap0 & 3) | 2; + int sec_tap0 = 2; + int sec_tap1 = 1; + + int pri_idx = dir; + int sec1_idx = (dir + 2) & 7; + int sec2_idx = (dir + 6) & 7; // (dir - 2) % 8 + + // -- k = 0 -- + { + int o1 = dirs8[pri_idx ].x; + int o2 = dirs8[sec1_idx].x; + int o3 = dirs8[sec2_idx].x; + int p0 = int(u_tmp.tmp[uint(int(tmp_off) + o1)]); + int p1 = int(u_tmp.tmp[uint(int(tmp_off) - o1)]); + int s0 = int(u_tmp.tmp[uint(int(tmp_off) + o2)]); + int s1 = int(u_tmp.tmp[uint(int(tmp_off) - o2)]); + int s2 = int(u_tmp.tmp[uint(int(tmp_off) + o3)]); + int s3 = int(u_tmp.tmp[uint(int(tmp_off) - o3)]); + + sum += pri_tap0 * constrain(p0 - px, pri, pri_shift); + sum += pri_tap0 * constrain(p1 - px, pri, pri_shift); + sum += sec_tap0 * constrain(s0 - px, sec, sec_shift); + sum += sec_tap0 * constrain(s1 - px, sec, sec_shift); + sum += sec_tap0 * constrain(s2 - px, sec, sec_shift); + sum += sec_tap0 * constrain(s3 - px, sec, sec_shift); + + // min/max bookkeeping — NEON umin / smax semantics. + // Unsigned min: 0x8000 sentinel (32768u) > any 0..255 pixel. + // Signed max: 0x8000 = -32768 (signed) < any valid max. + mn = int(min(uint(mn), uint(p0))); + mn = int(min(uint(mn), uint(p1))); + mn = int(min(uint(mn), uint(s0))); + mn = int(min(uint(mn), uint(s1))); + mn = int(min(uint(mn), uint(s2))); + mn = int(min(uint(mn), uint(s3))); + mx = max(mx, p0); mx = max(mx, p1); + mx = max(mx, s0); mx = max(mx, s1); + mx = max(mx, s2); mx = max(mx, s3); + } + + // -- k = 1 -- + { + int o1 = dirs8[pri_idx ].y; + int o2 = dirs8[sec1_idx].y; + int o3 = dirs8[sec2_idx].y; + int p0 = int(u_tmp.tmp[uint(int(tmp_off) + o1)]); + int p1 = int(u_tmp.tmp[uint(int(tmp_off) - o1)]); + int s0 = int(u_tmp.tmp[uint(int(tmp_off) + o2)]); + int s1 = int(u_tmp.tmp[uint(int(tmp_off) - o2)]); + int s2 = int(u_tmp.tmp[uint(int(tmp_off) + o3)]); + int s3 = int(u_tmp.tmp[uint(int(tmp_off) - o3)]); + + sum += pri_tap1 * constrain(p0 - px, pri, pri_shift); + sum += pri_tap1 * constrain(p1 - px, pri, pri_shift); + sum += sec_tap1 * constrain(s0 - px, sec, sec_shift); + sum += sec_tap1 * constrain(s1 - px, sec, sec_shift); + sum += sec_tap1 * constrain(s2 - px, sec, sec_shift); + sum += sec_tap1 * constrain(s3 - px, sec, sec_shift); + + mn = int(min(uint(mn), uint(p0))); + mn = int(min(uint(mn), uint(p1))); + mn = int(min(uint(mn), uint(s0))); + mn = int(min(uint(mn), uint(s1))); + mn = int(min(uint(mn), uint(s2))); + mn = int(min(uint(mn), uint(s3))); + mx = max(mx, p0); mx = max(mx, p1); + mx = max(mx, s0); mx = max(mx, s1); + mx = max(mx, s2); mx = max(mx, s3); + } + + int adj = (sum - int(sum < 0) + 8) >> 4; + int outpx = clamp(px + adj, mn, mx); + u_dst.dst[dst_off] = uint8_t(outpx); +} diff --git a/tests/bench_concurrent_mixed.c b/tests/bench_concurrent_mixed.c index 49a362a..952e809 100644 --- a/tests/bench_concurrent_mixed.c +++ b/tests/bench_concurrent_mixed.c @@ -134,6 +134,31 @@ static void neon_run_lpf(uint64_t *seed, uint64_t *out_done, int wd_8) { free(master); free(work); free(Es); free(Is); free(Hs); } +static void neon_run_cdef(uint64_t *seed, uint64_t *out_done) { + int n = NEON_BATCH; + uint16_t *tmps = malloc((size_t) n * 192 * sizeof(uint16_t)); + uint8_t *dsts = malloc((size_t) n * 64); + int *pris = malloc(n*sizeof(int)), *secs = malloc(n*sizeof(int)); + int *dirs = malloc(n*sizeof(int)), *damps = malloc(n*sizeof(int)); + for (int i = 0; i < n; i++) { + for (int j = 0; j < 192; j++) tmps[i*192 + j] = (uint16_t)(xs_step(seed) & 0xff); + for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) + dsts[i*64 + r*8 + c] = (uint8_t) tmps[i*192 + (r+2)*16 + (c+2)]; + pris[i] = (int)(xs_step(seed) % 7) + 1; + secs[i] = (int)(xs_step(seed) % 4) + 1; + dirs[i] = (int)(xs_step(seed) & 7); + damps[i] = (int)(xs_step(seed) % 6) + 1; + } + while (!g_stop) { + for (int i = 0; i < n; i++) + dav1d_cdef_filter8_8bpc_neon(dsts + i*64, 8, + tmps + i*192 + (2*16+2), + pris[i], secs[i], dirs[i], damps[i], 8, 0); + *out_done += n; + } + free(tmps); free(dsts); free(pris); free(secs); free(dirs); free(damps); +} + static void neon_run_idct(uint64_t *seed, uint64_t *out_done) { int16_t *blocks_master = malloc((size_t) NEON_BATCH * 64 * sizeof(int16_t)); int16_t *blocks_work = malloc((size_t) NEON_BATCH * 64 * sizeof(int16_t)); @@ -175,6 +200,7 @@ static void *neon_worker(void *p) { case K_LPF4: neon_run_lpf(&seed, &done, 0); break; case K_LPF8: neon_run_lpf(&seed, &done, 1); break; case K_IDCT: neon_run_idct(&seed, &done); break; + case K_CDEF: neon_run_cdef(&seed, &done); break; default: fprintf(stderr, "bad NEON kernel\n"); break; } a->elapsed_s = now_s() - t0; @@ -194,8 +220,8 @@ typedef struct { /* Each QPU kernel has its own push-constant layout. */ typedef struct { uint32_t n, dst_stride_u8, _pad0, _pad1; } pc_lpf; typedef struct { uint32_t n, dst_stride_u8, src_stride_u8, _pad; } pc_mc; -/* IDCT: pc layout in v3d_idct8.comp = (n_blocks, blocks_per_row, dst_stride_u8, _pad) */ typedef struct { uint32_t n_blocks, blocks_per_row, dst_stride_u8, _pad; } pc_idct; +typedef struct { uint32_t n_blocks, tmp_stride_u16, dst_stride_u8, _pad; } pc_cdef; /* CDEF: not yet — QPU CDEF kernel not implemented. CDEF QPU mode uses * dav1d NEON via a single-thread NEON call on the QPU host core instead. * That's a degenerate "QPU helper" but matches the deferred state of @@ -296,8 +322,16 @@ static void *qpu_real_worker(void *p) case K_IDCT: spv = "v3d_idct8.spv"; dst_bytes = (size_t) n_units * 64; - src_bytes = (size_t) n_units * 64 * sizeof(int16_t); /* coeffs */ - meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t); /* per-block pos */ + src_bytes = (size_t) n_units * 64 * sizeof(int16_t); + meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t); + has_src = 1; + break; + case K_CDEF: + spv = "v3d_cdef.spv"; + bpw = 4; + dst_bytes = (size_t) n_units * 64; + src_bytes = (size_t) n_units * 192 * sizeof(uint16_t); + meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t); has_src = 1; break; default: @@ -334,22 +368,37 @@ static void *qpu_real_worker(void *p) ((uint8_t *) buf_src.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff); } else if (a->kernel == K_IDCT) { for (int i = 0; i < n_units; i++) { - meta[4*i+0] = (uint32_t)((size_t)i * 64); /* dst_off */ - meta[4*i+1] = (uint32_t)((i * 64) / 64); /* coeff_off (in blocks) */ - meta[4*i+2] = 0; /* eob (not used by our shader) */ + meta[4*i+0] = (uint32_t)((size_t)i * 64); + meta[4*i+1] = (uint32_t)((i * 64) / 64); + meta[4*i+2] = 0; meta[4*i+3] = 0; } - /* Fill coeffs with random VP9-ish values. */ int16_t *cf = (int16_t *) buf_src.mapped; size_t n_coefs = src_bytes / sizeof(int16_t); for (size_t i = 0; i < n_coefs; i++) cf[i] = (int16_t)((int)(xs_step(&seed) % 8192) - 4096); + } else if (a->kernel == K_CDEF) { + uint16_t *tmps = (uint16_t *) buf_src.mapped; + for (int i = 0; i < n_units; i++) { + uint32_t pri = (uint32_t)((xs_step(&seed) % 7) + 1); + uint32_t sec = (uint32_t)((xs_step(&seed) % 4) + 1); + uint32_t damping = (uint32_t)((xs_step(&seed) % 6) + 1); + meta[4*i+0] = (uint32_t)((size_t)i * 64); + meta[4*i+1] = pri | (sec << 8) | (damping << 16); + meta[4*i+2] = (uint32_t)((size_t)i * 192 + (2*16 + 2)); + meta[4*i+3] = (uint32_t)(xs_step(&seed) & 7); + for (int j = 0; j < 192; j++) + tmps[(size_t)i * 192 + j] = (uint16_t)(xs_step(&seed) & 0xff); + } + for (size_t i = 0; i < dst_bytes; i++) + ((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff); } v3d_pipeline pipe = {0}; int n_ssbos = has_src ? 3 : 2; size_t pc_size = (a->kernel == K_MC) ? sizeof(pc_mc) : - (a->kernel == K_IDCT) ? sizeof(pc_idct) : sizeof(pc_lpf); + (a->kernel == K_IDCT) ? sizeof(pc_idct) : + (a->kernel == K_CDEF) ? sizeof(pc_cdef) : sizeof(pc_lpf); v3d_runner_create_pipeline(r, spv, n_ssbos, pc_size, &pipe); v3d_buffer bind_bufs[3]; @@ -359,13 +408,15 @@ static void *qpu_real_worker(void *p) v3d_runner_bind_buffers(r, &pipe, bind_bufs, n_ssbos); uint32_t gc = (uint32_t)((n_units + bpw - 1) / bpw); - union { pc_lpf lpf; pc_mc mc; pc_idct idct; } pc = {0}; + union { pc_lpf lpf; pc_mc mc; pc_idct idct; pc_cdef cdef; } pc = {0}; if (a->kernel == K_LPF4 || a->kernel == K_LPF8) { pc.lpf = (pc_lpf){ .n = n_units, .dst_stride_u8 = 8 }; } else if (a->kernel == K_MC) { pc.mc = (pc_mc){ .n = n_units, .dst_stride_u8 = 8, .src_stride_u8 = 16 }; } else if (a->kernel == K_IDCT) { pc.idct = (pc_idct){ .n_blocks = n_units, .blocks_per_row = 16, .dst_stride_u8 = 128 }; + } else if (a->kernel == K_CDEF) { + pc.cdef = (pc_cdef){ .n_blocks = n_units, .tmp_stride_u16 = 16, .dst_stride_u8 = 8 }; } VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r); @@ -451,10 +502,10 @@ int main(int argc, char **argv) } } - /* CDEF on QPU side currently uses dav1d NEON fallback (cycle 5 - * Phase 6 not yet implemented). Real QPU CDEF would replace - * qpu_cdef_neon_fallback with qpu_real_worker. */ - int use_neon_fallback_for_cdef = (qpu_k == K_CDEF); + /* Cycle 5 Phase 6 landed — v3d_cdef.spv is M1-PASS. Use real + * QPU dispatch for CDEF too. The NEON-fallback worker remains + * compiled but is unselected. */ + int use_neon_fallback_for_cdef = 0; int barrier_count = n_neon + 1 /* QPU */ + 1 /* timer */ + 1 /* main */; printf("=== Issue 003 mixed-kernel M4 bench ===\n"); diff --git a/tests/bench_neon_cdef.c b/tests/bench_neon_cdef.c index 5a04638..316eb00 100644 --- a/tests/bench_neon_cdef.c +++ b/tests/bench_neon_cdef.c @@ -79,12 +79,17 @@ static void gen_filter_params(int *pri, int *sec, int *dir, int *damping) * pri_strength: 1..7 (non-zero for combined path) * sec_strength: 1..4 * dir: 0..7 - * damping: 3..6 + * damping: 1..6 — extended down to 1 (was 3..6) per + * cycle 5 phase 5 RED-2: include cases where + * sec_shift = damping - ulog2(sec) goes negative + * (e.g. damping=1, sec=4 → sec_shift = -1). + * Both NEON (uqsub) and C ref (now max(0,...)) + * saturate to 0 here; the bench should exercise it. */ *pri = (int)(xs() % 7) + 1; *sec = (int)(xs() % 4) + 1; *dir = (int)(xs() & 7); - *damping = (int)(xs() % 4) + 3; + *damping = (int)(xs() % 6) + 1; } static double now_seconds(void) diff --git a/tests/bench_v3d_cdef.c b/tests/bench_v3d_cdef.c new file mode 100644 index 0000000..6b241c7 --- /dev/null +++ b/tests/bench_v3d_cdef.c @@ -0,0 +1,332 @@ +/* + * Cycle 5 Phase 6 — QPU bench for AV1 CDEF primary+secondary 8x8 + * luma filter on V3D 7.1. + * + * Reports: + * M1₅: 3-way bit-exact (QPU vs NEON vs C reference) per Phase 5 + * YELLOW-1. + * M2₅: QPU sustained Mblock/s over K dispatched batches + * + * License: BSD-2-Clause; links dav1d 1.4.3 NEON snapshot. + */ +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "v3d_runner.h" + +extern void daedalus_cdef_filter_8x8_pri_sec_ref( + uint8_t *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, + int pri_strength, int sec_strength, + int dir, int damping, int h); + +extern void dav1d_cdef_filter8_8bpc_neon( + uint8_t *dst, ptrdiff_t dst_stride, + const uint16_t *tmp, + int pri_strength, int sec_strength, + int dir, int damping, int h, size_t edges); + +#define TMP_W 16 +#define TMP_H 12 +#define TMP_INTS (TMP_W * TMP_H) /* 192 */ +#define DST_W 8 +#define DST_H 8 +#define DST_BYTES (DST_H * DST_W) /* 64 */ +#define BLOCK_ORIGIN_U16 (2 * TMP_W + 2) /* 34 */ + +static uint64_t xs_state; +static inline uint64_t xs(void) { + uint64_t x = xs_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs_state = x; +} + +static void gen_tmp(uint16_t *tmp) +{ + for (int i = 0; i < TMP_INTS; i++) + tmp[i] = (uint16_t)(xs() & 0xff); +} + +static void tmp_center_to_dst(uint8_t *dst, const uint16_t *tmp) +{ + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) + dst[r * 8 + c] = (uint8_t) tmp[(r + 2) * TMP_W + (c + 2)]; +} + +static void gen_filter_params(int *pri, int *sec, int *dir, int *damping) +{ + *pri = (int)(xs() % 7) + 1; + *sec = (int)(xs() % 4) + 1; + *dir = (int)(xs() & 7); + *damping = (int)(xs() % 6) + 1; /* includes negative-sec_shift cases */ +} + +static double now_seconds(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ts.tv_sec + ts.tv_nsec * 1e-9; +} + +typedef struct { + uint32_t n_blocks; + uint32_t tmp_stride_u16; + uint32_t dst_stride_u8; + uint32_t _pad; +} push_consts; + +int main(int argc, char **argv) +{ + int n_blocks = 16384; + int iters = 200; + int verify_only = 0; + uint64_t seed = 0; + const char *spv_path = "v3d_cdef.spv"; + + static struct option opts[] = { + {"blocks", required_argument, 0, 'b'}, + {"iters", required_argument, 0, 'i'}, + {"seed", required_argument, 0, 's'}, + {"spv", required_argument, 0, 'S'}, + {"verify-only", no_argument, 0, 'V'}, + {0,0,0,0} + }; + for (int c; (c = getopt_long(argc, argv, "b:i:s:S:V", opts, 0)) != -1;) { + switch (c) { + case 'b': n_blocks = atoi(optarg); break; + case 'i': iters = atoi(optarg); break; + case 's': seed = strtoull(optarg, 0, 0); break; + case 'S': spv_path = optarg; break; + case 'V': verify_only = 1; break; + default: return 2; + } + } + + xs_state = seed ? seed : 0xc0defacedcafebebULL; + + v3d_runner *r = v3d_runner_create(); + if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; } + printf("=== v3d CDEF bench ===\n"); + printf(" device: %s\n", v3d_runner_device_name(r)); + printf(" n_blocks: %d iters: %d seed: 0x%016llx\n", + n_blocks, iters, (unsigned long long) (seed ? seed : 0xc0defacedcafebebULL)); + + size_t meta_bytes = (size_t) n_blocks * 4 * sizeof(uint32_t); /* uvec4 */ + size_t dst_bytes = (size_t) n_blocks * DST_BYTES; + size_t tmp_bytes = (size_t) n_blocks * TMP_INTS * sizeof(uint16_t); + + v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_tmp = {0}; + if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1; + if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1; + if (v3d_runner_create_buffer(r, tmp_bytes, &buf_tmp)) return 1; + + uint8_t *master_dst = malloc(dst_bytes); + uint8_t *expected_c = malloc(dst_bytes); + uint8_t *expected_n = malloc(dst_bytes); + int *pris = malloc(n_blocks * sizeof(int)); + int *secs = malloc(n_blocks * sizeof(int)); + int *dirs = malloc(n_blocks * sizeof(int)); + int *damps = malloc(n_blocks * sizeof(int)); + if (!master_dst || !expected_c || !expected_n || !pris || !secs || !dirs || !damps) { + fprintf(stderr, "alloc fail\n"); return 1; + } + + /* Generate tmp + params + initial dst (block center extracted). */ + uint16_t *tmp_gpu = (uint16_t *) buf_tmp.mapped; + for (int i = 0; i < n_blocks; i++) { + uint16_t *tmp = tmp_gpu + (size_t)i * TMP_INTS; + gen_tmp(tmp); + tmp_center_to_dst(master_dst + (size_t)i * DST_BYTES, tmp); + gen_filter_params(&pris[i], &secs[i], &dirs[i], &damps[i]); + } + + /* Compute C-ref and NEON expected outputs (serial, on master_dst). */ + memcpy(expected_c, master_dst, dst_bytes); + memcpy(expected_n, master_dst, dst_bytes); + for (int i = 0; i < n_blocks; i++) { + daedalus_cdef_filter_8x8_pri_sec_ref( + expected_c + (size_t)i * DST_BYTES, DST_W, + tmp_gpu + (size_t)i * TMP_INTS, + pris[i], secs[i], dirs[i], damps[i], 8); + dav1d_cdef_filter8_8bpc_neon( + expected_n + (size_t)i * DST_BYTES, DST_W, + tmp_gpu + (size_t)i * TMP_INTS + BLOCK_ORIGIN_U16, + pris[i], secs[i], dirs[i], damps[i], 8, 0); + } + + /* Confirm 2-way C vs NEON parity (defence in depth — Phase 3 already + * passed this for 10000 blocks, but n_blocks may be larger here). */ + int cn_mis = 0; + for (int i = 0; i < n_blocks; i++) { + if (memcmp(expected_c + (size_t)i * DST_BYTES, + expected_n + (size_t)i * DST_BYTES, DST_BYTES) != 0) cn_mis++; + } + printf(" C ref vs NEON parity check: %d/%d mismatches\n", cn_mis, n_blocks); + if (cn_mis > 0) { + fprintf(stderr, "ERROR: C ref disagrees with NEON before QPU even runs.\n"); + return 1; + } + + /* Populate meta SSBO (post Phase 5 RED-1 layout). */ + uint32_t *meta = (uint32_t *) buf_meta.mapped; + uint32_t dst_stride_u8 = DST_W; /* 8 */ + uint32_t tmp_stride_u16 = TMP_W; /* 16 */ + for (int i = 0; i < n_blocks; i++) { + uint32_t pri = (uint32_t) pris[i]; + uint32_t sec = (uint32_t) secs[i]; + uint32_t damping = (uint32_t) damps[i]; + meta[4*i + 0] = (uint32_t)((size_t)i * DST_BYTES); + meta[4*i + 1] = pri | (sec << 8) | (damping << 16); + meta[4*i + 2] = (uint32_t)((size_t)i * TMP_INTS + BLOCK_ORIGIN_U16); + meta[4*i + 3] = (uint32_t) dirs[i]; + } + + /* Pipeline (3 SSBOs). */ + v3d_pipeline pipe = {0}; + if (v3d_runner_create_pipeline(r, spv_path, + /*n_ssbos=*/3, + /*push_const_size=*/sizeof(push_consts), + &pipe)) return 1; + v3d_buffer bind_bufs[3] = { buf_meta, buf_dst, buf_tmp }; + if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 3)) return 1; + + const uint32_t blocks_per_wg = 4; + uint32_t group_count_x = (uint32_t)((n_blocks + blocks_per_wg - 1) / blocks_per_wg); + printf(" dispatch: %u WGs × 256 invocations = %u blocks\n", + group_count_x, group_count_x * blocks_per_wg); + + push_consts pc = { + .n_blocks = (uint32_t) n_blocks, + .tmp_stride_u16 = tmp_stride_u16, + .dst_stride_u8 = dst_stride_u8, + ._pad = 0, + }; + + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r); + if (cb == VK_NULL_HANDLE) return 1; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + pipe.layout, 0, 1, &pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + vkCmdDispatch(cb, group_count_x, 1, 1); + vkEndCommandBuffer(cb); + + /* --- M1: QPU vs C-ref vs NEON 3-way --- */ + printf("\n=== M1₅: QPU vs C-ref vs NEON 3-way ===\n"); + memcpy(buf_dst.mapped, master_dst, dst_bytes); + if (v3d_runner_submit_wait(r, cb)) return 1; + + int qc_mismatches = 0, qn_mismatches = 0; + int prints = 0; + for (int i = 0; i < n_blocks; i++) { + const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * DST_BYTES; + const uint8_t *c = expected_c + (size_t)i * DST_BYTES; + const uint8_t *n = expected_n + (size_t)i * DST_BYTES; + int qc = memcmp(q, c, DST_BYTES); + int qn = memcmp(q, n, DST_BYTES); + if (qc) qc_mismatches++; + if (qn) qn_mismatches++; + if ((qc || qn) && prints < 3) { + fprintf(stderr, "MISMATCH block %d (pri=%d sec=%d dir=%d damp=%d):\n", + i, pris[i], secs[i], dirs[i], damps[i]); + fprintf(stderr, " C ref:"); + for (int r0 = 0; r0 < 8; r0++) { + fprintf(stderr, "\n r%d ", r0); + for (int c0 = 0; c0 < 8; c0++) fprintf(stderr, "%3u ", c[r0*8+c0]); + } + fprintf(stderr, "\n QPU:"); + for (int r0 = 0; r0 < 8; r0++) { + fprintf(stderr, "\n r%d ", r0); + for (int c0 = 0; c0 < 8; c0++) fprintf(stderr, "%3u ", q[r0*8+c0]); + } + fprintf(stderr, "\n"); + prints++; + } + } + printf(" QPU vs C ref: %d / %d blocks bit-exact (%.4f%%)\n", + n_blocks - qc_mismatches, n_blocks, + 100.0 * (n_blocks - qc_mismatches) / n_blocks); + printf(" QPU vs NEON: %d / %d blocks bit-exact (%.4f%%)\n", + n_blocks - qn_mismatches, n_blocks, + 100.0 * (n_blocks - qn_mismatches) / n_blocks); + + if (qc_mismatches > 0 || qn_mismatches > 0) { + fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); + return 1; + } + + if (verify_only) { + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_tmp); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy(r); + return 0; + } + + /* --- M2: throughput --- */ + printf("\n=== M2₅: QPU throughput ===\n"); + + for (int i = 0; i < 5; i++) { + memcpy(buf_dst.mapped, master_dst, dst_bytes); + if (v3d_runner_submit_wait(r, cb)) return 1; + } + + double t0 = now_seconds(); + for (int i = 0; i < iters; i++) { + memcpy(buf_dst.mapped, master_dst, dst_bytes); + if (v3d_runner_submit_wait(r, cb)) return 1; + } + double t1 = now_seconds(); + + double s0 = now_seconds(); + for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master_dst, dst_bytes); + double s1 = now_seconds(); + + double kernel_seconds = (t1 - t0) - (s1 - s0); + double total_blocks = (double) n_blocks * iters; + double mbps = total_blocks / kernel_seconds / 1e6; + + printf(" blocks/dispatch: %d\n", n_blocks); + printf(" iters: %d\n", iters); + printf(" total blocks: %.0f\n", total_blocks); + printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", kernel_seconds); + printf(" elapsed (setup) =%.6f s\n", s1 - s0); + printf(" M2₅ throughput = %.3f Mblock/s\n", mbps); + printf(" per-block = %.1f ns\n", kernel_seconds / total_blocks * 1e9); + printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6); + + double M3_5 = 3.809; + double R5 = mbps / M3_5; + printf("\n Cycle 5 NEON M3₅ = %.3f Mblock/s\n", M3_5); + printf(" R₅ = M2₅/M3₅ = %.3f\n", R5); + if (R5 >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n"); + else if (R5 >= 0.5) printf(" decision band = YELLOW: M4 decides\n"); + else if (R5 >= 0.1) printf(" decision band = ORANGE: M4 may still rescue\n"); + else printf(" decision band = RED: structural mismatch (predicted)\n"); + + /* 30fps@1080p floor: 32400 blocks/frame × 30 fps = 0.972 Mblock/s */ + double floor_rate = 0.972; + printf(" 30fps@1080p floor: %.2fx margin (isolation)\n", mbps / floor_rate); + + v3d_runner_destroy_pipeline(r, &pipe); + v3d_runner_destroy_buffer(r, &buf_tmp); + v3d_runner_destroy_buffer(r, &buf_dst); + v3d_runner_destroy_buffer(r, &buf_meta); + v3d_runner_destroy(r); + free(master_dst); free(expected_c); free(expected_n); + free(pris); free(secs); free(dirs); free(damps); + return 0; +} diff --git a/tests/cdef_ref.c b/tests/cdef_ref.c index b7503c7..7eb9dcf 100644 --- a/tests/cdef_ref.c +++ b/tests/cdef_ref.c @@ -98,7 +98,10 @@ void daedalus_cdef_filter_8x8_pri_sec_ref( { const int pri_tap = 4 - (pri_strength & 1); const int pri_shift = imax(0, damping - ulog2((unsigned) pri_strength)); - const int sec_shift = damping - ulog2((unsigned) sec_strength); + /* Cycle 5 phase 5 RED-2: NEON `uqsub` saturates to 0. Mirror it + * here so the C ref is bit-exact against NEON for damping-light + * cases (which the original bench param gen didn't exercise). */ + const int sec_shift = imax(0, damping - ulog2((unsigned) sec_strength)); /* Walk into the center 8x8 region of the 12×16 padded buffer. */ tmp = tmp + 2 * TMP_STRIDE + 2;