d66f22f333
First QPU IDCT8 kernel running and bit-exact on V3D 7.1 via Mesa
v3dv compute. Five iterations through a Phase 7→Phase 4' loopback;
production kernel is v4.
New files:
- src/v3d_runner.{c,h} — reusable Vulkan compute plumbing (instance,
V3D device picker, HOST_VISIBLE|COHERENT
SSBOs with mmap, compute pipeline from .spv,
enables storageBuffer{8,16}BitAccess)
- src/v3d_idct8.comp — VP9 8x8 DCT_DCT IDCT add, v4 production:
256 invocations/WG, 2 blocks/subgroup
(no idle lanes), uint8 dst SSBO (race-free
per phase5 finding 5), unrolled writes
(no chained ternary), oob-flag pattern
(barrier-safe per phase5 finding 7)
- tests/bench_v3d_idct.c — M1' bit-exact gate + M2 throughput vs C ref
- docs/phase7.md — full iteration journey + decision verdict
CMakeLists.txt updated to build the new shader, library, and bench
when DAEDALUS_BUILD_VULKAN=ON.
Iteration record (1920x1088 luma, 32640 blocks/dispatch, N=3):
ver change R ns/block
v1 first-light 0.230 533
v2 kill ternary + 2-blocks-per-sg 0.474 258
v3 per-pass scope oN 0.481 254 (noise)
v4 WG 64 -> 256 invocations 0.947 129
v5 packed uint32 coeff reads 0.938 130 (noise, reverted)
v4 final N=3 0.918 +/- 0.033
Bit-exactness 100.0000% across all iterations (10000-block sample
on 128x128, 32640-block sample on 1080p) against both the C
reference (tests/vp9_idct8_ref.c) and the vendored FFmpeg NEON
ff_vp9_idct_idct_8x8_add_neon.
Key learning over the Phase 5 review's prediction model: the
chained ternary was NOT a spill killer on V3D 7.1 (shaderdb
showed 0:0 spills:fills even in v1). The actual lever was
workgroup-size-driven latency hiding — going from 64 to 256
invocations doubled throughput with the same compiled code
(270 inst, 2 threads, 21 max-temps, 0 spills) because the
v3dv scheduler had 4x more in-flight work to overlap TMU
latency.
Verdict per phase1.md decision rules: YELLOW band (0.5 <= R < 1.0)
by a wide margin, near GREEN boundary. Phase 1 YELLOW rule:
add M4 (concurrent CPU+QPU throughput) before honest-close or
continue. M4 is the next measurement, not more shader tuning —
at R = 0.92 with all 4 A76 cores still 100% free for other work,
the question is whether the system aggregate beats pure 4-core
NEON.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
218 lines
8.6 KiB
Plaintext
218 lines
8.6 KiB
Plaintext
// daedalus-fourier — VP9 8×8 DCT_DCT inverse-transform-add, V3D 7.1.
|
||
// v2: post-Phase-7 loopback. Phase 4' iteration 1.
|
||
//
|
||
// Changes from v1 (per phase47 iteration 1 + Sonnet v3d perf research):
|
||
//
|
||
// Opt 1 — kill the chained ternary. v1's row-pass write had
|
||
// `(r==0)?o0:(r==1)?o1:...` inside a `for r` loop; that
|
||
// kept all 8 oN scalars live across 7 phi nodes and almost
|
||
// certainly forced register spills (Iago Toral 2021,
|
||
// blogs.igalia.com/itoral). v2 unrolls the 8 writes
|
||
// completely — each oN is used exactly once.
|
||
//
|
||
// Opt 2 — 2 blocks per subgroup. v1 had 1 block per 16-lane
|
||
// subgroup with 8 lanes idle per phase. v2 packs 2 blocks
|
||
// per subgroup (one in lanes 0..7, one in lanes 8..15),
|
||
// and every lane runs both passes for its own block.
|
||
// Eliminates idle lanes AND removes the col_pass/row_pass
|
||
// branch divergence. 8 blocks per WG (vs 4 before),
|
||
// dispatch count halves from 8160 to 4080 on 1080p.
|
||
// Shared-mem footprint doubles to 2 KiB (still « 16 KiB).
|
||
//
|
||
// (Opt 3 — packed uint32 storage — deferred; do it if Opt 1+2
|
||
// don't get us into the GREEN/YELLOW decision band.)
|
||
//
|
||
// License: BSD-2-Clause.
|
||
|
||
#version 450
|
||
#extension GL_EXT_shader_8bit_storage : require
|
||
#extension GL_EXT_shader_16bit_storage : require
|
||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||
|
||
// v4: local_size 256 (was 64) — 16 subgroups × 16 lanes = 32 blocks/WG.
|
||
// More in-flight work per WG = more latency hiding for v3d's TMU.
|
||
// shared = 32 × 64 × 4 B = 8 KiB (still under 16 KiB).
|
||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||
|
||
layout(binding = 0) readonly buffer Coeffs {
|
||
int16_t coeffs[]; // N × 64 packed
|
||
} u_coeffs;
|
||
// (v5 tried uint32-packed reads with manual unpack — no measurable
|
||
// perf change vs int16, added code complexity; reverted.)
|
||
|
||
layout(binding = 1) buffer Dst {
|
||
uint8_t dst[]; // H × stride bytes
|
||
} u_dst;
|
||
|
||
layout(binding = 2) readonly buffer Meta {
|
||
uvec2 meta[]; // per-block (block_x_8, block_y_8)
|
||
} u_meta;
|
||
|
||
layout(push_constant) uniform PC {
|
||
uint n_blocks;
|
||
uint blocks_per_row; // unused (meta drives position)
|
||
uint dst_stride_u8;
|
||
uint _pad;
|
||
} pc;
|
||
|
||
// 32 blocks per WG × 64 i32 per block × 4 B = 8192 B shared.
|
||
shared int tmp_shared[32 * 64];
|
||
|
||
// VP9 Q14 trig constants (spec §8.7.1.4).
|
||
const int COSPI_16 = 11585;
|
||
const int COSPI_24 = 6270;
|
||
const int COSPI_08 = 15137;
|
||
const int COSPI_28 = 3196;
|
||
const int COSPI_04 = 16069;
|
||
const int COSPI_20 = 9102;
|
||
const int COSPI_12 = 13623;
|
||
|
||
int qround14(int x) { return (x + (1 << 13)) >> 14; }
|
||
|
||
void idct8_1d(int i0, int i1, int i2, int i3,
|
||
int i4, int i5, int i6, int i7,
|
||
out int o0, out int o1, out int o2, out int o3,
|
||
out int o4, out int o5, out int o6, out int o7)
|
||
{
|
||
int t0a = qround14((i0 + i4) * COSPI_16);
|
||
int t1a = qround14((i0 - i4) * COSPI_16);
|
||
int t2a = qround14(i2 * COSPI_24 - i6 * COSPI_08);
|
||
int t3a = qround14(i2 * COSPI_08 + i6 * COSPI_24);
|
||
int t4a = qround14(i1 * COSPI_28 - i7 * COSPI_04);
|
||
int t5a = qround14(i5 * COSPI_12 - i3 * COSPI_20);
|
||
int t6a = qround14(i5 * COSPI_20 + i3 * COSPI_12);
|
||
int t7a = qround14(i1 * COSPI_04 + i7 * COSPI_28);
|
||
|
||
int t0 = t0a + t3a, t1 = t1a + t2a;
|
||
int t2 = t1a - t2a, t3 = t0a - t3a;
|
||
int t4 = t4a + t5a;
|
||
int t5p = t4a - t5a;
|
||
int t7 = t7a + t6a;
|
||
int t6p = t7a - t6a;
|
||
|
||
int t5 = qround14((t6p - t5p) * COSPI_16);
|
||
int t6 = qround14((t6p + t5p) * COSPI_16);
|
||
|
||
o0 = t0 + t7; o1 = t1 + t6;
|
||
o2 = t2 + t5; o3 = t3 + t4;
|
||
o4 = t3 - t4; o5 = t2 - t5;
|
||
o6 = t1 - t6; o7 = t0 - t7;
|
||
}
|
||
|
||
void main()
|
||
{
|
||
// ---- Lane / block decomposition --------------------------------
|
||
// 64 invocations/WG = 4 subgroups × 16 lanes/subgroup.
|
||
// Each subgroup packs 2 blocks (one in lanes 0..7, one in lanes 8..15).
|
||
// 8 blocks per WG total.
|
||
//
|
||
// Every lane runs both column and row pass for its own block —
|
||
// no idle lanes, no col_pass/row_pass branch divergence.
|
||
|
||
uint gid = gl_GlobalInvocationID.x;
|
||
uint wg_id = gid / 256u;
|
||
uint lane_in_wg = gid & 255u;
|
||
uint sg_in_wg = lane_in_wg >> 4; // 0..15
|
||
uint lane_in_sg = lane_in_wg & 15u;
|
||
uint block_slot = lane_in_sg >> 3; // 0 (lanes 0..7) or 1 (lanes 8..15)
|
||
uint k = lane_in_sg & 7u; // 0..7
|
||
|
||
uint block_local = sg_in_wg * 2u + block_slot; // 0..31 within WG
|
||
uint block_idx = wg_id * 32u + block_local;
|
||
|
||
// OOB flag — gates work bodies, but barrier() is reached by all.
|
||
// Per phase5.md finding 7.
|
||
bool oob = (block_idx >= pc.n_blocks);
|
||
|
||
// ---- Column pass ----------------------------------------------
|
||
// v3 (Opt 4): scope oN inside each pass so they're dead at the
|
||
// barrier — v2 had them function-scope which inflated max-temps
|
||
// (shaderdb reported 20 max-temps / 2 threads instead of 4 threads
|
||
// possible). Lower temps → more hardware threads → better
|
||
// latency hiding.
|
||
if (!oob) {
|
||
uint base = block_idx * 64u;
|
||
int c0 = int(u_coeffs.coeffs[base + 0u * 8u + k]);
|
||
int c1 = int(u_coeffs.coeffs[base + 1u * 8u + k]);
|
||
int c2 = int(u_coeffs.coeffs[base + 2u * 8u + k]);
|
||
int c3 = int(u_coeffs.coeffs[base + 3u * 8u + k]);
|
||
int c4 = int(u_coeffs.coeffs[base + 4u * 8u + k]);
|
||
int c5 = int(u_coeffs.coeffs[base + 5u * 8u + k]);
|
||
int c6 = int(u_coeffs.coeffs[base + 6u * 8u + k]);
|
||
int c7 = int(u_coeffs.coeffs[base + 7u * 8u + k]);
|
||
|
||
int o0, o1, o2, o3, o4, o5, o6, o7;
|
||
idct8_1d(c0, c1, c2, c3, c4, c5, c6, c7,
|
||
o0, o1, o2, o3, o4, o5, o6, o7);
|
||
|
||
// Transposed write: row k of tmp_shared[block_local].
|
||
uint tbase = block_local * 64u + k * 8u;
|
||
tmp_shared[tbase + 0u] = o0;
|
||
tmp_shared[tbase + 1u] = o1;
|
||
tmp_shared[tbase + 2u] = o2;
|
||
tmp_shared[tbase + 3u] = o3;
|
||
tmp_shared[tbase + 4u] = o4;
|
||
tmp_shared[tbase + 5u] = o5;
|
||
tmp_shared[tbase + 6u] = o6;
|
||
tmp_shared[tbase + 7u] = o7;
|
||
}
|
||
|
||
barrier(); // unconditional — every lane in the WG reaches this
|
||
|
||
// ---- Row pass --------------------------------------------------
|
||
if (!oob) {
|
||
// Read column k of tmp_shared[block_local].
|
||
uint tbase = block_local * 64u;
|
||
int s0 = tmp_shared[tbase + 0u * 8u + k];
|
||
int s1 = tmp_shared[tbase + 1u * 8u + k];
|
||
int s2 = tmp_shared[tbase + 2u * 8u + k];
|
||
int s3 = tmp_shared[tbase + 3u * 8u + k];
|
||
int s4 = tmp_shared[tbase + 4u * 8u + k];
|
||
int s5 = tmp_shared[tbase + 5u * 8u + k];
|
||
int s6 = tmp_shared[tbase + 6u * 8u + k];
|
||
int s7 = tmp_shared[tbase + 7u * 8u + k];
|
||
|
||
int o0, o1, o2, o3, o4, o5, o6, o7;
|
||
idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7,
|
||
o0, o1, o2, o3, o4, o5, o6, o7);
|
||
|
||
// Columnar write into dst. Each lane owns column k of its block.
|
||
// Block position in dst from meta.
|
||
uvec2 bp = u_meta.meta[block_idx];
|
||
uint block_x = bp.x;
|
||
uint block_y = bp.y;
|
||
uint dx = block_x * 8u + k;
|
||
uint dy0 = block_y * 8u;
|
||
uint stride = pc.dst_stride_u8;
|
||
|
||
// Opt 1: 8 fully-unrolled writes — each o_i used exactly once.
|
||
// No chained ternary, no loop with runtime-variable index.
|
||
uint a0 = (dy0 + 0u) * stride + dx;
|
||
uint a1 = (dy0 + 1u) * stride + dx;
|
||
uint a2 = (dy0 + 2u) * stride + dx;
|
||
uint a3 = (dy0 + 3u) * stride + dx;
|
||
uint a4 = (dy0 + 4u) * stride + dx;
|
||
uint a5 = (dy0 + 5u) * stride + dx;
|
||
uint a6 = (dy0 + 6u) * stride + dx;
|
||
uint a7 = (dy0 + 7u) * stride + dx;
|
||
|
||
int p0 = int(u_dst.dst[a0]);
|
||
int p1 = int(u_dst.dst[a1]);
|
||
int p2 = int(u_dst.dst[a2]);
|
||
int p3 = int(u_dst.dst[a3]);
|
||
int p4 = int(u_dst.dst[a4]);
|
||
int p5 = int(u_dst.dst[a5]);
|
||
int p6 = int(u_dst.dst[a6]);
|
||
int p7 = int(u_dst.dst[a7]);
|
||
|
||
u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 16) >> 5), 0, 255));
|
||
u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 16) >> 5), 0, 255));
|
||
u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 16) >> 5), 0, 255));
|
||
u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 16) >> 5), 0, 255));
|
||
u_dst.dst[a4] = uint8_t(clamp(p4 + ((o4 + 16) >> 5), 0, 255));
|
||
u_dst.dst[a5] = uint8_t(clamp(p5 + ((o5 + 16) >> 5), 0, 255));
|
||
u_dst.dst[a6] = uint8_t(clamp(p6 + ((o6 + 16) >> 5), 0, 255));
|
||
u_dst.dst[a7] = uint8_t(clamp(p7 + ((o7 + 16) >> 5), 0, 255));
|
||
}
|
||
}
|