Cycle 5 closed: CDEF QPU R5=0.116 ORANGE, opportunistic helper

Phase 4 plan with 3 Phase-5 REDs applied inline:
  - meta layout: m.z=tmp_off, m.w=dir
  - sec_shift clamped to >=0 (NEON uqsub semantics)
  - directions table as const ivec2[14], not OR-packed

Phase 6 deliverable: v3d_cdef.comp (387 inst, 2 threads, no spills).
3-way M1 (QPU vs C ref vs NEON) PASS 4096/4096.

M2: 0.443 Mblock/s -> R5 = 0.116 ORANGE (predicted 0.02-0.05 RED).
M4 same-kernel: NEON-3+QPU 8.46 < NEON-4 alone ~10 (negative).
M4 mixed (NEON-3 MC + QPU CDEF): CPU 34.17 Mblock/s MC,
  QPU 0.42 Mblock/s CDEF helper. CPU side higher than the
  Issue 003 NEON-fallback proxy suggested - cross-substrate
  contention is gentler than same-side NEON contention.

Verdict: CDEF stays on CPU; QPU dispatch path exists for
opportunistic use. Deployment recipe table updated for all 5
cycles. Phase 9 lessons: linear extrapolation across cycles is
too pessimistic; CDEF is bandwidth-bound on NEON despite high
per-block ns; real-substrate-cross contention < NEON-proxy
contention.

- src/v3d_cdef.comp: cycle 5 QPU shader
- tests/bench_v3d_cdef.c: 3-way M1, M2 bench
- tests/bench_concurrent_mixed.c: K_CDEF on both sides
- tests/cdef_ref.c + bench_neon_cdef.c: sec_shift clamp +
  expanded damping range to exercise the edge case
- CMakeLists.txt: v3d_cdef.spv + bench_v3d_cdef wiring
- docs/k5_cdef_phase4.md updated with Phase 5 review applied
- docs/k5_cdef_phase7.md: closure doc with full verdict matrix

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 13:52:46 +00:00
parent 1740e7c165
commit 5223d3cb3f
8 changed files with 849 additions and 36 deletions
+178
View File
@@ -0,0 +1,178 @@
// daedalus-fourier cycle 5 — AV1 CDEF primary+secondary 8x8 luma filter,
// V3D 7.1 via Mesa v3dv compute.
//
// Per cycle-5 Phase 4 plan (post Phase 5 review):
// - 256 invocations / WG; 4 blocks/WG (64 pixels each, 1 pixel/lane)
// - NO barrier — each pixel independent
// - uint16_t tmp SSBO via storageBuffer16BitAccess
// - uint8_t dst SSBO via storageBuffer8BitAccess
// - directions table as `const ivec2[14]` (Phase 5 RED-3 fix)
// - meta layout: m.x=dst_off, m.y=params (pri|sec<<8|damping<<16),
// m.z=tmp_off_u16, m.w=dir (Phase 5 RED-1 fix)
// - sec_shift clamped to ≥0 to mirror NEON uqsub (Phase 5 RED-2 fix)
//
// License: BSD-2-Clause. Algorithm transcribed from tests/cdef_ref.c
// which mirrors dav1d 1.4.3 NEON (src/arm/64/cdef_tmpl.S).
#version 450
#extension GL_EXT_shader_8bit_storage : require
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_explicit_arithmetic_types : require
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
layout(binding = 0) readonly buffer Meta {
uvec4 meta[]; // per-block: (dst_off, params, tmp_off_u16, dir)
} u_meta;
layout(binding = 1) buffer Dst {
uint8_t dst[];
} u_dst;
layout(binding = 2) readonly buffer Tmp {
uint16_t tmp[]; // padded 12×16 per block; meta.z = block-origin u16 offset
} u_tmp;
layout(push_constant) uniform PC {
uint n_blocks;
uint tmp_stride_u16;
uint dst_stride_u8;
uint _pad;
} pc;
// 14-entry stride-16 directions table (8 dirs + 6 wrap copies for
// (dir+2)%8 / (dir+6)%8 safe lookup). Values from cdef_ref.c.
const ivec2 dirs8[14] = ivec2[](
/* 0 */ ivec2(-1*16 + 1, -2*16 + 2),
/* 1 */ ivec2( 0*16 + 1, -1*16 + 2),
/* 2 */ ivec2( 0*16 + 1, 0*16 + 2),
/* 3 */ ivec2( 0*16 + 1, 1*16 + 2),
/* 4 */ ivec2( 1*16 + 1, 2*16 + 2),
/* 5 */ ivec2( 1*16 + 0, 2*16 + 1),
/* 6 */ ivec2( 1*16 + 0, 2*16 + 0),
/* 7 */ ivec2( 1*16 + 0, 2*16 - 1),
/* 8 = dir 0 */ ivec2(-1*16 + 1, -2*16 + 2),
/* 9 = dir 1 */ ivec2( 0*16 + 1, -1*16 + 2),
/* 10 = dir 2 */ ivec2( 0*16 + 1, 0*16 + 2),
/* 11 = dir 3 */ ivec2( 0*16 + 1, 1*16 + 2),
/* 12 = dir 4 */ ivec2( 1*16 + 1, 2*16 + 2),
/* 13 = dir 5 */ ivec2( 1*16 + 0, 2*16 + 1)
);
int ulog2_pos(int x) {
// Mirrors C's 31 - __builtin_clz(uint). x >= 1 required.
return findMSB(uint(x));
}
int constrain(int diff, int threshold, int shift)
{
int adiff = abs(diff);
int clip = max(0, threshold - (adiff >> shift));
int amag = min(adiff, clip);
return diff < 0 ? -amag : amag;
}
void main()
{
uint wg_id = gl_WorkGroupID.x;
uint lane_in_wg = gl_LocalInvocationID.x; // 0..255
uint block_in_wg = lane_in_wg >> 6; // 0..3
uint px_idx = lane_in_wg & 63u; // 0..63
uint row = px_idx >> 3; // 0..7
uint col = px_idx & 7u; // 0..7
uint block_idx = wg_id * 4u + block_in_wg;
if (block_idx >= pc.n_blocks) return; // no barrier — safe
uvec4 m = u_meta.meta[block_idx];
uint dst_off = m.x + row * pc.dst_stride_u8 + col;
uint tmp_off = m.z + row * pc.tmp_stride_u16 + col;
int pri = int(m.y & 0xffu);
int sec = int((m.y >> 8) & 0xffu);
int damping = int((m.y >> 16) & 0xffu);
int dir = int(m.w & 7u);
int px = int(u_tmp.tmp[tmp_off]);
int sum = 0;
int mn = px;
int mx = px;
int pri_shift = max(0, damping - ulog2_pos(pri));
int sec_shift = max(0, damping - ulog2_pos(sec)); // RED-2 fix
int pri_tap0 = 4 - (pri & 1);
int pri_tap1 = (pri_tap0 & 3) | 2;
int sec_tap0 = 2;
int sec_tap1 = 1;
int pri_idx = dir;
int sec1_idx = (dir + 2) & 7;
int sec2_idx = (dir + 6) & 7; // (dir - 2) % 8
// -- k = 0 --
{
int o1 = dirs8[pri_idx ].x;
int o2 = dirs8[sec1_idx].x;
int o3 = dirs8[sec2_idx].x;
int p0 = int(u_tmp.tmp[uint(int(tmp_off) + o1)]);
int p1 = int(u_tmp.tmp[uint(int(tmp_off) - o1)]);
int s0 = int(u_tmp.tmp[uint(int(tmp_off) + o2)]);
int s1 = int(u_tmp.tmp[uint(int(tmp_off) - o2)]);
int s2 = int(u_tmp.tmp[uint(int(tmp_off) + o3)]);
int s3 = int(u_tmp.tmp[uint(int(tmp_off) - o3)]);
sum += pri_tap0 * constrain(p0 - px, pri, pri_shift);
sum += pri_tap0 * constrain(p1 - px, pri, pri_shift);
sum += sec_tap0 * constrain(s0 - px, sec, sec_shift);
sum += sec_tap0 * constrain(s1 - px, sec, sec_shift);
sum += sec_tap0 * constrain(s2 - px, sec, sec_shift);
sum += sec_tap0 * constrain(s3 - px, sec, sec_shift);
// min/max bookkeeping — NEON umin / smax semantics.
// Unsigned min: 0x8000 sentinel (32768u) > any 0..255 pixel.
// Signed max: 0x8000 = -32768 (signed) < any valid max.
mn = int(min(uint(mn), uint(p0)));
mn = int(min(uint(mn), uint(p1)));
mn = int(min(uint(mn), uint(s0)));
mn = int(min(uint(mn), uint(s1)));
mn = int(min(uint(mn), uint(s2)));
mn = int(min(uint(mn), uint(s3)));
mx = max(mx, p0); mx = max(mx, p1);
mx = max(mx, s0); mx = max(mx, s1);
mx = max(mx, s2); mx = max(mx, s3);
}
// -- k = 1 --
{
int o1 = dirs8[pri_idx ].y;
int o2 = dirs8[sec1_idx].y;
int o3 = dirs8[sec2_idx].y;
int p0 = int(u_tmp.tmp[uint(int(tmp_off) + o1)]);
int p1 = int(u_tmp.tmp[uint(int(tmp_off) - o1)]);
int s0 = int(u_tmp.tmp[uint(int(tmp_off) + o2)]);
int s1 = int(u_tmp.tmp[uint(int(tmp_off) - o2)]);
int s2 = int(u_tmp.tmp[uint(int(tmp_off) + o3)]);
int s3 = int(u_tmp.tmp[uint(int(tmp_off) - o3)]);
sum += pri_tap1 * constrain(p0 - px, pri, pri_shift);
sum += pri_tap1 * constrain(p1 - px, pri, pri_shift);
sum += sec_tap1 * constrain(s0 - px, sec, sec_shift);
sum += sec_tap1 * constrain(s1 - px, sec, sec_shift);
sum += sec_tap1 * constrain(s2 - px, sec, sec_shift);
sum += sec_tap1 * constrain(s3 - px, sec, sec_shift);
mn = int(min(uint(mn), uint(p0)));
mn = int(min(uint(mn), uint(p1)));
mn = int(min(uint(mn), uint(s0)));
mn = int(min(uint(mn), uint(s1)));
mn = int(min(uint(mn), uint(s2)));
mn = int(min(uint(mn), uint(s3)));
mx = max(mx, p0); mx = max(mx, p1);
mx = max(mx, s0); mx = max(mx, s1);
mx = max(mx, s2); mx = max(mx, s3);
}
int adj = (sum - int(sum < 0) + 8) >> 4;
int outpx = clamp(px + adj, mn, mx);
u_dst.dst[dst_off] = uint8_t(outpx);
}