Cycle 5 closed: CDEF QPU R5=0.116 ORANGE, opportunistic helper
Phase 4 plan with 3 Phase-5 REDs applied inline: - meta layout: m.z=tmp_off, m.w=dir - sec_shift clamped to >=0 (NEON uqsub semantics) - directions table as const ivec2[14], not OR-packed Phase 6 deliverable: v3d_cdef.comp (387 inst, 2 threads, no spills). 3-way M1 (QPU vs C ref vs NEON) PASS 4096/4096. M2: 0.443 Mblock/s -> R5 = 0.116 ORANGE (predicted 0.02-0.05 RED). M4 same-kernel: NEON-3+QPU 8.46 < NEON-4 alone ~10 (negative). M4 mixed (NEON-3 MC + QPU CDEF): CPU 34.17 Mblock/s MC, QPU 0.42 Mblock/s CDEF helper. CPU side higher than the Issue 003 NEON-fallback proxy suggested - cross-substrate contention is gentler than same-side NEON contention. Verdict: CDEF stays on CPU; QPU dispatch path exists for opportunistic use. Deployment recipe table updated for all 5 cycles. Phase 9 lessons: linear extrapolation across cycles is too pessimistic; CDEF is bandwidth-bound on NEON despite high per-block ns; real-substrate-cross contention < NEON-proxy contention. - src/v3d_cdef.comp: cycle 5 QPU shader - tests/bench_v3d_cdef.c: 3-way M1, M2 bench - tests/bench_concurrent_mixed.c: K_CDEF on both sides - tests/cdef_ref.c + bench_neon_cdef.c: sec_shift clamp + expanded damping range to exercise the edge case - CMakeLists.txt: v3d_cdef.spv + bench_v3d_cdef wiring - docs/k5_cdef_phase4.md updated with Phase 5 review applied - docs/k5_cdef_phase7.md: closure doc with full verdict matrix Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,178 @@
|
||||
// daedalus-fourier cycle 5 — AV1 CDEF primary+secondary 8x8 luma filter,
|
||||
// V3D 7.1 via Mesa v3dv compute.
|
||||
//
|
||||
// Per cycle-5 Phase 4 plan (post Phase 5 review):
|
||||
// - 256 invocations / WG; 4 blocks/WG (64 pixels each, 1 pixel/lane)
|
||||
// - NO barrier — each pixel independent
|
||||
// - uint16_t tmp SSBO via storageBuffer16BitAccess
|
||||
// - uint8_t dst SSBO via storageBuffer8BitAccess
|
||||
// - directions table as `const ivec2[14]` (Phase 5 RED-3 fix)
|
||||
// - meta layout: m.x=dst_off, m.y=params (pri|sec<<8|damping<<16),
|
||||
// m.z=tmp_off_u16, m.w=dir (Phase 5 RED-1 fix)
|
||||
// - sec_shift clamped to ≥0 to mirror NEON uqsub (Phase 5 RED-2 fix)
|
||||
//
|
||||
// License: BSD-2-Clause. Algorithm transcribed from tests/cdef_ref.c
|
||||
// which mirrors dav1d 1.4.3 NEON (src/arm/64/cdef_tmpl.S).
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Meta {
|
||||
uvec4 meta[]; // per-block: (dst_off, params, tmp_off_u16, dir)
|
||||
} u_meta;
|
||||
|
||||
layout(binding = 1) buffer Dst {
|
||||
uint8_t dst[];
|
||||
} u_dst;
|
||||
|
||||
layout(binding = 2) readonly buffer Tmp {
|
||||
uint16_t tmp[]; // padded 12×16 per block; meta.z = block-origin u16 offset
|
||||
} u_tmp;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint tmp_stride_u16;
|
||||
uint dst_stride_u8;
|
||||
uint _pad;
|
||||
} pc;
|
||||
|
||||
// 14-entry stride-16 directions table (8 dirs + 6 wrap copies for
|
||||
// (dir+2)%8 / (dir+6)%8 safe lookup). Values from cdef_ref.c.
|
||||
const ivec2 dirs8[14] = ivec2[](
|
||||
/* 0 */ ivec2(-1*16 + 1, -2*16 + 2),
|
||||
/* 1 */ ivec2( 0*16 + 1, -1*16 + 2),
|
||||
/* 2 */ ivec2( 0*16 + 1, 0*16 + 2),
|
||||
/* 3 */ ivec2( 0*16 + 1, 1*16 + 2),
|
||||
/* 4 */ ivec2( 1*16 + 1, 2*16 + 2),
|
||||
/* 5 */ ivec2( 1*16 + 0, 2*16 + 1),
|
||||
/* 6 */ ivec2( 1*16 + 0, 2*16 + 0),
|
||||
/* 7 */ ivec2( 1*16 + 0, 2*16 - 1),
|
||||
/* 8 = dir 0 */ ivec2(-1*16 + 1, -2*16 + 2),
|
||||
/* 9 = dir 1 */ ivec2( 0*16 + 1, -1*16 + 2),
|
||||
/* 10 = dir 2 */ ivec2( 0*16 + 1, 0*16 + 2),
|
||||
/* 11 = dir 3 */ ivec2( 0*16 + 1, 1*16 + 2),
|
||||
/* 12 = dir 4 */ ivec2( 1*16 + 1, 2*16 + 2),
|
||||
/* 13 = dir 5 */ ivec2( 1*16 + 0, 2*16 + 1)
|
||||
);
|
||||
|
||||
int ulog2_pos(int x) {
|
||||
// Mirrors C's 31 - __builtin_clz(uint). x >= 1 required.
|
||||
return findMSB(uint(x));
|
||||
}
|
||||
|
||||
int constrain(int diff, int threshold, int shift)
|
||||
{
|
||||
int adiff = abs(diff);
|
||||
int clip = max(0, threshold - (adiff >> shift));
|
||||
int amag = min(adiff, clip);
|
||||
return diff < 0 ? -amag : amag;
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint wg_id = gl_WorkGroupID.x;
|
||||
uint lane_in_wg = gl_LocalInvocationID.x; // 0..255
|
||||
uint block_in_wg = lane_in_wg >> 6; // 0..3
|
||||
uint px_idx = lane_in_wg & 63u; // 0..63
|
||||
uint row = px_idx >> 3; // 0..7
|
||||
uint col = px_idx & 7u; // 0..7
|
||||
|
||||
uint block_idx = wg_id * 4u + block_in_wg;
|
||||
if (block_idx >= pc.n_blocks) return; // no barrier — safe
|
||||
|
||||
uvec4 m = u_meta.meta[block_idx];
|
||||
uint dst_off = m.x + row * pc.dst_stride_u8 + col;
|
||||
uint tmp_off = m.z + row * pc.tmp_stride_u16 + col;
|
||||
int pri = int(m.y & 0xffu);
|
||||
int sec = int((m.y >> 8) & 0xffu);
|
||||
int damping = int((m.y >> 16) & 0xffu);
|
||||
int dir = int(m.w & 7u);
|
||||
|
||||
int px = int(u_tmp.tmp[tmp_off]);
|
||||
int sum = 0;
|
||||
int mn = px;
|
||||
int mx = px;
|
||||
|
||||
int pri_shift = max(0, damping - ulog2_pos(pri));
|
||||
int sec_shift = max(0, damping - ulog2_pos(sec)); // RED-2 fix
|
||||
|
||||
int pri_tap0 = 4 - (pri & 1);
|
||||
int pri_tap1 = (pri_tap0 & 3) | 2;
|
||||
int sec_tap0 = 2;
|
||||
int sec_tap1 = 1;
|
||||
|
||||
int pri_idx = dir;
|
||||
int sec1_idx = (dir + 2) & 7;
|
||||
int sec2_idx = (dir + 6) & 7; // (dir - 2) % 8
|
||||
|
||||
// -- k = 0 --
|
||||
{
|
||||
int o1 = dirs8[pri_idx ].x;
|
||||
int o2 = dirs8[sec1_idx].x;
|
||||
int o3 = dirs8[sec2_idx].x;
|
||||
int p0 = int(u_tmp.tmp[uint(int(tmp_off) + o1)]);
|
||||
int p1 = int(u_tmp.tmp[uint(int(tmp_off) - o1)]);
|
||||
int s0 = int(u_tmp.tmp[uint(int(tmp_off) + o2)]);
|
||||
int s1 = int(u_tmp.tmp[uint(int(tmp_off) - o2)]);
|
||||
int s2 = int(u_tmp.tmp[uint(int(tmp_off) + o3)]);
|
||||
int s3 = int(u_tmp.tmp[uint(int(tmp_off) - o3)]);
|
||||
|
||||
sum += pri_tap0 * constrain(p0 - px, pri, pri_shift);
|
||||
sum += pri_tap0 * constrain(p1 - px, pri, pri_shift);
|
||||
sum += sec_tap0 * constrain(s0 - px, sec, sec_shift);
|
||||
sum += sec_tap0 * constrain(s1 - px, sec, sec_shift);
|
||||
sum += sec_tap0 * constrain(s2 - px, sec, sec_shift);
|
||||
sum += sec_tap0 * constrain(s3 - px, sec, sec_shift);
|
||||
|
||||
// min/max bookkeeping — NEON umin / smax semantics.
|
||||
// Unsigned min: 0x8000 sentinel (32768u) > any 0..255 pixel.
|
||||
// Signed max: 0x8000 = -32768 (signed) < any valid max.
|
||||
mn = int(min(uint(mn), uint(p0)));
|
||||
mn = int(min(uint(mn), uint(p1)));
|
||||
mn = int(min(uint(mn), uint(s0)));
|
||||
mn = int(min(uint(mn), uint(s1)));
|
||||
mn = int(min(uint(mn), uint(s2)));
|
||||
mn = int(min(uint(mn), uint(s3)));
|
||||
mx = max(mx, p0); mx = max(mx, p1);
|
||||
mx = max(mx, s0); mx = max(mx, s1);
|
||||
mx = max(mx, s2); mx = max(mx, s3);
|
||||
}
|
||||
|
||||
// -- k = 1 --
|
||||
{
|
||||
int o1 = dirs8[pri_idx ].y;
|
||||
int o2 = dirs8[sec1_idx].y;
|
||||
int o3 = dirs8[sec2_idx].y;
|
||||
int p0 = int(u_tmp.tmp[uint(int(tmp_off) + o1)]);
|
||||
int p1 = int(u_tmp.tmp[uint(int(tmp_off) - o1)]);
|
||||
int s0 = int(u_tmp.tmp[uint(int(tmp_off) + o2)]);
|
||||
int s1 = int(u_tmp.tmp[uint(int(tmp_off) - o2)]);
|
||||
int s2 = int(u_tmp.tmp[uint(int(tmp_off) + o3)]);
|
||||
int s3 = int(u_tmp.tmp[uint(int(tmp_off) - o3)]);
|
||||
|
||||
sum += pri_tap1 * constrain(p0 - px, pri, pri_shift);
|
||||
sum += pri_tap1 * constrain(p1 - px, pri, pri_shift);
|
||||
sum += sec_tap1 * constrain(s0 - px, sec, sec_shift);
|
||||
sum += sec_tap1 * constrain(s1 - px, sec, sec_shift);
|
||||
sum += sec_tap1 * constrain(s2 - px, sec, sec_shift);
|
||||
sum += sec_tap1 * constrain(s3 - px, sec, sec_shift);
|
||||
|
||||
mn = int(min(uint(mn), uint(p0)));
|
||||
mn = int(min(uint(mn), uint(p1)));
|
||||
mn = int(min(uint(mn), uint(s0)));
|
||||
mn = int(min(uint(mn), uint(s1)));
|
||||
mn = int(min(uint(mn), uint(s2)));
|
||||
mn = int(min(uint(mn), uint(s3)));
|
||||
mx = max(mx, p0); mx = max(mx, p1);
|
||||
mx = max(mx, s0); mx = max(mx, s1);
|
||||
mx = max(mx, s2); mx = max(mx, s3);
|
||||
}
|
||||
|
||||
int adj = (sum - int(sum < 0) + 8) >> 4;
|
||||
int outpx = clamp(px + adj, mn, mx);
|
||||
u_dst.dst[dst_off] = uint8_t(outpx);
|
||||
}
|
||||
Reference in New Issue
Block a user