Cycle 5 closed: CDEF QPU R5=0.116 ORANGE, opportunistic helper

Phase 4 plan with 3 Phase-5 REDs applied inline: - meta layout: m.z=tmp_off, m.w=dir - sec_shift clamped to >=0 (NEON uqsub semantics) - directions table as const ivec2[14], not OR-packed Phase 6 deliverable: v3d_cdef.comp (387 inst, 2 threads, no spills). 3-way M1 (QPU vs C ref vs NEON) PASS 4096/4096. M2: 0.443 Mblock/s -> R5 = 0.116 ORANGE (predicted 0.02-0.05 RED). M4 same-kernel: NEON-3+QPU 8.46 < NEON-4 alone ~10 (negative). M4 mixed (NEON-3 MC + QPU CDEF): CPU 34.17 Mblock/s MC, QPU 0.42 Mblock/s CDEF helper. CPU side higher than the Issue 003 NEON-fallback proxy suggested - cross-substrate contention is gentler than same-side NEON contention. Verdict: CDEF stays on CPU; QPU dispatch path exists for opportunistic use. Deployment recipe table updated for all 5 cycles. Phase 9 lessons: linear extrapolation across cycles is too pessimistic; CDEF is bandwidth-bound on NEON despite high per-block ns; real-substrate-cross contention < NEON-proxy contention. - src/v3d_cdef.comp: cycle 5 QPU shader - tests/bench_v3d_cdef.c: 3-way M1, M2 bench - tests/bench_concurrent_mixed.c: K_CDEF on both sides - tests/cdef_ref.c + bench_neon_cdef.c: sec_shift clamp + expanded damping range to exercise the edge case - CMakeLists.txt: v3d_cdef.spv + bench_v3d_cdef wiring - docs/k5_cdef_phase4.md updated with Phase 5 review applied - docs/k5_cdef_phase7.md: closure doc with full verdict matrix Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:52:46 +00:00
parent 1740e7c165
commit 5223d3cb3f
8 changed files with 849 additions and 36 deletions
@@ -0,0 +1,178 @@
+// daedalus-fourier cycle 5 — AV1 CDEF primary+secondary 8x8 luma filter,
+// V3D 7.1 via Mesa v3dv compute.
+//
+// Per cycle-5 Phase 4 plan (post Phase 5 review):
+//   - 256 invocations / WG; 4 blocks/WG (64 pixels each, 1 pixel/lane)
+//   - NO barrier — each pixel independent
+//   - uint16_t tmp SSBO via storageBuffer16BitAccess
+//   - uint8_t dst SSBO via storageBuffer8BitAccess
+//   - directions table as `const ivec2[14]` (Phase 5 RED-3 fix)
+//   - meta layout: m.x=dst_off, m.y=params (pri|sec<<8|damping<<16),
+//                  m.z=tmp_off_u16, m.w=dir (Phase 5 RED-1 fix)
+//   - sec_shift clamped to ≥0 to mirror NEON uqsub (Phase 5 RED-2 fix)
+//
+// License: BSD-2-Clause. Algorithm transcribed from tests/cdef_ref.c
+// which mirrors dav1d 1.4.3 NEON (src/arm/64/cdef_tmpl.S).
+
+#version 450
+#extension GL_EXT_shader_8bit_storage             : require
+#extension GL_EXT_shader_16bit_storage            : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly buffer Meta {
+    uvec4 meta[];      // per-block: (dst_off, params, tmp_off_u16, dir)
+} u_meta;
+
+layout(binding = 1) buffer Dst {
+    uint8_t dst[];
+} u_dst;
+
+layout(binding = 2) readonly buffer Tmp {
+    uint16_t tmp[];    // padded 12×16 per block; meta.z = block-origin u16 offset
+} u_tmp;
+
+layout(push_constant) uniform PC {
+    uint n_blocks;
+    uint tmp_stride_u16;
+    uint dst_stride_u8;
+    uint _pad;
+} pc;
+
+// 14-entry stride-16 directions table (8 dirs + 6 wrap copies for
+// (dir+2)%8 / (dir+6)%8 safe lookup). Values from cdef_ref.c.
+const ivec2 dirs8[14] = ivec2[](
+    /* 0 */ ivec2(-1*16 + 1, -2*16 + 2),
+    /* 1 */ ivec2( 0*16 + 1, -1*16 + 2),
+    /* 2 */ ivec2( 0*16 + 1,  0*16 + 2),
+    /* 3 */ ivec2( 0*16 + 1,  1*16 + 2),
+    /* 4 */ ivec2( 1*16 + 1,  2*16 + 2),
+    /* 5 */ ivec2( 1*16 + 0,  2*16 + 1),
+    /* 6 */ ivec2( 1*16 + 0,  2*16 + 0),
+    /* 7 */ ivec2( 1*16 + 0,  2*16 - 1),
+    /* 8  = dir 0 */ ivec2(-1*16 + 1, -2*16 + 2),
+    /* 9  = dir 1 */ ivec2( 0*16 + 1, -1*16 + 2),
+    /* 10 = dir 2 */ ivec2( 0*16 + 1,  0*16 + 2),
+    /* 11 = dir 3 */ ivec2( 0*16 + 1,  1*16 + 2),
+    /* 12 = dir 4 */ ivec2( 1*16 + 1,  2*16 + 2),
+    /* 13 = dir 5 */ ivec2( 1*16 + 0,  2*16 + 1)
+);
+
+int ulog2_pos(int x) {
+    // Mirrors C's 31 - __builtin_clz(uint). x >= 1 required.
+    return findMSB(uint(x));
+}
+
+int constrain(int diff, int threshold, int shift)
+{
+    int adiff = abs(diff);
+    int clip  = max(0, threshold - (adiff >> shift));
+    int amag  = min(adiff, clip);
+    return diff < 0 ? -amag : amag;
+}
+
+void main()
+{
+    uint wg_id        = gl_WorkGroupID.x;
+    uint lane_in_wg   = gl_LocalInvocationID.x;       // 0..255
+    uint block_in_wg  = lane_in_wg >> 6;              // 0..3
+    uint px_idx       = lane_in_wg & 63u;             // 0..63
+    uint row          = px_idx >> 3;                  // 0..7
+    uint col          = px_idx & 7u;                  // 0..7
+
+    uint block_idx = wg_id * 4u + block_in_wg;
+    if (block_idx >= pc.n_blocks) return;             // no barrier — safe
+
+    uvec4 m = u_meta.meta[block_idx];
+    uint dst_off = m.x + row * pc.dst_stride_u8 + col;
+    uint tmp_off = m.z + row * pc.tmp_stride_u16 + col;
+    int pri      = int(m.y & 0xffu);
+    int sec      = int((m.y >> 8) & 0xffu);
+    int damping  = int((m.y >> 16) & 0xffu);
+    int dir      = int(m.w & 7u);
+
+    int px = int(u_tmp.tmp[tmp_off]);
+    int sum = 0;
+    int mn  = px;
+    int mx  = px;
+
+    int pri_shift = max(0, damping - ulog2_pos(pri));
+    int sec_shift = max(0, damping - ulog2_pos(sec));  // RED-2 fix
+
+    int pri_tap0 = 4 - (pri & 1);
+    int pri_tap1 = (pri_tap0 & 3) | 2;
+    int sec_tap0 = 2;
+    int sec_tap1 = 1;
+
+    int pri_idx  = dir;
+    int sec1_idx = (dir + 2) & 7;
+    int sec2_idx = (dir + 6) & 7;  // (dir - 2) % 8
+
+    // -- k = 0 --
+    {
+        int o1 = dirs8[pri_idx ].x;
+        int o2 = dirs8[sec1_idx].x;
+        int o3 = dirs8[sec2_idx].x;
+        int p0 = int(u_tmp.tmp[uint(int(tmp_off) + o1)]);
+        int p1 = int(u_tmp.tmp[uint(int(tmp_off) - o1)]);
+        int s0 = int(u_tmp.tmp[uint(int(tmp_off) + o2)]);
+        int s1 = int(u_tmp.tmp[uint(int(tmp_off) - o2)]);
+        int s2 = int(u_tmp.tmp[uint(int(tmp_off) + o3)]);
+        int s3 = int(u_tmp.tmp[uint(int(tmp_off) - o3)]);
+
+        sum += pri_tap0 * constrain(p0 - px, pri, pri_shift);
+        sum += pri_tap0 * constrain(p1 - px, pri, pri_shift);
+        sum += sec_tap0 * constrain(s0 - px, sec, sec_shift);
+        sum += sec_tap0 * constrain(s1 - px, sec, sec_shift);
+        sum += sec_tap0 * constrain(s2 - px, sec, sec_shift);
+        sum += sec_tap0 * constrain(s3 - px, sec, sec_shift);
+
+        // min/max bookkeeping — NEON umin / smax semantics.
+        // Unsigned min: 0x8000 sentinel (32768u) > any 0..255 pixel.
+        // Signed max: 0x8000 = -32768 (signed) < any valid max.
+        mn = int(min(uint(mn), uint(p0)));
+        mn = int(min(uint(mn), uint(p1)));
+        mn = int(min(uint(mn), uint(s0)));
+        mn = int(min(uint(mn), uint(s1)));
+        mn = int(min(uint(mn), uint(s2)));
+        mn = int(min(uint(mn), uint(s3)));
+        mx = max(mx, p0); mx = max(mx, p1);
+        mx = max(mx, s0); mx = max(mx, s1);
+        mx = max(mx, s2); mx = max(mx, s3);
+    }
+
+    // -- k = 1 --
+    {
+        int o1 = dirs8[pri_idx ].y;
+        int o2 = dirs8[sec1_idx].y;
+        int o3 = dirs8[sec2_idx].y;
+        int p0 = int(u_tmp.tmp[uint(int(tmp_off) + o1)]);
+        int p1 = int(u_tmp.tmp[uint(int(tmp_off) - o1)]);
+        int s0 = int(u_tmp.tmp[uint(int(tmp_off) + o2)]);
+        int s1 = int(u_tmp.tmp[uint(int(tmp_off) - o2)]);
+        int s2 = int(u_tmp.tmp[uint(int(tmp_off) + o3)]);
+        int s3 = int(u_tmp.tmp[uint(int(tmp_off) - o3)]);
+
+        sum += pri_tap1 * constrain(p0 - px, pri, pri_shift);
+        sum += pri_tap1 * constrain(p1 - px, pri, pri_shift);
+        sum += sec_tap1 * constrain(s0 - px, sec, sec_shift);
+        sum += sec_tap1 * constrain(s1 - px, sec, sec_shift);
+        sum += sec_tap1 * constrain(s2 - px, sec, sec_shift);
+        sum += sec_tap1 * constrain(s3 - px, sec, sec_shift);
+
+        mn = int(min(uint(mn), uint(p0)));
+        mn = int(min(uint(mn), uint(p1)));
+        mn = int(min(uint(mn), uint(s0)));
+        mn = int(min(uint(mn), uint(s1)));
+        mn = int(min(uint(mn), uint(s2)));
+        mn = int(min(uint(mn), uint(s3)));
+        mx = max(mx, p0); mx = max(mx, p1);
+        mx = max(mx, s0); mx = max(mx, s1);
+        mx = max(mx, s2); mx = max(mx, s3);
+    }
+
+    int adj = (sum - int(sum < 0) + 8) >> 4;
+    int outpx = clamp(px + adj, mn, mx);
+    u_dst.dst[dst_off] = uint8_t(outpx);
+}