From a9590acee371c65b01a0ac2342ea67b03f58f89e Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Mon, 25 May 2026 21:28:04 +0200
Subject: [PATCH] wip: mc02 v2 shared-tile

---
 src/v3d_h264_qpel_mc02.comp | 75 ++++++++++++++++++++++++++++---------
 1 file changed, 58 insertions(+), 17 deletions(-)

diff --git a/src/v3d_h264_qpel_mc02.comp b/src/v3d_h264_qpel_mc02.comp
index 07462ed..12ea7f3 100644
--- a/src/v3d_h264_qpel_mc02.comp
+++ b/src/v3d_h264_qpel_mc02.comp
@@ -1,7 +1,6 @@
 // daedalus-fourier — H.264 luma qpel mc02 (8x8, vertical half-pel), V3D 7.1.
 //
-// Sibling of cycle 9's v3d_h264_qpel_mc20.comp.  Same 6-tap filter,
-// transposed to vertical direction:
+// v2: cooperative-load shared-memory tile.
 //
 //   dst[r,c] = clip255(
 //       ( s[r-2,c]
@@ -14,9 +13,30 @@
 //       ) >> 5)
 //
 // src+src_off points at row 0 col 0 of the OUTPUT block; the filter
-// reads rows -2..+3 (2 rows of top context, 3 rows of bottom).
+// reads rows -2..+3 (2 rows of top context, 3 rows of bottom), total
+// 13 distinct source rows × 8 cols = 104 bytes per 8x8 output.
 //
-// Same WG layout as mc20: 64 lanes / 1 block-per-WG / 1 lane-per-pixel.
+// v1 had each of the 64 lanes do 6 SSBO loads → 384 loads/WG to cover
+// 104 unique bytes (3.7x redundant), and each lane's loads were stride-
+// spaced (one cache line per byte under V3D's TMU).  PR #36 bench
+// showed mc02 was the only qpel position where CPU NEON still beat
+// QPU (16.96 ns/op CPU vs 20.54 ns/op QPU; 1.21x CPU favoring).
+//
+// v2 splits the work into a coalesced load phase + a shared-memory
+// compute phase:
+//
+//   Phase 1: each of the 64 lanes cooperatively loads the 104-byte
+//   source tile into shared memory.  Lanes 0..63 load bytes at indices
+//   0..63 (covers source rows 0..7 of the 13-row tile); lanes 0..39
+//   second-load bytes 64..103 (rows 8..12).  Reads within a row are
+//   contiguous so the SIMD groups coalesce; total SSBO loads = 104,
+//   matching the unique-byte count.
+//
+//   Phase 2: all 64 lanes compute one output pixel each, reading 6
+//   bytes from shared.  Shared-memory access on V3D is local-store
+//   backed (no TMU round-trip).
+//
+// Same WG layout as v1: 64 lanes / 1 block-per-WG / 1 lane-per-pixel.
 //
 // License: BSD-2-Clause.
 
@@ -36,31 +56,52 @@ layout(push_constant) uniform PC {
     uint _pad0, _pad1;
 } pc;
 
+// 13 source rows × 8 cols.  int storage (4 bytes each) — wasteful vs
+// uint8_t but avoids 8-bit-shared interop concerns on glslang+v3dv;
+// 416 bytes shared/WG is well within any reasonable local-store budget.
+shared int s_tile[13 * 8];
+
 void main()
 {
     uint block_idx = gl_WorkGroupID.x;
     if (block_idx >= pc.n_blocks) return;
 
     uint lane = gl_LocalInvocationID.x;
-    uint r = lane >> 3;
-    uint c = lane & 7u;
 
     uint dst_off = u_meta.meta[block_idx].x;
     uint src_off = u_meta.meta[block_idx].y;
     uint stride  = pc.stride_u8;
 
-    // Read the 6 rows of vertical context at col (c) of THIS output row.
-    // src_off+r*stride+c is at the OUTPUT pixel position; the kernel
-    // samples r-2..r+3 along the column.  Unsigned-safe because the
-    // public API contract guarantees src_off >= 2*stride.
-    uint col_base = src_off + c;
+    // Source-tile base: src_off points at output-row-0 col-0, the tile
+    // starts 2 rows above.  Unsigned-safe because the public API
+    // contract guarantees src_off >= 2*stride.
+    uint tile_base = src_off - 2u * stride;
 
-    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
-    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
-    int s_0  = int(u_src.src[col_base +  r       * stride]);
-    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
-    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
-    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    // Phase 1: cooperative load — 64 lanes load 104 bytes.
+    {
+        uint sr = lane >> 3;        // 0..7
+        uint sc = lane & 7u;
+        s_tile[lane] = int(u_src.src[tile_base + sr * stride + sc]);
+    }
+    if (lane < 40u) {
+        uint idx = lane + 64u;      // 64..103
+        uint sr = idx >> 3;         // 8..12
+        uint sc = idx & 7u;
+        s_tile[idx] = int(u_src.src[tile_base + sr * stride + sc]);
+    }
+
+    barrier();
+
+    // Phase 2: each lane computes one output pixel from the shared tile.
+    uint r = lane >> 3;
+    uint c = lane & 7u;
+
+    int s_m2 = s_tile[(r + 0u) * 8u + c];
+    int s_m1 = s_tile[(r + 1u) * 8u + c];
+    int s_0  = s_tile[(r + 2u) * 8u + c];
+    int s_p1 = s_tile[(r + 3u) * 8u + c];
+    int s_p2 = s_tile[(r + 4u) * 8u + c];
+    int s_p3 = s_tile[(r + 5u) * 8u + c];
 
     int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
     int p = clamp(v >> 5, 0, 255);