// daedalus-fourier — H.264 luma qpel mc02 (8x8, vertical half-pel), V3D 7.1.
//
// v2: cooperative-load shared-memory tile.
//
//   dst[r,c] = clip255(
//       ( s[r-2,c]
//         - 5 * s[r-1,c]
//         + 20 * s[r,  c]
//         + 20 * s[r+1,c]
//         -  5 * s[r+2,c]
//         +      s[r+3,c]
//         + 16
//       ) >> 5)
//
// src+src_off points at row 0 col 0 of the OUTPUT block; the filter
// reads rows -2..+3 (2 rows of top context, 3 rows of bottom), total
// 13 distinct source rows × 8 cols = 104 bytes per 8x8 output.
//
// v1 had each of the 64 lanes do 6 SSBO loads → 384 loads/WG to cover
// 104 unique bytes (3.7x redundant), and each lane's loads were stride-
// spaced (one cache line per byte under V3D's TMU).  PR #36 bench
// showed mc02 was the only qpel position where CPU NEON still beat
// QPU (16.96 ns/op CPU vs 20.54 ns/op QPU; 1.21x CPU favoring).
//
// v2 splits the work into a coalesced load phase + a shared-memory
// compute phase:
//
//   Phase 1: each of the 64 lanes cooperatively loads the 104-byte
//   source tile into shared memory.  Lanes 0..63 load bytes at indices
//   0..63 (covers source rows 0..7 of the 13-row tile); lanes 0..39
//   second-load bytes 64..103 (rows 8..12).  Reads within a row are
//   contiguous so the SIMD groups coalesce; total SSBO loads = 104,
//   matching the unique-byte count.
//
//   Phase 2: all 64 lanes compute one output pixel each, reading 6
//   bytes from shared.  Shared-memory access on V3D is local-store
//   backed (no TMU round-trip).
//
// Same WG layout as v1: 64 lanes / 1 block-per-WG / 1 lane-per-pixel.
//
// License: BSD-2-Clause.

#version 450
#extension GL_EXT_shader_8bit_storage             : require
#extension GL_EXT_shader_explicit_arithmetic_types : require

layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;

layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;

layout(push_constant) uniform PC {
    uint n_blocks;
    uint stride_u8;
    uint _pad0, _pad1;
} pc;

// 13 source rows × 8 cols.  int storage (4 bytes each) — wasteful vs
// uint8_t but avoids 8-bit-shared interop concerns on glslang+v3dv;
// 416 bytes shared/WG is well within any reasonable local-store budget.
shared int s_tile[13 * 8];

void main()
{
    uint block_idx = gl_WorkGroupID.x;
    if (block_idx >= pc.n_blocks) return;

    uint lane = gl_LocalInvocationID.x;

    uint dst_off = u_meta.meta[block_idx].x;
    uint src_off = u_meta.meta[block_idx].y;
    uint stride  = pc.stride_u8;

    // Source-tile base: src_off points at output-row-0 col-0, the tile
    // starts 2 rows above.  Unsigned-safe because the public API
    // contract guarantees src_off >= 2*stride.
    uint tile_base = src_off - 2u * stride;

    // Phase 1: cooperative load — 64 lanes load 104 bytes.
    {
        uint sr = lane >> 3;        // 0..7
        uint sc = lane & 7u;
        s_tile[lane] = int(u_src.src[tile_base + sr * stride + sc]);
    }
    if (lane < 40u) {
        uint idx = lane + 64u;      // 64..103
        uint sr = idx >> 3;         // 8..12
        uint sc = idx & 7u;
        s_tile[idx] = int(u_src.src[tile_base + sr * stride + sc]);
    }

    barrier();

    // Phase 2: each lane computes one output pixel from the shared tile.
    uint r = lane >> 3;
    uint c = lane & 7u;

    int s_m2 = s_tile[(r + 0u) * 8u + c];
    int s_m1 = s_tile[(r + 1u) * 8u + c];
    int s_0  = s_tile[(r + 2u) * 8u + c];
    int s_p1 = s_tile[(r + 3u) * 8u + c];
    int s_p2 = s_tile[(r + 4u) * 8u + c];
    int s_p3 = s_tile[(r + 5u) * 8u + c];

    int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
    int p = clamp(v >> 5, 0, 255);

    u_dst.dst[dst_off + r * stride + c] = uint8_t(p);
}