// daedalus-fourier cycle 2 — VP9 4-tap inner loop filter, horizontal
// direction, 8-pixel edge. V3D 7.1 via Mesa v3dv compute.
//
// Bakes in cycle-1 v4 winning patterns from the start:
//   - 256 invocations / WG (max), for v3dv latency hiding
//   - uint8_t dst SSBO via storageBuffer8BitAccess (race-free byte writes)
//   - 2 lanes per "block_slot" pattern — here 2 edges per 16-lane subgroup
//   - NO chained-ternary writes, only direct named-variable writes
//
// Differs from cycle-1 IDCT structurally:
//   - NO barrier — each lane fully independent (one row of one edge)
//   - NO shared memory — no transpose needed
//   - oob early-return is SAFE here (no barrier reachability issue)
//
// Contracts (per k2_deblock_phase4.md §4, revised per phase5'' findings 2+4):
//   1. meta[i].x ≥ 4 for every edge — bench enforced via assert
//   2. pc.dst_stride_u8 ≥ 4 — bench enforced via assert
//
// License: BSD-2-Clause. Algorithm transcribed from
// tests/vp9_lpf_ref.c which mirrors libavcodec/vp9dsp_template.c
// (vendored LGPL-2.1+).

#version 450
#extension GL_EXT_shader_8bit_storage              : require
#extension GL_EXT_shader_explicit_arithmetic_types : require

layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

layout(binding = 0) readonly buffer Meta {
    uvec4 meta[];   // per edge: (dst_offset_bytes, E, I, H)
} u_meta;

layout(binding = 1) buffer Dst {
    uint8_t dst[];
} u_dst;

layout(push_constant) uniform PC {
    uint n_edges;
    uint dst_stride_u8;
    uint _pad0;
    uint _pad1;
} pc;

void main()
{
    // Lane / edge decomposition (cycle-1 v4 pattern adapted: 8 lanes
    // per edge instead of 8 lanes per block; 2 edges per subgroup,
    // 16 subgroups per WG, 32 edges per WG).
    uint gid         = gl_GlobalInvocationID.x;
    uint wg_id       = gid / 256u;
    uint lane_in_wg  = gid & 255u;
    uint sg_in_wg    = lane_in_wg >> 4;          // 0..15
    uint lane_in_sg  = lane_in_wg & 15u;
    uint edge_slot   = lane_in_sg >> 3;          // 0 (lanes 0..7) or 1 (8..15)
    uint row         = lane_in_sg & 7u;          // 0..7 — which row of this edge

    uint edge_local  = sg_in_wg * 2u + edge_slot;
    uint edge_idx    = wg_id * 32u + edge_local;

    // Safe early-return: no barrier follows. Per phase4 §4.
    if (edge_idx >= pc.n_edges) return;

    uvec4 m = u_meta.meta[edge_idx];
    uint base = m.x + row * pc.dst_stride_u8;
    int E = int(m.y), I = int(m.z), H = int(m.w);

    int p3 = int(u_dst.dst[base - 4u]);
    int p2 = int(u_dst.dst[base - 3u]);
    int p1 = int(u_dst.dst[base - 2u]);
    int p0 = int(u_dst.dst[base - 1u]);
    int q0 = int(u_dst.dst[base + 0u]);
    int q1 = int(u_dst.dst[base + 1u]);
    int q2 = int(u_dst.dst[base + 2u]);
    int q3 = int(u_dst.dst[base + 3u]);

    bool fm = abs(p3 - p2) <= I && abs(p2 - p1) <= I &&
              abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
              abs(q2 - q1) <= I && abs(q3 - q2) <= I &&
              abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;
    if (!fm) return;

    bool hev = abs(p1 - p0) > H || abs(q1 - q0) > H;

    if (hev) {
        int f  = clamp(p1 - q1, -128, 127);
        f      = clamp(3 * (q0 - p0) + f, -128, 127);
        int f1 = min(f + 4, 127) >> 3;
        int f2 = min(f + 3, 127) >> 3;
        u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255));
        u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255));
    } else {
        int f  = clamp(3 * (q0 - p0), -128, 127);
        int f1 = min(f + 4, 127) >> 3;
        int f2 = min(f + 3, 127) >> 3;
        u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255));
        u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255));
        int fp = (f1 + 1) >> 1;
        u_dst.dst[base - 2u] = uint8_t(clamp(p1 + fp, 0, 255));
        u_dst.dst[base + 1u] = uint8_t(clamp(q1 - fp, 0, 255));
    }
}