// daedalus-fourier — VP9 8×8 DCT_DCT inverse-transform-add, V3D 7.1.
// v2: post-Phase-7 loopback. Phase 4' iteration 1.
//
// Changes from v1 (per phase47 iteration 1 + Sonnet v3d perf research):
//
//   Opt 1 — kill the chained ternary. v1's row-pass write had
//           `(r==0)?o0:(r==1)?o1:...` inside a `for r` loop; that
//           kept all 8 oN scalars live across 7 phi nodes and almost
//           certainly forced register spills (Iago Toral 2021,
//           blogs.igalia.com/itoral). v2 unrolls the 8 writes
//           completely — each oN is used exactly once.
//
//   Opt 2 — 2 blocks per subgroup. v1 had 1 block per 16-lane
//           subgroup with 8 lanes idle per phase. v2 packs 2 blocks
//           per subgroup (one in lanes 0..7, one in lanes 8..15),
//           and every lane runs both passes for its own block.
//           Eliminates idle lanes AND removes the col_pass/row_pass
//           branch divergence. 8 blocks per WG (vs 4 before),
//           dispatch count halves from 8160 to 4080 on 1080p.
//           Shared-mem footprint doubles to 2 KiB (still « 16 KiB).
//
// (Opt 3 — packed uint32 storage — deferred; do it if Opt 1+2
// don't get us into the GREEN/YELLOW decision band.)
//
// License: BSD-2-Clause.

#version 450
#extension GL_EXT_shader_8bit_storage             : require
#extension GL_EXT_shader_16bit_storage            : require
#extension GL_EXT_shader_explicit_arithmetic_types : require

// v4: local_size 256 (was 64) — 16 subgroups × 16 lanes = 32 blocks/WG.
// More in-flight work per WG = more latency hiding for v3d's TMU.
// shared = 32 × 64 × 4 B = 8 KiB (still under 16 KiB).
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

layout(binding = 0) readonly buffer Coeffs {
    int16_t coeffs[];   // N × 64 packed
} u_coeffs;
// (v5 tried uint32-packed reads with manual unpack — no measurable
// perf change vs int16, added code complexity; reverted.)

layout(binding = 1) buffer Dst {
    uint8_t dst[];      // H × stride bytes
} u_dst;

layout(binding = 2) readonly buffer Meta {
    uvec2 meta[];       // per-block (block_x_8, block_y_8)
} u_meta;

layout(push_constant) uniform PC {
    uint n_blocks;
    uint blocks_per_row;   // unused (meta drives position)
    uint dst_stride_u8;
    uint _pad;
} pc;

// 32 blocks per WG × 64 i32 per block × 4 B = 8192 B shared.
shared int tmp_shared[32 * 64];

// VP9 Q14 trig constants (spec §8.7.1.4).
const int COSPI_16 = 11585;
const int COSPI_24 =  6270;
const int COSPI_08 = 15137;
const int COSPI_28 =  3196;
const int COSPI_04 = 16069;
const int COSPI_20 =  9102;
const int COSPI_12 = 13623;

int qround14(int x) { return (x + (1 << 13)) >> 14; }

void idct8_1d(int i0, int i1, int i2, int i3,
              int i4, int i5, int i6, int i7,
              out int o0, out int o1, out int o2, out int o3,
              out int o4, out int o5, out int o6, out int o7)
{
    int t0a = qround14((i0 + i4) * COSPI_16);
    int t1a = qround14((i0 - i4) * COSPI_16);
    int t2a = qround14(i2 * COSPI_24 - i6 * COSPI_08);
    int t3a = qround14(i2 * COSPI_08 + i6 * COSPI_24);
    int t4a = qround14(i1 * COSPI_28 - i7 * COSPI_04);
    int t5a = qround14(i5 * COSPI_12 - i3 * COSPI_20);
    int t6a = qround14(i5 * COSPI_20 + i3 * COSPI_12);
    int t7a = qround14(i1 * COSPI_04 + i7 * COSPI_28);

    int t0 = t0a + t3a, t1 = t1a + t2a;
    int t2 = t1a - t2a, t3 = t0a - t3a;
    int t4  = t4a + t5a;
    int t5p = t4a - t5a;
    int t7  = t7a + t6a;
    int t6p = t7a - t6a;

    int t5 = qround14((t6p - t5p) * COSPI_16);
    int t6 = qround14((t6p + t5p) * COSPI_16);

    o0 = t0 + t7; o1 = t1 + t6;
    o2 = t2 + t5; o3 = t3 + t4;
    o4 = t3 - t4; o5 = t2 - t5;
    o6 = t1 - t6; o7 = t0 - t7;
}

void main()
{
    // ---- Lane / block decomposition --------------------------------
    // 64 invocations/WG = 4 subgroups × 16 lanes/subgroup.
    // Each subgroup packs 2 blocks (one in lanes 0..7, one in lanes 8..15).
    // 8 blocks per WG total.
    //
    // Every lane runs both column and row pass for its own block —
    // no idle lanes, no col_pass/row_pass branch divergence.

    uint gid          = gl_GlobalInvocationID.x;
    uint wg_id        = gid / 256u;
    uint lane_in_wg   = gid & 255u;
    uint sg_in_wg     = lane_in_wg >> 4;          // 0..15
    uint lane_in_sg   = lane_in_wg & 15u;
    uint block_slot   = lane_in_sg >> 3;          // 0 (lanes 0..7) or 1 (lanes 8..15)
    uint k            = lane_in_sg & 7u;          // 0..7

    uint block_local  = sg_in_wg * 2u + block_slot;   // 0..31 within WG
    uint block_idx    = wg_id * 32u + block_local;

    // OOB flag — gates work bodies, but barrier() is reached by all.
    // Per phase5.md finding 7.
    bool oob = (block_idx >= pc.n_blocks);

    // ---- Column pass ----------------------------------------------
    // v3 (Opt 4): scope oN inside each pass so they're dead at the
    // barrier — v2 had them function-scope which inflated max-temps
    // (shaderdb reported 20 max-temps / 2 threads instead of 4 threads
    // possible). Lower temps → more hardware threads → better
    // latency hiding.
    if (!oob) {
        uint base = block_idx * 64u;
        int c0 = int(u_coeffs.coeffs[base + 0u * 8u + k]);
        int c1 = int(u_coeffs.coeffs[base + 1u * 8u + k]);
        int c2 = int(u_coeffs.coeffs[base + 2u * 8u + k]);
        int c3 = int(u_coeffs.coeffs[base + 3u * 8u + k]);
        int c4 = int(u_coeffs.coeffs[base + 4u * 8u + k]);
        int c5 = int(u_coeffs.coeffs[base + 5u * 8u + k]);
        int c6 = int(u_coeffs.coeffs[base + 6u * 8u + k]);
        int c7 = int(u_coeffs.coeffs[base + 7u * 8u + k]);

        int o0, o1, o2, o3, o4, o5, o6, o7;
        idct8_1d(c0, c1, c2, c3, c4, c5, c6, c7,
                 o0, o1, o2, o3, o4, o5, o6, o7);

        // Transposed write: row k of tmp_shared[block_local].
        uint tbase = block_local * 64u + k * 8u;
        tmp_shared[tbase + 0u] = o0;
        tmp_shared[tbase + 1u] = o1;
        tmp_shared[tbase + 2u] = o2;
        tmp_shared[tbase + 3u] = o3;
        tmp_shared[tbase + 4u] = o4;
        tmp_shared[tbase + 5u] = o5;
        tmp_shared[tbase + 6u] = o6;
        tmp_shared[tbase + 7u] = o7;
    }

    barrier();   // unconditional — every lane in the WG reaches this

    // ---- Row pass --------------------------------------------------
    if (!oob) {
        // Read column k of tmp_shared[block_local].
        uint tbase = block_local * 64u;
        int s0 = tmp_shared[tbase + 0u * 8u + k];
        int s1 = tmp_shared[tbase + 1u * 8u + k];
        int s2 = tmp_shared[tbase + 2u * 8u + k];
        int s3 = tmp_shared[tbase + 3u * 8u + k];
        int s4 = tmp_shared[tbase + 4u * 8u + k];
        int s5 = tmp_shared[tbase + 5u * 8u + k];
        int s6 = tmp_shared[tbase + 6u * 8u + k];
        int s7 = tmp_shared[tbase + 7u * 8u + k];

        int o0, o1, o2, o3, o4, o5, o6, o7;
        idct8_1d(s0, s1, s2, s3, s4, s5, s6, s7,
                 o0, o1, o2, o3, o4, o5, o6, o7);

        // Columnar write into dst. Each lane owns column k of its block.
        // Block position in dst from meta.
        uvec2 bp = u_meta.meta[block_idx];
        uint block_x = bp.x;
        uint block_y = bp.y;
        uint dx     = block_x * 8u + k;
        uint dy0    = block_y * 8u;
        uint stride = pc.dst_stride_u8;

        // Opt 1: 8 fully-unrolled writes — each o_i used exactly once.
        // No chained ternary, no loop with runtime-variable index.
        uint a0 = (dy0 + 0u) * stride + dx;
        uint a1 = (dy0 + 1u) * stride + dx;
        uint a2 = (dy0 + 2u) * stride + dx;
        uint a3 = (dy0 + 3u) * stride + dx;
        uint a4 = (dy0 + 4u) * stride + dx;
        uint a5 = (dy0 + 5u) * stride + dx;
        uint a6 = (dy0 + 6u) * stride + dx;
        uint a7 = (dy0 + 7u) * stride + dx;

        int p0 = int(u_dst.dst[a0]);
        int p1 = int(u_dst.dst[a1]);
        int p2 = int(u_dst.dst[a2]);
        int p3 = int(u_dst.dst[a3]);
        int p4 = int(u_dst.dst[a4]);
        int p5 = int(u_dst.dst[a5]);
        int p6 = int(u_dst.dst[a6]);
        int p7 = int(u_dst.dst[a7]);

        u_dst.dst[a0] = uint8_t(clamp(p0 + ((o0 + 16) >> 5), 0, 255));
        u_dst.dst[a1] = uint8_t(clamp(p1 + ((o1 + 16) >> 5), 0, 255));
        u_dst.dst[a2] = uint8_t(clamp(p2 + ((o2 + 16) >> 5), 0, 255));
        u_dst.dst[a3] = uint8_t(clamp(p3 + ((o3 + 16) >> 5), 0, 255));
        u_dst.dst[a4] = uint8_t(clamp(p4 + ((o4 + 16) >> 5), 0, 255));
        u_dst.dst[a5] = uint8_t(clamp(p5 + ((o5 + 16) >> 5), 0, 255));
        u_dst.dst[a6] = uint8_t(clamp(p6 + ((o6 + 16) >> 5), 0, 255));
        u_dst.dst[a7] = uint8_t(clamp(p7 + ((o7 + 16) >> 5), 0, 255));
    }
}