// daedalus-fourier cycle 2 — VP9 4-tap inner loop filter, horizontal // direction, 8-pixel edge. V3D 7.1 via Mesa v3dv compute. // // Bakes in cycle-1 v4 winning patterns from the start: // - 256 invocations / WG (max), for v3dv latency hiding // - uint8_t dst SSBO via storageBuffer8BitAccess (race-free byte writes) // - 2 lanes per "block_slot" pattern — here 2 edges per 16-lane subgroup // - NO chained-ternary writes, only direct named-variable writes // // Differs from cycle-1 IDCT structurally: // - NO barrier — each lane fully independent (one row of one edge) // - NO shared memory — no transpose needed // - oob early-return is SAFE here (no barrier reachability issue) // // Contracts (per k2_deblock_phase4.md §4, revised per phase5'' findings 2+4): // 1. meta[i].x ≥ 4 for every edge — bench enforced via assert // 2. pc.dst_stride_u8 ≥ 4 — bench enforced via assert // // License: BSD-2-Clause. Algorithm transcribed from // tests/vp9_lpf_ref.c which mirrors libavcodec/vp9dsp_template.c // (vendored LGPL-2.1+). #version 450 #extension GL_EXT_shader_8bit_storage : require #extension GL_EXT_shader_explicit_arithmetic_types : require layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; layout(binding = 0) readonly buffer Meta { uvec4 meta[]; // per edge: (dst_offset_bytes, E, I, H) } u_meta; layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; layout(push_constant) uniform PC { uint n_edges; uint dst_stride_u8; uint _pad0; uint _pad1; } pc; void main() { // Lane / edge decomposition (cycle-1 v4 pattern adapted: 8 lanes // per edge instead of 8 lanes per block; 2 edges per subgroup, // 16 subgroups per WG, 32 edges per WG). uint gid = gl_GlobalInvocationID.x; uint wg_id = gid / 256u; uint lane_in_wg = gid & 255u; uint sg_in_wg = lane_in_wg >> 4; // 0..15 uint lane_in_sg = lane_in_wg & 15u; uint edge_slot = lane_in_sg >> 3; // 0 (lanes 0..7) or 1 (8..15) uint row = lane_in_sg & 7u; // 0..7 — which row of this edge uint edge_local = sg_in_wg * 2u + edge_slot; uint edge_idx = wg_id * 32u + edge_local; // Safe early-return: no barrier follows. Per phase4 §4. if (edge_idx >= pc.n_edges) return; uvec4 m = u_meta.meta[edge_idx]; uint base = m.x + row * pc.dst_stride_u8; int E = int(m.y), I = int(m.z), H = int(m.w); int p3 = int(u_dst.dst[base - 4u]); int p2 = int(u_dst.dst[base - 3u]); int p1 = int(u_dst.dst[base - 2u]); int p0 = int(u_dst.dst[base - 1u]); int q0 = int(u_dst.dst[base + 0u]); int q1 = int(u_dst.dst[base + 1u]); int q2 = int(u_dst.dst[base + 2u]); int q3 = int(u_dst.dst[base + 3u]); bool fm = abs(p3 - p2) <= I && abs(p2 - p1) <= I && abs(p1 - p0) <= I && abs(q1 - q0) <= I && abs(q2 - q1) <= I && abs(q3 - q2) <= I && abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E; if (!fm) return; bool hev = abs(p1 - p0) > H || abs(q1 - q0) > H; if (hev) { int f = clamp(p1 - q1, -128, 127); f = clamp(3 * (q0 - p0) + f, -128, 127); int f1 = min(f + 4, 127) >> 3; int f2 = min(f + 3, 127) >> 3; u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255)); u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255)); } else { int f = clamp(3 * (q0 - p0), -128, 127); int f1 = min(f + 4, 127) >> 3; int f2 = min(f + 3, 127) >> 3; u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255)); u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255)); int fp = (f1 + 1) >> 1; u_dst.dst[base - 2u] = uint8_t(clamp(p1 + fp, 0, 255)); u_dst.dst[base + 1u] = uint8_t(clamp(q1 - fp, 0, 255)); } }