// daedalus-fourier cycle 4 — VP9 8-tap inner LPF, wd=8, h direction, // 8-pixel edge. V3D 7.1 via Mesa v3dv. // // Extension of cycle 2's wd=4 kernel: adds flat8in test + 6-write // flat-region path. Same lane/edge geometry (32 edges/WG, 8 lanes // per edge, no barrier, no shared mem). // // Contracts (per k4_lpf8_phase4_7.md): // - meta[i].x: dst_off (≥ 4 for cycle-2 reasons; >= 3 strictly here // for the -3 read, but ≥ 4 keeps invariant with cycle 2) // - **dst_stride_u8 ≥ 6** (cycle 4 update: flat8in path writes // 6 contiguous bytes per row at base-3..base+2) // // License: BSD-2-Clause. #version 450 #extension GL_EXT_shader_8bit_storage : require #extension GL_EXT_shader_explicit_arithmetic_types : require layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta; layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; layout(push_constant) uniform PC { uint n_edges; uint blocks_per_row; /* unused */ uint dst_stride_u8; uint _pad; } pc; void main() { uint gid = gl_GlobalInvocationID.x; uint wg_id = gid / 256u; uint lane_in_wg = gid & 255u; uint sg_in_wg = lane_in_wg >> 4; uint lane_in_sg = lane_in_wg & 15u; uint edge_slot = lane_in_sg >> 3; uint row = lane_in_sg & 7u; uint edge_local = sg_in_wg * 2u + edge_slot; uint edge_idx = wg_id * 32u + edge_local; if (edge_idx >= pc.n_edges) return; uvec4 m = u_meta.meta[edge_idx]; uint base = m.x + row * pc.dst_stride_u8; int E = int(m.y), I = int(m.z), H = int(m.w); int p3 = int(u_dst.dst[base - 4u]); int p2 = int(u_dst.dst[base - 3u]); int p1 = int(u_dst.dst[base - 2u]); int p0 = int(u_dst.dst[base - 1u]); int q0 = int(u_dst.dst[base + 0u]); int q1 = int(u_dst.dst[base + 1u]); int q2 = int(u_dst.dst[base + 2u]); int q3 = int(u_dst.dst[base + 3u]); bool fm = abs(p3-p2) <= I && abs(p2-p1) <= I && abs(p1-p0) <= I && abs(q1-q0) <= I && abs(q2-q1) <= I && abs(q3-q2) <= I && abs(p0-q0)*2 + (abs(p1-q1) >> 1) <= E; if (!fm) return; /* F = 1 << (BIT_DEPTH - 8) = 1 for 8-bit pixels. */ bool flat8in = abs(p3-p0) <= 1 && abs(p2-p0) <= 1 && abs(p1-p0) <= 1 && abs(q1-q0) <= 1 && abs(q2-q0) <= 1 && abs(q3-q0) <= 1; if (flat8in) { /* wd=8 inner-flat filter — 8-pixel-input, 6 outputs. Each * output is a weighted average; rounding bias +4, >>3. */ u_dst.dst[base - 3u] = uint8_t((p3+p3+p3 + 2*p2 + p1+p0+q0 + 4) >> 3); u_dst.dst[base - 2u] = uint8_t((p3+p3+p2 + 2*p1 + p0+q0+q1 + 4) >> 3); u_dst.dst[base - 1u] = uint8_t((p3+p2+p1 + 2*p0 + q0+q1+q2 + 4) >> 3); u_dst.dst[base + 0u] = uint8_t((p2+p1+p0 + 2*q0 + q1+q2+q3 + 4) >> 3); u_dst.dst[base + 1u] = uint8_t((p1+p0+q0 + 2*q1 + q2+q3+q3 + 4) >> 3); u_dst.dst[base + 2u] = uint8_t((p0+q0+q1 + 2*q2 + q3+q3+q3 + 4) >> 3); } else { bool hev = abs(p1-p0) > H || abs(q1-q0) > H; if (hev) { int f = clamp(p1 - q1, -128, 127); f = clamp(3*(q0-p0) + f, -128, 127); int f1 = min(f + 4, 127) >> 3; int f2 = min(f + 3, 127) >> 3; u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255)); u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255)); } else { int f = clamp(3*(q0-p0), -128, 127); int f1 = min(f + 4, 127) >> 3; int f2 = min(f + 3, 127) >> 3; u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255)); u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255)); int fp = (f1 + 1) >> 1; u_dst.dst[base - 2u] = uint8_t(clamp(p1 + fp, 0, 255)); u_dst.dst[base + 1u] = uint8_t(clamp(q1 - fp, 0, 255)); } } }