// daedalus-fourier — H.264 chroma 4:2:0 H loop filter (horizontal // filter across a vertical edge), non-intra bS<4 variant. // // Sibling of v3d_h264deblock_chroma_v.comp; same kernel transposed // to read pix[-2..+1] (cols) instead of pix[-2*stride..+1*stride] // (rows). Same 8-cell × 4-segment geometry, same WG layout (lanes // 8..15 of each edge early-return — only 8 active per edge). // // 4:2:0-only: 4:2:2 chroma_h has a 16-row edge that this shader // doesn't address. daedalus_dispatch_h264_deblock_chroma_h is // 4:2:0-only by design; caller (libavcodec init) gates accordingly. // // License: BSD-2-Clause. #version 450 #extension GL_EXT_shader_8bit_storage : require #extension GL_EXT_shader_explicit_arithmetic_types : require layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta; layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; layout(push_constant) uniform PC { uint n_edges; uint dst_stride_u8; uint _pad0; uint _pad1; } pc; void main() { uint lane_in_wg = gl_GlobalInvocationID.x & 255u; uint edge_in_wg = lane_in_wg >> 4; // 0..15 uint row_in_edge = lane_in_wg & 15u; // 0..15 — only 0..7 active uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg; if (edge_idx >= pc.n_edges) return; if (row_in_edge >= 8u) return; uvec4 m = u_meta.meta[edge_idx]; uint stride = pc.dst_stride_u8; uint dst_off = m.x + row_in_edge * stride; int alpha = int(m.y & 0xffu); int beta = int((m.y >> 8) & 0xffu); uint seg = row_in_edge >> 1; uint tc0_byte = (m.z >> (seg * 8u)) & 0xffu; int tc0_s = int(tc0_byte); if (tc0_s >= 128) tc0_s -= 256; if (alpha == 0 || beta == 0) return; if (tc0_s < 0) return; int p1 = int(u_dst.dst[dst_off - 2u]); int p0 = int(u_dst.dst[dst_off - 1u]); int q0 = int(u_dst.dst[dst_off ]); int q1 = int(u_dst.dst[dst_off + 1u]); if (abs(p0 - q0) >= alpha) return; if (abs(p1 - p0) >= beta) return; if (abs(q1 - q0) >= beta) return; int tc = tc0_s + 1; int delta = clamp(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc); u_dst.dst[dst_off - 1u] = uint8_t(clamp(p0 + delta, 0, 255)); u_dst.dst[dst_off ] = uint8_t(clamp(q0 - delta, 0, 255)); }