// daedalus-fourier — H.264 luma qpel mc02 (8x8, vertical half-pel), V3D 7.1. // // Sibling of cycle 9's v3d_h264_qpel_mc20.comp. Same 6-tap filter, // transposed to vertical direction: // // dst[r,c] = clip255( // ( s[r-2,c] // - 5 * s[r-1,c] // + 20 * s[r, c] // + 20 * s[r+1,c] // - 5 * s[r+2,c] // + s[r+3,c] // + 16 // ) >> 5) // // src+src_off points at row 0 col 0 of the OUTPUT block; the filter // reads rows -2..+3 (2 rows of top context, 3 rows of bottom). // // Same WG layout as mc20: 64 lanes / 1 block-per-WG / 1 lane-per-pixel. // // License: BSD-2-Clause. #version 450 #extension GL_EXT_shader_8bit_storage : require #extension GL_EXT_shader_explicit_arithmetic_types : require layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; layout(push_constant) uniform PC { uint n_blocks; uint stride_u8; uint _pad0, _pad1; } pc; void main() { uint block_idx = gl_WorkGroupID.x; if (block_idx >= pc.n_blocks) return; uint lane = gl_LocalInvocationID.x; uint r = lane >> 3; uint c = lane & 7u; uint dst_off = u_meta.meta[block_idx].x; uint src_off = u_meta.meta[block_idx].y; uint stride = pc.stride_u8; // Read the 6 rows of vertical context at col (c) of THIS output row. // src_off+r*stride+c is at the OUTPUT pixel position; the kernel // samples r-2..r+3 along the column. Unsigned-safe because the // public API contract guarantees src_off >= 2*stride. uint col_base = src_off + c; int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); int s_0 = int(u_src.src[col_base + r * stride]); int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16; int p = clamp(v >> 5, 0, 255); u_dst.dst[dst_off + r * stride + c] = uint8_t(p); }