// daedalus-fourier — H.264 luma qpel avg_mc22 (biprediction) (8x8, 2D half-pel "j" position). // V3D 7.1. // // Cascaded H+V 6-tap per H.264 §8.4.2.2.1 / FFmpeg ff_put_h264_qpel8_mc22_neon: // // tmp[r,c] = src[r,c-2] - 5*src[r,c-1] + 20*src[r,c] + 20*src[r,c+1] // - 5*src[r,c+2] + src[r,c+3] (int16) // // dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c] // + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c] // + 512) >> 10) // // The +512 >> 10 final scale compensates for both 6-tap scalings. // CANNOT just cascade mc20→mc02 because intermediate must be int16 // (no per-stage clip), so this is a dedicated kernel. // // Per-lane structure: each lane computes its own (r, c) output by // running the FULL cascade — 6 horizontal lowpass int16 values for // rows r-2..r+3, then a vertical lowpass on those. ~50 ALU ops per // lane. No shared memory / barriers needed; V3D L2 absorbs the // redundant src reads across lanes. // // WG layout: 64 lanes / 1 block-per-WG / 1 lane-per-output-pixel // (same as mc20 / mc02). // // // avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: // dst[r,c] = avg(dst[r,c], mc22_value) // Caller pre-loads dst with the list0 prediction; this shader // folds in the list1 contribution. // // License: BSD-2-Clause. #version 450 #extension GL_EXT_shader_8bit_storage : require #extension GL_EXT_shader_explicit_arithmetic_types : require layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; layout(push_constant) uniform PC { uint n_blocks; uint stride_u8; uint _pad0, _pad1; } pc; // Horizontal 6-tap filter at (row_off, c) — reads src at cols c-2..c+3 // of the row identified by row_off, returns int16 intermediate (NOT // scaled — the v-pass does the +512 >> 10 for both stages). int hpel_h(uint row_off, uint c) { int s_m2 = int(u_src.src[row_off + c - 2u]); int s_m1 = int(u_src.src[row_off + c - 1u]); int s_0 = int(u_src.src[row_off + c ]); int s_p1 = int(u_src.src[row_off + c + 1u]); int s_p2 = int(u_src.src[row_off + c + 2u]); int s_p3 = int(u_src.src[row_off + c + 3u]); return s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3; } void main() { uint block_idx = gl_WorkGroupID.x; if (block_idx >= pc.n_blocks) return; uint lane = gl_LocalInvocationID.x; uint r = lane >> 3; uint c = lane & 7u; uint dst_off = u_meta.meta[block_idx].x; uint src_off = u_meta.meta[block_idx].y; uint stride = pc.stride_u8; // Compute 6 horizontal lowpass values at rows r-2..r+3 (relative // to the output row r) of column c. src_off+r*stride+c is the // output pixel position; we sample rows r-2..r+3. // Unsigned-safe because src_off >= 2*stride per the caller contract. int t0 = hpel_h(src_off + (r - 2u) * stride, c); int t1 = hpel_h(src_off + (r - 1u) * stride, c); int t2 = hpel_h(src_off + r * stride, c); int t3 = hpel_h(src_off + (r + 1u) * stride, c); int t4 = hpel_h(src_off + (r + 2u) * stride, c); int t5 = hpel_h(src_off + (r + 3u) * stride, c); int v = t0 - 5 * t1 + 20 * t2 + 20 * t3 - 5 * t4 + t5 + 512; int p = clamp(v >> 10, 0, 255); uint final_off = dst_off + r * stride + c; int prev = int(u_dst.dst[final_off]); u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1); }