// daedalus-fourier cycle 3 — VP9 8-tap "regular" subpel filter, // horizontal direction, 8-wide output, h rows. V3D 7.1 via Mesa v3dv. // // Bakes in cycle-1+2 v4 winning patterns from start: // - local_size_x = 256 // - 8 lanes per block (1 lane per output row), 2 blocks per // 16-lane subgroup, 16 subgroups per WG → 32 blocks per WG // - uint8_t SSBO via storageBuffer8BitAccess // - oob early-return safe (no barrier) // // Contracts (per k3_mc_phase4.md §5, revised per phase5''' findings): // - meta[i].x: dst_off (byte offset of block's row-0 col-0 dst pixel) // - meta[i].y: src_off (byte offset of block's row-0 col-0 SOURCE // pixel — note: NO +3 shift; the C bench's `src + 3` C-caller // convention does NOT carry into the SSBO offset. Shader reads // s[k] = SSBO[src_off + row*stride + k] for k=0..14, matching // C ref's per-row read of `master_src[block_base + row*stride // + (x..x+7)]` for output col x ∈ 0..7). // - meta[i].z: mx (subpel phase in [0..15]) // - dst_stride_u8 ≥ 8 (race-safety lower bound; bench asserts) // - src_stride_u8 ≥ 15 (per-row read span; bench asserts) // // License: BSD-2-Clause. Algorithm transcribed from tests/vp9_mc_ref.c // which mirrors libavcodec/vp9dsp_template.c FILTER_8TAP macro. #version 450 #extension GL_EXT_shader_8bit_storage : require #extension GL_EXT_shader_explicit_arithmetic_types : require layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; layout(binding = 0) readonly buffer Meta { uvec4 meta[]; // per block: (dst_off, src_off, mx, _pad) } u_meta; layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; layout(binding = 2) readonly buffer Src { uint8_t src[]; } u_src; layout(push_constant) uniform PC { uint n_blocks; uint dst_stride_u8; uint src_stride_u8; uint _pad; } pc; // VP9 8-tap REGULAR filter table — verbatim from // external/ffmpeg-snapshot/libavcodec/vp9_subpel_filters_table.c // (index [1] = FILTER_8TAP_REGULAR). 16 subpel phases × 8 taps. // // shaderdb-gate (phase5''' finding 2): if uniform count > ~144 after // first compile, escalate this LUT to SSBO binding 3. const int FILTER_REGULAR[16][8] = int[16][8]( int[8]( 0, 0, 0, 128, 0, 0, 0, 0 ), int[8]( 0, 1, -5, 126, 8, -3, 1, 0 ), int[8](-1, 3, -10, 122, 18, -6, 2, 0 ), int[8](-1, 4, -13, 118, 27, -9, 3, -1 ), int[8](-1, 4, -16, 112, 37, -11, 4, -1 ), int[8](-1, 5, -18, 105, 48, -14, 4, -1 ), int[8](-1, 5, -19, 97, 58, -16, 5, -1 ), int[8](-1, 6, -19, 88, 68, -18, 5, -1 ), int[8](-1, 6, -19, 78, 78, -19, 6, -1 ), int[8](-1, 5, -18, 68, 88, -19, 6, -1 ), int[8](-1, 5, -16, 58, 97, -19, 5, -1 ), int[8](-1, 4, -14, 48, 105, -18, 5, -1 ), int[8](-1, 4, -11, 37, 112, -16, 4, -1 ), int[8](-1, 3, -9, 27, 118, -13, 4, -1 ), int[8]( 0, 2, -6, 18, 122, -10, 3, -1 ), int[8]( 0, 1, -3, 8, 126, -5, 1, 0 ) ); void main() { uint gid = gl_GlobalInvocationID.x; uint wg_id = gid / 256u; uint lane_in_wg = gid & 255u; uint sg_in_wg = lane_in_wg >> 4; uint lane_in_sg = lane_in_wg & 15u; uint block_slot = lane_in_sg >> 3; uint row = lane_in_sg & 7u; uint block_local = sg_in_wg * 2u + block_slot; uint block_idx = wg_id * 32u + block_local; // No barrier follows — safe early-return. if (block_idx >= pc.n_blocks) return; uvec4 m = u_meta.meta[block_idx]; uint dst_off = m.x; uint src_off = m.y; uint mx = m.z & 15u; // Read 15 source pixels for this row. uint src_row = src_off + row * pc.src_stride_u8; int s0 = int(u_src.src[src_row + 0u]); int s1 = int(u_src.src[src_row + 1u]); int s2 = int(u_src.src[src_row + 2u]); int s3 = int(u_src.src[src_row + 3u]); int s4 = int(u_src.src[src_row + 4u]); int s5 = int(u_src.src[src_row + 5u]); int s6 = int(u_src.src[src_row + 6u]); int s7 = int(u_src.src[src_row + 7u]); int s8 = int(u_src.src[src_row + 8u]); int s9 = int(u_src.src[src_row + 9u]); int s10 = int(u_src.src[src_row + 10u]); int s11 = int(u_src.src[src_row + 11u]); int s12 = int(u_src.src[src_row + 12u]); int s13 = int(u_src.src[src_row + 13u]); int s14 = int(u_src.src[src_row + 14u]); int F0 = FILTER_REGULAR[mx][0]; int F1 = FILTER_REGULAR[mx][1]; int F2 = FILTER_REGULAR[mx][2]; int F3 = FILTER_REGULAR[mx][3]; int F4 = FILTER_REGULAR[mx][4]; int F5 = FILTER_REGULAR[mx][5]; int F6 = FILTER_REGULAR[mx][6]; int F7 = FILTER_REGULAR[mx][7]; int o0 = F0*s0 + F1*s1 + F2*s2 + F3*s3 + F4*s4 + F5*s5 + F6*s6 + F7*s7; int o1 = F0*s1 + F1*s2 + F2*s3 + F3*s4 + F4*s5 + F5*s6 + F6*s7 + F7*s8; int o2 = F0*s2 + F1*s3 + F2*s4 + F3*s5 + F4*s6 + F5*s7 + F6*s8 + F7*s9; int o3 = F0*s3 + F1*s4 + F2*s5 + F3*s6 + F4*s7 + F5*s8 + F6*s9 + F7*s10; int o4 = F0*s4 + F1*s5 + F2*s6 + F3*s7 + F4*s8 + F5*s9 + F6*s10 + F7*s11; int o5 = F0*s5 + F1*s6 + F2*s7 + F3*s8 + F4*s9 + F5*s10 + F6*s11 + F7*s12; int o6 = F0*s6 + F1*s7 + F2*s8 + F3*s9 + F4*s10 + F5*s11 + F6*s12 + F7*s13; int o7 = F0*s7 + F1*s8 + F2*s9 + F3*s10 + F4*s11 + F5*s12 + F6*s13 + F7*s14; uint dst_row = dst_off + row * pc.dst_stride_u8; u_dst.dst[dst_row + 0u] = uint8_t(clamp((o0 + 64) >> 7, 0, 255)); u_dst.dst[dst_row + 1u] = uint8_t(clamp((o1 + 64) >> 7, 0, 255)); u_dst.dst[dst_row + 2u] = uint8_t(clamp((o2 + 64) >> 7, 0, 255)); u_dst.dst[dst_row + 3u] = uint8_t(clamp((o3 + 64) >> 7, 0, 255)); u_dst.dst[dst_row + 4u] = uint8_t(clamp((o4 + 64) >> 7, 0, 255)); u_dst.dst[dst_row + 5u] = uint8_t(clamp((o5 + 64) >> 7, 0, 255)); u_dst.dst[dst_row + 6u] = uint8_t(clamp((o6 + 64) >> 7, 0, 255)); u_dst.dst[dst_row + 7u] = uint8_t(clamp((o7 + 64) >> 7, 0, 255)); }