Files
daedalus-fourier/src/v3d_h264_qpel_mc02.comp
T

111 lines
3.8 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// daedalus-fourier — H.264 luma qpel mc02 (8x8, vertical half-pel), V3D 7.1.
//
// v2: cooperative-load shared-memory tile.
//
// dst[r,c] = clip255(
// ( s[r-2,c]
// - 5 * s[r-1,c]
// + 20 * s[r, c]
// + 20 * s[r+1,c]
// - 5 * s[r+2,c]
// + s[r+3,c]
// + 16
// ) >> 5)
//
// src+src_off points at row 0 col 0 of the OUTPUT block; the filter
// reads rows -2..+3 (2 rows of top context, 3 rows of bottom), total
// 13 distinct source rows × 8 cols = 104 bytes per 8x8 output.
//
// v1 had each of the 64 lanes do 6 SSBO loads → 384 loads/WG to cover
// 104 unique bytes (3.7x redundant), and each lane's loads were stride-
// spaced (one cache line per byte under V3D's TMU). PR #36 bench
// showed mc02 was the only qpel position where CPU NEON still beat
// QPU (16.96 ns/op CPU vs 20.54 ns/op QPU; 1.21x CPU favoring).
//
// v2 splits the work into a coalesced load phase + a shared-memory
// compute phase:
//
// Phase 1: each of the 64 lanes cooperatively loads the 104-byte
// source tile into shared memory. Lanes 0..63 load bytes at indices
// 0..63 (covers source rows 0..7 of the 13-row tile); lanes 0..39
// second-load bytes 64..103 (rows 8..12). Reads within a row are
// contiguous so the SIMD groups coalesce; total SSBO loads = 104,
// matching the unique-byte count.
//
// Phase 2: all 64 lanes compute one output pixel each, reading 6
// bytes from shared. Shared-memory access on V3D is local-store
// backed (no TMU round-trip).
//
// Same WG layout as v1: 64 lanes / 1 block-per-WG / 1 lane-per-pixel.
//
// License: BSD-2-Clause.
#version 450
#extension GL_EXT_shader_8bit_storage : require
#extension GL_EXT_shader_explicit_arithmetic_types : require
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
layout(push_constant) uniform PC {
uint n_blocks;
uint stride_u8;
uint _pad0, _pad1;
} pc;
// 13 source rows × 8 cols. int storage (4 bytes each) — wasteful vs
// uint8_t but avoids 8-bit-shared interop concerns on glslang+v3dv;
// 416 bytes shared/WG is well within any reasonable local-store budget.
shared int s_tile[13 * 8];
void main()
{
uint block_idx = gl_WorkGroupID.x;
if (block_idx >= pc.n_blocks) return;
uint lane = gl_LocalInvocationID.x;
uint dst_off = u_meta.meta[block_idx].x;
uint src_off = u_meta.meta[block_idx].y;
uint stride = pc.stride_u8;
// Source-tile base: src_off points at output-row-0 col-0, the tile
// starts 2 rows above. Unsigned-safe because the public API
// contract guarantees src_off >= 2*stride.
uint tile_base = src_off - 2u * stride;
// Phase 1: cooperative load — 64 lanes load 104 bytes.
{
uint sr = lane >> 3; // 0..7
uint sc = lane & 7u;
s_tile[lane] = int(u_src.src[tile_base + sr * stride + sc]);
}
if (lane < 40u) {
uint idx = lane + 64u; // 64..103
uint sr = idx >> 3; // 8..12
uint sc = idx & 7u;
s_tile[idx] = int(u_src.src[tile_base + sr * stride + sc]);
}
barrier();
// Phase 2: each lane computes one output pixel from the shared tile.
uint r = lane >> 3;
uint c = lane & 7u;
int s_m2 = s_tile[(r + 0u) * 8u + c];
int s_m1 = s_tile[(r + 1u) * 8u + c];
int s_0 = s_tile[(r + 2u) * 8u + c];
int s_p1 = s_tile[(r + 3u) * 8u + c];
int s_p2 = s_tile[(r + 4u) * 8u + c];
int s_p3 = s_tile[(r + 5u) * 8u + c];
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
int p = clamp(v >> 5, 0, 255);
u_dst.dst[dst_off + r * stride + c] = uint8_t(p);
}