daedalus-fourier/src/v3d_h264deblock_chroma_h.comp

// daedalus-fourier — H.264 chroma 4:2:0 H loop filter (horizontal
// filter across a vertical edge), non-intra bS<4 variant.
//
// Sibling of v3d_h264deblock_chroma_v.comp; same kernel transposed
// to read pix[-2..+1] (cols) instead of pix[-2*stride..+1*stride]
// (rows).  Same 8-cell × 4-segment geometry, same WG layout (lanes
// 8..15 of each edge early-return — only 8 active per edge).
//
// 4:2:0-only: 4:2:2 chroma_h has a 16-row edge that this shader
// doesn't address.  daedalus_dispatch_h264_deblock_chroma_h is
// 4:2:0-only by design; caller (libavcodec init) gates accordingly.
//
// License: BSD-2-Clause.

#version 450
#extension GL_EXT_shader_8bit_storage              : require
#extension GL_EXT_shader_explicit_arithmetic_types : require

layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta;
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;

layout(push_constant) uniform PC {
    uint n_edges;
    uint dst_stride_u8;
    uint _pad0;
    uint _pad1;
} pc;

void main()
{
    uint lane_in_wg  = gl_GlobalInvocationID.x & 255u;
    uint edge_in_wg  = lane_in_wg >> 4;        // 0..15
    uint row_in_edge = lane_in_wg & 15u;       // 0..15 — only 0..7 active

    uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg;
    if (edge_idx >= pc.n_edges) return;
    if (row_in_edge >= 8u) return;

    uvec4 m = u_meta.meta[edge_idx];
    uint stride  = pc.dst_stride_u8;
    uint dst_off = m.x + row_in_edge * stride;
    int alpha = int(m.y & 0xffu);
    int beta  = int((m.y >> 8) & 0xffu);

    uint seg = row_in_edge >> 1;
    uint tc0_byte = (m.z >> (seg * 8u)) & 0xffu;
    int tc0_s = int(tc0_byte);
    if (tc0_s >= 128) tc0_s -= 256;

    if (alpha == 0 || beta == 0) return;
    if (tc0_s < 0) return;

    int p1 = int(u_dst.dst[dst_off - 2u]);
    int p0 = int(u_dst.dst[dst_off - 1u]);
    int q0 = int(u_dst.dst[dst_off       ]);
    int q1 = int(u_dst.dst[dst_off + 1u]);

    if (abs(p0 - q0) >= alpha) return;
    if (abs(p1 - p0) >= beta)  return;
    if (abs(q1 - q0) >= beta)  return;

    int tc = tc0_s + 1;
    int delta = clamp(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);

    u_dst.dst[dst_off - 1u] = uint8_t(clamp(p0 + delta, 0, 255));
    u_dst.dst[dst_off       ] = uint8_t(clamp(q0 - delta, 0, 255));
}