Files
daedalus-fourier/src/v3d_h264deblock_chroma_h.comp
T
claude-noether d8de7754fa h264: V3D shaders for chroma deblock V + H (4:2:0)
Adds the QPU shader pair for chroma_v / chroma_h deblock (non-intra
bS<4), siblings of the cycle 8 luma_v shader and PR #28's luma_h.
Closes 4 of 8 deblock QPU coverage at non-intra:

  luma_v   ✓ cycle 8
  luma_h   ✓ PR #28
  chroma_v ✓ this PR
  chroma_h ✓ this PR
  *_intra  — CPU NEON (less common; smaller volume)

Per H.264 §8.7.2.4 chroma kernel is simpler than luma: only p0/q0
updated (never p1/p2/q1/q2), tC = tc0_seg + 1 (no luma-style ap/aq
side bonus), 8 cells per edge (vs luma's 16).  Shader: 64 lines
vs luma_v's 108 — same WG geometry (16 edges × 16 lanes, lanes
8..15 of each edge early-return).

4:2:0-only: 4:2:2 chroma_h has a 16-row edge geometry that this
shader doesn't address; daedalus_dispatch_h264_deblock_chroma_h is
4:2:0-only by design, caller-side gating already covers this in the
libavcodec substitution arc (marfrit-packages PR #98).

Recipe table flips DAEDALUS_KERNEL_H264_DEBLOCK_CV / CH from CPU to
QPU.  dispatch_h264_deblock_chroma_qpu factored to share QPU
plumbing between V and H (orientation passed as a flag for the
dst_max calculation).

Verified on hertz:

  $ ./build/test_api_h264 | grep "deblock chroma [vh]:"
    H.264 deblock chroma v: 256/256 bytes bit-exact (100.0000%)
    H.264 deblock chroma h: 256/256 bytes bit-exact (100.0000%)

  Recipe substrate now reports 2 (QPU) for both CV and CH.

Coverage now:
                bS<4 QPU     bS=4 (intra)
  luma_v        ✓ cycle 8    CPU NEON
  luma_h        ✓ PR #28     CPU NEON
  chroma_v      ✓ this PR    CPU NEON
  chroma_h      ✓ this PR    CPU NEON

Intra (bS=4) variants stay CPU NEON.  Less common case, smaller
per-frame contribution, and the algorithm is structurally different
(no tc0; strong-vs-weak filter quad-tree).  Can land as a follow-up
PR if perf demands.
2026-05-25 17:10:34 +02:00

70 lines
2.3 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// daedalus-fourier — H.264 chroma 4:2:0 H loop filter (horizontal
// filter across a vertical edge), non-intra bS<4 variant.
//
// Sibling of v3d_h264deblock_chroma_v.comp; same kernel transposed
// to read pix[-2..+1] (cols) instead of pix[-2*stride..+1*stride]
// (rows). Same 8-cell × 4-segment geometry, same WG layout (lanes
// 8..15 of each edge early-return — only 8 active per edge).
//
// 4:2:0-only: 4:2:2 chroma_h has a 16-row edge that this shader
// doesn't address. daedalus_dispatch_h264_deblock_chroma_h is
// 4:2:0-only by design; caller (libavcodec init) gates accordingly.
//
// License: BSD-2-Clause.
#version 450
#extension GL_EXT_shader_8bit_storage : require
#extension GL_EXT_shader_explicit_arithmetic_types : require
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta;
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
layout(push_constant) uniform PC {
uint n_edges;
uint dst_stride_u8;
uint _pad0;
uint _pad1;
} pc;
void main()
{
uint lane_in_wg = gl_GlobalInvocationID.x & 255u;
uint edge_in_wg = lane_in_wg >> 4; // 0..15
uint row_in_edge = lane_in_wg & 15u; // 0..15 — only 0..7 active
uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg;
if (edge_idx >= pc.n_edges) return;
if (row_in_edge >= 8u) return;
uvec4 m = u_meta.meta[edge_idx];
uint stride = pc.dst_stride_u8;
uint dst_off = m.x + row_in_edge * stride;
int alpha = int(m.y & 0xffu);
int beta = int((m.y >> 8) & 0xffu);
uint seg = row_in_edge >> 1;
uint tc0_byte = (m.z >> (seg * 8u)) & 0xffu;
int tc0_s = int(tc0_byte);
if (tc0_s >= 128) tc0_s -= 256;
if (alpha == 0 || beta == 0) return;
if (tc0_s < 0) return;
int p1 = int(u_dst.dst[dst_off - 2u]);
int p0 = int(u_dst.dst[dst_off - 1u]);
int q0 = int(u_dst.dst[dst_off ]);
int q1 = int(u_dst.dst[dst_off + 1u]);
if (abs(p0 - q0) >= alpha) return;
if (abs(p1 - p0) >= beta) return;
if (abs(q1 - q0) >= beta) return;
int tc = tc0_s + 1;
int delta = clamp(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
u_dst.dst[dst_off - 1u] = uint8_t(clamp(p0 + delta, 0, 255));
u_dst.dst[dst_off ] = uint8_t(clamp(q0 - delta, 0, 255));
}