h264: V3D shaders for chroma deblock V + H (4:2:0)
Adds the QPU shader pair for chroma_v / chroma_h deblock (non-intra bS<4), siblings of the cycle 8 luma_v shader and PR #28's luma_h. Closes 4 of 8 deblock QPU coverage at non-intra: luma_v ✓ cycle 8 luma_h ✓ PR #28 chroma_v ✓ this PR chroma_h ✓ this PR *_intra — CPU NEON (less common; smaller volume) Per H.264 §8.7.2.4 chroma kernel is simpler than luma: only p0/q0 updated (never p1/p2/q1/q2), tC = tc0_seg + 1 (no luma-style ap/aq side bonus), 8 cells per edge (vs luma's 16). Shader: 64 lines vs luma_v's 108 — same WG geometry (16 edges × 16 lanes, lanes 8..15 of each edge early-return). 4:2:0-only: 4:2:2 chroma_h has a 16-row edge geometry that this shader doesn't address; daedalus_dispatch_h264_deblock_chroma_h is 4:2:0-only by design, caller-side gating already covers this in the libavcodec substitution arc (marfrit-packages PR #98). Recipe table flips DAEDALUS_KERNEL_H264_DEBLOCK_CV / CH from CPU to QPU. dispatch_h264_deblock_chroma_qpu factored to share QPU plumbing between V and H (orientation passed as a flag for the dst_max calculation). Verified on hertz: $ ./build/test_api_h264 | grep "deblock chroma [vh]:" H.264 deblock chroma v: 256/256 bytes bit-exact (100.0000%) H.264 deblock chroma h: 256/256 bytes bit-exact (100.0000%) Recipe substrate now reports 2 (QPU) for both CV and CH. Coverage now: bS<4 QPU bS=4 (intra) luma_v ✓ cycle 8 CPU NEON luma_h ✓ PR #28 CPU NEON chroma_v ✓ this PR CPU NEON chroma_h ✓ this PR CPU NEON Intra (bS=4) variants stay CPU NEON. Less common case, smaller per-frame contribution, and the algorithm is structurally different (no tc0; strong-vs-weak filter quad-tree). Can land as a follow-up PR if perf demands.
This commit is contained in:
@@ -0,0 +1,69 @@
|
||||
// daedalus-fourier — H.264 chroma 4:2:0 H loop filter (horizontal
|
||||
// filter across a vertical edge), non-intra bS<4 variant.
|
||||
//
|
||||
// Sibling of v3d_h264deblock_chroma_v.comp; same kernel transposed
|
||||
// to read pix[-2..+1] (cols) instead of pix[-2*stride..+1*stride]
|
||||
// (rows). Same 8-cell × 4-segment geometry, same WG layout (lanes
|
||||
// 8..15 of each edge early-return — only 8 active per edge).
|
||||
//
|
||||
// 4:2:0-only: 4:2:2 chroma_h has a 16-row edge that this shader
|
||||
// doesn't address. daedalus_dispatch_h264_deblock_chroma_h is
|
||||
// 4:2:0-only by design; caller (libavcodec init) gates accordingly.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_edges;
|
||||
uint dst_stride_u8;
|
||||
uint _pad0;
|
||||
uint _pad1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint lane_in_wg = gl_GlobalInvocationID.x & 255u;
|
||||
uint edge_in_wg = lane_in_wg >> 4; // 0..15
|
||||
uint row_in_edge = lane_in_wg & 15u; // 0..15 — only 0..7 active
|
||||
|
||||
uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg;
|
||||
if (edge_idx >= pc.n_edges) return;
|
||||
if (row_in_edge >= 8u) return;
|
||||
|
||||
uvec4 m = u_meta.meta[edge_idx];
|
||||
uint stride = pc.dst_stride_u8;
|
||||
uint dst_off = m.x + row_in_edge * stride;
|
||||
int alpha = int(m.y & 0xffu);
|
||||
int beta = int((m.y >> 8) & 0xffu);
|
||||
|
||||
uint seg = row_in_edge >> 1;
|
||||
uint tc0_byte = (m.z >> (seg * 8u)) & 0xffu;
|
||||
int tc0_s = int(tc0_byte);
|
||||
if (tc0_s >= 128) tc0_s -= 256;
|
||||
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0_s < 0) return;
|
||||
|
||||
int p1 = int(u_dst.dst[dst_off - 2u]);
|
||||
int p0 = int(u_dst.dst[dst_off - 1u]);
|
||||
int q0 = int(u_dst.dst[dst_off ]);
|
||||
int q1 = int(u_dst.dst[dst_off + 1u]);
|
||||
|
||||
if (abs(p0 - q0) >= alpha) return;
|
||||
if (abs(p1 - p0) >= beta) return;
|
||||
if (abs(q1 - q0) >= beta) return;
|
||||
|
||||
int tc = tc0_s + 1;
|
||||
int delta = clamp(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
|
||||
u_dst.dst[dst_off - 1u] = uint8_t(clamp(p0 + delta, 0, 255));
|
||||
u_dst.dst[dst_off ] = uint8_t(clamp(q0 - delta, 0, 255));
|
||||
}
|
||||
Reference in New Issue
Block a user