h264: V3D shaders for chroma deblock V + H (4:2:0)

Adds the QPU shader pair for chroma_v / chroma_h deblock (non-intra bS<4), siblings of the cycle 8 luma_v shader and PR #28's luma_h. Closes 4 of 8 deblock QPU coverage at non-intra: luma_v ✓ cycle 8 luma_h ✓ PR #28 chroma_v ✓ this PR chroma_h ✓ this PR *_intra — CPU NEON (less common; smaller volume) Per H.264 §8.7.2.4 chroma kernel is simpler than luma: only p0/q0 updated (never p1/p2/q1/q2), tC = tc0_seg + 1 (no luma-style ap/aq side bonus), 8 cells per edge (vs luma's 16). Shader: 64 lines vs luma_v's 108 — same WG geometry (16 edges × 16 lanes, lanes 8..15 of each edge early-return). 4:2:0-only: 4:2:2 chroma_h has a 16-row edge geometry that this shader doesn't address; daedalus_dispatch_h264_deblock_chroma_h is 4:2:0-only by design, caller-side gating already covers this in the libavcodec substitution arc (marfrit-packages PR #98). Recipe table flips DAEDALUS_KERNEL_H264_DEBLOCK_CV / CH from CPU to QPU. dispatch_h264_deblock_chroma_qpu factored to share QPU plumbing between V and H (orientation passed as a flag for the dst_max calculation). Verified on hertz: $ ./build/test_api_h264 | grep "deblock chroma [vh]:" H.264 deblock chroma v: 256/256 bytes bit-exact (100.0000%) H.264 deblock chroma h: 256/256 bytes bit-exact (100.0000%) Recipe substrate now reports 2 (QPU) for both CV and CH. Coverage now: bS<4 QPU bS=4 (intra) luma_v ✓ cycle 8 CPU NEON luma_h ✓ PR #28 CPU NEON chroma_v ✓ this PR CPU NEON chroma_h ✓ this PR CPU NEON Intra (bS=4) variants stay CPU NEON. Less common case, smaller per-frame contribution, and the algorithm is structurally different (no tc0; strong-vs-weak filter quad-tree). Can land as a follow-up PR if perf demands.
2026-05-25 17:10:34 +02:00
parent de9266a6eb
commit d8de7754fa
4 changed files with 269 additions and 9 deletions
@@ -0,0 +1,69 @@
+// daedalus-fourier — H.264 chroma 4:2:0 H loop filter (horizontal
+// filter across a vertical edge), non-intra bS<4 variant.
+//
+// Sibling of v3d_h264deblock_chroma_v.comp; same kernel transposed
+// to read pix[-2..+1] (cols) instead of pix[-2*stride..+1*stride]
+// (rows).  Same 8-cell × 4-segment geometry, same WG layout (lanes
+// 8..15 of each edge early-return — only 8 active per edge).
+//
+// 4:2:0-only: 4:2:2 chroma_h has a 16-row edge that this shader
+// doesn't address.  daedalus_dispatch_h264_deblock_chroma_h is
+// 4:2:0-only by design; caller (libavcodec init) gates accordingly.
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage              : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+
+layout(push_constant) uniform PC {
+    uint n_edges;
+    uint dst_stride_u8;
+    uint _pad0;
+    uint _pad1;
+} pc;
+
+void main()
+{
+    uint lane_in_wg  = gl_GlobalInvocationID.x & 255u;
+    uint edge_in_wg  = lane_in_wg >> 4;        // 0..15
+    uint row_in_edge = lane_in_wg & 15u;       // 0..15 — only 0..7 active
+
+    uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg;
+    if (edge_idx >= pc.n_edges) return;
+    if (row_in_edge >= 8u) return;
+
+    uvec4 m = u_meta.meta[edge_idx];
+    uint stride  = pc.dst_stride_u8;
+    uint dst_off = m.x + row_in_edge * stride;
+    int alpha = int(m.y & 0xffu);
+    int beta  = int((m.y >> 8) & 0xffu);
+
+    uint seg = row_in_edge >> 1;
+    uint tc0_byte = (m.z >> (seg * 8u)) & 0xffu;
+    int tc0_s = int(tc0_byte);
+    if (tc0_s >= 128) tc0_s -= 256;
+
+    if (alpha == 0 || beta == 0) return;
+    if (tc0_s < 0) return;
+
+    int p1 = int(u_dst.dst[dst_off - 2u]);
+    int p0 = int(u_dst.dst[dst_off - 1u]);
+    int q0 = int(u_dst.dst[dst_off       ]);
+    int q1 = int(u_dst.dst[dst_off + 1u]);
+
+    if (abs(p0 - q0) >= alpha) return;
+    if (abs(p1 - p0) >= beta)  return;
+    if (abs(q1 - q0) >= beta)  return;
+
+    int tc = tc0_s + 1;
+    int delta = clamp(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
+
+    u_dst.dst[dst_off - 1u] = uint8_t(clamp(p0 + delta, 0, 255));
+    u_dst.dst[dst_off       ] = uint8_t(clamp(q0 - delta, 0, 255));
+}