2079fe39c6
Generates 15 avg_ shader variants by templating from the existing put_ shaders. Each avg_ shader is identical to its put_ sibling except the final write does an L2 average with the existing dst: put_: dst[r,c] = result avg_: dst[r,c] = (dst[r,c] + result + 1) >> 1 Per H.264 §8.4.2.3.1 (B-slice biprediction): caller pre-loads dst with the list0 prediction; the avg_ call folds in list1. Generated via python (avg-shader-gen.py): reads each v3d_h264_qpel_mcXY.comp, transforms the docstring header + final write hunk, writes v3d_h264_qpel_avg_mcXY.comp. ~88 lines each; 15 new shader files. Dispatch reuses the existing dispatch_h264_qpel_diag_qpu helper for all 15 — same src envelope (10*stride+11 covers any (r±1, c±1) shift), the L2 step only touches dst. Slightly over-allocates for the simpler positions (avg_mc20/02/10/30/01/03) but negligible cost. Eliminates 15 wrappers + 15 src_max bound calculations that would otherwise duplicate. CMake foreach loops compile + install 15 new SPV files. ctx grows 15 pipeline pairs. Recipe table flips DAEDALUS_KERNEL_H264_QPEL_AVG_* from CPU to QPU. Public dispatchers re-defined via the existing DEFINE_QPEL_DIAG_PUBLIC macro (replaces the CPU-only DEFINE_QPEL_DISPATCH instantiations). Verified on hertz: $ ./build/test_api_h264 | grep "qpel avg" | wc -l 15 $ ./build/test_api_h264 | grep "qpel avg" | grep -c "100.0000%" 15 All 15 PASS 2048/2048 bytes bit-exact via QPU. QPU coverage for the H.264 8-bit 4:2:0 hot-path pixel kernels: Layer Coverage ───────────────────────────────────────────────────────────── IDCT 4x4 luma ✓ cycle 6 (one QPU shader, also handles chroma) IDCT 8x8 luma ✓ cycle 7 Chroma DC Hadamard CPU only (4 adds + 4 subs; not worth) Deblock luma_v ✓ cycle 8 Deblock luma_h ✓ PR #28 Deblock chroma_v/h ✓ PR #29 Deblock *_intra CPU only (less common, structurally different) qpel put_ 15 pos ✓ cycle 9 (mc20) + PRs #30-#33 qpel avg_ 15 pos ✓ THIS PR The H.264 non-intra-deblock hot path is now FULLY on QPU for any consumer that initialises daedalus with a QPU-capable context.
53 lines
2.0 KiB
Plaintext
53 lines
2.0 KiB
Plaintext
// daedalus-fourier — H.264 luma qpel avg_mc03 (biprediction) (8x8, ¾-pel vertical),
|
|
// V3D 7.1. Per H.264 §8.4.2.2.1 "n" position:
|
|
//
|
|
// dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r+1, c] + 1) >> 1)
|
|
//
|
|
// Same as mc01 but L2-averages with src[r+1, c] instead of src[r, c].
|
|
//
|
|
//
|
|
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
|
// dst[r,c] = avg(dst[r,c], mc03_value)
|
|
// Caller pre-loads dst with the list0 prediction; this shader
|
|
// folds in the list1 contribution.
|
|
//
|
|
// License: BSD-2-Clause.
|
|
|
|
#version 450
|
|
#extension GL_EXT_shader_8bit_storage : require
|
|
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
|
|
|
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
|
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
|
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
|
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
|
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
|
|
|
void main()
|
|
{
|
|
uint block_idx = gl_WorkGroupID.x;
|
|
if (block_idx >= pc.n_blocks) return;
|
|
|
|
uint lane = gl_LocalInvocationID.x;
|
|
uint r = lane >> 3, c = lane & 7u;
|
|
|
|
uint dst_off = u_meta.meta[block_idx].x;
|
|
uint src_off = u_meta.meta[block_idx].y;
|
|
uint stride = pc.stride_u8;
|
|
uint col_base = src_off + c;
|
|
|
|
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
|
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
|
int s_0 = int(u_src.src[col_base + r * stride]);
|
|
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
|
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
|
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
|
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
|
int vp = clamp(v >> 5, 0, 255);
|
|
|
|
int avg = (vp + s_p1 + 1) >> 1; // L2 with src[r+1, c]
|
|
uint final_off = dst_off + r * stride + c;
|
|
int prev = int(u_dst.dst[final_off]);
|
|
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
|
}
|