2079fe39c6
Generates 15 avg_ shader variants by templating from the existing put_ shaders. Each avg_ shader is identical to its put_ sibling except the final write does an L2 average with the existing dst: put_: dst[r,c] = result avg_: dst[r,c] = (dst[r,c] + result + 1) >> 1 Per H.264 §8.4.2.3.1 (B-slice biprediction): caller pre-loads dst with the list0 prediction; the avg_ call folds in list1. Generated via python (avg-shader-gen.py): reads each v3d_h264_qpel_mcXY.comp, transforms the docstring header + final write hunk, writes v3d_h264_qpel_avg_mcXY.comp. ~88 lines each; 15 new shader files. Dispatch reuses the existing dispatch_h264_qpel_diag_qpu helper for all 15 — same src envelope (10*stride+11 covers any (r±1, c±1) shift), the L2 step only touches dst. Slightly over-allocates for the simpler positions (avg_mc20/02/10/30/01/03) but negligible cost. Eliminates 15 wrappers + 15 src_max bound calculations that would otherwise duplicate. CMake foreach loops compile + install 15 new SPV files. ctx grows 15 pipeline pairs. Recipe table flips DAEDALUS_KERNEL_H264_QPEL_AVG_* from CPU to QPU. Public dispatchers re-defined via the existing DEFINE_QPEL_DIAG_PUBLIC macro (replaces the CPU-only DEFINE_QPEL_DISPATCH instantiations). Verified on hertz: $ ./build/test_api_h264 | grep "qpel avg" | wc -l 15 $ ./build/test_api_h264 | grep "qpel avg" | grep -c "100.0000%" 15 All 15 PASS 2048/2048 bytes bit-exact via QPU. QPU coverage for the H.264 8-bit 4:2:0 hot-path pixel kernels: Layer Coverage ───────────────────────────────────────────────────────────── IDCT 4x4 luma ✓ cycle 6 (one QPU shader, also handles chroma) IDCT 8x8 luma ✓ cycle 7 Chroma DC Hadamard CPU only (4 adds + 4 subs; not worth) Deblock luma_v ✓ cycle 8 Deblock luma_h ✓ PR #28 Deblock chroma_v/h ✓ PR #29 Deblock *_intra CPU only (less common, structurally different) qpel put_ 15 pos ✓ cycle 9 (mc20) + PRs #30-#33 qpel avg_ 15 pos ✓ THIS PR The H.264 non-intra-deblock hot path is now FULLY on QPU for any consumer that initialises daedalus with a QPU-capable context.
95 lines
3.5 KiB
Plaintext
95 lines
3.5 KiB
Plaintext
// daedalus-fourier — H.264 luma qpel avg_mc22 (biprediction) (8x8, 2D half-pel "j" position).
|
|
// V3D 7.1.
|
|
//
|
|
// Cascaded H+V 6-tap per H.264 §8.4.2.2.1 / FFmpeg ff_put_h264_qpel8_mc22_neon:
|
|
//
|
|
// tmp[r,c] = src[r,c-2] - 5*src[r,c-1] + 20*src[r,c] + 20*src[r,c+1]
|
|
// - 5*src[r,c+2] + src[r,c+3] (int16)
|
|
//
|
|
// dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c]
|
|
// + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c]
|
|
// + 512) >> 10)
|
|
//
|
|
// The +512 >> 10 final scale compensates for both 6-tap scalings.
|
|
// CANNOT just cascade mc20→mc02 because intermediate must be int16
|
|
// (no per-stage clip), so this is a dedicated kernel.
|
|
//
|
|
// Per-lane structure: each lane computes its own (r, c) output by
|
|
// running the FULL cascade — 6 horizontal lowpass int16 values for
|
|
// rows r-2..r+3, then a vertical lowpass on those. ~50 ALU ops per
|
|
// lane. No shared memory / barriers needed; V3D L2 absorbs the
|
|
// redundant src reads across lanes.
|
|
//
|
|
// WG layout: 64 lanes / 1 block-per-WG / 1 lane-per-output-pixel
|
|
// (same as mc20 / mc02).
|
|
//
|
|
//
|
|
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
|
// dst[r,c] = avg(dst[r,c], mc22_value)
|
|
// Caller pre-loads dst with the list0 prediction; this shader
|
|
// folds in the list1 contribution.
|
|
//
|
|
// License: BSD-2-Clause.
|
|
|
|
#version 450
|
|
#extension GL_EXT_shader_8bit_storage : require
|
|
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
|
|
|
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
|
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
|
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
|
|
|
layout(push_constant) uniform PC {
|
|
uint n_blocks;
|
|
uint stride_u8;
|
|
uint _pad0, _pad1;
|
|
} pc;
|
|
|
|
// Horizontal 6-tap filter at (row_off, c) — reads src at cols c-2..c+3
|
|
// of the row identified by row_off, returns int16 intermediate (NOT
|
|
// scaled — the v-pass does the +512 >> 10 for both stages).
|
|
int hpel_h(uint row_off, uint c)
|
|
{
|
|
int s_m2 = int(u_src.src[row_off + c - 2u]);
|
|
int s_m1 = int(u_src.src[row_off + c - 1u]);
|
|
int s_0 = int(u_src.src[row_off + c ]);
|
|
int s_p1 = int(u_src.src[row_off + c + 1u]);
|
|
int s_p2 = int(u_src.src[row_off + c + 2u]);
|
|
int s_p3 = int(u_src.src[row_off + c + 3u]);
|
|
return s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3;
|
|
}
|
|
|
|
void main()
|
|
{
|
|
uint block_idx = gl_WorkGroupID.x;
|
|
if (block_idx >= pc.n_blocks) return;
|
|
|
|
uint lane = gl_LocalInvocationID.x;
|
|
uint r = lane >> 3;
|
|
uint c = lane & 7u;
|
|
|
|
uint dst_off = u_meta.meta[block_idx].x;
|
|
uint src_off = u_meta.meta[block_idx].y;
|
|
uint stride = pc.stride_u8;
|
|
|
|
// Compute 6 horizontal lowpass values at rows r-2..r+3 (relative
|
|
// to the output row r) of column c. src_off+r*stride+c is the
|
|
// output pixel position; we sample rows r-2..r+3.
|
|
// Unsigned-safe because src_off >= 2*stride per the caller contract.
|
|
int t0 = hpel_h(src_off + (r - 2u) * stride, c);
|
|
int t1 = hpel_h(src_off + (r - 1u) * stride, c);
|
|
int t2 = hpel_h(src_off + r * stride, c);
|
|
int t3 = hpel_h(src_off + (r + 1u) * stride, c);
|
|
int t4 = hpel_h(src_off + (r + 2u) * stride, c);
|
|
int t5 = hpel_h(src_off + (r + 3u) * stride, c);
|
|
|
|
int v = t0 - 5 * t1 + 20 * t2 + 20 * t3 - 5 * t4 + t5 + 512;
|
|
int p = clamp(v >> 10, 0, 255);
|
|
|
|
uint final_off = dst_off + r * stride + c;
|
|
int prev = int(u_dst.dst[final_off]);
|
|
u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1);
|
|
}
|