From 2079fe39c66c0272eb37cfbf42a6ade00410b2cd Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 20:22:33 +0200 Subject: [PATCH] =?UTF-8?q?h264:=20V3D=20shaders=20for=20all=2015=20avg=5F?= =?UTF-8?q?=20qpel=20positions=20=E2=80=94=20qpel=20QPU=20complete?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generates 15 avg_ shader variants by templating from the existing put_ shaders. Each avg_ shader is identical to its put_ sibling except the final write does an L2 average with the existing dst: put_: dst[r,c] = result avg_: dst[r,c] = (dst[r,c] + result + 1) >> 1 Per H.264 §8.4.2.3.1 (B-slice biprediction): caller pre-loads dst with the list0 prediction; the avg_ call folds in list1. Generated via python (avg-shader-gen.py): reads each v3d_h264_qpel_mcXY.comp, transforms the docstring header + final write hunk, writes v3d_h264_qpel_avg_mcXY.comp. ~88 lines each; 15 new shader files. Dispatch reuses the existing dispatch_h264_qpel_diag_qpu helper for all 15 — same src envelope (10*stride+11 covers any (r±1, c±1) shift), the L2 step only touches dst. Slightly over-allocates for the simpler positions (avg_mc20/02/10/30/01/03) but negligible cost. Eliminates 15 wrappers + 15 src_max bound calculations that would otherwise duplicate. CMake foreach loops compile + install 15 new SPV files. ctx grows 15 pipeline pairs. Recipe table flips DAEDALUS_KERNEL_H264_QPEL_AVG_* from CPU to QPU. Public dispatchers re-defined via the existing DEFINE_QPEL_DIAG_PUBLIC macro (replaces the CPU-only DEFINE_QPEL_DISPATCH instantiations). Verified on hertz: $ ./build/test_api_h264 | grep "qpel avg" | wc -l 15 $ ./build/test_api_h264 | grep "qpel avg" | grep -c "100.0000%" 15 All 15 PASS 2048/2048 bytes bit-exact via QPU. QPU coverage for the H.264 8-bit 4:2:0 hot-path pixel kernels: Layer Coverage ───────────────────────────────────────────────────────────── IDCT 4x4 luma ✓ cycle 6 (one QPU shader, also handles chroma) IDCT 8x8 luma ✓ cycle 7 Chroma DC Hadamard CPU only (4 adds + 4 subs; not worth) Deblock luma_v ✓ cycle 8 Deblock luma_h ✓ PR #28 Deblock chroma_v/h ✓ PR #29 Deblock *_intra CPU only (less common, structurally different) qpel put_ 15 pos ✓ cycle 9 (mc20) + PRs #30-#33 qpel avg_ 15 pos ✓ THIS PR The H.264 non-intra-deblock hot path is now FULLY on QPU for any consumer that initialises daedalus with a QPU-capable context. --- CMakeLists.txt | 34 +++++++++- src/daedalus_core.c | 116 +++++++++++++++++++++++--------- src/v3d_h264_qpel_avg_mc01.comp | 52 ++++++++++++++ src/v3d_h264_qpel_avg_mc02.comp | 77 +++++++++++++++++++++ src/v3d_h264_qpel_avg_mc03.comp | 52 ++++++++++++++ src/v3d_h264_qpel_avg_mc10.comp | 55 +++++++++++++++ src/v3d_h264_qpel_avg_mc11.comp | 96 ++++++++++++++++++++++++++ src/v3d_h264_qpel_avg_mc12.comp | 96 ++++++++++++++++++++++++++ src/v3d_h264_qpel_avg_mc13.comp | 96 ++++++++++++++++++++++++++ src/v3d_h264_qpel_avg_mc20.comp | 91 +++++++++++++++++++++++++ src/v3d_h264_qpel_avg_mc21.comp | 96 ++++++++++++++++++++++++++ src/v3d_h264_qpel_avg_mc22.comp | 94 ++++++++++++++++++++++++++ src/v3d_h264_qpel_avg_mc23.comp | 96 ++++++++++++++++++++++++++ src/v3d_h264_qpel_avg_mc30.comp | 52 ++++++++++++++ src/v3d_h264_qpel_avg_mc31.comp | 96 ++++++++++++++++++++++++++ src/v3d_h264_qpel_avg_mc32.comp | 96 ++++++++++++++++++++++++++ src/v3d_h264_qpel_avg_mc33.comp | 96 ++++++++++++++++++++++++++ 17 files changed, 1360 insertions(+), 31 deletions(-) create mode 100644 src/v3d_h264_qpel_avg_mc01.comp create mode 100644 src/v3d_h264_qpel_avg_mc02.comp create mode 100644 src/v3d_h264_qpel_avg_mc03.comp create mode 100644 src/v3d_h264_qpel_avg_mc10.comp create mode 100644 src/v3d_h264_qpel_avg_mc11.comp create mode 100644 src/v3d_h264_qpel_avg_mc12.comp create mode 100644 src/v3d_h264_qpel_avg_mc13.comp create mode 100644 src/v3d_h264_qpel_avg_mc20.comp create mode 100644 src/v3d_h264_qpel_avg_mc21.comp create mode 100644 src/v3d_h264_qpel_avg_mc22.comp create mode 100644 src/v3d_h264_qpel_avg_mc23.comp create mode 100644 src/v3d_h264_qpel_avg_mc30.comp create mode 100644 src/v3d_h264_qpel_avg_mc31.comp create mode 100644 src/v3d_h264_qpel_avg_mc32.comp create mode 100644 src/v3d_h264_qpel_avg_mc33.comp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8210760..cfc0d76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -389,7 +389,24 @@ if (DAEDALUS_BUILD_VULKAN) set(H264_QPEL_${_mc}_SPV ${_spv}) endforeach() - add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV} ${H264_QPEL_mc11_SPV} ${H264_QPEL_mc12_SPV} ${H264_QPEL_mc13_SPV} ${H264_QPEL_mc21_SPV} ${H264_QPEL_mc23_SPV} ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV}) + # avg_ biprediction variants — same shader as put_ + extra L2 with + # existing dst. All 15 useful positions. + foreach(_mc mc20 mc02 mc22 mc10 mc30 mc01 mc03 + mc11 mc12 mc13 mc21 mc23 mc31 mc32 mc33) + set(_spv ${CMAKE_BINARY_DIR}/v3d_h264_qpel_avg_${_mc}.spv) + add_custom_command( + OUTPUT ${_spv} + COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3 + -o ${_spv} + ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_avg_${_mc}.comp + DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_avg_${_mc}.comp + COMMENT "glslang: v3d_h264_qpel_avg_${_mc}.comp -> .spv" + VERBATIM + ) + set(H264_QPEL_avg_${_mc}_SPV ${_spv}) + endforeach() + + add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV} ${H264_QPEL_mc11_SPV} ${H264_QPEL_mc12_SPV} ${H264_QPEL_mc13_SPV} ${H264_QPEL_mc21_SPV} ${H264_QPEL_mc23_SPV} ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV} ${H264_QPEL_avg_mc20_SPV} ${H264_QPEL_avg_mc02_SPV} ${H264_QPEL_avg_mc22_SPV} ${H264_QPEL_avg_mc10_SPV} ${H264_QPEL_avg_mc30_SPV} ${H264_QPEL_avg_mc01_SPV} ${H264_QPEL_avg_mc03_SPV} ${H264_QPEL_avg_mc11_SPV} ${H264_QPEL_avg_mc12_SPV} ${H264_QPEL_avg_mc13_SPV} ${H264_QPEL_avg_mc21_SPV} ${H264_QPEL_avg_mc23_SPV} ${H264_QPEL_avg_mc31_SPV} ${H264_QPEL_avg_mc32_SPV} ${H264_QPEL_avg_mc33_SPV}) # v3d_runner — reusable Vulkan plumbing. add_library(v3d_runner STATIC src/v3d_runner.c) @@ -542,6 +559,21 @@ if (DAEDALUS_BUILD_VULKAN) ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV} + ${H264_QPEL_avg_mc20_SPV} + ${H264_QPEL_avg_mc02_SPV} + ${H264_QPEL_avg_mc22_SPV} + ${H264_QPEL_avg_mc10_SPV} + ${H264_QPEL_avg_mc30_SPV} + ${H264_QPEL_avg_mc01_SPV} + ${H264_QPEL_avg_mc03_SPV} + ${H264_QPEL_avg_mc11_SPV} + ${H264_QPEL_avg_mc12_SPV} + ${H264_QPEL_avg_mc13_SPV} + ${H264_QPEL_avg_mc21_SPV} + ${H264_QPEL_avg_mc23_SPV} + ${H264_QPEL_avg_mc31_SPV} + ${H264_QPEL_avg_mc32_SPV} + ${H264_QPEL_avg_mc33_SPV} DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders ) endif() diff --git a/src/daedalus_core.c b/src/daedalus_core.c index a09b9c7..9d9c857 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -72,6 +72,22 @@ struct daedalus_ctx { int h264_qpel_mc31_pipe_ready; v3d_pipeline h264_qpel_mc31_pipe; int h264_qpel_mc32_pipe_ready; v3d_pipeline h264_qpel_mc32_pipe; int h264_qpel_mc33_pipe_ready; v3d_pipeline h264_qpel_mc33_pipe; + /* avg_ biprediction pipelines — same shaders + L2 with existing dst. */ + int h264_qpel_avg_mc20_pipe_ready; v3d_pipeline h264_qpel_avg_mc20_pipe; + int h264_qpel_avg_mc02_pipe_ready; v3d_pipeline h264_qpel_avg_mc02_pipe; + int h264_qpel_avg_mc22_pipe_ready; v3d_pipeline h264_qpel_avg_mc22_pipe; + int h264_qpel_avg_mc10_pipe_ready; v3d_pipeline h264_qpel_avg_mc10_pipe; + int h264_qpel_avg_mc30_pipe_ready; v3d_pipeline h264_qpel_avg_mc30_pipe; + int h264_qpel_avg_mc01_pipe_ready; v3d_pipeline h264_qpel_avg_mc01_pipe; + int h264_qpel_avg_mc03_pipe_ready; v3d_pipeline h264_qpel_avg_mc03_pipe; + int h264_qpel_avg_mc11_pipe_ready; v3d_pipeline h264_qpel_avg_mc11_pipe; + int h264_qpel_avg_mc12_pipe_ready; v3d_pipeline h264_qpel_avg_mc12_pipe; + int h264_qpel_avg_mc13_pipe_ready; v3d_pipeline h264_qpel_avg_mc13_pipe; + int h264_qpel_avg_mc21_pipe_ready; v3d_pipeline h264_qpel_avg_mc21_pipe; + int h264_qpel_avg_mc23_pipe_ready; v3d_pipeline h264_qpel_avg_mc23_pipe; + int h264_qpel_avg_mc31_pipe_ready; v3d_pipeline h264_qpel_avg_mc31_pipe; + int h264_qpel_avg_mc32_pipe_ready; v3d_pipeline h264_qpel_avg_mc32_pipe; + int h264_qpel_avg_mc33_pipe_ready; v3d_pipeline h264_qpel_avg_mc33_pipe; }; daedalus_ctx *daedalus_ctx_create(void) @@ -146,6 +162,21 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx) if (ctx->h264_qpel_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc31_pipe); if (ctx->h264_qpel_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc32_pipe); if (ctx->h264_qpel_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc33_pipe); + if (ctx->h264_qpel_avg_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc20_pipe); + if (ctx->h264_qpel_avg_mc02_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc02_pipe); + if (ctx->h264_qpel_avg_mc22_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc22_pipe); + if (ctx->h264_qpel_avg_mc10_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc10_pipe); + if (ctx->h264_qpel_avg_mc30_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc30_pipe); + if (ctx->h264_qpel_avg_mc01_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc01_pipe); + if (ctx->h264_qpel_avg_mc03_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc03_pipe); + if (ctx->h264_qpel_avg_mc11_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc11_pipe); + if (ctx->h264_qpel_avg_mc12_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc12_pipe); + if (ctx->h264_qpel_avg_mc13_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc13_pipe); + if (ctx->h264_qpel_avg_mc21_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc21_pipe); + if (ctx->h264_qpel_avg_mc23_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc23_pipe); + if (ctx->h264_qpel_avg_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc31_pipe); + if (ctx->h264_qpel_avg_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc32_pipe); + if (ctx->h264_qpel_avg_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc33_pipe); v3d_runner_destroy(ctx->runner); } free(ctx); @@ -195,21 +226,21 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_QPEL_MC31: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc31.spv */ case DAEDALUS_KERNEL_H264_QPEL_MC32: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc32.spv */ case DAEDALUS_KERNEL_H264_QPEL_MC33: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc33.spv */ - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_CPU; /* biprediction anchors */ - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_CPU; /* ¼-H L2 avg */ - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_CPU; /* diagonals avg */ - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_CPU; - case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc20.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc02.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc22.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc10.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc30.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc01.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc03.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc11.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc12.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc13.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc21.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc23.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc31.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc32.spv */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc33.spv */ } return DAEDALUS_SUBSTRATE_CPU; } @@ -1837,6 +1868,26 @@ DEFINE_QPEL_DIAG_QPU(mc31) DEFINE_QPEL_DIAG_QPU(mc32) DEFINE_QPEL_DIAG_QPU(mc33) +/* avg_ variants — same diag-style envelope (10*stride+11 covers any + * (r±1, c±1) offset the avg_ shaders use), different SPV file. + * Slightly over-allocates for avg_mc20/02/10/30/01/03 (which need + * less src context) but the cost is negligible. */ +DEFINE_QPEL_DIAG_QPU(avg_mc20) +DEFINE_QPEL_DIAG_QPU(avg_mc02) +DEFINE_QPEL_DIAG_QPU(avg_mc22) +DEFINE_QPEL_DIAG_QPU(avg_mc10) +DEFINE_QPEL_DIAG_QPU(avg_mc30) +DEFINE_QPEL_DIAG_QPU(avg_mc01) +DEFINE_QPEL_DIAG_QPU(avg_mc03) +DEFINE_QPEL_DIAG_QPU(avg_mc11) +DEFINE_QPEL_DIAG_QPU(avg_mc12) +DEFINE_QPEL_DIAG_QPU(avg_mc13) +DEFINE_QPEL_DIAG_QPU(avg_mc21) +DEFINE_QPEL_DIAG_QPU(avg_mc23) +DEFINE_QPEL_DIAG_QPU(avg_mc31) +DEFINE_QPEL_DIAG_QPU(avg_mc32) +DEFINE_QPEL_DIAG_QPU(avg_mc33) + #undef DEFINE_QPEL_DIAG_QPU /* -------------------- Public dispatch entry points -------------- */ @@ -2142,22 +2193,27 @@ DEFINE_QPEL_DIAG_PUBLIC(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23) DEFINE_QPEL_DIAG_PUBLIC(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31) DEFINE_QPEL_DIAG_PUBLIC(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32) DEFINE_QPEL_DIAG_PUBLIC(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33) + +/* avg_ biprediction dispatchers (15 positions) — same macro, the + * underlying _qpu dispatch fns also reuse the diag QPU helper since + * the avg_ shaders share the put_ src envelope (the L2 step only + * touches dst). */ +DEFINE_QPEL_DIAG_PUBLIC(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32) +DEFINE_QPEL_DIAG_PUBLIC(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33) #undef DEFINE_QPEL_DIAG_PUBLIC -DEFINE_QPEL_DISPATCH(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20) -DEFINE_QPEL_DISPATCH(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02) -DEFINE_QPEL_DISPATCH(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22) -DEFINE_QPEL_DISPATCH(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10) -DEFINE_QPEL_DISPATCH(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30) -DEFINE_QPEL_DISPATCH(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01) -DEFINE_QPEL_DISPATCH(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03) -DEFINE_QPEL_DISPATCH(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11) -DEFINE_QPEL_DISPATCH(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12) -DEFINE_QPEL_DISPATCH(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13) -DEFINE_QPEL_DISPATCH(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21) -DEFINE_QPEL_DISPATCH(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23) -DEFINE_QPEL_DISPATCH(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31) -DEFINE_QPEL_DISPATCH(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32) -DEFINE_QPEL_DISPATCH(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33) #undef DEFINE_QPEL_DISPATCH diff --git a/src/v3d_h264_qpel_avg_mc01.comp b/src/v3d_h264_qpel_avg_mc01.comp new file mode 100644 index 0000000..4394330 --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc01.comp @@ -0,0 +1,52 @@ +// daedalus-fourier — H.264 luma qpel avg_mc01 (biprediction) (8x8, ¼-pel vertical), +// V3D 7.1. Per H.264 §8.4.2.2.1 "d" position: +// +// dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r,c] + 1) >> 1) +// +// Sibling of v3d_h264_qpel_mc02.comp with L2 step against src[r, c]. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc01_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + uint col_base = src_off + c; + + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16; + int vp = clamp(v >> 5, 0, 255); + + int avg = (vp + s_0 + 1) >> 1; // L2 with src[r, c] + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc02.comp b/src/v3d_h264_qpel_avg_mc02.comp new file mode 100644 index 0000000..29bfa48 --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc02.comp @@ -0,0 +1,77 @@ +// daedalus-fourier — H.264 luma qpel avg_mc02 (biprediction) (8x8, vertical half-pel), V3D 7.1. +// +// Sibling of cycle 9's v3d_h264_qpel_mc20.comp. Same 6-tap filter, +// transposed to vertical direction: +// +// dst[r,c] = clip255( +// ( s[r-2,c] +// - 5 * s[r-1,c] +// + 20 * s[r, c] +// + 20 * s[r+1,c] +// - 5 * s[r+2,c] +// + s[r+3,c] +// + 16 +// ) >> 5) +// +// src+src_off points at row 0 col 0 of the OUTPUT block; the filter +// reads rows -2..+3 (2 rows of top context, 3 rows of bottom). +// +// Same WG layout as mc20: 64 lanes / 1 block-per-WG / 1 lane-per-pixel. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc02_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; + +layout(push_constant) uniform PC { + uint n_blocks; + uint stride_u8; + uint _pad0, _pad1; +} pc; + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3; + uint c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + // Read the 6 rows of vertical context at col (c) of THIS output row. + // src_off+r*stride+c is at the OUTPUT pixel position; the kernel + // samples r-2..r+3 along the column. Unsigned-safe because the + // public API contract guarantees src_off >= 2*stride. + uint col_base = src_off + c; + + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + + int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16; + int p = clamp(v >> 5, 0, 255); + + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc03.comp b/src/v3d_h264_qpel_avg_mc03.comp new file mode 100644 index 0000000..3fd62ea --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc03.comp @@ -0,0 +1,52 @@ +// daedalus-fourier — H.264 luma qpel avg_mc03 (biprediction) (8x8, ¾-pel vertical), +// V3D 7.1. Per H.264 §8.4.2.2.1 "n" position: +// +// dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r+1, c] + 1) >> 1) +// +// Same as mc01 but L2-averages with src[r+1, c] instead of src[r, c]. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc03_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + uint col_base = src_off + c; + + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16; + int vp = clamp(v >> 5, 0, 255); + + int avg = (vp + s_p1 + 1) >> 1; // L2 with src[r+1, c] + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc10.comp b/src/v3d_h264_qpel_avg_mc10.comp new file mode 100644 index 0000000..941993c --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc10.comp @@ -0,0 +1,55 @@ +// daedalus-fourier — H.264 luma qpel avg_mc10 (biprediction) (8x8, ¼-pel horizontal), +// V3D 7.1. Per H.264 §8.4.2.2.1 "a" position: +// +// dst[r,c] = ((clip255(mc20(s)[r,c]) + s[r,c] + 1) >> 1) +// +// = horizontal half-pel filter, clipped to u8, then L2 rounded-averaged +// with the integer source pixel at the SAME position. Sibling of +// v3d_h264_qpel_mc20.comp with the L2 step added at the tail. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc10_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + uint row_base = src_off + r * stride + c; + + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16; + int hp = clamp(v >> 5, 0, 255); + + // L2 average with the integer source at the SAME (r, c) position. + int avg = (hp + s_0 + 1) >> 1; + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc11.comp b/src/v3d_h264_qpel_avg_mc11.comp new file mode 100644 index 0000000..ee95e61 --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc11.comp @@ -0,0 +1,96 @@ +// daedalus-fourier — H.264 luma qpel avg_mc11 (biprediction) (8x8, diagonal quarter-pel), +// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel +// anchors via L2 rounded-average: +// +// mc11[r,c] = avg(mc20(r, c), +// mc02(r, c)) +// +// Per-lane structure: each lane computes BOTH anchor outputs at its +// own (r, c) target offset, then L2 averages. No shared memory. +// Same WG geometry as the other qpel shaders. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc11_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +int hpel_h(uint src_off, uint stride, uint r, uint c) { + uint row_base = src_off + r * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_v(uint src_off, uint stride, uint r, uint c) { + uint col_base = src_off + c; + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) { + // Single row's int16 horizontal lowpass (NOT clipped — used as + // intermediate for the vertical pass of hpel_hv). + uint row_base = src_off + rr * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3; +} + +int hpel_hv(uint src_off, uint stride, uint r, uint c) { + int t0 = hpel_hv_row(src_off, stride, r - 2u, c); + int t1 = hpel_hv_row(src_off, stride, r - 1u, c); + int t2 = hpel_hv_row(src_off, stride, r, c); + int t3 = hpel_hv_row(src_off, stride, r + 1u, c); + int t4 = hpel_hv_row(src_off, stride, r + 2u, c); + int t5 = hpel_hv_row(src_off, stride, r + 3u, c); + int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512; + return clamp(v >> 10, 0, 255); +} + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + int a = hpel_h(src_off, stride, r, c); + int b = hpel_v(src_off, stride, r, c); + int avg = (a + b + 1) >> 1; + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc12.comp b/src/v3d_h264_qpel_avg_mc12.comp new file mode 100644 index 0000000..705213b --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc12.comp @@ -0,0 +1,96 @@ +// daedalus-fourier — H.264 luma qpel avg_mc12 (biprediction) (8x8, diagonal quarter-pel), +// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel +// anchors via L2 rounded-average: +// +// mc12[r,c] = avg(mc22(r, c), +// mc02(r, c)) +// +// Per-lane structure: each lane computes BOTH anchor outputs at its +// own (r, c) target offset, then L2 averages. No shared memory. +// Same WG geometry as the other qpel shaders. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc12_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +int hpel_h(uint src_off, uint stride, uint r, uint c) { + uint row_base = src_off + r * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_v(uint src_off, uint stride, uint r, uint c) { + uint col_base = src_off + c; + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) { + // Single row's int16 horizontal lowpass (NOT clipped — used as + // intermediate for the vertical pass of hpel_hv). + uint row_base = src_off + rr * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3; +} + +int hpel_hv(uint src_off, uint stride, uint r, uint c) { + int t0 = hpel_hv_row(src_off, stride, r - 2u, c); + int t1 = hpel_hv_row(src_off, stride, r - 1u, c); + int t2 = hpel_hv_row(src_off, stride, r, c); + int t3 = hpel_hv_row(src_off, stride, r + 1u, c); + int t4 = hpel_hv_row(src_off, stride, r + 2u, c); + int t5 = hpel_hv_row(src_off, stride, r + 3u, c); + int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512; + return clamp(v >> 10, 0, 255); +} + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + int a = hpel_hv(src_off, stride, r, c); + int b = hpel_v(src_off, stride, r, c); + int avg = (a + b + 1) >> 1; + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc13.comp b/src/v3d_h264_qpel_avg_mc13.comp new file mode 100644 index 0000000..8d34b39 --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc13.comp @@ -0,0 +1,96 @@ +// daedalus-fourier — H.264 luma qpel avg_mc13 (biprediction) (8x8, diagonal quarter-pel), +// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel +// anchors via L2 rounded-average: +// +// mc13[r,c] = avg(mc20(r+1, c), +// mc02(r, c)) +// +// Per-lane structure: each lane computes BOTH anchor outputs at its +// own (r, c) target offset, then L2 averages. No shared memory. +// Same WG geometry as the other qpel shaders. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc13_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +int hpel_h(uint src_off, uint stride, uint r, uint c) { + uint row_base = src_off + r * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_v(uint src_off, uint stride, uint r, uint c) { + uint col_base = src_off + c; + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) { + // Single row's int16 horizontal lowpass (NOT clipped — used as + // intermediate for the vertical pass of hpel_hv). + uint row_base = src_off + rr * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3; +} + +int hpel_hv(uint src_off, uint stride, uint r, uint c) { + int t0 = hpel_hv_row(src_off, stride, r - 2u, c); + int t1 = hpel_hv_row(src_off, stride, r - 1u, c); + int t2 = hpel_hv_row(src_off, stride, r, c); + int t3 = hpel_hv_row(src_off, stride, r + 1u, c); + int t4 = hpel_hv_row(src_off, stride, r + 2u, c); + int t5 = hpel_hv_row(src_off, stride, r + 3u, c); + int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512; + return clamp(v >> 10, 0, 255); +} + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + int a = hpel_h(src_off, stride, r+1u, c); + int b = hpel_v(src_off, stride, r, c); + int avg = (a + b + 1) >> 1; + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc20.comp b/src/v3d_h264_qpel_avg_mc20.comp new file mode 100644 index 0000000..de74073 --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc20.comp @@ -0,0 +1,91 @@ +// daedalus-fourier — H.264 luma qpel avg_mc20 (biprediction) (8x8, horizontal half-pel), V3D 7.1. +// +// H.264 spec §8.4.2.2.1 horizontal 6-tap luma interpolation: +// +// dst[r,c] = clip255( +// ( s[r,c-2] +// - 5 * s[r,c-1] +// + 20 * s[r,c] +// + 20 * s[r,c+1] +// - 5 * s[r,c+2] +// + s[r,c+3] +// + 16 +// ) >> 5) +// +// Single-stride: dst and src share `stride` (H264QpelContext +// convention). src+src_off already points at the leftmost output +// column (col 0); the filter reads cols -2..+3. Caller guarantees +// edge-padding context per the public API docstring. +// +// Workgroup layout: 64 invocations = 1 lane per output pixel. +// 1 block per WG; n_blocks WGs total. This is the simplest layout +// that avoids any inter-lane communication — each lane independently +// reads its 6 src samples and writes its 1 dst sample. V3D's L2 +// cache handles the redundant reads from adjacent lanes. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc20_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Src { + uint8_t src[]; +} u_src; + +layout(binding = 1) buffer Dst { + uint8_t dst[]; +} u_dst; + +layout(binding = 2) readonly buffer Meta { + uvec4 meta[]; // .x = dst_off, .y = src_off +} u_meta; + +layout(push_constant) uniform PC { + uint n_blocks; + uint stride_u8; + uint _pad0, _pad1; +} pc; + +void main() +{ + // 1 block per WG, 64 lanes covering the 8x8 output block. + uint wg_id = gl_WorkGroupID.x; + uint block_idx = wg_id; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3; // 0..7 (row) + uint c = lane & 7u; // 0..7 (column) + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + // src points at output col 0 of the block; filter reads cols -2..+3 + // of the current row. Negative col arithmetic is unsigned-safe + // because src_off >= 2 (caller-guaranteed left context). + uint row_base = src_off + r * stride + c; + + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base + 0u]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + + int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16; + int p = clamp(v >> 5, 0, 255); + + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc21.comp b/src/v3d_h264_qpel_avg_mc21.comp new file mode 100644 index 0000000..7b46ea1 --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc21.comp @@ -0,0 +1,96 @@ +// daedalus-fourier — H.264 luma qpel avg_mc21 (biprediction) (8x8, diagonal quarter-pel), +// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel +// anchors via L2 rounded-average: +// +// mc21[r,c] = avg(mc22(r, c), +// mc20(r, c)) +// +// Per-lane structure: each lane computes BOTH anchor outputs at its +// own (r, c) target offset, then L2 averages. No shared memory. +// Same WG geometry as the other qpel shaders. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc21_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +int hpel_h(uint src_off, uint stride, uint r, uint c) { + uint row_base = src_off + r * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_v(uint src_off, uint stride, uint r, uint c) { + uint col_base = src_off + c; + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) { + // Single row's int16 horizontal lowpass (NOT clipped — used as + // intermediate for the vertical pass of hpel_hv). + uint row_base = src_off + rr * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3; +} + +int hpel_hv(uint src_off, uint stride, uint r, uint c) { + int t0 = hpel_hv_row(src_off, stride, r - 2u, c); + int t1 = hpel_hv_row(src_off, stride, r - 1u, c); + int t2 = hpel_hv_row(src_off, stride, r, c); + int t3 = hpel_hv_row(src_off, stride, r + 1u, c); + int t4 = hpel_hv_row(src_off, stride, r + 2u, c); + int t5 = hpel_hv_row(src_off, stride, r + 3u, c); + int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512; + return clamp(v >> 10, 0, 255); +} + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + int a = hpel_hv(src_off, stride, r, c); + int b = hpel_h(src_off, stride, r, c); + int avg = (a + b + 1) >> 1; + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc22.comp b/src/v3d_h264_qpel_avg_mc22.comp new file mode 100644 index 0000000..0387ad0 --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc22.comp @@ -0,0 +1,94 @@ +// daedalus-fourier — H.264 luma qpel avg_mc22 (biprediction) (8x8, 2D half-pel "j" position). +// V3D 7.1. +// +// Cascaded H+V 6-tap per H.264 §8.4.2.2.1 / FFmpeg ff_put_h264_qpel8_mc22_neon: +// +// tmp[r,c] = src[r,c-2] - 5*src[r,c-1] + 20*src[r,c] + 20*src[r,c+1] +// - 5*src[r,c+2] + src[r,c+3] (int16) +// +// dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c] +// + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c] +// + 512) >> 10) +// +// The +512 >> 10 final scale compensates for both 6-tap scalings. +// CANNOT just cascade mc20→mc02 because intermediate must be int16 +// (no per-stage clip), so this is a dedicated kernel. +// +// Per-lane structure: each lane computes its own (r, c) output by +// running the FULL cascade — 6 horizontal lowpass int16 values for +// rows r-2..r+3, then a vertical lowpass on those. ~50 ALU ops per +// lane. No shared memory / barriers needed; V3D L2 absorbs the +// redundant src reads across lanes. +// +// WG layout: 64 lanes / 1 block-per-WG / 1 lane-per-output-pixel +// (same as mc20 / mc02). +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc22_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; + +layout(push_constant) uniform PC { + uint n_blocks; + uint stride_u8; + uint _pad0, _pad1; +} pc; + +// Horizontal 6-tap filter at (row_off, c) — reads src at cols c-2..c+3 +// of the row identified by row_off, returns int16 intermediate (NOT +// scaled — the v-pass does the +512 >> 10 for both stages). +int hpel_h(uint row_off, uint c) +{ + int s_m2 = int(u_src.src[row_off + c - 2u]); + int s_m1 = int(u_src.src[row_off + c - 1u]); + int s_0 = int(u_src.src[row_off + c ]); + int s_p1 = int(u_src.src[row_off + c + 1u]); + int s_p2 = int(u_src.src[row_off + c + 2u]); + int s_p3 = int(u_src.src[row_off + c + 3u]); + return s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3; +} + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3; + uint c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + // Compute 6 horizontal lowpass values at rows r-2..r+3 (relative + // to the output row r) of column c. src_off+r*stride+c is the + // output pixel position; we sample rows r-2..r+3. + // Unsigned-safe because src_off >= 2*stride per the caller contract. + int t0 = hpel_h(src_off + (r - 2u) * stride, c); + int t1 = hpel_h(src_off + (r - 1u) * stride, c); + int t2 = hpel_h(src_off + r * stride, c); + int t3 = hpel_h(src_off + (r + 1u) * stride, c); + int t4 = hpel_h(src_off + (r + 2u) * stride, c); + int t5 = hpel_h(src_off + (r + 3u) * stride, c); + + int v = t0 - 5 * t1 + 20 * t2 + 20 * t3 - 5 * t4 + t5 + 512; + int p = clamp(v >> 10, 0, 255); + + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc23.comp b/src/v3d_h264_qpel_avg_mc23.comp new file mode 100644 index 0000000..222df1f --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc23.comp @@ -0,0 +1,96 @@ +// daedalus-fourier — H.264 luma qpel avg_mc23 (biprediction) (8x8, diagonal quarter-pel), +// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel +// anchors via L2 rounded-average: +// +// mc23[r,c] = avg(mc22(r, c), +// mc20(r+1, c)) +// +// Per-lane structure: each lane computes BOTH anchor outputs at its +// own (r, c) target offset, then L2 averages. No shared memory. +// Same WG geometry as the other qpel shaders. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc23_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +int hpel_h(uint src_off, uint stride, uint r, uint c) { + uint row_base = src_off + r * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_v(uint src_off, uint stride, uint r, uint c) { + uint col_base = src_off + c; + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) { + // Single row's int16 horizontal lowpass (NOT clipped — used as + // intermediate for the vertical pass of hpel_hv). + uint row_base = src_off + rr * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3; +} + +int hpel_hv(uint src_off, uint stride, uint r, uint c) { + int t0 = hpel_hv_row(src_off, stride, r - 2u, c); + int t1 = hpel_hv_row(src_off, stride, r - 1u, c); + int t2 = hpel_hv_row(src_off, stride, r, c); + int t3 = hpel_hv_row(src_off, stride, r + 1u, c); + int t4 = hpel_hv_row(src_off, stride, r + 2u, c); + int t5 = hpel_hv_row(src_off, stride, r + 3u, c); + int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512; + return clamp(v >> 10, 0, 255); +} + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + int a = hpel_hv(src_off, stride, r, c); + int b = hpel_h(src_off, stride, r+1u, c); + int avg = (a + b + 1) >> 1; + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc30.comp b/src/v3d_h264_qpel_avg_mc30.comp new file mode 100644 index 0000000..e91f6dc --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc30.comp @@ -0,0 +1,52 @@ +// daedalus-fourier — H.264 luma qpel avg_mc30 (biprediction) (8x8, ¾-pel horizontal), +// V3D 7.1. Per H.264 §8.4.2.2.1 "c" position: +// +// dst[r,c] = ((clip255(mc20(s)[r,c]) + s[r,c+1] + 1) >> 1) +// +// Same as mc10 but L2-averages with src[r, c+1] instead of src[r, c]. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc30_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + uint row_base = src_off + r * stride + c; + + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16; + int hp = clamp(v >> 5, 0, 255); + + int avg = (hp + s_p1 + 1) >> 1; // L2 with src[r, c+1] + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc31.comp b/src/v3d_h264_qpel_avg_mc31.comp new file mode 100644 index 0000000..60a20df --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc31.comp @@ -0,0 +1,96 @@ +// daedalus-fourier — H.264 luma qpel avg_mc31 (biprediction) (8x8, diagonal quarter-pel), +// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel +// anchors via L2 rounded-average: +// +// mc31[r,c] = avg(mc20(r, c), +// mc02(r, c+1)) +// +// Per-lane structure: each lane computes BOTH anchor outputs at its +// own (r, c) target offset, then L2 averages. No shared memory. +// Same WG geometry as the other qpel shaders. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc31_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +int hpel_h(uint src_off, uint stride, uint r, uint c) { + uint row_base = src_off + r * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_v(uint src_off, uint stride, uint r, uint c) { + uint col_base = src_off + c; + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) { + // Single row's int16 horizontal lowpass (NOT clipped — used as + // intermediate for the vertical pass of hpel_hv). + uint row_base = src_off + rr * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3; +} + +int hpel_hv(uint src_off, uint stride, uint r, uint c) { + int t0 = hpel_hv_row(src_off, stride, r - 2u, c); + int t1 = hpel_hv_row(src_off, stride, r - 1u, c); + int t2 = hpel_hv_row(src_off, stride, r, c); + int t3 = hpel_hv_row(src_off, stride, r + 1u, c); + int t4 = hpel_hv_row(src_off, stride, r + 2u, c); + int t5 = hpel_hv_row(src_off, stride, r + 3u, c); + int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512; + return clamp(v >> 10, 0, 255); +} + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + int a = hpel_h(src_off, stride, r, c); + int b = hpel_v(src_off, stride, r, c+1u); + int avg = (a + b + 1) >> 1; + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc32.comp b/src/v3d_h264_qpel_avg_mc32.comp new file mode 100644 index 0000000..80655f6 --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc32.comp @@ -0,0 +1,96 @@ +// daedalus-fourier — H.264 luma qpel avg_mc32 (biprediction) (8x8, diagonal quarter-pel), +// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel +// anchors via L2 rounded-average: +// +// mc32[r,c] = avg(mc22(r, c), +// mc02(r, c+1)) +// +// Per-lane structure: each lane computes BOTH anchor outputs at its +// own (r, c) target offset, then L2 averages. No shared memory. +// Same WG geometry as the other qpel shaders. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc32_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +int hpel_h(uint src_off, uint stride, uint r, uint c) { + uint row_base = src_off + r * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_v(uint src_off, uint stride, uint r, uint c) { + uint col_base = src_off + c; + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) { + // Single row's int16 horizontal lowpass (NOT clipped — used as + // intermediate for the vertical pass of hpel_hv). + uint row_base = src_off + rr * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3; +} + +int hpel_hv(uint src_off, uint stride, uint r, uint c) { + int t0 = hpel_hv_row(src_off, stride, r - 2u, c); + int t1 = hpel_hv_row(src_off, stride, r - 1u, c); + int t2 = hpel_hv_row(src_off, stride, r, c); + int t3 = hpel_hv_row(src_off, stride, r + 1u, c); + int t4 = hpel_hv_row(src_off, stride, r + 2u, c); + int t5 = hpel_hv_row(src_off, stride, r + 3u, c); + int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512; + return clamp(v >> 10, 0, 255); +} + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + int a = hpel_hv(src_off, stride, r, c); + int b = hpel_v(src_off, stride, r, c+1u); + int avg = (a + b + 1) >> 1; + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +} diff --git a/src/v3d_h264_qpel_avg_mc33.comp b/src/v3d_h264_qpel_avg_mc33.comp new file mode 100644 index 0000000..6d1f171 --- /dev/null +++ b/src/v3d_h264_qpel_avg_mc33.comp @@ -0,0 +1,96 @@ +// daedalus-fourier — H.264 luma qpel avg_mc33 (biprediction) (8x8, diagonal quarter-pel), +// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel +// anchors via L2 rounded-average: +// +// mc33[r,c] = avg(mc20(r+1, c), +// mc02(r, c+1)) +// +// Per-lane structure: each lane computes BOTH anchor outputs at its +// own (r, c) target offset, then L2 averages. No shared memory. +// Same WG geometry as the other qpel shaders. +// +// +// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1: +// dst[r,c] = avg(dst[r,c], mc33_value) +// Caller pre-loads dst with the list0 prediction; this shader +// folds in the list1 contribution. +// +// License: BSD-2-Clause. + +#version 450 +#extension GL_EXT_shader_8bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; +layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src; +layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst; +layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta; +layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc; + +int hpel_h(uint src_off, uint stride, uint r, uint c) { + uint row_base = src_off + r * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_v(uint src_off, uint stride, uint r, uint c) { + uint col_base = src_off + c; + int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]); + int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]); + int s_0 = int(u_src.src[col_base + r * stride]); + int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]); + int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]); + int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]); + int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16; + return clamp(v >> 5, 0, 255); +} + +int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) { + // Single row's int16 horizontal lowpass (NOT clipped — used as + // intermediate for the vertical pass of hpel_hv). + uint row_base = src_off + rr * stride + c; + int s_m2 = int(u_src.src[row_base - 2u]); + int s_m1 = int(u_src.src[row_base - 1u]); + int s_0 = int(u_src.src[row_base ]); + int s_p1 = int(u_src.src[row_base + 1u]); + int s_p2 = int(u_src.src[row_base + 2u]); + int s_p3 = int(u_src.src[row_base + 3u]); + return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3; +} + +int hpel_hv(uint src_off, uint stride, uint r, uint c) { + int t0 = hpel_hv_row(src_off, stride, r - 2u, c); + int t1 = hpel_hv_row(src_off, stride, r - 1u, c); + int t2 = hpel_hv_row(src_off, stride, r, c); + int t3 = hpel_hv_row(src_off, stride, r + 1u, c); + int t4 = hpel_hv_row(src_off, stride, r + 2u, c); + int t5 = hpel_hv_row(src_off, stride, r + 3u, c); + int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512; + return clamp(v >> 10, 0, 255); +} + +void main() +{ + uint block_idx = gl_WorkGroupID.x; + if (block_idx >= pc.n_blocks) return; + + uint lane = gl_LocalInvocationID.x; + uint r = lane >> 3, c = lane & 7u; + + uint dst_off = u_meta.meta[block_idx].x; + uint src_off = u_meta.meta[block_idx].y; + uint stride = pc.stride_u8; + + int a = hpel_h(src_off, stride, r+1u, c); + int b = hpel_v(src_off, stride, r, c+1u); + int avg = (a + b + 1) >> 1; + uint final_off = dst_off + r * stride + c; + int prev = int(u_dst.dst[final_off]); + u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1); +}