h264: V3D shaders for all 15 avg_ qpel positions — qpel QPU complete
Generates 15 avg_ shader variants by templating from the existing put_ shaders. Each avg_ shader is identical to its put_ sibling except the final write does an L2 average with the existing dst: put_: dst[r,c] = result avg_: dst[r,c] = (dst[r,c] + result + 1) >> 1 Per H.264 §8.4.2.3.1 (B-slice biprediction): caller pre-loads dst with the list0 prediction; the avg_ call folds in list1. Generated via python (avg-shader-gen.py): reads each v3d_h264_qpel_mcXY.comp, transforms the docstring header + final write hunk, writes v3d_h264_qpel_avg_mcXY.comp. ~88 lines each; 15 new shader files. Dispatch reuses the existing dispatch_h264_qpel_diag_qpu helper for all 15 — same src envelope (10*stride+11 covers any (r±1, c±1) shift), the L2 step only touches dst. Slightly over-allocates for the simpler positions (avg_mc20/02/10/30/01/03) but negligible cost. Eliminates 15 wrappers + 15 src_max bound calculations that would otherwise duplicate. CMake foreach loops compile + install 15 new SPV files. ctx grows 15 pipeline pairs. Recipe table flips DAEDALUS_KERNEL_H264_QPEL_AVG_* from CPU to QPU. Public dispatchers re-defined via the existing DEFINE_QPEL_DIAG_PUBLIC macro (replaces the CPU-only DEFINE_QPEL_DISPATCH instantiations). Verified on hertz: $ ./build/test_api_h264 | grep "qpel avg" | wc -l 15 $ ./build/test_api_h264 | grep "qpel avg" | grep -c "100.0000%" 15 All 15 PASS 2048/2048 bytes bit-exact via QPU. QPU coverage for the H.264 8-bit 4:2:0 hot-path pixel kernels: Layer Coverage ───────────────────────────────────────────────────────────── IDCT 4x4 luma ✓ cycle 6 (one QPU shader, also handles chroma) IDCT 8x8 luma ✓ cycle 7 Chroma DC Hadamard CPU only (4 adds + 4 subs; not worth) Deblock luma_v ✓ cycle 8 Deblock luma_h ✓ PR #28 Deblock chroma_v/h ✓ PR #29 Deblock *_intra CPU only (less common, structurally different) qpel put_ 15 pos ✓ cycle 9 (mc20) + PRs #30-#33 qpel avg_ 15 pos ✓ THIS PR The H.264 non-intra-deblock hot path is now FULLY on QPU for any consumer that initialises daedalus with a QPU-capable context.
This commit is contained in:
+33
-1
@@ -389,7 +389,24 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
set(H264_QPEL_${_mc}_SPV ${_spv})
|
||||
endforeach()
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV} ${H264_QPEL_mc11_SPV} ${H264_QPEL_mc12_SPV} ${H264_QPEL_mc13_SPV} ${H264_QPEL_mc21_SPV} ${H264_QPEL_mc23_SPV} ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV})
|
||||
# avg_ biprediction variants — same shader as put_ + extra L2 with
|
||||
# existing dst. All 15 useful positions.
|
||||
foreach(_mc mc20 mc02 mc22 mc10 mc30 mc01 mc03
|
||||
mc11 mc12 mc13 mc21 mc23 mc31 mc32 mc33)
|
||||
set(_spv ${CMAKE_BINARY_DIR}/v3d_h264_qpel_avg_${_mc}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${_spv}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${_spv}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_avg_${_mc}.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_avg_${_mc}.comp
|
||||
COMMENT "glslang: v3d_h264_qpel_avg_${_mc}.comp -> .spv"
|
||||
VERBATIM
|
||||
)
|
||||
set(H264_QPEL_avg_${_mc}_SPV ${_spv})
|
||||
endforeach()
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV} ${H264_QPEL_mc11_SPV} ${H264_QPEL_mc12_SPV} ${H264_QPEL_mc13_SPV} ${H264_QPEL_mc21_SPV} ${H264_QPEL_mc23_SPV} ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV} ${H264_QPEL_avg_mc20_SPV} ${H264_QPEL_avg_mc02_SPV} ${H264_QPEL_avg_mc22_SPV} ${H264_QPEL_avg_mc10_SPV} ${H264_QPEL_avg_mc30_SPV} ${H264_QPEL_avg_mc01_SPV} ${H264_QPEL_avg_mc03_SPV} ${H264_QPEL_avg_mc11_SPV} ${H264_QPEL_avg_mc12_SPV} ${H264_QPEL_avg_mc13_SPV} ${H264_QPEL_avg_mc21_SPV} ${H264_QPEL_avg_mc23_SPV} ${H264_QPEL_avg_mc31_SPV} ${H264_QPEL_avg_mc32_SPV} ${H264_QPEL_avg_mc33_SPV})
|
||||
|
||||
# v3d_runner — reusable Vulkan plumbing.
|
||||
add_library(v3d_runner STATIC src/v3d_runner.c)
|
||||
@@ -542,6 +559,21 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
${H264_QPEL_mc31_SPV}
|
||||
${H264_QPEL_mc32_SPV}
|
||||
${H264_QPEL_mc33_SPV}
|
||||
${H264_QPEL_avg_mc20_SPV}
|
||||
${H264_QPEL_avg_mc02_SPV}
|
||||
${H264_QPEL_avg_mc22_SPV}
|
||||
${H264_QPEL_avg_mc10_SPV}
|
||||
${H264_QPEL_avg_mc30_SPV}
|
||||
${H264_QPEL_avg_mc01_SPV}
|
||||
${H264_QPEL_avg_mc03_SPV}
|
||||
${H264_QPEL_avg_mc11_SPV}
|
||||
${H264_QPEL_avg_mc12_SPV}
|
||||
${H264_QPEL_avg_mc13_SPV}
|
||||
${H264_QPEL_avg_mc21_SPV}
|
||||
${H264_QPEL_avg_mc23_SPV}
|
||||
${H264_QPEL_avg_mc31_SPV}
|
||||
${H264_QPEL_avg_mc32_SPV}
|
||||
${H264_QPEL_avg_mc33_SPV}
|
||||
DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
|
||||
)
|
||||
endif()
|
||||
|
||||
Reference in New Issue
Block a user