h264: V3D shaders for the 8 diagonal qpel positions
Closes the put_ qpel QPU matrix. Adds mc11/12/13/21/23/31/32/33 — each composes two half-pel anchor outputs via L2 rounded-average: mc11 ¼¼ : avg(mc20[r, c], mc02[r, c]) mc12 ¼½ : avg(mc22[r, c], mc02[r, c]) mc13 ¼¾ : avg(mc20[r+1, c], mc02[r, c]) mc21 ½¼ : avg(mc22[r, c], mc20[r, c]) mc23 ½¾ : avg(mc22[r, c], mc20[r+1, c]) mc31 ¾¼ : avg(mc20[r, c], mc02[r, c+1]) mc32 ¾½ : avg(mc22[r, c], mc02[r, c+1]) mc33 ¾¾ : avg(mc20[r+1, c], mc02[r, c+1]) Per-lane structure: each lane runs the FULL cascade for BOTH anchors at its own (r, c) target, then L2 averages. No shared memory. Shaders inline hpel_h() / hpel_v() / hpel_hv() helpers (the latter does the 13×6 int16 cascade per cell). ~88 lines each. Shaders generated from a python template (POSITIONS table + format string) — the 8 .comp files are 1:1 with the C reference's DEFINE_DIAG_REF macro from fourier PR #18. Dispatch plumbing: shared dispatch_h264_qpel_diag_qpu helper covers all 8 (same src envelope as mc22: src_max = src_off + 10*stride + 11, covering rows -2..+10 and cols -2..+10 for any (r±1, c±1) offset). Recipe table: all 8 DAEDALUS_KERNEL_H264_QPEL_MC{11..33} flipped to QPU. Public dispatchers re-defined via DEFINE_QPEL_DIAG_PUBLIC macro (replaces the old DEFINE_QPEL_DISPATCH which fast-failed QPU). Verified on hertz: $ ./build/test_api_h264 | grep "qpel mc[1-3][1-3]" H.264 qpel mc11: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc12: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc13: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc21: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc23: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc31: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc32: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc33: 2048/2048 bytes bit-exact (100.0000%) Meaningful: the (r±1, c±1) offsets are easy to transpose between positions; passing first try on the asymmetric variants (mc13/23/31/33) means the position-specific shifts are correct in all 8 templates. put_ qpel QPU matrix is now COMPLETE: 15 of 15 useful positions (mc00 = integer copy, no shader needed). avg_ qpel positions (15 more) remain on CPU NEON; can land as a follow-up since avg_ is just put_ + one extra L2 against existing dst. put_ mc20 ✓ mc02 ✓ mc22 ✓ (anchors) mc10 ✓ mc30 ✓ mc01 ✓ mc03 ✓ (single-axis ¼-pel) mc11 ✓ mc12 ✓ mc13 ✓ (this PR — row-1 diagonals) mc21 ✓ mc23 ✓ (this PR — row-2 diagonals) mc31 ✓ mc32 ✓ mc33 ✓ (this PR — row-3 diagonals) avg_ all 15 — CPU NEON
This commit is contained in:
+13
-5
@@ -372,10 +372,10 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
# Quarter-pel single-axis variants (mc10/30/01/03) — each is the
|
||||
# corresponding half-pel filter + L2 average with an integer-source
|
||||
# pixel. Same WG geometry as mc20/mc02.
|
||||
foreach(_mc mc10 mc30 mc01 mc03)
|
||||
# Quarter-pel single-axis variants (mc10/30/01/03) + diagonal
|
||||
# variants (mc11/12/13/21/23/31/32/33) — each composes 1-2 half-pel
|
||||
# results with optional L2 averaging. Same WG geometry as mc20/mc02.
|
||||
foreach(_mc mc10 mc30 mc01 mc03 mc11 mc12 mc13 mc21 mc23 mc31 mc32 mc33)
|
||||
set(_spv ${CMAKE_BINARY_DIR}/v3d_h264_qpel_${_mc}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${_spv}
|
||||
@@ -389,7 +389,7 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
set(H264_QPEL_${_mc}_SPV ${_spv})
|
||||
endforeach()
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV})
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV} ${H264_QPEL_mc11_SPV} ${H264_QPEL_mc12_SPV} ${H264_QPEL_mc13_SPV} ${H264_QPEL_mc21_SPV} ${H264_QPEL_mc23_SPV} ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV})
|
||||
|
||||
# v3d_runner — reusable Vulkan plumbing.
|
||||
add_library(v3d_runner STATIC src/v3d_runner.c)
|
||||
@@ -534,6 +534,14 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
${H264_QPEL_mc30_SPV}
|
||||
${H264_QPEL_mc01_SPV}
|
||||
${H264_QPEL_mc03_SPV}
|
||||
${H264_QPEL_mc11_SPV}
|
||||
${H264_QPEL_mc12_SPV}
|
||||
${H264_QPEL_mc13_SPV}
|
||||
${H264_QPEL_mc21_SPV}
|
||||
${H264_QPEL_mc23_SPV}
|
||||
${H264_QPEL_mc31_SPV}
|
||||
${H264_QPEL_mc32_SPV}
|
||||
${H264_QPEL_mc33_SPV}
|
||||
DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
|
||||
)
|
||||
endif()
|
||||
|
||||
Reference in New Issue
Block a user