h264: V3D shaders for chroma deblock V + H (4:2:0)
Adds the QPU shader pair for chroma_v / chroma_h deblock (non-intra bS<4), siblings of the cycle 8 luma_v shader and PR #28's luma_h. Closes 4 of 8 deblock QPU coverage at non-intra: luma_v ✓ cycle 8 luma_h ✓ PR #28 chroma_v ✓ this PR chroma_h ✓ this PR *_intra — CPU NEON (less common; smaller volume) Per H.264 §8.7.2.4 chroma kernel is simpler than luma: only p0/q0 updated (never p1/p2/q1/q2), tC = tc0_seg + 1 (no luma-style ap/aq side bonus), 8 cells per edge (vs luma's 16). Shader: 64 lines vs luma_v's 108 — same WG geometry (16 edges × 16 lanes, lanes 8..15 of each edge early-return). 4:2:0-only: 4:2:2 chroma_h has a 16-row edge geometry that this shader doesn't address; daedalus_dispatch_h264_deblock_chroma_h is 4:2:0-only by design, caller-side gating already covers this in the libavcodec substitution arc (marfrit-packages PR #98). Recipe table flips DAEDALUS_KERNEL_H264_DEBLOCK_CV / CH from CPU to QPU. dispatch_h264_deblock_chroma_qpu factored to share QPU plumbing between V and H (orientation passed as a flag for the dst_max calculation). Verified on hertz: $ ./build/test_api_h264 | grep "deblock chroma [vh]:" H.264 deblock chroma v: 256/256 bytes bit-exact (100.0000%) H.264 deblock chroma h: 256/256 bytes bit-exact (100.0000%) Recipe substrate now reports 2 (QPU) for both CV and CH. Coverage now: bS<4 QPU bS=4 (intra) luma_v ✓ cycle 8 CPU NEON luma_h ✓ PR #28 CPU NEON chroma_v ✓ this PR CPU NEON chroma_h ✓ this PR CPU NEON Intra (bS=4) variants stay CPU NEON. Less common case, smaller per-frame contribution, and the algorithm is structurally different (no tc0; strong-vs-weak filter quad-tree). Can land as a follow-up PR if perf demands.
This commit is contained in:
+25
-1
@@ -295,6 +295,28 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
set(H264DEBLOCK_CHROMA_V_SPV ${CMAKE_BINARY_DIR}/v3d_h264deblock_chroma_v.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264DEBLOCK_CHROMA_V_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${H264DEBLOCK_CHROMA_V_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_v.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_v.comp
|
||||
COMMENT "glslang: v3d_h264deblock_chroma_v.comp -> .spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
set(H264DEBLOCK_CHROMA_H_SPV ${CMAKE_BINARY_DIR}/v3d_h264deblock_chroma_h.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264DEBLOCK_CHROMA_H_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${H264DEBLOCK_CHROMA_H_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_h.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_h.comp
|
||||
COMMENT "glslang: v3d_h264deblock_chroma_h.comp -> .spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
set(H264_IDCT4_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct4.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264_IDCT4_SPV}
|
||||
@@ -328,7 +350,7 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV})
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV})
|
||||
|
||||
# v3d_runner — reusable Vulkan plumbing.
|
||||
add_library(v3d_runner STATIC src/v3d_runner.c)
|
||||
@@ -462,6 +484,8 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
${CDEF_SPV}
|
||||
${H264DEBLOCK_SPV}
|
||||
${H264DEBLOCK_H_SPV}
|
||||
${H264DEBLOCK_CHROMA_V_SPV}
|
||||
${H264DEBLOCK_CHROMA_H_SPV}
|
||||
${H264_IDCT4_SPV}
|
||||
${H264_IDCT8_SPV}
|
||||
${H264_QPEL_MC20_SPV}
|
||||
|
||||
Reference in New Issue
Block a user