Compare commits
56 Commits
f4af24020f
..
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 432d127ea9 | |||
| 1347fb961c | |||
| 9be02a9470 | |||
| 989818c2e6 | |||
| 1446b779a6 | |||
| c2d1e9790e | |||
| e506ef0803 | |||
| 2079fe39c6 | |||
| 55d3618408 | |||
| 746533582e | |||
| 224f4be9e2 | |||
| e3c28495ae | |||
| 8b8e8dc6e8 | |||
| 02d564b43e | |||
| 2074a50554 | |||
| bc5edf656d | |||
| 37b75b5813 | |||
| d8de7754fa | |||
| de9266a6eb | |||
| 3db059ffab | |||
| 2faa849ce2 | |||
| cb3aef3dac | |||
| 31c68d0d0e | |||
| df9e1c9d78 | |||
| b9f9ff2a89 | |||
| 1f07f3cd70 | |||
| b21b35c74b | |||
| ba5bbae8e2 | |||
| eef7f034b0 | |||
| 854bdeda20 | |||
| 17d672ebef | |||
| 5565cc2bef | |||
| 18ca708f87 | |||
| 8bc6d27ea7 | |||
| 1ee8b1c0ab | |||
| 01f782cfaf | |||
| 1cc0990c9f | |||
| 1113953f97 | |||
| 76e3076670 | |||
| 0894a46114 | |||
| d0a1db3c8f | |||
| e01f7bc7c6 | |||
| f3d4b15b9a | |||
| 20a4299c5c | |||
| a2575d5e42 | |||
| c3301b0c2e | |||
| 9abc73d308 | |||
| d7100459f2 | |||
| dff610e13d | |||
| c43ee84d8e | |||
| fad600000b | |||
| ce6703a862 | |||
| 5306bf0f61 | |||
| 9b1c106dc5 | |||
| ce436bfd96 | |||
| a5c47aa51c |
+196
-1
@@ -284,6 +284,55 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
set(H264DEBLOCK_H_SPV ${CMAKE_BINARY_DIR}/v3d_h264deblock_h.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264DEBLOCK_H_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${H264DEBLOCK_H_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_h.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_h.comp
|
||||
COMMENT "glslang: v3d_h264deblock_h.comp -> v3d_h264deblock_h.spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
set(H264DEBLOCK_CHROMA_V_SPV ${CMAKE_BINARY_DIR}/v3d_h264deblock_chroma_v.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264DEBLOCK_CHROMA_V_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${H264DEBLOCK_CHROMA_V_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_v.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_v.comp
|
||||
COMMENT "glslang: v3d_h264deblock_chroma_v.comp -> .spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
set(H264DEBLOCK_CHROMA_H_SPV ${CMAKE_BINARY_DIR}/v3d_h264deblock_chroma_h.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264DEBLOCK_CHROMA_H_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${H264DEBLOCK_CHROMA_H_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_h.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_chroma_h.comp
|
||||
COMMENT "glslang: v3d_h264deblock_chroma_h.comp -> .spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
# Intra (bS=4) deblock shaders — strong/weak filter selector per
|
||||
# H.264 §8.3.2.3. 4 variants (luma_v/h + chroma_v/h).
|
||||
foreach(_kind luma_v_intra luma_h_intra chroma_v_intra chroma_h_intra)
|
||||
set(_spv ${CMAKE_BINARY_DIR}/v3d_h264deblock_${_kind}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${_spv}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${_spv}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_${_kind}.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264deblock_${_kind}.comp
|
||||
COMMENT "glslang: v3d_h264deblock_${_kind}.comp -> .spv"
|
||||
VERBATIM
|
||||
)
|
||||
set(H264DEBLOCK_${_kind}_SPV ${_spv})
|
||||
endforeach()
|
||||
|
||||
set(H264_IDCT4_SPV ${CMAKE_BINARY_DIR}/v3d_h264_idct4.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264_IDCT4_SPV}
|
||||
@@ -317,7 +366,63 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV})
|
||||
set(H264_QPEL_MC02_SPV ${CMAKE_BINARY_DIR}/v3d_h264_qpel_mc02.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264_QPEL_MC02_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${H264_QPEL_MC02_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_mc02.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_mc02.comp
|
||||
COMMENT "glslang: v3d_h264_qpel_mc02.comp -> v3d_h264_qpel_mc02.spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
set(H264_QPEL_MC22_SPV ${CMAKE_BINARY_DIR}/v3d_h264_qpel_mc22.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${H264_QPEL_MC22_SPV}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${H264_QPEL_MC22_SPV}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_mc22.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_mc22.comp
|
||||
COMMENT "glslang: v3d_h264_qpel_mc22.comp -> v3d_h264_qpel_mc22.spv"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
# Quarter-pel single-axis variants (mc10/30/01/03) + diagonal
|
||||
# variants (mc11/12/13/21/23/31/32/33) — each composes 1-2 half-pel
|
||||
# results with optional L2 averaging. Same WG geometry as mc20/mc02.
|
||||
foreach(_mc mc10 mc30 mc01 mc03 mc11 mc12 mc13 mc21 mc23 mc31 mc32 mc33)
|
||||
set(_spv ${CMAKE_BINARY_DIR}/v3d_h264_qpel_${_mc}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${_spv}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${_spv}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_${_mc}.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_${_mc}.comp
|
||||
COMMENT "glslang: v3d_h264_qpel_${_mc}.comp -> .spv"
|
||||
VERBATIM
|
||||
)
|
||||
set(H264_QPEL_${_mc}_SPV ${_spv})
|
||||
endforeach()
|
||||
|
||||
# avg_ biprediction variants — same shader as put_ + extra L2 with
|
||||
# existing dst. All 15 useful positions.
|
||||
foreach(_mc mc20 mc02 mc22 mc10 mc30 mc01 mc03
|
||||
mc11 mc12 mc13 mc21 mc23 mc31 mc32 mc33)
|
||||
set(_spv ${CMAKE_BINARY_DIR}/v3d_h264_qpel_avg_${_mc}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${_spv}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${_spv}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_avg_${_mc}.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_avg_${_mc}.comp
|
||||
COMMENT "glslang: v3d_h264_qpel_avg_${_mc}.comp -> .spv"
|
||||
VERBATIM
|
||||
)
|
||||
set(H264_QPEL_avg_${_mc}_SPV ${_spv})
|
||||
endforeach()
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264DEBLOCK_luma_v_intra_SPV} ${H264DEBLOCK_luma_h_intra_SPV} ${H264DEBLOCK_chroma_v_intra_SPV} ${H264DEBLOCK_chroma_h_intra_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV} ${H264_QPEL_mc11_SPV} ${H264_QPEL_mc12_SPV} ${H264_QPEL_mc13_SPV} ${H264_QPEL_mc21_SPV} ${H264_QPEL_mc23_SPV} ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV} ${H264_QPEL_avg_mc20_SPV} ${H264_QPEL_avg_mc02_SPV} ${H264_QPEL_avg_mc22_SPV} ${H264_QPEL_avg_mc10_SPV} ${H264_QPEL_avg_mc30_SPV} ${H264_QPEL_avg_mc01_SPV} ${H264_QPEL_avg_mc03_SPV} ${H264_QPEL_avg_mc11_SPV} ${H264_QPEL_avg_mc12_SPV} ${H264_QPEL_avg_mc13_SPV} ${H264_QPEL_avg_mc21_SPV} ${H264_QPEL_avg_mc23_SPV} ${H264_QPEL_avg_mc31_SPV} ${H264_QPEL_avg_mc32_SPV} ${H264_QPEL_avg_mc33_SPV})
|
||||
|
||||
# v3d_runner — reusable Vulkan plumbing.
|
||||
add_library(v3d_runner STATIC src/v3d_runner.c)
|
||||
@@ -391,6 +496,11 @@ endif()
|
||||
|
||||
add_library(daedalus_core STATIC
|
||||
src/daedalus_core.c
|
||||
src/h264_chroma_dc.c
|
||||
src/h264_intra_pred_4x4.c
|
||||
src/h264_intra_pred_16x16.c
|
||||
src/h264_intra_pred_chroma8x8.c
|
||||
src/h264_intra_pred_8x8_luma.c
|
||||
src/v3d_runner.c
|
||||
${FFASM_SOURCES}
|
||||
${FFASM_LPF_SOURCES}
|
||||
@@ -445,9 +555,45 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
${LPF8_SPV}
|
||||
${CDEF_SPV}
|
||||
${H264DEBLOCK_SPV}
|
||||
${H264DEBLOCK_H_SPV}
|
||||
${H264DEBLOCK_CHROMA_V_SPV}
|
||||
${H264DEBLOCK_CHROMA_H_SPV}
|
||||
${H264DEBLOCK_luma_v_intra_SPV}
|
||||
${H264DEBLOCK_luma_h_intra_SPV}
|
||||
${H264DEBLOCK_chroma_v_intra_SPV}
|
||||
${H264DEBLOCK_chroma_h_intra_SPV}
|
||||
${H264_IDCT4_SPV}
|
||||
${H264_IDCT8_SPV}
|
||||
${H264_QPEL_MC20_SPV}
|
||||
${H264_QPEL_MC02_SPV}
|
||||
${H264_QPEL_MC22_SPV}
|
||||
${H264_QPEL_mc10_SPV}
|
||||
${H264_QPEL_mc30_SPV}
|
||||
${H264_QPEL_mc01_SPV}
|
||||
${H264_QPEL_mc03_SPV}
|
||||
${H264_QPEL_mc11_SPV}
|
||||
${H264_QPEL_mc12_SPV}
|
||||
${H264_QPEL_mc13_SPV}
|
||||
${H264_QPEL_mc21_SPV}
|
||||
${H264_QPEL_mc23_SPV}
|
||||
${H264_QPEL_mc31_SPV}
|
||||
${H264_QPEL_mc32_SPV}
|
||||
${H264_QPEL_mc33_SPV}
|
||||
${H264_QPEL_avg_mc20_SPV}
|
||||
${H264_QPEL_avg_mc02_SPV}
|
||||
${H264_QPEL_avg_mc22_SPV}
|
||||
${H264_QPEL_avg_mc10_SPV}
|
||||
${H264_QPEL_avg_mc30_SPV}
|
||||
${H264_QPEL_avg_mc01_SPV}
|
||||
${H264_QPEL_avg_mc03_SPV}
|
||||
${H264_QPEL_avg_mc11_SPV}
|
||||
${H264_QPEL_avg_mc12_SPV}
|
||||
${H264_QPEL_avg_mc13_SPV}
|
||||
${H264_QPEL_avg_mc21_SPV}
|
||||
${H264_QPEL_avg_mc23_SPV}
|
||||
${H264_QPEL_avg_mc31_SPV}
|
||||
${H264_QPEL_avg_mc32_SPV}
|
||||
${H264_QPEL_avg_mc33_SPV}
|
||||
DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
|
||||
)
|
||||
endif()
|
||||
@@ -520,7 +666,15 @@ add_executable(test_api_h264
|
||||
tests/h264_idct8_ref.c
|
||||
tests/h264_deblock_ref.c
|
||||
tests/h264_h_loop_filter_luma_ref.c
|
||||
tests/h264_chroma_loop_filter_ref.c
|
||||
tests/h264_intra_loop_filter_ref.c
|
||||
tests/h264_qpel8_mc20_ref.c
|
||||
tests/h264_qpel8_mc02_ref.c
|
||||
tests/h264_qpel8_mc22_ref.c
|
||||
tests/h264_qpel8_quarter_axis_ref.c
|
||||
tests/h264_qpel8_diag_ref.c
|
||||
tests/h264_qpel8_avg_anchors_ref.c
|
||||
tests/h264_qpel8_avg_rest_ref.c
|
||||
)
|
||||
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
||||
target_compile_options(test_api_h264 PRIVATE -O2)
|
||||
@@ -529,6 +683,47 @@ add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c)
|
||||
target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core)
|
||||
target_compile_options(test_api_opportunistic_qpu PRIVATE -O2)
|
||||
|
||||
# H.264 Intra_4x4 luma prediction (9 modes) — public src primitives.
|
||||
# The bodies now live in src/h264_intra_pred_4x4.c (linked into
|
||||
# daedalus_core for use by libavcodec.so substitution-arc consumers).
|
||||
# This test exercises the public symbols.
|
||||
add_executable(test_intra_pred_4x4 tests/test_intra_pred_4x4.c)
|
||||
target_link_libraries(test_intra_pred_4x4 PRIVATE daedalus_core)
|
||||
target_compile_options(test_intra_pred_4x4 PRIVATE -O2)
|
||||
|
||||
# H.264 Intra_16x16 luma prediction (4 modes) — public src primitives,
|
||||
# linked from daedalus_core.
|
||||
add_executable(test_intra_pred_16x16 tests/test_intra_pred_16x16.c)
|
||||
target_link_libraries(test_intra_pred_16x16 PRIVATE daedalus_core)
|
||||
target_compile_options(test_intra_pred_16x16 PRIVATE -O2)
|
||||
|
||||
# H.264 Intra_8x8 chroma prediction (4 modes) — public src primitives.
|
||||
add_executable(test_intra_pred_chroma8x8 tests/test_intra_pred_chroma8x8.c)
|
||||
target_link_libraries(test_intra_pred_chroma8x8 PRIVATE daedalus_core)
|
||||
target_compile_options(test_intra_pred_chroma8x8 PRIVATE -O2)
|
||||
|
||||
# H.264 Intra_8x8 luma prediction (High profile, 9 modes + 1-2-1
|
||||
# pre-filter) — public src primitives.
|
||||
add_executable(test_intra_pred_8x8_luma tests/test_intra_pred_8x8_luma.c)
|
||||
target_link_libraries(test_intra_pred_8x8_luma PRIVATE daedalus_core)
|
||||
target_compile_options(test_intra_pred_8x8_luma PRIVATE -O2)
|
||||
|
||||
# H.264 chroma DC 2x2 Hadamard pre-pass primitive. Pure transform,
|
||||
# no QP-dependent scaling (that's caller-side composition).
|
||||
add_executable(test_chroma_dc_hadamard
|
||||
tests/test_chroma_dc_hadamard.c
|
||||
tests/h264_chroma_dc_hadamard_ref.c
|
||||
)
|
||||
# Links daedalus_core to pull in the public daedalus_h264_chroma_dc_hadamard_2x2
|
||||
# symbol (for the public-API parity test added in this PR).
|
||||
target_link_libraries(test_chroma_dc_hadamard PRIVATE daedalus_core)
|
||||
target_compile_options(test_chroma_dc_hadamard PRIVATE -O2)
|
||||
|
||||
# H.264 primitives latency benchmark (NEON CPU baseline).
|
||||
add_executable(bench_h264_primitives tests/bench_h264_primitives.c)
|
||||
target_link_libraries(bench_h264_primitives PRIVATE daedalus_core)
|
||||
target_compile_options(bench_h264_primitives PRIVATE -O2)
|
||||
|
||||
add_executable(bench_pool_overhead tests/bench_pool_overhead.c)
|
||||
target_link_libraries(bench_pool_overhead PRIVATE daedalus_core)
|
||||
target_compile_options(bench_pool_overhead PRIVATE -O2)
|
||||
|
||||
@@ -286,6 +286,79 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
/* H.264 chroma (4:2:0) loop filters — bS<4 variant. Chroma uses
|
||||
* the SAME daedalus_h264_deblock_meta struct as luma but on smaller
|
||||
* tiles: 8 cols × 4 rows for V (4 segments of 2 cols), 4 cols × 8
|
||||
* rows for H (4 segments of 2 rows). Each segment has its own tc0
|
||||
* strength (tc0[s] applies to both cells in segment s).
|
||||
*
|
||||
* Algorithm difference vs luma: chroma updates only p0 and q0
|
||||
* (never p1/p2/q1/q2) and uses tC = tc0_seg + 1 directly (no
|
||||
* luma-style ap/aq side-condition bonus).
|
||||
*
|
||||
* QPU shaders for chroma deblock not implemented yet; recipe table
|
||||
* routes AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1.
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
/* H.264 bS=4 "intra" loop filters — used at I-MB and inter
|
||||
* macroblock boundaries where boundary strength is forced to 4 per
|
||||
* H.264 §8.7.2.1. Different algorithm from bS<4: per-side strong
|
||||
* vs weak filter decided by quad-tree condition (luma only);
|
||||
* chroma is always weak. No tc0 — the daedalus_h264_deblock_meta
|
||||
* struct's tc0[] field is IGNORED for intra dispatches (callers can
|
||||
* leave it uninitialised or share a single edge list across both
|
||||
* intra and non-intra kernels).
|
||||
*
|
||||
* Reuses the same meta layout as bS<4 dispatches for alpha + beta +
|
||||
* dst_off; tile geometry per orientation is identical to the bS<4
|
||||
* sibling (16-col / 16-row luma; 8-col / 8-row chroma).
|
||||
*
|
||||
* QPU shaders not implemented for any of the four; recipe routes
|
||||
* AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1 (fast fail).
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_deblock_luma_v_intra(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
int daedalus_dispatch_h264_deblock_luma_v_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_luma_h_intra(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
int daedalus_dispatch_h264_deblock_luma_h_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_v_intra(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
int daedalus_dispatch_h264_deblock_chroma_v_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_h_intra(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
int daedalus_dispatch_h264_deblock_chroma_h_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
|
||||
* (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
|
||||
@@ -319,6 +392,240 @@ int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
/* H.264 luma qpel mc02 (vertical half-pel) — mirror of mc20.
|
||||
* 6-tap filter applied vertically:
|
||||
* dst[r,c] = clip255((s[r-2,c] - 5*s[r-1,c] + 20*s[r,c]
|
||||
* + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c]
|
||||
* + 16) >> 5)
|
||||
*
|
||||
* Same single-stride convention as mc20. src + src_off points at
|
||||
* row 0 col 0 of the OUTPUT block; the filter reads rows -2..+3, so
|
||||
* the caller must guarantee 2 rows of top context and 3 rows of
|
||||
* bottom context per block (FFmpeg edge-emulated buffer handles
|
||||
* frame boundaries; same contract as mc20).
|
||||
*
|
||||
* QPU shader not implemented yet; recipe table routes AUTO to CPU
|
||||
* NEON. Explicit DAEDALUS_SUBSTRATE_QPU returns -1.
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
/* H.264 luma qpel mc22 (2D half-pel "j" position per spec §8.4.2.2.1).
|
||||
* Horizontal 6-tap cascaded into vertical 6-tap with intermediate
|
||||
* 16-bit precision; final +512 >> 10 with clip255. Common position
|
||||
* in real H.264 streams.
|
||||
*
|
||||
* src + src_off points at row 0 col 0 of the OUTPUT block; the
|
||||
* cascade reads rows -2..+10 (13 rows of context) and cols -2..+5
|
||||
* (10 cols of context). Caller must guarantee.
|
||||
*
|
||||
* QPU shader not implemented yet (the HV lowpass is the meatiest
|
||||
* qpel kernel; structurally distinct from the 1D mc20 shader).
|
||||
* Recipe routes AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1.
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
/* H.264 luma single-axis quarter-pel qpel positions ("put"):
|
||||
* mc10 ¼-H ("a" position): clip255(mc20(s)) avg src[r,c]
|
||||
* mc30 ¾-H ("c" position): clip255(mc20(s)) avg src[r,c+1]
|
||||
* mc01 ¼-V ("d" position): clip255(mc02(s)) avg src[r,c]
|
||||
* mc03 ¾-V ("n" position): clip255(mc02(s)) avg src[r+1,c]
|
||||
*
|
||||
* Each is a half-pel lowpass clipped to u8 then averaged with an
|
||||
* integer-aligned source pixel (rounded +1 >> 1). Same edge
|
||||
* context contract as mc20/mc02. CPU-only for now; QPU shaders
|
||||
* not yet implemented. Explicit SUBSTRATE_QPU returns -1.
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc10(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
int daedalus_dispatch_h264_qpel_mc10(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc30(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
int daedalus_dispatch_h264_qpel_mc30(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc01(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
int daedalus_dispatch_h264_qpel_mc01(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc03(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
int daedalus_dispatch_h264_qpel_mc03(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
/* H.264 luma diagonal qpel positions ("put", 8 variants). Each is
|
||||
* the rounded average of two half-pel intermediates per H.264
|
||||
* §8.4.2.2.1 / Table 8-4 (decomposition matches the FFmpeg .S
|
||||
* structure; see test/h264_qpel8_diag_ref.c for the formulas).
|
||||
*
|
||||
* mc11 ¼¼ : avg(mc20[r,c], mc02[r,c])
|
||||
* mc12 ¼½ : avg(mc22[r,c], mc02[r,c])
|
||||
* mc13 ¼¾ : avg(mc20[r+1,c], mc02[r,c])
|
||||
* mc21 ½¼ : avg(mc22[r,c], mc20[r,c])
|
||||
* mc23 ½¾ : avg(mc22[r,c], mc20[r+1,c])
|
||||
* mc31 ¾¼ : avg(mc20[r,c], mc02[r,c+1])
|
||||
* mc32 ¾½ : avg(mc22[r,c], mc02[r,c+1])
|
||||
* mc33 ¾¾ : avg(mc20[r+1,c], mc02[r,c+1])
|
||||
*
|
||||
* CPU-only via vendored FFmpeg NEON; QPU shaders pending.
|
||||
* Explicit SUBSTRATE_QPU returns -1.
|
||||
*/
|
||||
#define DECLARE_QPEL_DIAG(name) \
|
||||
int daedalus_recipe_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, \
|
||||
uint8_t *dst, const uint8_t *src, size_t stride, \
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta); \
|
||||
int daedalus_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, daedalus_substrate sub, \
|
||||
uint8_t *dst, const uint8_t *src, size_t stride, \
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
DECLARE_QPEL_DIAG(mc11)
|
||||
DECLARE_QPEL_DIAG(mc12)
|
||||
DECLARE_QPEL_DIAG(mc13)
|
||||
DECLARE_QPEL_DIAG(mc21)
|
||||
DECLARE_QPEL_DIAG(mc23)
|
||||
DECLARE_QPEL_DIAG(mc31)
|
||||
DECLARE_QPEL_DIAG(mc32)
|
||||
DECLARE_QPEL_DIAG(mc33)
|
||||
|
||||
#undef DECLARE_QPEL_DIAG
|
||||
|
||||
/* H.264 luma qpel avg_ biprediction anchors — 3 half-pel positions
|
||||
* (the put_ result is L2-averaged into the existing dst buffer per
|
||||
* H.264 §8.4.2.3.1). Caller is responsible for pre-loading dst with
|
||||
* the list0 prediction; the avg_ call adds list1.
|
||||
*
|
||||
* Same single-stride convention as put_; CPU NEON only for now.
|
||||
*/
|
||||
#define DECLARE_QPEL_AVG(name) \
|
||||
int daedalus_recipe_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, \
|
||||
uint8_t *dst, const uint8_t *src, size_t stride, \
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta); \
|
||||
int daedalus_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, daedalus_substrate sub, \
|
||||
uint8_t *dst, const uint8_t *src, size_t stride, \
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
DECLARE_QPEL_AVG(avg_mc20)
|
||||
DECLARE_QPEL_AVG(avg_mc02)
|
||||
DECLARE_QPEL_AVG(avg_mc22)
|
||||
DECLARE_QPEL_AVG(avg_mc10)
|
||||
DECLARE_QPEL_AVG(avg_mc30)
|
||||
DECLARE_QPEL_AVG(avg_mc01)
|
||||
DECLARE_QPEL_AVG(avg_mc03)
|
||||
DECLARE_QPEL_AVG(avg_mc11)
|
||||
DECLARE_QPEL_AVG(avg_mc12)
|
||||
DECLARE_QPEL_AVG(avg_mc13)
|
||||
DECLARE_QPEL_AVG(avg_mc21)
|
||||
DECLARE_QPEL_AVG(avg_mc23)
|
||||
DECLARE_QPEL_AVG(avg_mc31)
|
||||
DECLARE_QPEL_AVG(avg_mc32)
|
||||
DECLARE_QPEL_AVG(avg_mc33)
|
||||
|
||||
#undef DECLARE_QPEL_AVG
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* H.264 chroma DC 2x2 Hadamard pre-pass (per H.264 §8.5.11.1).
|
||||
*
|
||||
* Operates in-place on 4 int16 (the DC coefficients of an MB's
|
||||
* chroma 4x4 AC blocks). Pure CPU primitive — no substrate
|
||||
* dispatch wrapper because the work is 4 adds + 4 subs. Callers
|
||||
* compose with QP-dependent scaling themselves; the scale shape
|
||||
* varies by slice/PPS chroma_qp offset context.
|
||||
*
|
||||
* Bit-exact validated against tests/h264_chroma_dc_hadamard_ref.c
|
||||
* (7-case spec-derived test suite including the H·H = 4·I algebraic
|
||||
* invariant; see PR #23).
|
||||
* ----------------------------------------------------------------- */
|
||||
void daedalus_h264_chroma_dc_hadamard_2x2(int16_t c[4]);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* H.264 Intra_4x4 luma prediction (per H.264 §8.3.1.4). 9 modes.
|
||||
*
|
||||
* Pure CPU primitives — each is a small straightforward fill of a
|
||||
* 4x4 output block from neighbour pixels in the same buffer. No
|
||||
* substrate-dispatch wrapper (the work is too small to amortise).
|
||||
*
|
||||
* FFmpeg-style interface: `dst` at row 0 col 0 of the 4x4 output.
|
||||
* Reads top-left at dst[-stride-1], top at dst[-stride..-stride+7]
|
||||
* (top-right for DDL/VL), and left at dst[r*stride - 1] for r=0..3.
|
||||
* Caller must ensure all 13 neighbour bytes are valid (interior-MB
|
||||
* assumption — H.264 availability fallback handled at caller).
|
||||
*
|
||||
* Bit-exact validated against tests/test_intra_pred_4x4.c (10-case
|
||||
* spec-derived test suite including the asymmetric Vertical_Right
|
||||
* 16-cell hand-derived case; see fourier PR #12).
|
||||
* ----------------------------------------------------------------- */
|
||||
void daedalus_h264_pred_4x4_vertical (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_4x4_horizontal(uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_4x4_dc (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_4x4_ddl (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_4x4_ddr (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_4x4_vr (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_4x4_hd (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_4x4_vl (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_4x4_hu (uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* H.264 Intra_16x16 luma prediction (per §8.3.2). 4 modes:
|
||||
* Vertical / Horizontal / DC / Plane. Same FFmpeg-style interface
|
||||
* as the 4x4 family at 16x16 scale. Same neighbour availability
|
||||
* assumption (interior-MB).
|
||||
* ----------------------------------------------------------------- */
|
||||
void daedalus_h264_pred_16x16_vertical (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_16x16_horizontal(uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_16x16_dc (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_16x16_plane (uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* H.264 Intra_8x8 chroma prediction (per §8.3.3, 4:2:0). 4 modes:
|
||||
* DC / Horizontal / Vertical / Plane. Note: DC is per-quadrant
|
||||
* asymmetric; Plane uses slope coefficient 34 (not luma's 5).
|
||||
* ----------------------------------------------------------------- */
|
||||
void daedalus_h264_pred_chroma8x8_dc (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_chroma8x8_horizontal(uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_chroma8x8_vertical (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_chroma8x8_plane (uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* H.264 Intra_8x8 luma prediction (High profile, per §8.3.2.1).
|
||||
* 9 modes with the spec-defined 1-2-1 reference-sample pre-filter
|
||||
* applied internally to the 25 neighbours before the mode arithmetic.
|
||||
*
|
||||
* "_8x8l" naming follows the FFmpeg h264pred_template convention
|
||||
* (pred8x8l_<mode>_c) to keep the substitution wrappers a 1:1 name
|
||||
* map.
|
||||
* ----------------------------------------------------------------- */
|
||||
void daedalus_h264_pred_8x8l_vertical (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_8x8l_horizontal(uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_8x8l_dc (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_8x8l_ddl (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_8x8l_ddr (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_8x8l_vr (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_8x8l_hd (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_8x8l_vl (uint8_t *dst, ptrdiff_t stride);
|
||||
void daedalus_h264_pred_8x8l_hu (uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* Recipe query — what does the API recommend for each kernel?
|
||||
* ----------------------------------------------------------------- */
|
||||
@@ -333,6 +640,41 @@ typedef enum {
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC20 = 9,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA = 13,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA = 14,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC02 = 17,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC22 = 18,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC10 = 19,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC30 = 20,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC01 = 21,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC03 = 22,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC11 = 23,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC12 = 24,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC13 = 25,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC21 = 26,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC23 = 27,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC31 = 28,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC32 = 29,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC33 = 30,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC20 = 31,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC02 = 32,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC22 = 33,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC10 = 34,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC30 = 35,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC01 = 36,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC03 = 37,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC11 = 38,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC12 = 39,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC13 = 40,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC21 = 41,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC23 = 42,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC31 = 43,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC32 = 44,
|
||||
DAEDALUS_KERNEL_H264_QPEL_AVG_MC33 = 45,
|
||||
} daedalus_kernel;
|
||||
|
||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||
|
||||
+1214
-13
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,34 @@
|
||||
/* SPDX-License-Identifier: BSD-2-Clause */
|
||||
/*
|
||||
* H.264 chroma DC 2x2 Hadamard pre-pass (public, in-tree CPU).
|
||||
*
|
||||
* The 4 DC coefficients of an MB's chroma 4x4 AC blocks go through
|
||||
* this 2x2 Hadamard before quant-scaling and re-injection into the
|
||||
* AC blocks' [0,0] coefficient. Algorithm per H.264 §8.5.11.1.
|
||||
*
|
||||
* Pure CPU primitive — there's no substrate-dispatch wrapper because
|
||||
* the work is 4 adds + 4 subs. Callers compose with QP-dependent
|
||||
* scaling themselves (the scale shape varies by slice/PPS chroma_qp
|
||||
* offset context and shouldn't be baked into the kernel).
|
||||
*
|
||||
* Bit-exact validated against tests/h264_chroma_dc_hadamard_ref.c
|
||||
* (7-case spec-derived test suite including the H·H = 4·I algebraic
|
||||
* invariant; see PR #23). Same algorithm; this is the public
|
||||
* src-tree copy.
|
||||
*/
|
||||
#include "daedalus.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
void daedalus_h264_chroma_dc_hadamard_2x2(int16_t c[4])
|
||||
{
|
||||
int t0 = c[0] + c[1];
|
||||
int t1 = c[0] - c[1];
|
||||
int t2 = c[2] + c[3];
|
||||
int t3 = c[2] - c[3];
|
||||
|
||||
c[0] = (int16_t)(t0 + t2); /* f[0,0] = sum of all 4 */
|
||||
c[1] = (int16_t)(t1 + t3); /* f[0,1] = col-difference */
|
||||
c[2] = (int16_t)(t0 - t2); /* f[1,0] = row-difference */
|
||||
c[3] = (int16_t)(t1 - t3); /* f[1,1] = anti-diagonal */
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma Intra_16x16
|
||||
* prediction modes (per H.264 spec §8.3.2). All 4 modes.
|
||||
*
|
||||
* Mode index → name (per H.264 Table 7-15):
|
||||
* 0 = Vertical
|
||||
* 1 = Horizontal
|
||||
* 2 = DC
|
||||
* 3 = Plane
|
||||
*
|
||||
* Calling convention (FFmpeg-style, matches the Intra_4x4 ref):
|
||||
* pred_16x16_<mode>(uint8_t *dst, ptrdiff_t stride)
|
||||
*
|
||||
* `dst` points at row 0, col 0 of the 16x16 output block. Neighbours:
|
||||
* top[0..15] = dst[-stride + 0 .. -stride + 15]
|
||||
* top-left = dst[-stride - 1]
|
||||
* left[0..15] = dst[ 0*stride - 1 .. 15*stride - 1]
|
||||
*
|
||||
* AVAILABILITY: assumes all neighbours valid (interior-MB case). The
|
||||
* H.264 spec defines fallback for boundary cases (DC averages just
|
||||
* the available side, etc.); the eventual libavcodec intercept
|
||||
* handles availability before calling.
|
||||
*
|
||||
* License: BSD-2-Clause.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
/* Mode 0 — Vertical: each col = top[col]. */
|
||||
void daedalus_h264_pred_16x16_vertical(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
for (int r = 0; r < 16; r++)
|
||||
for (int c = 0; c < 16; c++) dst[r * stride + c] = top[c];
|
||||
}
|
||||
|
||||
/* Mode 1 — Horizontal: each row = left[row]. */
|
||||
void daedalus_h264_pred_16x16_horizontal(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 16; r++) {
|
||||
uint8_t l = dst[r * stride - 1];
|
||||
for (int c = 0; c < 16; c++) dst[r * stride + c] = l;
|
||||
}
|
||||
}
|
||||
|
||||
/* Mode 2 — DC: ((sum_top16 + sum_left16 + 16) >> 5) broadcast. */
|
||||
void daedalus_h264_pred_16x16_dc(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int sum = 16; /* rounding for >> 5 over 32 samples */
|
||||
for (int i = 0; i < 16; i++) sum += top[i];
|
||||
for (int i = 0; i < 16; i++) sum += dst[i * stride - 1];
|
||||
uint8_t v = (uint8_t)(sum >> 5);
|
||||
for (int r = 0; r < 16; r++)
|
||||
for (int c = 0; c < 16; c++) dst[r * stride + c] = v;
|
||||
}
|
||||
|
||||
/* Mode 3 — Plane (per H.264 §8.3.2.4):
|
||||
* H = sum_{i=0..7} (i+1) * (p[7+i+1, -1] - p[7-i-1, -1])
|
||||
* = sum_{i=0..7} (i+1) * (top[8+i] - top[6-i])
|
||||
* V = sum_{j=0..7} (j+1) * (p[-1, 7+j+1] - p[-1, 7-j-1])
|
||||
* = sum_{j=0..7} (j+1) * (left[8+j] - left[6-j])
|
||||
* b = (5*H + 32) >> 6
|
||||
* c = (5*V + 32) >> 6
|
||||
* a = 16 * (p[-1, 15] + p[15, -1])
|
||||
* = 16 * (left[15] + top[15])
|
||||
* pred[y][x] = Clip1((a + b*(x-7) + c*(y-7) + 16) >> 5)
|
||||
*
|
||||
* Note: spec indexing uses [x, y] with x = col, y = row (or vice
|
||||
* versa depending on the section). Here I use the FFmpeg convention
|
||||
* pred[y][x] = pred[row][col]; the H = horizontal-slope formula uses
|
||||
* the TOP row's left-vs-right asymmetry; V = vertical-slope uses the
|
||||
* LEFT col's top-vs-bottom asymmetry. Boundary participants are
|
||||
* the top-left corner p[-1,-1] inferred from the spec's index range
|
||||
* (it does NOT participate in the H/V sums in the 16x16 case — only
|
||||
* for the chroma 8x8 plane mode).
|
||||
*/
|
||||
void daedalus_h264_pred_16x16_plane(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
/* H accumulates differences across the right vs left half of the
|
||||
* top row. Per spec, the top-left p[-1,-1] participates: i=7 uses
|
||||
* p[15,-1] - p[-1,-1]. We include it by reading top[-1]. */
|
||||
int H = 0, V = 0;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
int t_right = top[8 + i];
|
||||
int t_left = (i == 7) ? top[-1] : top[6 - i];
|
||||
H += (i + 1) * (t_right - t_left);
|
||||
}
|
||||
for (int j = 0; j < 8; j++) {
|
||||
int l_bot = dst[(8 + j) * stride - 1];
|
||||
int l_top = (j == 7) ? top[-1] : dst[(6 - j) * stride - 1];
|
||||
V += (j + 1) * (l_bot - l_top);
|
||||
}
|
||||
int b = (5 * H + 32) >> 6;
|
||||
int c = (5 * V + 32) >> 6;
|
||||
int a = 16 * (dst[15 * stride - 1] + top[15]);
|
||||
for (int y = 0; y < 16; y++) {
|
||||
for (int x = 0; x < 16; x++) {
|
||||
int v = (a + b * (x - 7) + c * (y - 7) + 16) >> 5;
|
||||
dst[y * stride + x] = (uint8_t) clip_u8(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,238 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma Intra_4x4
|
||||
* prediction modes (per H.264 spec §8.3.1.4). All 9 modes.
|
||||
*
|
||||
* Mode index → name (per H.264 Table 8-2):
|
||||
* 0 = Vertical
|
||||
* 1 = Horizontal
|
||||
* 2 = DC
|
||||
* 3 = Diagonal_Down_Left
|
||||
* 4 = Diagonal_Down_Right
|
||||
* 5 = Vertical_Right
|
||||
* 6 = Horizontal_Down
|
||||
* 7 = Vertical_Left
|
||||
* 8 = Horizontal_Up
|
||||
*
|
||||
* Calling convention matches FFmpeg's h264pred:
|
||||
* pred_4x4_<mode>(uint8_t *dst, ptrdiff_t stride)
|
||||
*
|
||||
* `dst` points at row 0, col 0 of the 4x4 output block. Neighbour
|
||||
* pixels come from the already-decoded surrounding pixels in the same
|
||||
* buffer:
|
||||
* top-left = dst[-stride - 1]
|
||||
* top[0..3] = dst[-stride + 0 .. -stride + 3]
|
||||
* top-right = dst[-stride + 4 .. -stride + 7] (DDL / VL only)
|
||||
* left[0..3] = dst[ 0*stride - 1 .. 3*stride - 1]
|
||||
*
|
||||
* AVAILABILITY: this reference assumes ALL neighbours are available
|
||||
* (the "interior MB" case). The H.264 spec defines fallback behaviour
|
||||
* for unavailable neighbours (e.g. DC averages only the available
|
||||
* side, top-right substitution from top[3] for DDL/VL near the right
|
||||
* frame edge); those branches are NOT modelled here. Tests must
|
||||
* exercise the kernel with all 13 neighbour bytes valid. The eventual
|
||||
* libavcodec intercept handles availability before calling.
|
||||
*
|
||||
* License: BSD-2-Clause for the reference + tests; the underlying
|
||||
* algorithm is from H.264/ITU-T H.264 (2003) and AVC standards, free
|
||||
* to implement.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/* Helper: 3-tap weighted average ((a + 2*b + c + 2) >> 2). */
|
||||
static inline uint8_t avg3(int a, int b, int c)
|
||||
{
|
||||
return (uint8_t)((a + 2*b + c + 2) >> 2);
|
||||
}
|
||||
|
||||
/* Helper: 2-tap mean ((a + b + 1) >> 1). */
|
||||
static inline uint8_t avg2(int a, int b)
|
||||
{
|
||||
return (uint8_t)((a + b + 1) >> 1);
|
||||
}
|
||||
|
||||
/* Mode 0 — Vertical: each col = top[col]. */
|
||||
void daedalus_h264_pred_4x4_vertical(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
for (int r = 0; r < 4; r++) {
|
||||
for (int c = 0; c < 4; c++) dst[r * stride + c] = top[c];
|
||||
}
|
||||
}
|
||||
|
||||
/* Mode 1 — Horizontal: each row = left[row]. */
|
||||
void daedalus_h264_pred_4x4_horizontal(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 4; r++) {
|
||||
uint8_t l = dst[r * stride - 1];
|
||||
for (int c = 0; c < 4; c++) dst[r * stride + c] = l;
|
||||
}
|
||||
}
|
||||
|
||||
/* Mode 2 — DC: mean of top 4 + left 4, broadcast. */
|
||||
void daedalus_h264_pred_4x4_dc(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int sum = 4; /* rounding for ((sum + 4) >> 3) */
|
||||
for (int i = 0; i < 4; i++) sum += top[i];
|
||||
for (int i = 0; i < 4; i++) sum += dst[i * stride - 1];
|
||||
uint8_t v = (uint8_t)(sum >> 3);
|
||||
for (int r = 0; r < 4; r++)
|
||||
for (int c = 0; c < 4; c++) dst[r * stride + c] = v;
|
||||
}
|
||||
|
||||
/* Mode 3 — Diagonal_Down_Left. Uses top[0..7] (incl. top-right). */
|
||||
void daedalus_h264_pred_4x4_ddl(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int t0 = top[0], t1 = top[1], t2 = top[2], t3 = top[3];
|
||||
int t4 = top[4], t5 = top[5], t6 = top[6], t7 = top[7];
|
||||
/* zz[7] = top filtered with 3-tap; spec table 8-7. */
|
||||
uint8_t zz[7];
|
||||
zz[0] = avg3(t0, t1, t2);
|
||||
zz[1] = avg3(t1, t2, t3);
|
||||
zz[2] = avg3(t2, t3, t4);
|
||||
zz[3] = avg3(t3, t4, t5);
|
||||
zz[4] = avg3(t4, t5, t6);
|
||||
zz[5] = avg3(t5, t6, t7);
|
||||
zz[6] = avg3(t6, t7, t7); /* spec: t7 doubled at the boundary */
|
||||
/* dst[r][c] = zz[c + r] */
|
||||
for (int r = 0; r < 4; r++)
|
||||
for (int c = 0; c < 4; c++) dst[r * stride + c] = zz[c + r];
|
||||
}
|
||||
|
||||
/* Mode 4 — Diagonal_Down_Right. Uses top-left + top[0..3] + left[0..3]. */
|
||||
void daedalus_h264_pred_4x4_ddr(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
int tl = dst[-stride - 1];
|
||||
int t0 = dst[-stride + 0], t1 = dst[-stride + 1];
|
||||
int t2 = dst[-stride + 2], t3 = dst[-stride + 3];
|
||||
int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
|
||||
int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1];
|
||||
/* zz indexed by (col - row): -3..+3 */
|
||||
uint8_t zz_m3 = avg3(l1, l2, l3);
|
||||
uint8_t zz_m2 = avg3(l0, l1, l2);
|
||||
uint8_t zz_m1 = avg3(tl, l0, l1);
|
||||
uint8_t zz_p0 = avg3(l0, tl, t0);
|
||||
uint8_t zz_p1 = avg3(tl, t0, t1);
|
||||
uint8_t zz_p2 = avg3(t0, t1, t2);
|
||||
uint8_t zz_p3 = avg3(t1, t2, t3);
|
||||
uint8_t zz[7] = { zz_m3, zz_m2, zz_m1, zz_p0, zz_p1, zz_p2, zz_p3 };
|
||||
for (int r = 0; r < 4; r++)
|
||||
for (int c = 0; c < 4; c++) dst[r * stride + c] = zz[(c - r) + 3];
|
||||
}
|
||||
|
||||
/* Mode 5 — Vertical_Right. */
|
||||
void daedalus_h264_pred_4x4_vr(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
int tl = dst[-stride - 1];
|
||||
int t0 = dst[-stride + 0], t1 = dst[-stride + 1];
|
||||
int t2 = dst[-stride + 2], t3 = dst[-stride + 3];
|
||||
int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
|
||||
int l2 = dst[ 2*stride - 1];
|
||||
/* H.264 §8.3.1.4.6: two patterns based on (2c - r) parity. */
|
||||
dst[0*stride + 0] = avg2(tl, t0);
|
||||
dst[0*stride + 1] = avg2(t0, t1);
|
||||
dst[0*stride + 2] = avg2(t1, t2);
|
||||
dst[0*stride + 3] = avg2(t2, t3);
|
||||
|
||||
dst[1*stride + 0] = avg3(l0, tl, t0);
|
||||
dst[1*stride + 1] = avg3(tl, t0, t1);
|
||||
dst[1*stride + 2] = avg3(t0, t1, t2);
|
||||
dst[1*stride + 3] = avg3(t1, t2, t3);
|
||||
|
||||
dst[2*stride + 0] = avg3(tl, l0, l1);
|
||||
dst[2*stride + 1] = dst[0*stride + 0];
|
||||
dst[2*stride + 2] = dst[0*stride + 1];
|
||||
dst[2*stride + 3] = dst[0*stride + 2];
|
||||
|
||||
dst[3*stride + 0] = avg3(l0, l1, l2);
|
||||
dst[3*stride + 1] = dst[1*stride + 0];
|
||||
dst[3*stride + 2] = dst[1*stride + 1];
|
||||
dst[3*stride + 3] = dst[1*stride + 2];
|
||||
}
|
||||
|
||||
/* Mode 6 — Horizontal_Down. */
|
||||
void daedalus_h264_pred_4x4_hd(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
int tl = dst[-stride - 1];
|
||||
int t0 = dst[-stride + 0], t1 = dst[-stride + 1], t2 = dst[-stride + 2];
|
||||
int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
|
||||
int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1];
|
||||
|
||||
dst[0*stride + 0] = avg2(tl, l0);
|
||||
dst[0*stride + 1] = avg3(l0, tl, t0);
|
||||
dst[0*stride + 2] = avg3(tl, t0, t1);
|
||||
dst[0*stride + 3] = avg3(t0, t1, t2);
|
||||
|
||||
dst[1*stride + 0] = avg2(l0, l1);
|
||||
dst[1*stride + 1] = avg3(tl, l0, l1);
|
||||
dst[1*stride + 2] = dst[0*stride + 0];
|
||||
dst[1*stride + 3] = dst[0*stride + 1];
|
||||
|
||||
dst[2*stride + 0] = avg2(l1, l2);
|
||||
dst[2*stride + 1] = avg3(l0, l1, l2);
|
||||
dst[2*stride + 2] = dst[1*stride + 0];
|
||||
dst[2*stride + 3] = dst[1*stride + 1];
|
||||
|
||||
dst[3*stride + 0] = avg2(l2, l3);
|
||||
dst[3*stride + 1] = avg3(l1, l2, l3);
|
||||
dst[3*stride + 2] = dst[2*stride + 0];
|
||||
dst[3*stride + 3] = dst[2*stride + 1];
|
||||
}
|
||||
|
||||
/* Mode 7 — Vertical_Left. Uses top[0..7]. */
|
||||
void daedalus_h264_pred_4x4_vl(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int t0=top[0], t1=top[1], t2=top[2], t3=top[3];
|
||||
int t4=top[4], t5=top[5], t6=top[6], t7=top[7];
|
||||
|
||||
dst[0*stride + 0] = avg2(t0, t1);
|
||||
dst[0*stride + 1] = avg2(t1, t2);
|
||||
dst[0*stride + 2] = avg2(t2, t3);
|
||||
dst[0*stride + 3] = avg2(t3, t4);
|
||||
|
||||
dst[1*stride + 0] = avg3(t0, t1, t2);
|
||||
dst[1*stride + 1] = avg3(t1, t2, t3);
|
||||
dst[1*stride + 2] = avg3(t2, t3, t4);
|
||||
dst[1*stride + 3] = avg3(t3, t4, t5);
|
||||
|
||||
dst[2*stride + 0] = avg2(t1, t2);
|
||||
dst[2*stride + 1] = avg2(t2, t3);
|
||||
dst[2*stride + 2] = avg2(t3, t4);
|
||||
dst[2*stride + 3] = avg2(t4, t5);
|
||||
|
||||
dst[3*stride + 0] = avg3(t1, t2, t3);
|
||||
dst[3*stride + 1] = avg3(t2, t3, t4);
|
||||
dst[3*stride + 2] = avg3(t3, t4, t5);
|
||||
dst[3*stride + 3] = avg3(t4, t5, t6);
|
||||
(void) t6; (void) t7; /* t6 used; t7 unused in 4x4 VL */
|
||||
}
|
||||
|
||||
/* Mode 8 — Horizontal_Up. Uses left[0..3] only. */
|
||||
void daedalus_h264_pred_4x4_hu(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
int l0 = dst[ 0*stride - 1], l1 = dst[ 1*stride - 1];
|
||||
int l2 = dst[ 2*stride - 1], l3 = dst[ 3*stride - 1];
|
||||
|
||||
dst[0*stride + 0] = avg2(l0, l1);
|
||||
dst[0*stride + 1] = avg3(l0, l1, l2);
|
||||
dst[0*stride + 2] = avg2(l1, l2);
|
||||
dst[0*stride + 3] = avg3(l1, l2, l3);
|
||||
|
||||
dst[1*stride + 0] = avg2(l1, l2);
|
||||
dst[1*stride + 1] = avg3(l1, l2, l3);
|
||||
dst[1*stride + 2] = avg2(l2, l3);
|
||||
dst[1*stride + 3] = avg3(l2, l3, l3);
|
||||
|
||||
dst[2*stride + 0] = avg2(l2, l3);
|
||||
dst[2*stride + 1] = avg3(l2, l3, l3);
|
||||
dst[2*stride + 2] = l3;
|
||||
dst[2*stride + 3] = l3;
|
||||
|
||||
dst[3*stride + 0] = l3;
|
||||
dst[3*stride + 1] = l3;
|
||||
dst[3*stride + 2] = l3;
|
||||
dst[3*stride + 3] = l3;
|
||||
}
|
||||
@@ -0,0 +1,305 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma Intra_8x8
|
||||
* prediction modes (per H.264 spec §8.3.2.1). High-profile-only
|
||||
* MB type — Baseline/Main/Extended profiles don't see Intra_8x8.
|
||||
*
|
||||
* Distinct from Intra_4x4 in two ways:
|
||||
*
|
||||
* 1. REFERENCE SAMPLE FILTERING (§8.3.2.1.1). The 25 raw
|
||||
* neighbour samples are pre-filtered with a 1-2-1 smoothing
|
||||
* filter BEFORE prediction. The filtering has spec-defined
|
||||
* boundary handling at the corners and the right-edge of the
|
||||
* top-row extension.
|
||||
*
|
||||
* 2. SCALE. All 9 prediction modes operate at 8x8 with the
|
||||
* filtered samples (Intra_4x4 operates at 4x4 with the raw
|
||||
* samples).
|
||||
*
|
||||
* This PR implements the filter + the 3 simple modes (Vertical,
|
||||
* Horizontal, DC). The 6 directional modes (DDL, DDR, VR, HD, VL,
|
||||
* HU at 8x8) follow in a separate PR — same template, different
|
||||
* formulas per spec sections §8.3.2.1.4..§8.3.2.1.9.
|
||||
*
|
||||
* Calling convention (FFmpeg-style):
|
||||
* pred_8x8_<mode>_ref(uint8_t *dst, ptrdiff_t stride)
|
||||
*
|
||||
* `dst` points at row 0 col 0 of the 8x8 output block. Reads from
|
||||
* top[0..15] = dst[-stride + 0..15]
|
||||
* top-left = dst[-stride - 1]
|
||||
* left[0..7] = dst[ 0*stride - 1 .. 7*stride - 1]
|
||||
*
|
||||
* AVAILABILITY: assumes all neighbours valid (interior-MB case).
|
||||
*
|
||||
* License: BSD-2-Clause.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
/* H.264 §8.3.2.1.1 reference sample filtering. Filters the 25 raw
|
||||
* samples around the 8x8 block into a `filt` array with the same
|
||||
* indices. When called against an "all neighbours available" tile,
|
||||
* the filtered output uses these spec-defined formulas:
|
||||
*
|
||||
* filt[top -1] (= filtered top-left) = (top[0] + 2*tl + left[0] + 2) >> 2
|
||||
*
|
||||
* filt[top 0] = (tl + 2*top[0] + top[1] + 2) >> 2
|
||||
* filt[top i] for 1<=i<=14 = (top[i-1] + 2*top[i] + top[i+1] + 2) >> 2
|
||||
* filt[top 15] = (top[14] + 3*top[15] + 2) >> 2 (boundary)
|
||||
*
|
||||
* filt[left 0] = (tl + 2*left[0] + left[1] + 2) >> 2
|
||||
* filt[left j] for 1<=j<=6 = (left[j-1] + 2*left[j] + left[j+1] + 2) >> 2
|
||||
* filt[left 7] = (left[6] + 3*left[7] + 2) >> 2 (boundary)
|
||||
*
|
||||
* Reads neighbours from the dst buffer; writes filtered values to
|
||||
* a caller-provided 26-element array indexed as:
|
||||
* filt[0] = filtered top-left
|
||||
* filt[1..16] = filtered top[0..15]
|
||||
* filt[17..24] = filtered left[0..7]
|
||||
*/
|
||||
static void filter_refs(const uint8_t *dst, ptrdiff_t stride,
|
||||
uint8_t filt[25])
|
||||
{
|
||||
int tl = dst[-stride - 1];
|
||||
int t[16];
|
||||
for (int i = 0; i < 16; i++) t[i] = dst[-stride + i];
|
||||
int l[8];
|
||||
for (int j = 0; j < 8; j++) l[j] = dst[j * stride - 1];
|
||||
|
||||
/* Filtered top-left. */
|
||||
filt[0] = (uint8_t)((t[0] + 2*tl + l[0] + 2) >> 2);
|
||||
|
||||
/* Filtered top. */
|
||||
filt[1] = (uint8_t)((tl + 2*t[0] + t[1] + 2) >> 2);
|
||||
for (int i = 1; i <= 14; i++)
|
||||
filt[1 + i] = (uint8_t)((t[i-1] + 2*t[i] + t[i+1] + 2) >> 2);
|
||||
filt[1 + 15] = (uint8_t)((t[14] + 3*t[15] + 2) >> 2);
|
||||
|
||||
/* Filtered left. */
|
||||
filt[17 + 0] = (uint8_t)((tl + 2*l[0] + l[1] + 2) >> 2);
|
||||
for (int j = 1; j <= 6; j++)
|
||||
filt[17 + j] = (uint8_t)((l[j-1] + 2*l[j] + l[j+1] + 2) >> 2);
|
||||
filt[17 + 7] = (uint8_t)((l[6] + 3*l[7] + 2) >> 2);
|
||||
}
|
||||
|
||||
/* Convenience macros for accessing the filt[] array by spec-style index. */
|
||||
#define FT(i) filt[1 + (i)] /* filtered top[i], i in 0..15 */
|
||||
#define FL(j) filt[17 + (j)] /* filtered left[j], j in 0..7 */
|
||||
#define FTL filt[0] /* filtered top-left */
|
||||
|
||||
/* Mode 0 Vertical (§8.3.2.1.2): pred[r,c] = filt_top[c]. */
|
||||
void daedalus_h264_pred_8x8l_vertical(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
uint8_t filt[25];
|
||||
filter_refs(dst, stride, filt);
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++) dst[r * stride + c] = FT(c);
|
||||
}
|
||||
|
||||
/* Mode 1 Horizontal (§8.3.2.1.3): pred[r,c] = filt_left[r]. */
|
||||
void daedalus_h264_pred_8x8l_horizontal(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
uint8_t filt[25];
|
||||
filter_refs(dst, stride, filt);
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++) dst[r * stride + c] = FL(r);
|
||||
}
|
||||
|
||||
/* Mode 2 DC (§8.3.2.1.4): ((sum_filt_top[0..7] + sum_filt_left[0..7]
|
||||
* + 8) >> 4) broadcast. Note the +8 (not +4 like 4x4): there are
|
||||
* 16 samples summed total, so >> 4 with half-step rounding +8. */
|
||||
void daedalus_h264_pred_8x8l_dc(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
uint8_t filt[25];
|
||||
filter_refs(dst, stride, filt);
|
||||
int sum = 8;
|
||||
for (int i = 0; i < 8; i++) sum += FT(i);
|
||||
for (int j = 0; j < 8; j++) sum += FL(j);
|
||||
uint8_t v = (uint8_t)(sum >> 4);
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++) dst[r * stride + c] = v;
|
||||
}
|
||||
|
||||
/* --- 6 directional modes for Intra_8x8 (H.264 §8.3.2.1.5..§8.3.2.1.10).
|
||||
* Transcribed from FFmpeg libavcodec/h264pred_template.c
|
||||
* pred8x8l_{down_left, down_right, vertical_right, horizontal_down,
|
||||
* vertical_left, horizontal_up} (LGPL-2.1+ in the original; algorithm
|
||||
* reproduced here for test purposes).
|
||||
*
|
||||
* All 6 use the same FILTERED reference samples produced by
|
||||
* filter_refs() above. Mapping from FFmpeg's t0..t15 / l0..l7 / lt
|
||||
* notation:
|
||||
* tN = FT(N) for N in 0..15
|
||||
* lN = FL(N) for N in 0..7
|
||||
* lt = FTL
|
||||
*
|
||||
* SRC(x,y) maps to dst[y*stride + x] (col x, row y).
|
||||
*/
|
||||
#define SRC(x, y) dst[(y) * stride + (x)]
|
||||
#define T(i) FT(i)
|
||||
#define L(j) FL(j)
|
||||
#define LT FTL
|
||||
|
||||
/* Mode 3 DDL (Diagonal_Down_Left) — uses TOP + TOP_RIGHT, no LEFT. */
|
||||
void daedalus_h264_pred_8x8l_ddl(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
uint8_t filt[25];
|
||||
filter_refs(dst, stride, filt);
|
||||
SRC(0,0)= (T(0) + 2*T(1) + T(2) + 2) >> 2;
|
||||
SRC(0,1)=SRC(1,0)= (T(1) + 2*T(2) + T(3) + 2) >> 2;
|
||||
SRC(0,2)=SRC(1,1)=SRC(2,0)= (T(2) + 2*T(3) + T(4) + 2) >> 2;
|
||||
SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (T(3) + 2*T(4) + T(5) + 2) >> 2;
|
||||
SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (T(4) + 2*T(5) + T(6) + 2) >> 2;
|
||||
SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (T(5) + 2*T(6) + T(7) + 2) >> 2;
|
||||
SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (T(6) + 2*T(7) + T(8) + 2) >> 2;
|
||||
SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (T(7) + 2*T(8) + T(9) + 2) >> 2;
|
||||
SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (T(8) + 2*T(9) + T(10) + 2) >> 2;
|
||||
SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (T(9) + 2*T(10) + T(11) + 2) >> 2;
|
||||
SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (T(10) + 2*T(11) + T(12) + 2) >> 2;
|
||||
SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (T(11) + 2*T(12) + T(13) + 2) >> 2;
|
||||
SRC(5,7)=SRC(6,6)=SRC(7,5)= (T(12) + 2*T(13) + T(14) + 2) >> 2;
|
||||
SRC(6,7)=SRC(7,6)= (T(13) + 2*T(14) + T(15) + 2) >> 2;
|
||||
SRC(7,7)= (T(14) + 3*T(15) + 2) >> 2;
|
||||
}
|
||||
|
||||
/* Mode 4 DDR (Diagonal_Down_Right). */
|
||||
void daedalus_h264_pred_8x8l_ddr(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
uint8_t filt[25];
|
||||
filter_refs(dst, stride, filt);
|
||||
SRC(0,7)= (L(7) + 2*L(6) + L(5) + 2) >> 2;
|
||||
SRC(0,6)=SRC(1,7)= (L(6) + 2*L(5) + L(4) + 2) >> 2;
|
||||
SRC(0,5)=SRC(1,6)=SRC(2,7)= (L(5) + 2*L(4) + L(3) + 2) >> 2;
|
||||
SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (L(4) + 2*L(3) + L(2) + 2) >> 2;
|
||||
SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (L(3) + 2*L(2) + L(1) + 2) >> 2;
|
||||
SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (L(2) + 2*L(1) + L(0) + 2) >> 2;
|
||||
SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (L(1) + 2*L(0) + LT + 2) >> 2;
|
||||
SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (L(0) + 2*LT + T(0) + 2) >> 2;
|
||||
SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (LT + 2*T(0) + T(1) + 2) >> 2;
|
||||
SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (T(0) + 2*T(1) + T(2) + 2) >> 2;
|
||||
SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (T(1) + 2*T(2) + T(3) + 2) >> 2;
|
||||
SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (T(2) + 2*T(3) + T(4) + 2) >> 2;
|
||||
SRC(5,0)=SRC(6,1)=SRC(7,2)= (T(3) + 2*T(4) + T(5) + 2) >> 2;
|
||||
SRC(6,0)=SRC(7,1)= (T(4) + 2*T(5) + T(6) + 2) >> 2;
|
||||
SRC(7,0)= (T(5) + 2*T(6) + T(7) + 2) >> 2;
|
||||
}
|
||||
|
||||
/* Mode 5 VR (Vertical_Right). */
|
||||
void daedalus_h264_pred_8x8l_vr(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
uint8_t filt[25];
|
||||
filter_refs(dst, stride, filt);
|
||||
SRC(0,6)= (L(5) + 2*L(4) + L(3) + 2) >> 2;
|
||||
SRC(0,7)= (L(6) + 2*L(5) + L(4) + 2) >> 2;
|
||||
SRC(0,4)=SRC(1,6)= (L(3) + 2*L(2) + L(1) + 2) >> 2;
|
||||
SRC(0,5)=SRC(1,7)= (L(4) + 2*L(3) + L(2) + 2) >> 2;
|
||||
SRC(0,2)=SRC(1,4)=SRC(2,6)= (L(1) + 2*L(0) + LT + 2) >> 2;
|
||||
SRC(0,3)=SRC(1,5)=SRC(2,7)= (L(2) + 2*L(1) + L(0) + 2) >> 2;
|
||||
SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (L(0) + 2*LT + T(0) + 2) >> 2;
|
||||
SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (LT + T(0) + 1) >> 1;
|
||||
SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (LT + 2*T(0) + T(1) + 2) >> 2;
|
||||
SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (T(0) + T(1) + 1) >> 1;
|
||||
SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (T(0) + 2*T(1) + T(2) + 2) >> 2;
|
||||
SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (T(1) + T(2) + 1) >> 1;
|
||||
SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (T(1) + 2*T(2) + T(3) + 2) >> 2;
|
||||
SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (T(2) + T(3) + 1) >> 1;
|
||||
SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (T(2) + 2*T(3) + T(4) + 2) >> 2;
|
||||
SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (T(3) + T(4) + 1) >> 1;
|
||||
SRC(5,1)=SRC(6,3)=SRC(7,5)= (T(3) + 2*T(4) + T(5) + 2) >> 2;
|
||||
SRC(5,0)=SRC(6,2)=SRC(7,4)= (T(4) + T(5) + 1) >> 1;
|
||||
SRC(6,1)=SRC(7,3)= (T(4) + 2*T(5) + T(6) + 2) >> 2;
|
||||
SRC(6,0)=SRC(7,2)= (T(5) + T(6) + 1) >> 1;
|
||||
SRC(7,1)= (T(5) + 2*T(6) + T(7) + 2) >> 2;
|
||||
SRC(7,0)= (T(6) + T(7) + 1) >> 1;
|
||||
}
|
||||
|
||||
/* Mode 6 HD (Horizontal_Down). */
|
||||
void daedalus_h264_pred_8x8l_hd(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
uint8_t filt[25];
|
||||
filter_refs(dst, stride, filt);
|
||||
SRC(0,7)= (L(6) + L(7) + 1) >> 1;
|
||||
SRC(1,7)= (L(5) + 2*L(6) + L(7) + 2) >> 2;
|
||||
SRC(0,6)=SRC(2,7)= (L(5) + L(6) + 1) >> 1;
|
||||
SRC(1,6)=SRC(3,7)= (L(4) + 2*L(5) + L(6) + 2) >> 2;
|
||||
SRC(0,5)=SRC(2,6)=SRC(4,7)= (L(4) + L(5) + 1) >> 1;
|
||||
SRC(1,5)=SRC(3,6)=SRC(5,7)= (L(3) + 2*L(4) + L(5) + 2) >> 2;
|
||||
SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (L(3) + L(4) + 1) >> 1;
|
||||
SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (L(2) + 2*L(3) + L(4) + 2) >> 2;
|
||||
SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (L(2) + L(3) + 1) >> 1;
|
||||
SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (L(1) + 2*L(2) + L(3) + 2) >> 2;
|
||||
SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (L(1) + L(2) + 1) >> 1;
|
||||
SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (L(0) + 2*L(1) + L(2) + 2) >> 2;
|
||||
SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (L(0) + L(1) + 1) >> 1;
|
||||
SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (LT + 2*L(0) + L(1) + 2) >> 2;
|
||||
SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (LT + L(0) + 1) >> 1;
|
||||
SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (L(0) + 2*LT + T(0) + 2) >> 2;
|
||||
SRC(2,0)=SRC(4,1)=SRC(6,2)= (T(1) + 2*T(0) + LT + 2) >> 2;
|
||||
SRC(3,0)=SRC(5,1)=SRC(7,2)= (T(2) + 2*T(1) + T(0) + 2) >> 2;
|
||||
SRC(4,0)=SRC(6,1)= (T(3) + 2*T(2) + T(1) + 2) >> 2;
|
||||
SRC(5,0)=SRC(7,1)= (T(4) + 2*T(3) + T(2) + 2) >> 2;
|
||||
SRC(6,0)= (T(5) + 2*T(4) + T(3) + 2) >> 2;
|
||||
SRC(7,0)= (T(6) + 2*T(5) + T(4) + 2) >> 2;
|
||||
}
|
||||
|
||||
/* Mode 7 VL (Vertical_Left) — uses TOP + TOP_RIGHT only. */
|
||||
void daedalus_h264_pred_8x8l_vl(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
uint8_t filt[25];
|
||||
filter_refs(dst, stride, filt);
|
||||
SRC(0,0)= (T(0) + T(1) + 1) >> 1;
|
||||
SRC(0,1)= (T(0) + 2*T(1) + T(2) + 2) >> 2;
|
||||
SRC(0,2)=SRC(1,0)= (T(1) + T(2) + 1) >> 1;
|
||||
SRC(0,3)=SRC(1,1)= (T(1) + 2*T(2) + T(3) + 2) >> 2;
|
||||
SRC(0,4)=SRC(1,2)=SRC(2,0)= (T(2) + T(3) + 1) >> 1;
|
||||
SRC(0,5)=SRC(1,3)=SRC(2,1)= (T(2) + 2*T(3) + T(4) + 2) >> 2;
|
||||
SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (T(3) + T(4) + 1) >> 1;
|
||||
SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (T(3) + 2*T(4) + T(5) + 2) >> 2;
|
||||
SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (T(4) + T(5) + 1) >> 1;
|
||||
SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (T(4) + 2*T(5) + T(6) + 2) >> 2;
|
||||
SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (T(5) + T(6) + 1) >> 1;
|
||||
SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (T(5) + 2*T(6) + T(7) + 2) >> 2;
|
||||
SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (T(6) + T(7) + 1) >> 1;
|
||||
SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (T(6) + 2*T(7) + T(8) + 2) >> 2;
|
||||
SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (T(7) + T(8) + 1) >> 1;
|
||||
SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (T(7) + 2*T(8) + T(9) + 2) >> 2;
|
||||
SRC(5,6)=SRC(6,4)=SRC(7,2)= (T(8) + T(9) + 1) >> 1;
|
||||
SRC(5,7)=SRC(6,5)=SRC(7,3)= (T(8) + 2*T(9) + T(10) + 2) >> 2;
|
||||
SRC(6,6)=SRC(7,4)= (T(9) + T(10) + 1) >> 1;
|
||||
SRC(6,7)=SRC(7,5)= (T(9) + 2*T(10) + T(11) + 2) >> 2;
|
||||
SRC(7,6)= (T(10) + T(11) + 1) >> 1;
|
||||
SRC(7,7)= (T(10) + 2*T(11) + T(12) + 2) >> 2;
|
||||
}
|
||||
|
||||
/* Mode 8 HU (Horizontal_Up) — uses LEFT only. */
|
||||
void daedalus_h264_pred_8x8l_hu(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
uint8_t filt[25];
|
||||
filter_refs(dst, stride, filt);
|
||||
SRC(0,0)= (L(0) + L(1) + 1) >> 1;
|
||||
SRC(1,0)= (L(0) + 2*L(1) + L(2) + 2) >> 2;
|
||||
SRC(0,1)=SRC(2,0)= (L(1) + L(2) + 1) >> 1;
|
||||
SRC(1,1)=SRC(3,0)= (L(1) + 2*L(2) + L(3) + 2) >> 2;
|
||||
SRC(0,2)=SRC(2,1)=SRC(4,0)= (L(2) + L(3) + 1) >> 1;
|
||||
SRC(1,2)=SRC(3,1)=SRC(5,0)= (L(2) + 2*L(3) + L(4) + 2) >> 2;
|
||||
SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (L(3) + L(4) + 1) >> 1;
|
||||
SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (L(3) + 2*L(4) + L(5) + 2) >> 2;
|
||||
SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (L(4) + L(5) + 1) >> 1;
|
||||
SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (L(4) + 2*L(5) + L(6) + 2) >> 2;
|
||||
SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (L(5) + L(6) + 1) >> 1;
|
||||
SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (L(5) + 2*L(6) + L(7) + 2) >> 2;
|
||||
SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (L(6) + L(7) + 1) >> 1;
|
||||
SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (L(6) + 3*L(7) + 2) >> 2;
|
||||
/* 20 positions all = L(7) per FFmpeg lines 1097-1100. */
|
||||
SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
|
||||
SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
|
||||
SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
|
||||
SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= L(7);
|
||||
}
|
||||
|
||||
#undef SRC
|
||||
#undef T
|
||||
#undef L
|
||||
#undef LT
|
||||
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 chroma Intra_8x8
|
||||
* prediction modes (per H.264 §8.3.3), used for both Cb and Cr
|
||||
* planes at 4:2:0. All 4 modes.
|
||||
*
|
||||
* Mode index → name (per H.264 Table 7-16):
|
||||
* 0 = DC (per-quadrant — asymmetric, see §8.3.3.2)
|
||||
* 1 = Horizontal
|
||||
* 2 = Vertical
|
||||
* 3 = Plane (slope coefficient 34, distinct from luma's 5)
|
||||
*
|
||||
* Calling convention (same shape as luma intra refs):
|
||||
* pred_chroma8x8_<mode>(uint8_t *dst, ptrdiff_t stride)
|
||||
*
|
||||
* `dst` points at row 0, col 0 of the 8x8 output block (single
|
||||
* component plane — Cb or Cr, dispatched independently). Neighbours:
|
||||
* top[0..7] = dst[-stride + 0 .. -stride + 7]
|
||||
* top-left = dst[-stride - 1]
|
||||
* left[0..7] = dst[ 0*stride - 1 .. 7*stride - 1]
|
||||
*
|
||||
* AVAILABILITY: assumes all neighbours valid (interior-MB case).
|
||||
* The H.264 spec defines per-quadrant fallback for the DC mode at
|
||||
* MB boundaries; that's caller-side via the libavcodec intercept.
|
||||
*
|
||||
* License: BSD-2-Clause.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
/* Mode 0 — DC (per-quadrant, 4:2:0 layout per §8.3.3.2).
|
||||
*
|
||||
* The 8×8 block is split into four 4×4 quadrants. For interior
|
||||
* MBs (all neighbours available), the DC value per quadrant uses:
|
||||
* (0,0) top-left : (sum_top[0..3] + sum_left[0..3] + 4) >> 3
|
||||
* (0,1) top-right : sum_top[4..7] + 2) >> 2
|
||||
* (1,0) bot-left : (sum_left[4..7] + 2) >> 2
|
||||
* (1,1) bot-right : (sum_top[4..7] + sum_left[4..7] + 4) >> 3
|
||||
*
|
||||
* The asymmetry mirrors what neighbours are "logically available"
|
||||
* for each quadrant in the spec's availability model. Top-right
|
||||
* quadrant ignores the top-left-half because that half is "vertically
|
||||
* above" the top-left quadrant; the spec uses top[4..7] only.
|
||||
*/
|
||||
void daedalus_h264_pred_chroma8x8_dc(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int top_lo = 0, top_hi = 0, left_lo = 0, left_hi = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
top_lo += top[i];
|
||||
top_hi += top[4 + i];
|
||||
left_lo += dst[i * stride - 1];
|
||||
left_hi += dst[(4 + i) * stride - 1];
|
||||
}
|
||||
uint8_t dc00 = (uint8_t)((top_lo + left_lo + 4) >> 3); /* top-left */
|
||||
uint8_t dc01 = (uint8_t)((top_hi + 2) >> 2); /* top-right */
|
||||
uint8_t dc10 = (uint8_t)(( left_hi + 2) >> 2); /* bot-left */
|
||||
uint8_t dc11 = (uint8_t)((top_hi + left_hi + 4) >> 3); /* bot-right */
|
||||
for (int r = 0; r < 4; r++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
dst[( r) * stride + c ] = dc00;
|
||||
dst[( r) * stride + 4 + c ] = dc01;
|
||||
dst[(4 + r) * stride + c ] = dc10;
|
||||
dst[(4 + r) * stride + 4 + c ] = dc11;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Mode 1 — Horizontal: each row = left[row]. */
|
||||
void daedalus_h264_pred_chroma8x8_horizontal(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++) {
|
||||
uint8_t l = dst[r * stride - 1];
|
||||
for (int c = 0; c < 8; c++) dst[r * stride + c] = l;
|
||||
}
|
||||
}
|
||||
|
||||
/* Mode 2 — Vertical: each col = top[col]. */
|
||||
void daedalus_h264_pred_chroma8x8_vertical(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++) dst[r * stride + c] = top[c];
|
||||
}
|
||||
|
||||
/* Mode 3 — Plane (per H.264 §8.3.3.4):
|
||||
* H = sum_{i=0..3} (i+1) * (p[4+i, -1] - p[2-i, -1]) ; i=3 uses p[-1,-1]
|
||||
* V = sum_{j=0..3} (j+1) * (p[-1, 4+j] - p[-1, 2-j]) ; j=3 uses p[-1,-1]
|
||||
* b = (34 * H + 32) >> 6
|
||||
* c = (34 * V + 32) >> 6
|
||||
* a = 16 * (p[-1, 7] + p[7, -1])
|
||||
* pred[y][x] = Clip1((a + b*(x - 3) + c*(y - 3) + 16) >> 5)
|
||||
*
|
||||
* Distinct from the Intra_16x16 luma Plane:
|
||||
* - Slope coefficient is 34 (not 5).
|
||||
* - Centre is (x-3, y-3) (not x-7, y-7).
|
||||
* - Spans 4 differences per sum (not 8).
|
||||
*/
|
||||
void daedalus_h264_pred_chroma8x8_plane(uint8_t *dst, ptrdiff_t stride)
|
||||
{
|
||||
const uint8_t *top = dst - stride;
|
||||
int H = 0, V = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int t_right = top[4 + i];
|
||||
int t_left = (i == 3) ? top[-1] : top[2 - i];
|
||||
H += (i + 1) * (t_right - t_left);
|
||||
}
|
||||
for (int j = 0; j < 4; j++) {
|
||||
int l_bot = dst[(4 + j) * stride - 1];
|
||||
int l_top = (j == 3) ? top[-1] : dst[(2 - j) * stride - 1];
|
||||
V += (j + 1) * (l_bot - l_top);
|
||||
}
|
||||
int b = (34 * H + 32) >> 6;
|
||||
int c = (34 * V + 32) >> 6;
|
||||
int a = 16 * (dst[7 * stride - 1] + top[7]);
|
||||
for (int y = 0; y < 8; y++) {
|
||||
for (int x = 0; x < 8; x++) {
|
||||
int v = (a + b * (x - 3) + c * (y - 3) + 16) >> 5;
|
||||
dst[y * stride + x] = (uint8_t) clip_u8(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc01 (biprediction) (8x8, ¼-pel vertical),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "d" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r,c] + 1) >> 1)
|
||||
//
|
||||
// Sibling of v3d_h264_qpel_mc02.comp with L2 step against src[r, c].
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc01_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint col_base = src_off + c;
|
||||
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int vp = clamp(v >> 5, 0, 255);
|
||||
|
||||
int avg = (vp + s_0 + 1) >> 1; // L2 with src[r, c]
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc02 (biprediction) (8x8, vertical half-pel), V3D 7.1.
|
||||
//
|
||||
// Sibling of cycle 9's v3d_h264_qpel_mc20.comp. Same 6-tap filter,
|
||||
// transposed to vertical direction:
|
||||
//
|
||||
// dst[r,c] = clip255(
|
||||
// ( s[r-2,c]
|
||||
// - 5 * s[r-1,c]
|
||||
// + 20 * s[r, c]
|
||||
// + 20 * s[r+1,c]
|
||||
// - 5 * s[r+2,c]
|
||||
// + s[r+3,c]
|
||||
// + 16
|
||||
// ) >> 5)
|
||||
//
|
||||
// src+src_off points at row 0 col 0 of the OUTPUT block; the filter
|
||||
// reads rows -2..+3 (2 rows of top context, 3 rows of bottom).
|
||||
//
|
||||
// Same WG layout as mc20: 64 lanes / 1 block-per-WG / 1 lane-per-pixel.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc02_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3;
|
||||
uint c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
// Read the 6 rows of vertical context at col (c) of THIS output row.
|
||||
// src_off+r*stride+c is at the OUTPUT pixel position; the kernel
|
||||
// samples r-2..r+3 along the column. Unsigned-safe because the
|
||||
// public API contract guarantees src_off >= 2*stride.
|
||||
uint col_base = src_off + c;
|
||||
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int p = clamp(v >> 5, 0, 255);
|
||||
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc03 (biprediction) (8x8, ¾-pel vertical),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "n" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r+1, c] + 1) >> 1)
|
||||
//
|
||||
// Same as mc01 but L2-averages with src[r+1, c] instead of src[r, c].
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc03_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint col_base = src_off + c;
|
||||
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int vp = clamp(v >> 5, 0, 255);
|
||||
|
||||
int avg = (vp + s_p1 + 1) >> 1; // L2 with src[r+1, c]
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc10 (biprediction) (8x8, ¼-pel horizontal),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "a" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc20(s)[r,c]) + s[r,c] + 1) >> 1)
|
||||
//
|
||||
// = horizontal half-pel filter, clipped to u8, then L2 rounded-averaged
|
||||
// with the integer source pixel at the SAME position. Sibling of
|
||||
// v3d_h264_qpel_mc20.comp with the L2 step added at the tail.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc10_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint row_base = src_off + r * stride + c;
|
||||
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int hp = clamp(v >> 5, 0, 255);
|
||||
|
||||
// L2 average with the integer source at the SAME (r, c) position.
|
||||
int avg = (hp + s_0 + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc11 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc11[r,c] = avg(mc20(r, c),
|
||||
// mc02(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc11_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc12 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc12[r,c] = avg(mc22(r, c),
|
||||
// mc02(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc12_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc13 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc13[r,c] = avg(mc20(r+1, c),
|
||||
// mc02(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc13_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r+1u, c);
|
||||
int b = hpel_v(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc20 (biprediction) (8x8, horizontal half-pel), V3D 7.1.
|
||||
//
|
||||
// H.264 spec §8.4.2.2.1 horizontal 6-tap luma interpolation:
|
||||
//
|
||||
// dst[r,c] = clip255(
|
||||
// ( s[r,c-2]
|
||||
// - 5 * s[r,c-1]
|
||||
// + 20 * s[r,c]
|
||||
// + 20 * s[r,c+1]
|
||||
// - 5 * s[r,c+2]
|
||||
// + s[r,c+3]
|
||||
// + 16
|
||||
// ) >> 5)
|
||||
//
|
||||
// Single-stride: dst and src share `stride` (H264QpelContext
|
||||
// convention). src+src_off already points at the leftmost output
|
||||
// column (col 0); the filter reads cols -2..+3. Caller guarantees
|
||||
// edge-padding context per the public API docstring.
|
||||
//
|
||||
// Workgroup layout: 64 invocations = 1 lane per output pixel.
|
||||
// 1 block per WG; n_blocks WGs total. This is the simplest layout
|
||||
// that avoids any inter-lane communication — each lane independently
|
||||
// reads its 6 src samples and writes its 1 dst sample. V3D's L2
|
||||
// cache handles the redundant reads from adjacent lanes.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc20_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src {
|
||||
uint8_t src[];
|
||||
} u_src;
|
||||
|
||||
layout(binding = 1) buffer Dst {
|
||||
uint8_t dst[];
|
||||
} u_dst;
|
||||
|
||||
layout(binding = 2) readonly buffer Meta {
|
||||
uvec4 meta[]; // .x = dst_off, .y = src_off
|
||||
} u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
// 1 block per WG, 64 lanes covering the 8x8 output block.
|
||||
uint wg_id = gl_WorkGroupID.x;
|
||||
uint block_idx = wg_id;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3; // 0..7 (row)
|
||||
uint c = lane & 7u; // 0..7 (column)
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
// src points at output col 0 of the block; filter reads cols -2..+3
|
||||
// of the current row. Negative col arithmetic is unsigned-safe
|
||||
// because src_off >= 2 (caller-guaranteed left context).
|
||||
uint row_base = src_off + r * stride + c;
|
||||
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base + 0u]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int p = clamp(v >> 5, 0, 255);
|
||||
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc21 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc21[r,c] = avg(mc22(r, c),
|
||||
// mc20(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc21_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_h(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc22 (biprediction) (8x8, 2D half-pel "j" position).
|
||||
// V3D 7.1.
|
||||
//
|
||||
// Cascaded H+V 6-tap per H.264 §8.4.2.2.1 / FFmpeg ff_put_h264_qpel8_mc22_neon:
|
||||
//
|
||||
// tmp[r,c] = src[r,c-2] - 5*src[r,c-1] + 20*src[r,c] + 20*src[r,c+1]
|
||||
// - 5*src[r,c+2] + src[r,c+3] (int16)
|
||||
//
|
||||
// dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c]
|
||||
// + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c]
|
||||
// + 512) >> 10)
|
||||
//
|
||||
// The +512 >> 10 final scale compensates for both 6-tap scalings.
|
||||
// CANNOT just cascade mc20→mc02 because intermediate must be int16
|
||||
// (no per-stage clip), so this is a dedicated kernel.
|
||||
//
|
||||
// Per-lane structure: each lane computes its own (r, c) output by
|
||||
// running the FULL cascade — 6 horizontal lowpass int16 values for
|
||||
// rows r-2..r+3, then a vertical lowpass on those. ~50 ALU ops per
|
||||
// lane. No shared memory / barriers needed; V3D L2 absorbs the
|
||||
// redundant src reads across lanes.
|
||||
//
|
||||
// WG layout: 64 lanes / 1 block-per-WG / 1 lane-per-output-pixel
|
||||
// (same as mc20 / mc02).
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc22_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
// Horizontal 6-tap filter at (row_off, c) — reads src at cols c-2..c+3
|
||||
// of the row identified by row_off, returns int16 intermediate (NOT
|
||||
// scaled — the v-pass does the +512 >> 10 for both stages).
|
||||
int hpel_h(uint row_off, uint c)
|
||||
{
|
||||
int s_m2 = int(u_src.src[row_off + c - 2u]);
|
||||
int s_m1 = int(u_src.src[row_off + c - 1u]);
|
||||
int s_0 = int(u_src.src[row_off + c ]);
|
||||
int s_p1 = int(u_src.src[row_off + c + 1u]);
|
||||
int s_p2 = int(u_src.src[row_off + c + 2u]);
|
||||
int s_p3 = int(u_src.src[row_off + c + 3u]);
|
||||
return s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3;
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3;
|
||||
uint c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
// Compute 6 horizontal lowpass values at rows r-2..r+3 (relative
|
||||
// to the output row r) of column c. src_off+r*stride+c is the
|
||||
// output pixel position; we sample rows r-2..r+3.
|
||||
// Unsigned-safe because src_off >= 2*stride per the caller contract.
|
||||
int t0 = hpel_h(src_off + (r - 2u) * stride, c);
|
||||
int t1 = hpel_h(src_off + (r - 1u) * stride, c);
|
||||
int t2 = hpel_h(src_off + r * stride, c);
|
||||
int t3 = hpel_h(src_off + (r + 1u) * stride, c);
|
||||
int t4 = hpel_h(src_off + (r + 2u) * stride, c);
|
||||
int t5 = hpel_h(src_off + (r + 3u) * stride, c);
|
||||
|
||||
int v = t0 - 5 * t1 + 20 * t2 + 20 * t3 - 5 * t4 + t5 + 512;
|
||||
int p = clamp(v >> 10, 0, 255);
|
||||
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc23 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc23[r,c] = avg(mc22(r, c),
|
||||
// mc20(r+1, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc23_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_h(src_off, stride, r+1u, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc30 (biprediction) (8x8, ¾-pel horizontal),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "c" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc20(s)[r,c]) + s[r,c+1] + 1) >> 1)
|
||||
//
|
||||
// Same as mc10 but L2-averages with src[r, c+1] instead of src[r, c].
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc30_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint row_base = src_off + r * stride + c;
|
||||
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int hp = clamp(v >> 5, 0, 255);
|
||||
|
||||
int avg = (hp + s_p1 + 1) >> 1; // L2 with src[r, c+1]
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc31 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc31[r,c] = avg(mc20(r, c),
|
||||
// mc02(r, c+1))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc31_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c+1u);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc32 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc32[r,c] = avg(mc22(r, c),
|
||||
// mc02(r, c+1))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc32_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c+1u);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc33 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc33[r,c] = avg(mc20(r+1, c),
|
||||
// mc02(r, c+1))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc33_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r+1u, c);
|
||||
int b = hpel_v(src_off, stride, r, c+1u);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc01 (8x8, ¼-pel vertical),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "d" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r,c] + 1) >> 1)
|
||||
//
|
||||
// Sibling of v3d_h264_qpel_mc02.comp with L2 step against src[r, c].
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint col_base = src_off + c;
|
||||
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int vp = clamp(v >> 5, 0, 255);
|
||||
|
||||
int avg = (vp + s_0 + 1) >> 1; // L2 with src[r, c]
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc02 (8x8, vertical half-pel), V3D 7.1.
|
||||
//
|
||||
// Sibling of cycle 9's v3d_h264_qpel_mc20.comp. Same 6-tap filter,
|
||||
// transposed to vertical direction:
|
||||
//
|
||||
// dst[r,c] = clip255(
|
||||
// ( s[r-2,c]
|
||||
// - 5 * s[r-1,c]
|
||||
// + 20 * s[r, c]
|
||||
// + 20 * s[r+1,c]
|
||||
// - 5 * s[r+2,c]
|
||||
// + s[r+3,c]
|
||||
// + 16
|
||||
// ) >> 5)
|
||||
//
|
||||
// src+src_off points at row 0 col 0 of the OUTPUT block; the filter
|
||||
// reads rows -2..+3 (2 rows of top context, 3 rows of bottom).
|
||||
//
|
||||
// Same WG layout as mc20: 64 lanes / 1 block-per-WG / 1 lane-per-pixel.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3;
|
||||
uint c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
// Read the 6 rows of vertical context at col (c) of THIS output row.
|
||||
// src_off+r*stride+c is at the OUTPUT pixel position; the kernel
|
||||
// samples r-2..r+3 along the column. Unsigned-safe because the
|
||||
// public API contract guarantees src_off >= 2*stride.
|
||||
uint col_base = src_off + c;
|
||||
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int p = clamp(v >> 5, 0, 255);
|
||||
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(p);
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc03 (8x8, ¾-pel vertical),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "n" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r+1, c] + 1) >> 1)
|
||||
//
|
||||
// Same as mc01 but L2-averages with src[r+1, c] instead of src[r, c].
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint col_base = src_off + c;
|
||||
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int vp = clamp(v >> 5, 0, 255);
|
||||
|
||||
int avg = (vp + s_p1 + 1) >> 1; // L2 with src[r+1, c]
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc10 (8x8, ¼-pel horizontal),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "a" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc20(s)[r,c]) + s[r,c] + 1) >> 1)
|
||||
//
|
||||
// = horizontal half-pel filter, clipped to u8, then L2 rounded-averaged
|
||||
// with the integer source pixel at the SAME position. Sibling of
|
||||
// v3d_h264_qpel_mc20.comp with the L2 step added at the tail.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint row_base = src_off + r * stride + c;
|
||||
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int hp = clamp(v >> 5, 0, 255);
|
||||
|
||||
// L2 average with the integer source at the SAME (r, c) position.
|
||||
int avg = (hp + s_0 + 1) >> 1;
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc11 (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc11[r,c] = avg(mc20(r, c),
|
||||
// mc02(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc12 (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc12[r,c] = avg(mc22(r, c),
|
||||
// mc02(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc13 (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc13[r,c] = avg(mc20(r+1, c),
|
||||
// mc02(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r+1u, c);
|
||||
int b = hpel_v(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc21 (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc21[r,c] = avg(mc22(r, c),
|
||||
// mc20(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_h(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc22 (8x8, 2D half-pel "j" position).
|
||||
// V3D 7.1.
|
||||
//
|
||||
// Cascaded H+V 6-tap per H.264 §8.4.2.2.1 / FFmpeg ff_put_h264_qpel8_mc22_neon:
|
||||
//
|
||||
// tmp[r,c] = src[r,c-2] - 5*src[r,c-1] + 20*src[r,c] + 20*src[r,c+1]
|
||||
// - 5*src[r,c+2] + src[r,c+3] (int16)
|
||||
//
|
||||
// dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c]
|
||||
// + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c]
|
||||
// + 512) >> 10)
|
||||
//
|
||||
// The +512 >> 10 final scale compensates for both 6-tap scalings.
|
||||
// CANNOT just cascade mc20→mc02 because intermediate must be int16
|
||||
// (no per-stage clip), so this is a dedicated kernel.
|
||||
//
|
||||
// Per-lane structure: each lane computes its own (r, c) output by
|
||||
// running the FULL cascade — 6 horizontal lowpass int16 values for
|
||||
// rows r-2..r+3, then a vertical lowpass on those. ~50 ALU ops per
|
||||
// lane. No shared memory / barriers needed; V3D L2 absorbs the
|
||||
// redundant src reads across lanes.
|
||||
//
|
||||
// WG layout: 64 lanes / 1 block-per-WG / 1 lane-per-output-pixel
|
||||
// (same as mc20 / mc02).
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
// Horizontal 6-tap filter at (row_off, c) — reads src at cols c-2..c+3
|
||||
// of the row identified by row_off, returns int16 intermediate (NOT
|
||||
// scaled — the v-pass does the +512 >> 10 for both stages).
|
||||
int hpel_h(uint row_off, uint c)
|
||||
{
|
||||
int s_m2 = int(u_src.src[row_off + c - 2u]);
|
||||
int s_m1 = int(u_src.src[row_off + c - 1u]);
|
||||
int s_0 = int(u_src.src[row_off + c ]);
|
||||
int s_p1 = int(u_src.src[row_off + c + 1u]);
|
||||
int s_p2 = int(u_src.src[row_off + c + 2u]);
|
||||
int s_p3 = int(u_src.src[row_off + c + 3u]);
|
||||
return s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3;
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3;
|
||||
uint c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
// Compute 6 horizontal lowpass values at rows r-2..r+3 (relative
|
||||
// to the output row r) of column c. src_off+r*stride+c is the
|
||||
// output pixel position; we sample rows r-2..r+3.
|
||||
// Unsigned-safe because src_off >= 2*stride per the caller contract.
|
||||
int t0 = hpel_h(src_off + (r - 2u) * stride, c);
|
||||
int t1 = hpel_h(src_off + (r - 1u) * stride, c);
|
||||
int t2 = hpel_h(src_off + r * stride, c);
|
||||
int t3 = hpel_h(src_off + (r + 1u) * stride, c);
|
||||
int t4 = hpel_h(src_off + (r + 2u) * stride, c);
|
||||
int t5 = hpel_h(src_off + (r + 3u) * stride, c);
|
||||
|
||||
int v = t0 - 5 * t1 + 20 * t2 + 20 * t3 - 5 * t4 + t5 + 512;
|
||||
int p = clamp(v >> 10, 0, 255);
|
||||
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(p);
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc23 (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc23[r,c] = avg(mc22(r, c),
|
||||
// mc20(r+1, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_h(src_off, stride, r+1u, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc30 (8x8, ¾-pel horizontal),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "c" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc20(s)[r,c]) + s[r,c+1] + 1) >> 1)
|
||||
//
|
||||
// Same as mc10 but L2-averages with src[r, c+1] instead of src[r, c].
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint row_base = src_off + r * stride + c;
|
||||
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int hp = clamp(v >> 5, 0, 255);
|
||||
|
||||
int avg = (hp + s_p1 + 1) >> 1; // L2 with src[r, c+1]
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc31 (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc31[r,c] = avg(mc20(r, c),
|
||||
// mc02(r, c+1))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c+1u);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc32 (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc32[r,c] = avg(mc22(r, c),
|
||||
// mc02(r, c+1))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c+1u);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
// daedalus-fourier — H.264 luma qpel mc33 (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc33[r,c] = avg(mc20(r+1, c),
|
||||
// mc02(r, c+1))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r+1u, c);
|
||||
int b = hpel_v(src_off, stride, r, c+1u);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
u_dst.dst[dst_off + r * stride + c] = uint8_t(avg);
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
// daedalus-fourier — H.264 chroma 4:2:0 H loop filter (horizontal
|
||||
// filter across a vertical edge), non-intra bS<4 variant.
|
||||
//
|
||||
// Sibling of v3d_h264deblock_chroma_v.comp; same kernel transposed
|
||||
// to read pix[-2..+1] (cols) instead of pix[-2*stride..+1*stride]
|
||||
// (rows). Same 8-cell × 4-segment geometry, same WG layout (lanes
|
||||
// 8..15 of each edge early-return — only 8 active per edge).
|
||||
//
|
||||
// 4:2:0-only: 4:2:2 chroma_h has a 16-row edge that this shader
|
||||
// doesn't address. daedalus_dispatch_h264_deblock_chroma_h is
|
||||
// 4:2:0-only by design; caller (libavcodec init) gates accordingly.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_edges;
|
||||
uint dst_stride_u8;
|
||||
uint _pad0;
|
||||
uint _pad1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint lane_in_wg = gl_GlobalInvocationID.x & 255u;
|
||||
uint edge_in_wg = lane_in_wg >> 4; // 0..15
|
||||
uint row_in_edge = lane_in_wg & 15u; // 0..15 — only 0..7 active
|
||||
|
||||
uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg;
|
||||
if (edge_idx >= pc.n_edges) return;
|
||||
if (row_in_edge >= 8u) return;
|
||||
|
||||
uvec4 m = u_meta.meta[edge_idx];
|
||||
uint stride = pc.dst_stride_u8;
|
||||
uint dst_off = m.x + row_in_edge * stride;
|
||||
int alpha = int(m.y & 0xffu);
|
||||
int beta = int((m.y >> 8) & 0xffu);
|
||||
|
||||
uint seg = row_in_edge >> 1;
|
||||
uint tc0_byte = (m.z >> (seg * 8u)) & 0xffu;
|
||||
int tc0_s = int(tc0_byte);
|
||||
if (tc0_s >= 128) tc0_s -= 256;
|
||||
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0_s < 0) return;
|
||||
|
||||
int p1 = int(u_dst.dst[dst_off - 2u]);
|
||||
int p0 = int(u_dst.dst[dst_off - 1u]);
|
||||
int q0 = int(u_dst.dst[dst_off ]);
|
||||
int q1 = int(u_dst.dst[dst_off + 1u]);
|
||||
|
||||
if (abs(p0 - q0) >= alpha) return;
|
||||
if (abs(p1 - p0) >= beta) return;
|
||||
if (abs(q1 - q0) >= beta) return;
|
||||
|
||||
int tc = tc0_s + 1;
|
||||
int delta = clamp(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
|
||||
u_dst.dst[dst_off - 1u] = uint8_t(clamp(p0 + delta, 0, 255));
|
||||
u_dst.dst[dst_off ] = uint8_t(clamp(q0 - delta, 0, 255));
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
// daedalus-fourier — H.264 chroma 4:2:0 intra (bS=4) H deblock —
|
||||
// V3D 7.1. Transpose of v3d_h264deblock_chroma_v_intra.comp.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_edges, dst_stride_u8, _p0, _p1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint lane_in_wg = gl_GlobalInvocationID.x & 255u;
|
||||
uint edge_in_wg = lane_in_wg >> 4;
|
||||
uint row_in_edge = lane_in_wg & 15u;
|
||||
uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg;
|
||||
if (edge_idx >= pc.n_edges) return;
|
||||
if (row_in_edge >= 8u) return;
|
||||
|
||||
uvec4 m = u_meta.meta[edge_idx];
|
||||
uint stride = pc.dst_stride_u8;
|
||||
uint dst_off = m.x + row_in_edge * stride;
|
||||
int alpha = int(m.y & 0xffu);
|
||||
int beta = int((m.y >> 8) & 0xffu);
|
||||
if ((alpha | beta) == 0) return;
|
||||
|
||||
int p1 = int(u_dst.dst[dst_off - 2u]);
|
||||
int p0 = int(u_dst.dst[dst_off - 1u]);
|
||||
int q0 = int(u_dst.dst[dst_off ]);
|
||||
int q1 = int(u_dst.dst[dst_off + 1u]);
|
||||
|
||||
if (abs(p0 - q0) >= alpha) return;
|
||||
if (abs(p1 - p0) >= beta) return;
|
||||
if (abs(q1 - q0) >= beta) return;
|
||||
|
||||
u_dst.dst[dst_off - 1u] = uint8_t(clamp((2*p1 + p0 + q1 + 2) >> 2, 0, 255));
|
||||
u_dst.dst[dst_off ] = uint8_t(clamp((2*q1 + q0 + p1 + 2) >> 2, 0, 255));
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
// daedalus-fourier — H.264 chroma 4:2:0 V loop filter (vertical
|
||||
// filter across a horizontal edge), non-intra bS<4 variant.
|
||||
//
|
||||
// Per H.264 §8.7.2.4: chroma kernel is simpler than luma's bS<4 —
|
||||
// only p0 / q0 are updated (chroma never modifies p1, p2, q1, q2),
|
||||
// tC = tc0_seg + 1 (no luma-style ap/aq side bonus), and the edge
|
||||
// spans 8 cells (4 segments × 2 cells/seg).
|
||||
//
|
||||
// V3D 7.1 via Mesa v3dv compute. WG geometry kept identical to the
|
||||
// luma shader (16 edges × 16 lanes/WG) for uniform dispatch math
|
||||
// across the deblock family; lanes 8..15 of each edge early-return.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Meta {
|
||||
uvec4 meta[]; // per edge: (dst_off, alpha|beta<<8, packed_tc0, _pad)
|
||||
} u_meta;
|
||||
|
||||
layout(binding = 1) buffer Dst {
|
||||
uint8_t dst[];
|
||||
} u_dst;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_edges;
|
||||
uint dst_stride_u8;
|
||||
uint _pad0;
|
||||
uint _pad1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint lane_in_wg = gl_GlobalInvocationID.x & 255u;
|
||||
uint edge_in_wg = lane_in_wg >> 4; // 0..15
|
||||
uint col_in_edge = lane_in_wg & 15u; // 0..15 — only 0..7 active
|
||||
|
||||
uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg;
|
||||
if (edge_idx >= pc.n_edges) return;
|
||||
if (col_in_edge >= 8u) return; // 8 cells per chroma edge
|
||||
|
||||
uvec4 m = u_meta.meta[edge_idx];
|
||||
uint dst_off = m.x + col_in_edge;
|
||||
uint stride = pc.dst_stride_u8;
|
||||
int alpha = int(m.y & 0xffu);
|
||||
int beta = int((m.y >> 8) & 0xffu);
|
||||
|
||||
// 8 cells / 4 segments = 2 cells per segment.
|
||||
uint seg = col_in_edge >> 1;
|
||||
uint tc0_byte = (m.z >> (seg * 8u)) & 0xffu;
|
||||
int tc0_s = int(tc0_byte);
|
||||
if (tc0_s >= 128) tc0_s -= 256;
|
||||
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0_s < 0) return;
|
||||
|
||||
int p1 = int(u_dst.dst[dst_off - 2u * stride]);
|
||||
int p0 = int(u_dst.dst[dst_off - 1u * stride]);
|
||||
int q0 = int(u_dst.dst[dst_off]);
|
||||
int q1 = int(u_dst.dst[dst_off + 1u * stride]);
|
||||
|
||||
if (abs(p0 - q0) >= alpha) return;
|
||||
if (abs(p1 - p0) >= beta) return;
|
||||
if (abs(q1 - q0) >= beta) return;
|
||||
|
||||
int tc = tc0_s + 1;
|
||||
int delta = clamp(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
|
||||
u_dst.dst[dst_off - 1u * stride] = uint8_t(clamp(p0 + delta, 0, 255));
|
||||
u_dst.dst[dst_off ] = uint8_t(clamp(q0 - delta, 0, 255));
|
||||
// p1, q1 untouched — chroma kernel only updates p0/q0.
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
// daedalus-fourier — H.264 chroma 4:2:0 intra (bS=4) V deblock —
|
||||
// V3D 7.1. Per H.264 §8.3.2.3 chroma intra path: simpler than luma
|
||||
// — always weak filter, only p0/q0 updated, 8 cells per edge.
|
||||
//
|
||||
// p0' = (2*p1 + p0 + q1 + 2) >> 2
|
||||
// q0' = (2*q1 + q0 + p1 + 2) >> 2
|
||||
//
|
||||
// Same 16-edges × 16-lanes/edge WG shape as luma; lanes 8..15 of each
|
||||
// edge early-return (chroma edges are only 8 cells wide).
|
||||
//
|
||||
// 4:2:0-only — caller-side gating handles 4:2:2 (chroma_format_idc>1)
|
||||
// at the libavcodec init layer.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_edges, dst_stride_u8, _p0, _p1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint lane_in_wg = gl_GlobalInvocationID.x & 255u;
|
||||
uint edge_in_wg = lane_in_wg >> 4;
|
||||
uint col_in_edge = lane_in_wg & 15u;
|
||||
uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg;
|
||||
if (edge_idx >= pc.n_edges) return;
|
||||
if (col_in_edge >= 8u) return;
|
||||
|
||||
uvec4 m = u_meta.meta[edge_idx];
|
||||
uint dst_off = m.x + col_in_edge;
|
||||
uint stride = pc.dst_stride_u8;
|
||||
int alpha = int(m.y & 0xffu);
|
||||
int beta = int((m.y >> 8) & 0xffu);
|
||||
if ((alpha | beta) == 0) return;
|
||||
|
||||
int p1 = int(u_dst.dst[dst_off - 2u * stride]);
|
||||
int p0 = int(u_dst.dst[dst_off - 1u * stride]);
|
||||
int q0 = int(u_dst.dst[dst_off]);
|
||||
int q1 = int(u_dst.dst[dst_off + 1u * stride]);
|
||||
|
||||
if (abs(p0 - q0) >= alpha) return;
|
||||
if (abs(p1 - p0) >= beta) return;
|
||||
if (abs(q1 - q0) >= beta) return;
|
||||
|
||||
u_dst.dst[dst_off - 1u * stride] = uint8_t(clamp((2*p1 + p0 + q1 + 2) >> 2, 0, 255));
|
||||
u_dst.dst[dst_off ] = uint8_t(clamp((2*q1 + q0 + p1 + 2) >> 2, 0, 255));
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
// daedalus-fourier — H.264 luma "h_loop_filter" (horizontal filtering
|
||||
// across a vertical edge), non-intra bS<4 variant. Sibling of cycle 8's
|
||||
// v3d_h264deblock.comp; same algorithm with row/col access transposed.
|
||||
//
|
||||
// V3D 7.1 via Mesa v3dv compute. Same WG geometry as the V shader:
|
||||
// - 256 invocations / WG, 16 edges/WG (16 lanes/edge = 1 sg/edge)
|
||||
// - uint8_t dst SSBO via storageBuffer8BitAccess
|
||||
// - No barrier (each lane independent)
|
||||
// - lane_in_edge = ROW index (0..15) along the vertical edge
|
||||
// - meta.dst_off points to (row 0, col 0) of the RIGHT block;
|
||||
// the kernel reads cols [-4..+3] of each row and writes [-2..+1].
|
||||
//
|
||||
// Filter contract (per H.264 §8.7.2.4):
|
||||
// 1. (m.x % pc.dst_stride_u8) ≥ 4 (kernel reads p3 at pix[-4])
|
||||
// 2. pc.dst_stride_u8 = byte stride between rows
|
||||
// 3. tc0_s pre-stored as signed int8 in m.z packed 4 bytes (one per
|
||||
// 4-row segment along the 16-row edge)
|
||||
//
|
||||
// License: BSD-2-Clause. Algorithm transcribed from
|
||||
// tests/h264_h_loop_filter_luma_ref.c which mirrors FFmpeg
|
||||
// ff_h264_h_loop_filter_luma_neon (LGPL-2.1+).
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Meta {
|
||||
uvec4 meta[]; // per edge: (dst_off, alpha|beta<<8, packed_tc0, _pad)
|
||||
} u_meta;
|
||||
|
||||
layout(binding = 1) buffer Dst {
|
||||
uint8_t dst[];
|
||||
} u_dst;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_edges;
|
||||
uint dst_stride_u8;
|
||||
uint _pad0;
|
||||
uint _pad1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint gid = gl_GlobalInvocationID.x;
|
||||
uint wg_id = gl_WorkGroupID.x;
|
||||
uint lane_in_wg = gid & 255u;
|
||||
uint edge_in_wg = lane_in_wg >> 4; // 0..15 (16 edges/WG)
|
||||
uint row_in_edge = lane_in_wg & 15u; // 0..15 — ROW along the V edge
|
||||
|
||||
uint edge_idx = wg_id * 16u + edge_in_wg;
|
||||
if (edge_idx >= pc.n_edges) return;
|
||||
|
||||
uvec4 m = u_meta.meta[edge_idx];
|
||||
uint stride = pc.dst_stride_u8;
|
||||
// dst_off addresses row 0 col 0 of the right block; advance by row * stride
|
||||
// to land at this lane's row. The kernel reads pix[-4..+3] AT THIS ROW.
|
||||
uint dst_off = m.x + row_in_edge * stride;
|
||||
int alpha = int(m.y & 0xffu);
|
||||
int beta = int((m.y >> 8) & 0xffu);
|
||||
|
||||
// tc0 segment = 0..3 indexed by (row_in_edge / 4).
|
||||
uint seg = row_in_edge >> 2;
|
||||
uint tc0_byte = (m.z >> (seg * 8u)) & 0xffu;
|
||||
int tc0_s = int(tc0_byte);
|
||||
if (tc0_s >= 128) tc0_s -= 256;
|
||||
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0_s < 0) return; // segment skip
|
||||
|
||||
// Horizontal access pattern — read cols at offsets [-3..+2] of this row.
|
||||
// p3 (col -4) unused in bS<4; same DCE comment as the V shader.
|
||||
int p2 = int(u_dst.dst[dst_off - 3u]);
|
||||
int p1 = int(u_dst.dst[dst_off - 2u]);
|
||||
int p0 = int(u_dst.dst[dst_off - 1u]);
|
||||
int q0 = int(u_dst.dst[dst_off ]);
|
||||
int q1 = int(u_dst.dst[dst_off + 1u]);
|
||||
int q2 = int(u_dst.dst[dst_off + 2u]);
|
||||
|
||||
// Edge preconditions (same as V).
|
||||
if (abs(p0 - q0) >= alpha) return;
|
||||
if (abs(p1 - p0) >= beta) return;
|
||||
if (abs(q1 - q0) >= beta) return;
|
||||
|
||||
int ap = abs(p2 - p0);
|
||||
int aq = abs(q2 - q0);
|
||||
bool ap_lt = ap < beta;
|
||||
bool aq_lt = aq < beta;
|
||||
int tc = tc0_s + int(ap_lt) + int(aq_lt);
|
||||
|
||||
int delta = clamp(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
int p0p = clamp(p0 + delta, 0, 255);
|
||||
int q0p = clamp(q0 - delta, 0, 255);
|
||||
|
||||
int p1p = p1;
|
||||
if (ap_lt) {
|
||||
int d_p1 = clamp((p2 + ((p0 + q0 + 1) >> 1) - 2*p1) >> 1, -tc0_s, tc0_s);
|
||||
p1p = clamp(p1 + d_p1, 0, 255);
|
||||
}
|
||||
int q1p = q1;
|
||||
if (aq_lt) {
|
||||
int d_q1 = clamp((q2 + ((p0 + q0 + 1) >> 1) - 2*q1) >> 1, -tc0_s, tc0_s);
|
||||
q1p = clamp(q1 + d_q1, 0, 255);
|
||||
}
|
||||
|
||||
u_dst.dst[dst_off - 2u] = uint8_t(p1p);
|
||||
u_dst.dst[dst_off - 1u] = uint8_t(p0p);
|
||||
u_dst.dst[dst_off ] = uint8_t(q0p);
|
||||
u_dst.dst[dst_off + 1u] = uint8_t(q1p);
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
// daedalus-fourier — H.264 luma intra (bS=4) H deblock — V3D 7.1.
|
||||
//
|
||||
// Sibling of v3d_h264deblock_luma_v_intra.comp transposed to the
|
||||
// horizontal axis: lane → row, reads pix[-4..+3] (cols) instead of
|
||||
// pix[-4*stride..+3*stride] (rows). Same strong/weak filter
|
||||
// selector + same write-back algebra.
|
||||
//
|
||||
// dst_off contract: (m.x % stride) ≥ 4 (kernel reads p3 at pix[-4]).
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_edges, dst_stride_u8, _p0, _p1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint lane_in_wg = gl_GlobalInvocationID.x & 255u;
|
||||
uint edge_in_wg = lane_in_wg >> 4;
|
||||
uint row_in_edge = lane_in_wg & 15u;
|
||||
uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg;
|
||||
if (edge_idx >= pc.n_edges) return;
|
||||
|
||||
uvec4 m = u_meta.meta[edge_idx];
|
||||
uint stride = pc.dst_stride_u8;
|
||||
uint dst_off = m.x + row_in_edge * stride;
|
||||
int alpha = int(m.y & 0xffu);
|
||||
int beta = int((m.y >> 8) & 0xffu);
|
||||
if ((alpha | beta) == 0) return;
|
||||
|
||||
int p3 = int(u_dst.dst[dst_off - 4u]);
|
||||
int p2 = int(u_dst.dst[dst_off - 3u]);
|
||||
int p1 = int(u_dst.dst[dst_off - 2u]);
|
||||
int p0 = int(u_dst.dst[dst_off - 1u]);
|
||||
int q0 = int(u_dst.dst[dst_off ]);
|
||||
int q1 = int(u_dst.dst[dst_off + 1u]);
|
||||
int q2 = int(u_dst.dst[dst_off + 2u]);
|
||||
int q3 = int(u_dst.dst[dst_off + 3u]);
|
||||
|
||||
if (abs(p0 - q0) >= alpha) return;
|
||||
if (abs(p1 - p0) >= beta) return;
|
||||
if (abs(q1 - q0) >= beta) return;
|
||||
|
||||
bool strong_common = abs(p0 - q0) < (alpha >> 2) + 2;
|
||||
bool strong_p = strong_common && abs(p2 - p0) < beta;
|
||||
bool strong_q = strong_common && abs(q2 - q0) < beta;
|
||||
|
||||
if (strong_p) {
|
||||
u_dst.dst[dst_off - 1u] = uint8_t(clamp((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3, 0, 255));
|
||||
u_dst.dst[dst_off - 2u] = uint8_t(clamp((p2 + p1 + p0 + q0 + 2) >> 2, 0, 255));
|
||||
u_dst.dst[dst_off - 3u] = uint8_t(clamp((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3, 0, 255));
|
||||
} else {
|
||||
u_dst.dst[dst_off - 1u] = uint8_t(clamp((2*p1 + p0 + q1 + 2) >> 2, 0, 255));
|
||||
}
|
||||
|
||||
if (strong_q) {
|
||||
u_dst.dst[dst_off ] = uint8_t(clamp((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3, 0, 255));
|
||||
u_dst.dst[dst_off + 1u] = uint8_t(clamp((q2 + q1 + q0 + p0 + 2) >> 2, 0, 255));
|
||||
u_dst.dst[dst_off + 2u] = uint8_t(clamp((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3, 0, 255));
|
||||
} else {
|
||||
u_dst.dst[dst_off ] = uint8_t(clamp((2*q1 + q0 + p1 + 2) >> 2, 0, 255));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
// daedalus-fourier — H.264 luma intra (bS=4) V deblock — V3D 7.1.
|
||||
//
|
||||
// Per H.264 §8.3.2.3: at I-MB edges and certain inter-MB edges that
|
||||
// force boundary strength to 4, the deblock kernel is structurally
|
||||
// different from bS<4 — it has a per-side strong/weak filter
|
||||
// selector that decides whether to update 3 cells (strong) or 1
|
||||
// (weak), reads p3/q3, and ignores tc0.
|
||||
//
|
||||
// strong_common = |p0-q0| < (α>>2) + 2
|
||||
// strong_p = strong_common AND |p2-p0| < β
|
||||
// strong_q = strong_common AND |q2-q0| < β
|
||||
//
|
||||
// Strong-p updates p0/p1/p2 with specific 5-/4-/3-tap blends.
|
||||
// Weak-p updates p0 only with (2*p1 + p0 + q1 + 2) >> 2.
|
||||
// Mirror for q-side.
|
||||
//
|
||||
// WG geometry identical to v3d_h264deblock.comp (16 edges × 16 lanes/WG).
|
||||
// dst_off contract: m.x ≥ 4*stride (kernel reads p3 at -4*stride).
|
||||
//
|
||||
// License: BSD-2-Clause. Algorithm transcribed from
|
||||
// tests/h264_intra_loop_filter_ref.c (PR #11).
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_edges, dst_stride_u8, _p0, _p1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint lane_in_wg = gl_GlobalInvocationID.x & 255u;
|
||||
uint edge_in_wg = lane_in_wg >> 4;
|
||||
uint col_in_edge = lane_in_wg & 15u;
|
||||
uint edge_idx = gl_WorkGroupID.x * 16u + edge_in_wg;
|
||||
if (edge_idx >= pc.n_edges) return;
|
||||
|
||||
uvec4 m = u_meta.meta[edge_idx];
|
||||
uint dst_off = m.x + col_in_edge;
|
||||
uint stride = pc.dst_stride_u8;
|
||||
int alpha = int(m.y & 0xffu);
|
||||
int beta = int((m.y >> 8) & 0xffu);
|
||||
if ((alpha | beta) == 0) return;
|
||||
|
||||
int p3 = int(u_dst.dst[dst_off - 4u * stride]);
|
||||
int p2 = int(u_dst.dst[dst_off - 3u * stride]);
|
||||
int p1 = int(u_dst.dst[dst_off - 2u * stride]);
|
||||
int p0 = int(u_dst.dst[dst_off - 1u * stride]);
|
||||
int q0 = int(u_dst.dst[dst_off]);
|
||||
int q1 = int(u_dst.dst[dst_off + 1u * stride]);
|
||||
int q2 = int(u_dst.dst[dst_off + 2u * stride]);
|
||||
int q3 = int(u_dst.dst[dst_off + 3u * stride]);
|
||||
|
||||
if (abs(p0 - q0) >= alpha) return;
|
||||
if (abs(p1 - p0) >= beta) return;
|
||||
if (abs(q1 - q0) >= beta) return;
|
||||
|
||||
bool strong_common = abs(p0 - q0) < (alpha >> 2) + 2;
|
||||
bool strong_p = strong_common && abs(p2 - p0) < beta;
|
||||
bool strong_q = strong_common && abs(q2 - q0) < beta;
|
||||
|
||||
if (strong_p) {
|
||||
u_dst.dst[dst_off - 1u * stride] = uint8_t(clamp((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3, 0, 255));
|
||||
u_dst.dst[dst_off - 2u * stride] = uint8_t(clamp((p2 + p1 + p0 + q0 + 2) >> 2, 0, 255));
|
||||
u_dst.dst[dst_off - 3u * stride] = uint8_t(clamp((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3, 0, 255));
|
||||
} else {
|
||||
u_dst.dst[dst_off - 1u * stride] = uint8_t(clamp((2*p1 + p0 + q1 + 2) >> 2, 0, 255));
|
||||
}
|
||||
|
||||
if (strong_q) {
|
||||
u_dst.dst[dst_off ] = uint8_t(clamp((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3, 0, 255));
|
||||
u_dst.dst[dst_off + 1u * stride] = uint8_t(clamp((q2 + q1 + q0 + p0 + 2) >> 2, 0, 255));
|
||||
u_dst.dst[dst_off + 2u * stride] = uint8_t(clamp((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3, 0, 255));
|
||||
} else {
|
||||
u_dst.dst[dst_off ] = uint8_t(clamp((2*q1 + q0 + p1 + 2) >> 2, 0, 255));
|
||||
}
|
||||
}
|
||||
+62
-2
@@ -8,6 +8,8 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
|
||||
#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
|
||||
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
|
||||
@@ -368,10 +370,68 @@ void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf)
|
||||
|
||||
/* ---- Pipelines -------------------------------------------------- */
|
||||
|
||||
/* SPV lookup tries a small set of locations. The caller passes a bare
|
||||
* filename (e.g. "v3d_h264_idct4.spv"); we try, in order:
|
||||
*
|
||||
* 1. cwd-relative (legacy contract; works when run from build/)
|
||||
* 2. $DAEDALUS_SHADER_DIR (env override for tests / packaged installs)
|
||||
* 3. <binary-dir>/<name> (so the bench/test binary finds the SPV next
|
||||
* to itself regardless of cwd — this is the
|
||||
* fix for the silent-no-SPV regression that
|
||||
* made PR #36's bench numbers meaningless)
|
||||
* 4. /opt/fourier/share/daedalus-fourier/<name> (Pi 5 install layout)
|
||||
* 5. /usr/share/daedalus-fourier/<name> (system-wide install)
|
||||
*
|
||||
* Returns NULL only if every location fails, with a single perror naming
|
||||
* the bare filename so the user can grep for it. */
|
||||
static FILE *open_spv(const char *name)
|
||||
{
|
||||
FILE *f = fopen(name, "rb");
|
||||
if (f) return f;
|
||||
|
||||
const char *envdir = getenv("DAEDALUS_SHADER_DIR");
|
||||
if (envdir && *envdir) {
|
||||
char p[PATH_MAX];
|
||||
snprintf(p, sizeof(p), "%s/%s", envdir, name);
|
||||
f = fopen(p, "rb");
|
||||
if (f) return f;
|
||||
}
|
||||
|
||||
char exe[PATH_MAX];
|
||||
ssize_t n = readlink("/proc/self/exe", exe, sizeof(exe) - 1);
|
||||
if (n > 0) {
|
||||
exe[n] = 0;
|
||||
char *slash = strrchr(exe, '/');
|
||||
if (slash) {
|
||||
*slash = 0;
|
||||
char p[PATH_MAX];
|
||||
snprintf(p, sizeof(p), "%s/%s", exe, name);
|
||||
f = fopen(p, "rb");
|
||||
if (f) return f;
|
||||
}
|
||||
}
|
||||
|
||||
char p[PATH_MAX];
|
||||
snprintf(p, sizeof(p), "/opt/fourier/share/daedalus-fourier/%s", name);
|
||||
f = fopen(p, "rb");
|
||||
if (f) return f;
|
||||
|
||||
snprintf(p, sizeof(p), "/usr/share/daedalus-fourier/%s", name);
|
||||
f = fopen(p, "rb");
|
||||
if (f) return f;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static uint32_t *read_spv(const char *path, size_t *out_size)
|
||||
{
|
||||
FILE *f = fopen(path, "rb");
|
||||
if (!f) { perror(path); return NULL; }
|
||||
FILE *f = open_spv(path);
|
||||
if (!f) {
|
||||
fprintf(stderr,
|
||||
"daedalus: SPV not found via cwd / $DAEDALUS_SHADER_DIR / "
|
||||
"binary-dir / /opt/fourier/share / /usr/share: %s\n", path);
|
||||
return NULL;
|
||||
}
|
||||
fseek(f, 0, SEEK_END);
|
||||
long sz = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
|
||||
@@ -0,0 +1,299 @@
|
||||
/* SPDX-License-Identifier: BSD-2-Clause */
|
||||
/* CLOCK_MONOTONIC under -std=c11 -CMAKE_C_EXTENSIONS=OFF. */
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
/*
|
||||
* bench_h264_primitives — latency baseline for the H.264 primitive
|
||||
* library landed across PRs #9–#35.
|
||||
*
|
||||
* Each kernel is exercised at a representative per-frame N for 1080p
|
||||
* (8160 MBs); the per-kernel total + ns/op + ms/frame are reported,
|
||||
* once per substrate (CPU NEON, QPU V3D7 compute). The QPU column
|
||||
* appears only when the host has a usable Vulkan device. When both
|
||||
* columns exist a CPU/QPU ratio is printed; that's the per-kernel
|
||||
* data the QPU-substrate decree (2026-05-23) deliberately overrides
|
||||
* but which is still useful to track over time as dispatch overhead
|
||||
* shrinks (buffer pool, persistent cmdbuf, dmabuf import — tasks 160-162).
|
||||
*
|
||||
* NOT a ctest — produces wall-time numbers, doesn't pass/fail.
|
||||
*
|
||||
* Invoke: ./build/bench_h264_primitives [iters [warmup]]
|
||||
* (default iters = 50, warmup = 5)
|
||||
*/
|
||||
|
||||
#include "daedalus.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
static uint64_t xs64_state = 0xfeedface5a5a5a5aULL;
|
||||
static uint64_t xs64(void) {
|
||||
uint64_t x = xs64_state;
|
||||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||||
return xs64_state = x;
|
||||
}
|
||||
|
||||
static double now_ms(void) {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1.0e6;
|
||||
}
|
||||
|
||||
/* Per-1080p-frame counts (8160 MBs at 1920x1088). */
|
||||
#define MBS_1080P 8160
|
||||
|
||||
/* Standard benchmark loop. fn() is called n times per iteration.
|
||||
*
|
||||
* fn() now returns the dispatch's int rc. A single preflight call is
|
||||
* made before the hot loop; if rc != 0 (which on the QPU substrate
|
||||
* almost always means "SPV not found via any search path"), bench_ns
|
||||
* returns -1 and the caller must NOT report the kernel as measured.
|
||||
*
|
||||
* Without this, a missing SPV makes every dispatch fail fast at the
|
||||
* cost of one fprintf+open call (~1-5 µs), and the loop times that
|
||||
* cost as if it were real QPU work — producing absurdly-small ns/op
|
||||
* numbers that look like a QPU speedup. This is exactly what made
|
||||
* PR #36's bench numbers a measurement artifact. */
|
||||
typedef int (*bench_fn)(void);
|
||||
|
||||
static double bench_ns(const char *name, int iters, int warmup,
|
||||
int ops_per_iter, bench_fn fn)
|
||||
{
|
||||
int rc = fn();
|
||||
if (rc != 0) {
|
||||
printf(" %-32s DISPATCH FAILED rc=%d — kernel skipped\n", name, rc);
|
||||
return -1;
|
||||
}
|
||||
for (int i = 0; i < warmup; i++) fn();
|
||||
double t0 = now_ms();
|
||||
for (int i = 0; i < iters; i++) fn();
|
||||
double t1 = now_ms();
|
||||
double total_ms = (t1 - t0);
|
||||
double ns_per_op = (total_ms * 1e6) / ((double) iters * ops_per_iter);
|
||||
printf(" %-32s %10.2f ns/op (%d iters x %d ops)\n",
|
||||
name, ns_per_op, iters, ops_per_iter);
|
||||
return ns_per_op;
|
||||
}
|
||||
|
||||
/* ---- Per-kernel scaffolding. Each section sets up the buffers +
|
||||
* meta, then defines a static fn() that calls the corresponding
|
||||
* dispatch with a representative N. The substrate is read from the
|
||||
* global g_sub so the same fn() can be re-driven with CPU then QPU. */
|
||||
|
||||
static daedalus_ctx *ctx;
|
||||
static daedalus_substrate g_sub = DAEDALUS_SUBSTRATE_CPU;
|
||||
|
||||
/* --- IDCT 4x4 luma: N = 16 blocks per MB. Bench with 1024 blocks
|
||||
* per call (64 MBs worth). Per-MB the dispatch overhead is the
|
||||
* same regardless of N — we want ns per block. */
|
||||
static int16_t idct4_coeffs[1024 * 16];
|
||||
static daedalus_h264_block_meta idct4_meta[1024];
|
||||
static uint8_t idct_dst[64 * 4 * 16 * 16]; /* 64 MB-rows × ... */
|
||||
|
||||
static int bench_idct4(void) {
|
||||
return daedalus_dispatch_h264_idct4(ctx, g_sub,
|
||||
idct_dst, 64*16, idct4_coeffs, 1024, idct4_meta);
|
||||
}
|
||||
|
||||
/* --- IDCT 8x8 luma: 256 8x8 blocks per call. */
|
||||
static int16_t idct8_coeffs[256 * 64];
|
||||
static daedalus_h264_block_meta idct8_meta[256];
|
||||
|
||||
static int bench_idct8(void) {
|
||||
return daedalus_dispatch_h264_idct8(ctx, g_sub,
|
||||
idct_dst, 64*16, idct8_coeffs, 256, idct8_meta);
|
||||
}
|
||||
|
||||
/* --- Deblock luma_v (cycle 8 baseline; M3 path). */
|
||||
static daedalus_h264_deblock_meta deblock_meta[256];
|
||||
static uint8_t deblock_dst[256 * 16 * 16];
|
||||
|
||||
static int bench_deblock_v(void) {
|
||||
return daedalus_dispatch_h264_deblock_luma_v(ctx, g_sub,
|
||||
deblock_dst, 16, 256, deblock_meta);
|
||||
}
|
||||
|
||||
static int bench_deblock_h(void) {
|
||||
return daedalus_dispatch_h264_deblock_luma_h(ctx, g_sub,
|
||||
deblock_dst, 16, 256, deblock_meta);
|
||||
}
|
||||
|
||||
/* --- qpel mc20 + mc02 + mc22 (the H/V/HV anchors). */
|
||||
static uint8_t qpel_src[256 * 16 * 16];
|
||||
static uint8_t qpel_dst[256 * 16 * 16];
|
||||
static daedalus_h264_qpel_meta qpel_meta[256];
|
||||
|
||||
static int bench_qpel_mc20(void) {
|
||||
return daedalus_dispatch_h264_qpel_mc20(ctx, g_sub,
|
||||
qpel_dst, qpel_src, 16, 256, qpel_meta);
|
||||
}
|
||||
static int bench_qpel_mc02(void) {
|
||||
return daedalus_dispatch_h264_qpel_mc02(ctx, g_sub,
|
||||
qpel_dst, qpel_src, 16, 256, qpel_meta);
|
||||
}
|
||||
static int bench_qpel_mc22(void) {
|
||||
return daedalus_dispatch_h264_qpel_mc22(ctx, g_sub,
|
||||
qpel_dst, qpel_src, 16, 256, qpel_meta);
|
||||
}
|
||||
|
||||
/* ---- One row of bench output:
|
||||
* - kernel name + N
|
||||
* - CPU ns/op
|
||||
* - QPU ns/op (or "n/a" if Vulkan absent)
|
||||
* - CPU/QPU ratio (>1 means QPU wins; <1 means CPU wins) */
|
||||
struct row {
|
||||
const char *name;
|
||||
int n_per_call;
|
||||
bench_fn fn;
|
||||
double cpu_ns;
|
||||
double qpu_ns; /* -1 if not measured */
|
||||
int frame_n; /* count per 1080p frame */
|
||||
};
|
||||
|
||||
static struct row rows[] = {
|
||||
{"IDCT 4x4 luma", 1024, bench_idct4, 0, -1, MBS_1080P * 16},
|
||||
{"IDCT 8x8 luma", 256, bench_idct8, 0, -1, MBS_1080P * 4},
|
||||
{"Deblock luma_v", 256, bench_deblock_v, 0, -1, MBS_1080P * 4},
|
||||
{"Deblock luma_h", 256, bench_deblock_h, 0, -1, MBS_1080P * 4},
|
||||
{"qpel mc20 (8x8)", 256, bench_qpel_mc20, 0, -1, MBS_1080P * 4},
|
||||
{"qpel mc02 (8x8)", 256, bench_qpel_mc02, 0, -1, MBS_1080P * 4},
|
||||
{"qpel mc22 (8x8)", 256, bench_qpel_mc22, 0, -1, MBS_1080P * 4},
|
||||
};
|
||||
#define N_ROWS ((int)(sizeof(rows)/sizeof(rows[0])))
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int iters = argc > 1 ? atoi(argv[1]) : 50;
|
||||
int warmup = argc > 2 ? atoi(argv[2]) : 5;
|
||||
|
||||
ctx = daedalus_ctx_create();
|
||||
if (!ctx) {
|
||||
fprintf(stderr, "ctx create failed (Vulkan?)\n");
|
||||
return 1;
|
||||
}
|
||||
int has_qpu = daedalus_ctx_has_qpu(ctx);
|
||||
|
||||
/* Pre-fill all input buffers with random data so the NEON inner
|
||||
* loops see realistic memory access patterns. */
|
||||
for (size_t i = 0; i < sizeof(idct4_coeffs)/2; i++)
|
||||
idct4_coeffs[i] = (int16_t)((int)(xs64() % 1024) - 512);
|
||||
for (size_t i = 0; i < sizeof(idct8_coeffs)/2; i++)
|
||||
idct8_coeffs[i] = (int16_t)((int)(xs64() % 1024) - 512);
|
||||
for (size_t i = 0; i < sizeof(qpel_src); i++) qpel_src[i] = (uint8_t)(xs64() & 0xff);
|
||||
|
||||
/* IDCT meta. */
|
||||
for (size_t i = 0; i < 1024; i++)
|
||||
idct4_meta[i].dst_off = (uint32_t)((i / 16) * 64 + (i % 16) * 4);
|
||||
for (size_t i = 0; i < 256; i++)
|
||||
idct8_meta[i].dst_off = (uint32_t)((i / 8) * 64 + (i % 8) * 8);
|
||||
|
||||
/* Deblock meta: edge offsets within 256 16x16 tiles. */
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
deblock_meta[i].dst_off = (uint32_t)(i * 256 + 4 * 16);
|
||||
deblock_meta[i].alpha = 30;
|
||||
deblock_meta[i].beta = 10;
|
||||
for (int s = 0; s < 4; s++) deblock_meta[i].tc0[s] = (int8_t)(s + 1);
|
||||
}
|
||||
|
||||
/* qpel meta. */
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
qpel_meta[i].src_off = (uint32_t)(i * 256 + 3 * 16 + 3);
|
||||
qpel_meta[i].dst_off = (uint32_t)(i * 256 + 3 * 16 + 3);
|
||||
}
|
||||
|
||||
printf("bench_h264_primitives: %d iters (%d warmup)\n", iters, warmup);
|
||||
printf(" ctx has_qpu=%d (CPU pass always runs; QPU pass skipped without Vulkan)\n\n", has_qpu);
|
||||
|
||||
/* Pass 1: CPU NEON. */
|
||||
g_sub = DAEDALUS_SUBSTRATE_CPU;
|
||||
printf("== CPU NEON ==\n");
|
||||
for (int i = 0; i < N_ROWS; i++)
|
||||
rows[i].cpu_ns = bench_ns(rows[i].name, iters, warmup, rows[i].n_per_call, rows[i].fn);
|
||||
|
||||
/* Pass 2: QPU compute (if available). */
|
||||
int qpu_failures = 0;
|
||||
if (has_qpu) {
|
||||
g_sub = DAEDALUS_SUBSTRATE_QPU;
|
||||
printf("\n== QPU V3D7 compute ==\n");
|
||||
for (int i = 0; i < N_ROWS; i++) {
|
||||
rows[i].qpu_ns = bench_ns(rows[i].name, iters, warmup, rows[i].n_per_call, rows[i].fn);
|
||||
if (rows[i].qpu_ns < 0) qpu_failures++;
|
||||
}
|
||||
if (qpu_failures) {
|
||||
fprintf(stderr,
|
||||
"\nbench_h264_primitives: %d of %d QPU dispatches failed.\n"
|
||||
" Almost always means SPV files were not found via any of:\n"
|
||||
" cwd / $DAEDALUS_SHADER_DIR / binary-dir /\n"
|
||||
" /opt/fourier/share/daedalus-fourier / /usr/share/daedalus-fourier\n"
|
||||
" Set DAEDALUS_SHADER_DIR=<path> or run from a dir where the\n"
|
||||
" .spv files exist (e.g. the cmake build dir).\n",
|
||||
qpu_failures, N_ROWS);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* Summary table — both substrates side by side. */
|
||||
printf("\n== Per-kernel comparison ==\n");
|
||||
printf(" %-24s %12s %12s %8s %7s\n",
|
||||
"kernel", "CPU ns/op", "QPU ns/op", "winner", "ms/frame");
|
||||
for (int i = 0; i < N_ROWS; i++) {
|
||||
double cpu_ms = rows[i].cpu_ns * rows[i].frame_n / 1e6;
|
||||
double qpu_ms = rows[i].qpu_ns > 0 ? rows[i].qpu_ns * rows[i].frame_n / 1e6 : -1;
|
||||
const char *winner;
|
||||
char ratio[16];
|
||||
if (rows[i].qpu_ns <= 0) {
|
||||
winner = "CPU"; /* QPU n/a */
|
||||
snprintf(ratio, sizeof(ratio), "n/a");
|
||||
} else if (rows[i].cpu_ns < rows[i].qpu_ns) {
|
||||
winner = "CPU";
|
||||
snprintf(ratio, sizeof(ratio), "%.2fx", rows[i].qpu_ns / rows[i].cpu_ns);
|
||||
} else {
|
||||
winner = "QPU";
|
||||
snprintf(ratio, sizeof(ratio), "%.2fx", rows[i].cpu_ns / rows[i].qpu_ns);
|
||||
}
|
||||
char qpu_field[16];
|
||||
if (rows[i].qpu_ns > 0) snprintf(qpu_field, sizeof(qpu_field), "%.2f", rows[i].qpu_ns);
|
||||
else snprintf(qpu_field, sizeof(qpu_field), "n/a");
|
||||
char ms_field[24];
|
||||
if (qpu_ms > 0)
|
||||
snprintf(ms_field, sizeof(ms_field), "%.2f/%.2f", cpu_ms, qpu_ms);
|
||||
else
|
||||
snprintf(ms_field, sizeof(ms_field), "%.2f/n/a", cpu_ms);
|
||||
printf(" %-24s %12.2f %12s %3s %s %s\n",
|
||||
rows[i].name, rows[i].cpu_ns, qpu_field, winner, ratio, ms_field);
|
||||
}
|
||||
|
||||
/* Per-frame budget summary at 1080p (8160 MBs). */
|
||||
double cpu_idct4 = rows[0].cpu_ns * MBS_1080P * 16 / 1e6;
|
||||
double cpu_debl = (rows[2].cpu_ns + rows[3].cpu_ns) * MBS_1080P * 4 / 1e6;
|
||||
double cpu_mc = rows[6].cpu_ns * MBS_1080P * 4 / 1e6; /* mc22 worst-case */
|
||||
double cpu_sum = cpu_idct4 + cpu_debl + cpu_mc;
|
||||
|
||||
printf("\n== Projected 1080p worst-case (CPU NEON only) ==\n");
|
||||
printf(" IDCT 4x4 + deblock luma + qpel mc22: %.2f ms (30fps deadline 33.33)\n", cpu_sum);
|
||||
printf(" Margin: %+.2f ms\n", 33.33 - cpu_sum);
|
||||
|
||||
if (has_qpu) {
|
||||
double qpu_idct4 = rows[0].qpu_ns * MBS_1080P * 16 / 1e6;
|
||||
double qpu_debl = (rows[2].qpu_ns + rows[3].qpu_ns) * MBS_1080P * 4 / 1e6;
|
||||
double qpu_mc = rows[6].qpu_ns * MBS_1080P * 4 / 1e6;
|
||||
double qpu_sum = qpu_idct4 + qpu_debl + qpu_mc;
|
||||
printf("\n== Projected 1080p worst-case (QPU V3D7 compute only) ==\n");
|
||||
printf(" IDCT 4x4 + deblock luma + qpel mc22: %.2f ms (30fps deadline 33.33)\n", qpu_sum);
|
||||
printf(" Margin: %+.2f ms\n", 33.33 - qpu_sum);
|
||||
printf("\n CPU vs QPU sum ratio: %.2fx (>1 means QPU wins)\n",
|
||||
qpu_sum > 0 ? cpu_sum / qpu_sum : 0.0);
|
||||
}
|
||||
|
||||
printf("\n(NOT included: chroma deblock, chroma IDCT, intra prediction,\n");
|
||||
printf(" CABAC/CAVLC entropy. These bench numbers are a budget LOWER\n");
|
||||
printf(" bound; the real decode stack adds 20-40%% on top.\n");
|
||||
printf(" Per-kernel substrate decisions belong in daedalus_core.c recipe\n");
|
||||
printf(" table; the QPU substrate decree (2026-05-23) keeps everything\n");
|
||||
printf(" on QPU regardless of these numbers as a policy choice.)\n");
|
||||
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for the H.264 chroma DC 2x2
|
||||
* Hadamard transform (per H.264 §8.5.11.1).
|
||||
*
|
||||
* In 4:2:0 chroma, the four DC coefficients (one from each chroma
|
||||
* 4x4 AC block within an MB) are arranged into a 2x2 block:
|
||||
*
|
||||
* c[0,0] c[0,1] block (0,0) DC block (0,1) DC
|
||||
* c[1,0] c[1,1] block (1,0) DC block (1,1) DC
|
||||
*
|
||||
* The 2x2 Hadamard transform:
|
||||
*
|
||||
* f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1]
|
||||
* f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1]
|
||||
* f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1]
|
||||
* f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1]
|
||||
*
|
||||
* Equivalently expressed as 2-stage butterflies (row then col), which
|
||||
* the NEON impl uses for SIMD friendliness — we present that form
|
||||
* here too so the QPU/NEON ports are 1:1.
|
||||
*
|
||||
* Output f[] replaces the input c[]. The QP-dependent scaling per
|
||||
* §8.5.11.2 happens AFTER this primitive — the intercept patch
|
||||
* composes Hadamard + LevelScale + shift itself, since the scaling
|
||||
* shape depends on QP and on whether we're in the chroma_qp_offset
|
||||
* adjustment regime.
|
||||
*
|
||||
* Input/output layout:
|
||||
* c[0..3] in row-major order: [c[0,0], c[0,1], c[1,0], c[1,1]]
|
||||
*
|
||||
* License: BSD-2-Clause. Algorithm is in the H.264 spec.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
|
||||
void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4])
|
||||
{
|
||||
/* Stage 1: butterfly along rows.
|
||||
* t[0] = c[0,0] + c[0,1] = c[0] + c[1]
|
||||
* t[1] = c[0,0] - c[0,1] = c[0] - c[1]
|
||||
* t[2] = c[1,0] + c[1,1] = c[2] + c[3]
|
||||
* t[3] = c[1,0] - c[1,1] = c[2] - c[3]
|
||||
*/
|
||||
int t0 = c[0] + c[1];
|
||||
int t1 = c[0] - c[1];
|
||||
int t2 = c[2] + c[3];
|
||||
int t3 = c[2] - c[3];
|
||||
|
||||
/* Stage 2: butterfly along cols. */
|
||||
c[0] = (int16_t)(t0 + t2); /* f[0,0] = t0+t2 = sum of all 4 */
|
||||
c[1] = (int16_t)(t1 + t3); /* f[0,1] = (c0-c1) + (c2-c3) */
|
||||
c[2] = (int16_t)(t0 - t2); /* f[1,0] = (c0+c1) - (c2+c3) */
|
||||
c[3] = (int16_t)(t1 - t3); /* f[1,1] = (c0-c1) - (c2-c3) */
|
||||
}
|
||||
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 chroma loop filters
|
||||
* (bS < 4 variant; "intra" / bS=4 variant lives in a separate file
|
||||
* when added). Covers both orientations:
|
||||
*
|
||||
* v_loop_filter_chroma: filter applied VERTICALLY across a
|
||||
* HORIZONTAL edge. Tile is 8 cols × 4 rows of context
|
||||
* (rows -2..+1); pix points to row 0 of the bottom block.
|
||||
* h_loop_filter_chroma: filter applied HORIZONTALLY across a
|
||||
* VERTICAL edge. Tile is 4 cols × 8 rows of context
|
||||
* (cols -2..+1); pix points to col 0 of the right block.
|
||||
*
|
||||
* Mirrors FFmpeg `ff_h264_v_loop_filter_chroma_neon` (line 412) and
|
||||
* `ff_h264_h_loop_filter_chroma_neon` (line 430) in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
|
||||
*
|
||||
* Algorithm per H.264 §8.7.2.4 (chroma bS<4 inter):
|
||||
* - Same edge preconditions as luma: |p0-q0|<α, |p1-p0|<β, |q1-q0|<β.
|
||||
* - tC = tc0_seg + 1 (chroma's tc has no luma-style ap/aq side bonus).
|
||||
* - δ = clip3((((q0-p0)<<2) + (p1-q1) + 4) >> 3, -tC, tC).
|
||||
* - p0' = clip255(p0+δ); q0' = clip255(q0-δ).
|
||||
* - Chroma NEVER updates p1, p2, q1, q2 (unlike luma).
|
||||
*
|
||||
* tc0[4]: 4 segments × 2 cells per segment = 8 cells per edge
|
||||
* (matches both 4:2:0 chroma plane geometry — 8 cols for V edge or
|
||||
* 8 rows for H edge).
|
||||
*
|
||||
* Signature (matches FFmpeg + the existing luma refs):
|
||||
* void(uint8_t *pix, ptrdiff_t stride,
|
||||
* int alpha, int beta, int8_t tc0[4]);
|
||||
*
|
||||
* License: LGPL-2.1-or-later (matches FFmpeg upstream).
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
static inline int clip3(int v, int lo, int hi) {
|
||||
return v < lo ? lo : v > hi ? hi : v;
|
||||
}
|
||||
static inline int abs_i(int x) { return x < 0 ? -x : x; }
|
||||
|
||||
/* Per-cell chroma filter, vertical-direction access (one column
|
||||
* across the horizontal edge). p1 is at pix[-2*stride], q1 at
|
||||
* pix[+1*stride]. */
|
||||
static void h264_chroma_cell_v(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int tc0_s)
|
||||
{
|
||||
int p1 = pix[-2*stride], p0 = pix[-1*stride];
|
||||
int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
int tc = tc0_s + 1;
|
||||
int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
pix[-1*stride] = (uint8_t) clip_u8(p0 + delta);
|
||||
pix[ 0*stride] = (uint8_t) clip_u8(q0 - delta);
|
||||
}
|
||||
|
||||
/* Same kernel, horizontal-direction access (one row across the
|
||||
* vertical edge). p1 at pix[-2], q1 at pix[+1]. */
|
||||
static void h264_chroma_cell_h(uint8_t *pix,
|
||||
int alpha, int beta, int tc0_s)
|
||||
{
|
||||
int p1 = pix[-2], p0 = pix[-1];
|
||||
int q0 = pix[ 0], q1 = pix[ 1];
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
int tc = tc0_s + 1;
|
||||
int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
pix[-1] = (uint8_t) clip_u8(p0 + delta);
|
||||
pix[ 0] = (uint8_t) clip_u8(q0 - delta);
|
||||
}
|
||||
|
||||
void daedalus_h264_v_loop_filter_chroma_ref(
|
||||
uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4])
|
||||
{
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
|
||||
|
||||
/* 8 cols divided into 4 segments of 2 cols each. */
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int tc0_s = tc0[s];
|
||||
if (tc0_s < 0) continue;
|
||||
for (int c = 0; c < 2; c++) {
|
||||
int col = s * 2 + c;
|
||||
h264_chroma_cell_v(pix + col, stride, alpha, beta, tc0_s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void daedalus_h264_h_loop_filter_chroma_ref(
|
||||
uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4])
|
||||
{
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
|
||||
|
||||
/* 8 rows divided into 4 segments of 2 rows each. */
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int tc0_s = tc0[s];
|
||||
if (tc0_s < 0) continue;
|
||||
for (int r = 0; r < 2; r++) {
|
||||
int row = s * 2 + r;
|
||||
h264_chroma_cell_h(pix + row * stride, alpha, beta, tc0_s);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,184 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma + chroma "intra"
|
||||
* loop filters (bS = 4 variant, used at I-MB edges where the
|
||||
* boundary strength is forced to 4). Covers all four orientations:
|
||||
*
|
||||
* v_loop_filter_luma_intra — 16 cols × 8 rows, edge between
|
||||
* rows -1 and 0
|
||||
* h_loop_filter_luma_intra — 8 cols × 16 rows, edge between
|
||||
* cols -1 and 0
|
||||
* v_loop_filter_chroma_intra — 8 cols × 4 rows
|
||||
* h_loop_filter_chroma_intra — 4 cols × 8 rows
|
||||
*
|
||||
* Mirrors FFmpeg's `ff_h264_{v,h}_loop_filter_{luma,chroma}_intra_neon`
|
||||
* in external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
|
||||
*
|
||||
* Algorithm per H.264 §8.7.2.3 (bS=4):
|
||||
*
|
||||
* Preconditions (same as bS<4):
|
||||
* |p0-q0| < α AND |p1-p0| < β AND |q1-q0| < β
|
||||
*
|
||||
* Luma — strong/weak filter selector per side:
|
||||
* strong_p = (|p2-p0| < β) AND (|p0-q0| < (α>>2)+2)
|
||||
* strong_q = (|q2-q0| < β) AND (|p0-q0| < (α>>2)+2)
|
||||
*
|
||||
* If strong_p, update p0/p1/p2:
|
||||
* p0' = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3
|
||||
* p1' = (p2 + p1 + p0 + q0 + 2) >> 2
|
||||
* p2' = (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
|
||||
* Else weak (single cell):
|
||||
* p0' = (2*p1 + p0 + q1 + 2) >> 2
|
||||
* Mirror for q-side.
|
||||
*
|
||||
* Chroma — always weak (no quad-tree selector):
|
||||
* p0' = (2*p1 + p0 + q1 + 2) >> 2
|
||||
* q0' = (2*q1 + q0 + p1 + 2) >> 2
|
||||
* Chroma never updates p1/p2/q1/q2.
|
||||
*
|
||||
* Signature (no tc0 in the intra path — the daedalus_h264_deblock_meta
|
||||
* struct's tc0 field is ignored at the dispatch layer):
|
||||
* void(uint8_t *pix, ptrdiff_t stride, int alpha, int beta);
|
||||
*
|
||||
* License: LGPL-2.1-or-later (matches FFmpeg upstream).
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
static inline int abs_i(int x) { return x < 0 ? -x : x; }
|
||||
|
||||
/* --- luma intra, one column across the horizontal edge --- */
|
||||
static void h264_luma_intra_cell_v(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta)
|
||||
{
|
||||
int p3 = pix[-4*stride], p2 = pix[-3*stride];
|
||||
int p1 = pix[-2*stride], p0 = pix[-1*stride];
|
||||
int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
|
||||
int q2 = pix[ 2*stride], q3 = pix[ 3*stride];
|
||||
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
|
||||
int strong_common = abs_i(p0 - q0) < ((alpha >> 2) + 2);
|
||||
int strong_p = strong_common && (abs_i(p2 - p0) < beta);
|
||||
int strong_q = strong_common && (abs_i(q2 - q0) < beta);
|
||||
|
||||
if (strong_p) {
|
||||
pix[-1*stride] = (uint8_t) clip_u8((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3);
|
||||
pix[-2*stride] = (uint8_t) clip_u8((p2 + p1 + p0 + q0 + 2) >> 2);
|
||||
pix[-3*stride] = (uint8_t) clip_u8((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3);
|
||||
} else {
|
||||
pix[-1*stride] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||
}
|
||||
|
||||
if (strong_q) {
|
||||
pix[ 0*stride] = (uint8_t) clip_u8((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3);
|
||||
pix[ 1*stride] = (uint8_t) clip_u8((q2 + q1 + q0 + p0 + 2) >> 2);
|
||||
pix[ 2*stride] = (uint8_t) clip_u8((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3);
|
||||
} else {
|
||||
pix[ 0*stride] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||
}
|
||||
}
|
||||
|
||||
/* --- luma intra, one row across the vertical edge --- */
|
||||
static void h264_luma_intra_cell_h(uint8_t *pix, int alpha, int beta)
|
||||
{
|
||||
int p3 = pix[-4], p2 = pix[-3], p1 = pix[-2], p0 = pix[-1];
|
||||
int q0 = pix[ 0], q1 = pix[ 1], q2 = pix[ 2], q3 = pix[ 3];
|
||||
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
|
||||
int strong_common = abs_i(p0 - q0) < ((alpha >> 2) + 2);
|
||||
int strong_p = strong_common && (abs_i(p2 - p0) < beta);
|
||||
int strong_q = strong_common && (abs_i(q2 - q0) < beta);
|
||||
|
||||
if (strong_p) {
|
||||
pix[-1] = (uint8_t) clip_u8((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3);
|
||||
pix[-2] = (uint8_t) clip_u8((p2 + p1 + p0 + q0 + 2) >> 2);
|
||||
pix[-3] = (uint8_t) clip_u8((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3);
|
||||
} else {
|
||||
pix[-1] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||
}
|
||||
|
||||
if (strong_q) {
|
||||
pix[ 0] = (uint8_t) clip_u8((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3);
|
||||
pix[ 1] = (uint8_t) clip_u8((q2 + q1 + q0 + p0 + 2) >> 2);
|
||||
pix[ 2] = (uint8_t) clip_u8((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3);
|
||||
} else {
|
||||
pix[ 0] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||
}
|
||||
}
|
||||
|
||||
/* --- chroma intra, one column across the horizontal edge --- */
|
||||
static void h264_chroma_intra_cell_v(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta)
|
||||
{
|
||||
int p1 = pix[-2*stride], p0 = pix[-1*stride];
|
||||
int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
|
||||
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
|
||||
pix[-1*stride] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||
pix[ 0*stride] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||
}
|
||||
|
||||
/* --- chroma intra, one row across the vertical edge --- */
|
||||
static void h264_chroma_intra_cell_h(uint8_t *pix, int alpha, int beta)
|
||||
{
|
||||
int p1 = pix[-2], p0 = pix[-1];
|
||||
int q0 = pix[ 0], q1 = pix[ 1];
|
||||
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
|
||||
pix[-1] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||
pix[ 0] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||
}
|
||||
|
||||
/* --- public refs --- */
|
||||
|
||||
void daedalus_h264_v_loop_filter_luma_intra_ref(
|
||||
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||
{
|
||||
/* Note: the FFmpeg .S `h264_loop_filter_start_intra` macro
|
||||
* returns early if (alpha|beta) == 0. For non-zero alpha or
|
||||
* non-zero beta it runs the filter; the per-cell preconditions
|
||||
* (abs(p0-q0)<alpha etc.) then decide whether each column
|
||||
* actually updates pixels. Match that here. */
|
||||
if ((alpha | beta) == 0) return;
|
||||
|
||||
/* 16 columns; no quad-tree segments in the intra path (bS=4 is
|
||||
* uniform across the edge, no tc0_seg < 0 skip). */
|
||||
for (int c = 0; c < 16; c++)
|
||||
h264_luma_intra_cell_v(pix + c, stride, alpha, beta);
|
||||
}
|
||||
|
||||
void daedalus_h264_h_loop_filter_luma_intra_ref(
|
||||
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||
{
|
||||
if ((alpha | beta) == 0) return;
|
||||
for (int r = 0; r < 16; r++)
|
||||
h264_luma_intra_cell_h(pix + r * stride, alpha, beta);
|
||||
}
|
||||
|
||||
void daedalus_h264_v_loop_filter_chroma_intra_ref(
|
||||
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||
{
|
||||
if ((alpha | beta) == 0) return;
|
||||
for (int c = 0; c < 8; c++)
|
||||
h264_chroma_intra_cell_v(pix + c, stride, alpha, beta);
|
||||
}
|
||||
|
||||
void daedalus_h264_h_loop_filter_chroma_intra_ref(
|
||||
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||
{
|
||||
if ((alpha | beta) == 0) return;
|
||||
for (int r = 0; r < 8; r++)
|
||||
h264_chroma_intra_cell_h(pix + r * stride, alpha, beta);
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Standalone bit-exact C references for the avg_ qpel anchors —
|
||||
* the biprediction "average against existing dst" form of mc20,
|
||||
* mc02, mc22. Used in B-slices where two qpel-interpolated samples
|
||||
* (one from list0, one from list1) are averaged per H.264 §8.4.2.3.
|
||||
*
|
||||
* Each kernel computes the same half-pel formula as the put_ form,
|
||||
* then averages with dst[r,c] via L2 ((dst + put_val + 1) >> 1).
|
||||
* The dst buffer carries the list0 prediction on entry; the avg_
|
||||
* call adds the list1 contribution.
|
||||
*
|
||||
* Mirror FFmpeg's `ff_avg_h264_qpel8_{mc20,mc02,mc22}_neon` in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
|
||||
* (same `\type=avg` expansion as the put_ functions).
|
||||
*
|
||||
* License: LGPL-2.1-or-later.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); }
|
||||
|
||||
/* Same per-cell helpers as the diag/quarter-axis refs. Duplicated
|
||||
* here (rather than extern'd) so this TU compiles standalone. */
|
||||
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||
{
|
||||
int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
|
||||
+ 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1]
|
||||
- 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3]
|
||||
+ 16;
|
||||
return (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||
{
|
||||
int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
|
||||
+ 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
|
||||
- 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
|
||||
+ 16;
|
||||
return (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
|
||||
void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++)
|
||||
dst[r*stride + c] = avg2(dst[r*stride + c], hpel_h(src, r, c, stride));
|
||||
}
|
||||
|
||||
void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++)
|
||||
dst[r*stride + c] = avg2(dst[r*stride + c], hpel_v(src, r, c, stride));
|
||||
}
|
||||
|
||||
void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
/* Per-cell mc22: same 13-row int16 tmp[] computation as the
|
||||
* put_ reference, then L2 with dst. */
|
||||
int16_t tmp[13][8];
|
||||
for (int rr = 0; rr < 13; rr++) {
|
||||
int src_row = rr - 2;
|
||||
const uint8_t *s = src + src_row * stride;
|
||||
for (int c = 0; c < 8; c++) {
|
||||
int v = (int) s[c-2] - 5 * (int) s[c-1]
|
||||
+ 20 * (int) s[c] + 20 * (int) s[c+1]
|
||||
- 5 * (int) s[c+2] + (int) s[c+3];
|
||||
tmp[rr][c] = (int16_t) v;
|
||||
}
|
||||
}
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++) {
|
||||
int v = tmp[r+0][c] - 5*tmp[r+1][c] + 20*tmp[r+2][c]
|
||||
+ 20*tmp[r+3][c] - 5*tmp[r+4][c] + tmp[r+5][c] + 512;
|
||||
uint8_t p = (uint8_t) clip_u8(v >> 10);
|
||||
dst[r*stride + c] = avg2(dst[r*stride + c], p);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Standalone bit-exact C references for the 12 remaining avg_
|
||||
* biprediction qpel positions (B-slice list0 + list1 averaging):
|
||||
* 4 quarter-axis: avg_mc{10,30,01,03}
|
||||
* 8 diagonals : avg_mc{11,12,13,21,23,31,32,33}
|
||||
*
|
||||
* Each is the put_ formula (per H.264 §8.4.2.2.1 / Table 8-4) with
|
||||
* a final L2 average against the existing dst contents per §8.4.2.3.1.
|
||||
* Caller pre-loads dst with the list0 prediction; the avg_ call
|
||||
* folds in list1.
|
||||
*
|
||||
* Mirror FFmpeg's `ff_avg_h264_qpel8_mc{XY}_neon` (in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
|
||||
* — same `\type=avg` expansion as the put_ functions).
|
||||
*
|
||||
* License: LGPL-2.1-or-later.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); }
|
||||
|
||||
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||
{
|
||||
int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
|
||||
+ 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1]
|
||||
- 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3]
|
||||
+ 16;
|
||||
return (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||
{
|
||||
int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
|
||||
+ 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
|
||||
- 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
|
||||
+ 16;
|
||||
return (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
static inline uint8_t hpel_hv(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||
{
|
||||
int t[6];
|
||||
for (int i = 0; i < 6; i++) {
|
||||
int rr = r - 2 + i;
|
||||
t[i] = (int) s[rr*stride + c-2] - 5 * (int) s[rr*stride + c-1]
|
||||
+ 20 * (int) s[rr*stride + c] + 20 * (int) s[rr*stride + c+1]
|
||||
- 5 * (int) s[rr*stride + c+2] + (int) s[rr*stride + c+3];
|
||||
}
|
||||
int v = t[0] - 5*t[1] + 20*t[2] + 20*t[3] - 5*t[4] + t[5] + 512;
|
||||
return (uint8_t) clip_u8(v >> 10);
|
||||
}
|
||||
|
||||
/* Quarter-axis variants: half-pel + L2 with integer source, then
|
||||
* L2 again with dst. */
|
||||
#define DEFINE_AVG_QUARTER(NAME, A_EXPR, INT_EXPR) \
|
||||
void daedalus_avg_h264_qpel8_ ## NAME ## _ref(uint8_t *dst, \
|
||||
const uint8_t *src, ptrdiff_t stride) \
|
||||
{ \
|
||||
for (int r = 0; r < 8; r++) \
|
||||
for (int c = 0; c < 8; c++) { \
|
||||
uint8_t a = (A_EXPR); \
|
||||
uint8_t p = (uint8_t)((a + (INT_EXPR) + 1) >> 1); \
|
||||
dst[r*stride + c] = avg2(dst[r*stride + c], p); \
|
||||
} \
|
||||
}
|
||||
|
||||
DEFINE_AVG_QUARTER(mc10, hpel_h(src, r, c, stride), src[r*stride + c ])
|
||||
DEFINE_AVG_QUARTER(mc30, hpel_h(src, r, c, stride), src[r*stride + c + 1])
|
||||
DEFINE_AVG_QUARTER(mc01, hpel_v(src, r, c, stride), src[(r )*stride + c])
|
||||
DEFINE_AVG_QUARTER(mc03, hpel_v(src, r, c, stride), src[(r + 1)*stride + c])
|
||||
|
||||
#undef DEFINE_AVG_QUARTER
|
||||
|
||||
/* Diagonal variants: avg of two half-pels, then L2 with dst. */
|
||||
#define DEFINE_AVG_DIAG(NAME, A_EXPR, B_EXPR) \
|
||||
void daedalus_avg_h264_qpel8_ ## NAME ## _ref(uint8_t *dst, \
|
||||
const uint8_t *src, ptrdiff_t stride) \
|
||||
{ \
|
||||
for (int r = 0; r < 8; r++) \
|
||||
for (int c = 0; c < 8; c++) { \
|
||||
uint8_t a = (A_EXPR); \
|
||||
uint8_t b = (B_EXPR); \
|
||||
uint8_t p = avg2(a, b); \
|
||||
dst[r*stride + c] = avg2(dst[r*stride + c], p); \
|
||||
} \
|
||||
}
|
||||
|
||||
DEFINE_AVG_DIAG(mc11, hpel_h(src, r, c, stride), hpel_v(src, r, c, stride))
|
||||
DEFINE_AVG_DIAG(mc12, hpel_hv(src, r, c, stride), hpel_v(src, r, c, stride))
|
||||
DEFINE_AVG_DIAG(mc13, hpel_h(src, r+1, c, stride), hpel_v(src, r, c, stride))
|
||||
DEFINE_AVG_DIAG(mc21, hpel_hv(src, r, c, stride), hpel_h(src, r, c, stride))
|
||||
DEFINE_AVG_DIAG(mc23, hpel_hv(src, r, c, stride), hpel_h(src, r+1, c, stride))
|
||||
DEFINE_AVG_DIAG(mc31, hpel_h(src, r, c, stride), hpel_v(src, r, c+1, stride))
|
||||
DEFINE_AVG_DIAG(mc32, hpel_hv(src, r, c, stride), hpel_v(src, r, c+1, stride))
|
||||
DEFINE_AVG_DIAG(mc33, hpel_h(src, r+1, c, stride), hpel_v(src, r, c+1, stride))
|
||||
|
||||
#undef DEFINE_AVG_DIAG
|
||||
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Standalone bit-exact C references for the 8 diagonal H.264 luma
|
||||
* qpel positions (mc11, mc12, mc13, mc21, mc23, mc31, mc32, mc33).
|
||||
* Each is the rounded average of two half-pel intermediates per
|
||||
* H.264 §8.4.2.2.1 / Table 8-4, decomposed to match the FFmpeg .S
|
||||
* reference structure (see comments in mc{11,12,21,...}_neon in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S).
|
||||
*
|
||||
* Position decompositions (verified against the .S):
|
||||
* mc11 (e, ¼¼): avg(mc20[r,c], mc02[r,c])
|
||||
* mc12 (f, ¼½): avg(mc22[r,c], mc02[r,c])
|
||||
* mc13 (g, ¼¾): avg(mc20[r+1,c], mc02[r,c])
|
||||
* mc21 (i, ½¼): avg(mc22[r,c], mc20[r,c])
|
||||
* mc23 (k, ½¾): avg(mc22[r,c], mc20[r+1,c])
|
||||
* mc31 (p, ¾¼): avg(mc20[r,c], mc02[r,c+1])
|
||||
* mc32 (q, ¾½): avg(mc22[r,c], mc02[r,c+1])
|
||||
* mc33 (r, ¾¾): avg(mc20[r+1,c], mc02[r,c+1])
|
||||
*
|
||||
* (The mc20[r,c] notation means "the mc20-style horizontal half-pel
|
||||
* result at source-relative integer position (r, c)"; analogously
|
||||
* for mc02 and mc22.)
|
||||
*
|
||||
* Single-stride convention; same edge-context contract as the simpler
|
||||
* variants (the cells "[r+1,c]" etc. demand one extra row/col of
|
||||
* source context beyond what mc20/mc02 alone would need).
|
||||
*
|
||||
* License: LGPL-2.1-or-later.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
/* Single-cell helpers — same arithmetic as the dedicated mc20/mc02
|
||||
* refs but computed point-by-point so the diagonal refs can mix them
|
||||
* cheaply. Each returns a u8 (already clipped). */
|
||||
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||
{
|
||||
int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
|
||||
+ 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1]
|
||||
- 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3]
|
||||
+ 16;
|
||||
return (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||
{
|
||||
int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
|
||||
+ 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
|
||||
- 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
|
||||
+ 16;
|
||||
return (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
|
||||
/* hpel_hv — 2D half-pel at (r, c) per the H.264 §8.4.2.2.1 "j"
|
||||
* cascade. Computes the 6 vertical intermediates needed for the
|
||||
* column at offsets -2..+3 around (r, c), each as a 16-bit signed
|
||||
* h-lowpass over the 6 source samples in the same row. Then v-lowpass
|
||||
* over those 6 intermediates with the +512 >> 10 final scale. Same
|
||||
* as the mc22 ref, just expressed point-by-point. */
|
||||
static inline uint8_t hpel_hv(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||
{
|
||||
int t[6]; /* tmp at rows r-2..r+3 of the same col c */
|
||||
for (int i = 0; i < 6; i++) {
|
||||
int rr = r - 2 + i;
|
||||
t[i] = (int) s[rr*stride + c-2] - 5 * (int) s[rr*stride + c-1]
|
||||
+ 20 * (int) s[rr*stride + c] + 20 * (int) s[rr*stride + c+1]
|
||||
- 5 * (int) s[rr*stride + c+2] + (int) s[rr*stride + c+3];
|
||||
}
|
||||
int v = t[0] - 5 * t[1] + 20 * t[2] + 20 * t[3] - 5 * t[4] + t[5] + 512;
|
||||
return (uint8_t) clip_u8(v >> 10);
|
||||
}
|
||||
|
||||
/* avg rounded ((a + b + 1) >> 1) — saturates already-clipped inputs
|
||||
* so no further clip needed. */
|
||||
static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); }
|
||||
|
||||
#define DEFINE_DIAG_REF(NAME, A_EXPR, B_EXPR) \
|
||||
void daedalus_put_h264_qpel8_ ## NAME ## _ref(uint8_t *dst, \
|
||||
const uint8_t *src, ptrdiff_t stride) \
|
||||
{ \
|
||||
for (int r = 0; r < 8; r++) \
|
||||
for (int c = 0; c < 8; c++) { \
|
||||
uint8_t a = (A_EXPR); \
|
||||
uint8_t b = (B_EXPR); \
|
||||
dst[r*stride + c] = avg2(a, b); \
|
||||
} \
|
||||
}
|
||||
|
||||
DEFINE_DIAG_REF(mc11, hpel_h(src, r, c, stride), hpel_v(src, r, c, stride))
|
||||
DEFINE_DIAG_REF(mc12, hpel_hv(src, r, c, stride), hpel_v(src, r, c, stride))
|
||||
DEFINE_DIAG_REF(mc13, hpel_h(src, r+1, c, stride), hpel_v(src, r, c, stride))
|
||||
DEFINE_DIAG_REF(mc21, hpel_hv(src, r, c, stride), hpel_h(src, r, c, stride))
|
||||
DEFINE_DIAG_REF(mc23, hpel_hv(src, r, c, stride), hpel_h(src, r+1, c, stride))
|
||||
DEFINE_DIAG_REF(mc31, hpel_h(src, r, c, stride), hpel_v(src, r, c+1, stride))
|
||||
DEFINE_DIAG_REF(mc32, hpel_hv(src, r, c, stride), hpel_v(src, r, c+1, stride))
|
||||
DEFINE_DIAG_REF(mc33, hpel_h(src, r+1, c, stride), hpel_v(src, r, c+1, stride))
|
||||
|
||||
#undef DEFINE_DIAG_REF
|
||||
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma qpel 8×8 mc02
|
||||
* (vertical half-pel, "put" variant). Mirror of mc20 with rows
|
||||
* and columns transposed. 6-tap filter applied vertically:
|
||||
*
|
||||
* dst[r,c] = clip255( (s[r-2,c] - 5*s[r-1,c] + 20*s[r,c]
|
||||
* + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c]
|
||||
* + 16) >> 5 )
|
||||
*
|
||||
* Mirrors FFmpeg `ff_put_h264_qpel8_mc02_neon` (in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
|
||||
* line 678, which tail-calls put_h264_qpel8_v_lowpass_neon).
|
||||
*
|
||||
* Signature:
|
||||
* void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
*
|
||||
* Both dst and src use the SAME stride. src points at row 0 col 0
|
||||
* of the output block; the filter reads rows -2..+3 (2 rows of top
|
||||
* context, 3 rows of bottom context). Caller must guarantee the
|
||||
* source buffer has those rows available (FFmpeg's edge-emulated
|
||||
* buffer handles this at the frame boundary; matches the contract
|
||||
* documented for mc20).
|
||||
*
|
||||
* License: LGPL-2.1-or-later.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++) {
|
||||
for (int c = 0; c < 8; c++) {
|
||||
int s_m2 = src[(r - 2) * stride + c];
|
||||
int s_m1 = src[(r - 1) * stride + c];
|
||||
int s_0 = src[(r + 0) * stride + c];
|
||||
int s_p1 = src[(r + 1) * stride + c];
|
||||
int s_p2 = src[(r + 2) * stride + c];
|
||||
int s_p3 = src[(r + 3) * stride + c];
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
dst[r * stride + c] = (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma qpel 8x8 mc22
|
||||
* (2D half-pel, "put" variant). Cascade of horizontal 6-tap then
|
||||
* vertical 6-tap with INTERMEDIATE 16-bit precision (no per-stage
|
||||
* clip/round), final +512 >> 10 to scale back.
|
||||
*
|
||||
* Per H.264 §8.4.2.2.1, "j" position:
|
||||
*
|
||||
* tmp[r,c] = s[r,c-2] - 5*s[r,c-1] + 20*s[r,c] + 20*s[r,c+1]
|
||||
* - 5*s[r,c+2] + s[r,c+3] (16-bit signed)
|
||||
*
|
||||
* dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c]
|
||||
* + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c]
|
||||
* + 512) >> 10)
|
||||
*
|
||||
* The tmp[] array spans rows r-2 .. r+3 around each output row, so
|
||||
* we need 13 intermediate rows (rows -2..+10 of the SOURCE
|
||||
* neighbourhood) for 8 output rows. Caller's src must have 2 rows
|
||||
* of top context + 3 rows of bottom context AND 2 cols of left +
|
||||
* 3 cols of right context (FFmpeg's edge-emulated buffer provides
|
||||
* this at the frame boundary; same contract as mc20).
|
||||
*
|
||||
* Mirrors FFmpeg `ff_put_h264_qpel8_mc22_neon` (in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
|
||||
* line 710, which tail-calls put_h264_qpel8_hv_lowpass_neon).
|
||||
*
|
||||
* Signature:
|
||||
* void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
*
|
||||
* Same single-stride convention as mc20/mc02.
|
||||
*
|
||||
* License: LGPL-2.1-or-later.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
/* 13 intermediate rows × 8 cols (for the 8 output rows
|
||||
* dst[0..7][0..7], we need tmp[-2..+10][0..7] — but tmp is
|
||||
* indexed RELATIVE to the output, so tmp_buf[0..12] corresponds
|
||||
* to source rows [-2..+10]). */
|
||||
int16_t tmp[13][8];
|
||||
for (int rr = 0; rr < 13; rr++) {
|
||||
int src_row = rr - 2; /* maps tmp_buf[0..12] → src rows [-2..+10] */
|
||||
const uint8_t *s = src + src_row * stride;
|
||||
for (int c = 0; c < 8; c++) {
|
||||
int v = (int) s[c - 2] - 5 * (int) s[c - 1]
|
||||
+ 20 * (int) s[c] + 20 * (int) s[c + 1]
|
||||
- 5 * (int) s[c + 2] + (int) s[c + 3];
|
||||
tmp[rr][c] = (int16_t) v;
|
||||
}
|
||||
}
|
||||
|
||||
for (int r = 0; r < 8; r++) {
|
||||
/* tmp[r-2..r+3] in the output's coord system → tmp_buf[r..r+5]. */
|
||||
for (int c = 0; c < 8; c++) {
|
||||
int v = tmp[r + 0][c] /* "r-2" + shift 2 */
|
||||
- 5 * tmp[r + 1][c] /* "r-1" */
|
||||
+ 20 * tmp[r + 2][c] /* "r+0" */
|
||||
+ 20 * tmp[r + 3][c] /* "r+1" */
|
||||
- 5 * tmp[r + 4][c] /* "r+2" */
|
||||
+ tmp[r + 5][c] /* "r+3" */
|
||||
+ 512;
|
||||
dst[r * stride + c] = (uint8_t) clip_u8(v >> 10);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Standalone bit-exact C references for the four single-axis quarter-
|
||||
* pel luma qpel positions (H.264 §8.4.2.2.1, "put" variants). Each
|
||||
* is a half-pel lowpass clipped to u8 followed by an L2 rounded-average
|
||||
* with an integer-position source pixel.
|
||||
*
|
||||
* mc10 ("a" pos, ¼ horiz): a = clip255(mc20(s)); dst = (a + s[r,c] + 1) >> 1
|
||||
* mc30 ("c" pos, ¾ horiz): a = clip255(mc20(s)); dst = (a + s[r,c+1] + 1) >> 1
|
||||
* mc01 ("d" pos, ¼ vert ): a = clip255(mc02(s)); dst = (a + s[r, c] + 1) >> 1
|
||||
* mc03 ("n" pos, ¾ vert ): a = clip255(mc02(s)); dst = (a + s[r+1,c] + 1) >> 1
|
||||
*
|
||||
* Mirror FFmpeg's `ff_put_h264_qpel8_mc{10,30,01,03}_neon` (in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
|
||||
* lines 587, 603, 611, 729 — each tail-calls the corresponding
|
||||
* lowpass_l2 helper).
|
||||
*
|
||||
* Same single-stride convention as mc20/mc02 — dst and src share the
|
||||
* same stride; src + src_off points at row 0 col 0 of the output
|
||||
* block, with appropriate edge context already in-buffer.
|
||||
*
|
||||
* License: LGPL-2.1-or-later.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
/* Compute one horizontal half-pel pixel at (r, c) — same as mc20. */
|
||||
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||
{
|
||||
int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
|
||||
+ 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1]
|
||||
- 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3]
|
||||
+ 16;
|
||||
return (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
|
||||
/* Compute one vertical half-pel pixel at (r, c) — same as mc02. */
|
||||
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||
{
|
||||
int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
|
||||
+ 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
|
||||
- 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
|
||||
+ 16;
|
||||
return (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
|
||||
void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++) {
|
||||
uint8_t a = hpel_h(src, r, c, stride);
|
||||
dst[r*stride + c] = (uint8_t) ((a + src[r*stride + c ] + 1) >> 1);
|
||||
}
|
||||
}
|
||||
|
||||
void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++) {
|
||||
uint8_t a = hpel_h(src, r, c, stride);
|
||||
dst[r*stride + c] = (uint8_t) ((a + src[r*stride + c + 1] + 1) >> 1);
|
||||
}
|
||||
}
|
||||
|
||||
void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++) {
|
||||
uint8_t a = hpel_v(src, r, c, stride);
|
||||
dst[r*stride + c] = (uint8_t) ((a + src[(r )*stride + c] + 1) >> 1);
|
||||
}
|
||||
}
|
||||
|
||||
void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++) {
|
||||
uint8_t a = hpel_v(src, r, c, stride);
|
||||
dst[r*stride + c] = (uint8_t) ((a + src[(r + 1)*stride + c] + 1) >> 1);
|
||||
}
|
||||
}
|
||||
+456
-1
@@ -18,8 +18,55 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t s
|
||||
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||||
extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_h264_v_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void daedalus_h264_h_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void daedalus_h264_v_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta);
|
||||
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
|
||||
@@ -191,6 +238,162 @@ static int test_deblock_h(void)
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_deblock_chroma_v(void)
|
||||
{
|
||||
/* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
|
||||
* (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
|
||||
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
|
||||
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
|
||||
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
|
||||
EDGE_OFF = EDGE_ROW * TILE_STRIDE };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
|
||||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||||
meta[i].beta = (int)(xs() % 16) + 1;
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int r = (int)(xs() % 8);
|
||||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||||
daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
|
||||
N_EDGES, meta);
|
||||
if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_deblock_chroma_h(void)
|
||||
{
|
||||
/* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
|
||||
* (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
|
||||
enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
|
||||
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
|
||||
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
|
||||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||||
meta[i].beta = (int)(xs() % 16) + 1;
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int r = (int)(xs() % 8);
|
||||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||||
daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
|
||||
N_EDGES, meta);
|
||||
if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
/* --- bS=4 intra-strength deblock tests ---
|
||||
* Tile geometry per orientation matches the bS<4 variant; only the
|
||||
* dispatch + reference function change. alpha/beta are non-trivial
|
||||
* (the C ref + NEON both early-return when alpha|beta == 0).
|
||||
*/
|
||||
typedef struct {
|
||||
const char *name;
|
||||
int n_edges, tile_stride, tile_rows, edge_off;
|
||||
void (*ref)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta);
|
||||
int (*dispatch)(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
} intra_test_spec;
|
||||
|
||||
static int run_intra_test(const intra_test_spec *t)
|
||||
{
|
||||
int total = t->n_edges * t->tile_stride * t->tile_rows;
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t *dst = malloc((size_t) total);
|
||||
uint8_t *dst_ref = malloc((size_t) total);
|
||||
daedalus_h264_deblock_meta *meta = calloc((size_t) t->n_edges, sizeof(*meta));
|
||||
if (!dst || !dst_ref || !meta) return 1;
|
||||
|
||||
for (int i = 0; i < total; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||||
int tile_bytes = t->tile_stride * t->tile_rows;
|
||||
for (int i = 0; i < t->n_edges; i++) {
|
||||
meta[i].dst_off = (uint32_t)(i * tile_bytes + t->edge_off);
|
||||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||||
meta[i].beta = (int)(xs() % 16) + 1;
|
||||
/* tc0[] unused for intra; leave at 0 from calloc. */
|
||||
}
|
||||
for (int i = 0; i < t->n_edges; i++) {
|
||||
t->ref(dst_ref + meta[i].dst_off,
|
||||
(ptrdiff_t) t->tile_stride,
|
||||
meta[i].alpha, meta[i].beta);
|
||||
}
|
||||
int rc = t->dispatch(ctx, dst, (size_t) t->tile_stride,
|
||||
(size_t) t->n_edges, meta);
|
||||
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", t->name, rc); return 1; }
|
||||
|
||||
int diff = 0;
|
||||
for (int i = 0; i < total; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 deblock %s: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
t->name, total - diff, total, 100.0 * (total - diff) / total);
|
||||
|
||||
free(meta); free(dst_ref); free(dst);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_deblock_intra_all(void)
|
||||
{
|
||||
intra_test_spec specs[] = {
|
||||
{ "luma v intra", 8, 16, 8, 4 * 16,
|
||||
daedalus_h264_v_loop_filter_luma_intra_ref,
|
||||
daedalus_recipe_dispatch_h264_deblock_luma_v_intra },
|
||||
{ "luma h intra", 8, 8, 16, 4,
|
||||
daedalus_h264_h_loop_filter_luma_intra_ref,
|
||||
daedalus_recipe_dispatch_h264_deblock_luma_h_intra },
|
||||
{ "chroma v intra", 8, 8, 4, 2 * 8,
|
||||
daedalus_h264_v_loop_filter_chroma_intra_ref,
|
||||
daedalus_recipe_dispatch_h264_deblock_chroma_v_intra },
|
||||
{ "chroma h intra", 8, 4, 8, 2,
|
||||
daedalus_h264_h_loop_filter_chroma_intra_ref,
|
||||
daedalus_recipe_dispatch_h264_deblock_chroma_h_intra },
|
||||
};
|
||||
int fail = 0;
|
||||
for (size_t i = 0; i < sizeof(specs)/sizeof(specs[0]); i++)
|
||||
fail |= run_intra_test(&specs[i]);
|
||||
return fail;
|
||||
}
|
||||
|
||||
static int test_qpel_mc20(void)
|
||||
{
|
||||
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
|
||||
@@ -231,6 +434,243 @@ static int test_qpel_mc20(void)
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_qpel_mc02(void)
|
||||
{
|
||||
/* mc02: vertical 6-tap. Tile is 16 cols × 16 rows so the kernel
|
||||
* can read rows [SRC_ROW-2 .. SRC_ROW+7+3] inside the buffer.
|
||||
* SRC_ROW = 3 leaves rows -2..-1 above the output (rows 1..2 of
|
||||
* the tile) and rows 8..10 below (rows 11..13). */
|
||||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||
SRC_ROW = 3 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_qpel_meta meta[N];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||
memset(dst, 0, sizeof(dst));
|
||||
memset(dst_ref, 0, sizeof(dst_ref));
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
|
||||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
daedalus_put_h264_qpel8_mc02_ref(dst_ref + meta[i].dst_off,
|
||||
src + meta[i].src_off,
|
||||
TILE_STRIDE);
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_qpel_mc02(ctx, dst, src,
|
||||
TILE_STRIDE, N, meta);
|
||||
if (rc) { fprintf(stderr, "qpel_mc02 dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 qpel mc02: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_qpel_mc22(void)
|
||||
{
|
||||
/* mc22: 2D HV lowpass. Needs 2 cols left + 3 cols right + 2 rows
|
||||
* top + 3 rows bottom of context per 8x8 output. Tile is 16x16
|
||||
* with output positioned at (SRC_ROW=3, SRC_COL=3) so the read
|
||||
* range [SRC_*-2 .. SRC_*+7+3] stays inside the tile. */
|
||||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||
SRC_ROW = 3, SRC_COL = 3 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_qpel_meta meta[N];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||
memset(dst, 0, sizeof(dst));
|
||||
memset(dst_ref, 0, sizeof(dst_ref));
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
daedalus_put_h264_qpel8_mc22_ref(dst_ref + meta[i].dst_off,
|
||||
src + meta[i].src_off,
|
||||
TILE_STRIDE);
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_qpel_mc22(ctx, dst, src,
|
||||
TILE_STRIDE, N, meta);
|
||||
if (rc) { fprintf(stderr, "qpel_mc22 dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 qpel mc22: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
/* Generic harness for the 4 single-axis quarter-pel positions; same
|
||||
* tile geometry as mc22 since each one reads the largest of the H/V
|
||||
* lowpass windows (mc10/mc30 need cols -2..+3, mc01/mc03 need rows
|
||||
* -2..+3 OR +1..+3 on the integer side). */
|
||||
typedef void (*qpel_ref_fn)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
typedef int (*qpel_dispatch_fn)(daedalus_ctx *ctx, uint8_t *dst,
|
||||
const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
static int run_quarter_axis_qpel(const char *name,
|
||||
qpel_ref_fn ref, qpel_dispatch_fn dispatch)
|
||||
{
|
||||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||
SRC_ROW = 3, SRC_COL = 3 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_qpel_meta meta[N];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||
memset(dst, 0, sizeof(dst));
|
||||
memset(dst_ref, 0, sizeof(dst_ref));
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE);
|
||||
|
||||
int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta);
|
||||
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_qpel_quarter_axis_all(void)
|
||||
{
|
||||
int fail = 0;
|
||||
fail |= run_quarter_axis_qpel("mc10", daedalus_put_h264_qpel8_mc10_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc10);
|
||||
fail |= run_quarter_axis_qpel("mc30", daedalus_put_h264_qpel8_mc30_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc30);
|
||||
fail |= run_quarter_axis_qpel("mc01", daedalus_put_h264_qpel8_mc01_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc01);
|
||||
fail |= run_quarter_axis_qpel("mc03", daedalus_put_h264_qpel8_mc03_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc03);
|
||||
return fail;
|
||||
}
|
||||
|
||||
static int test_qpel_diag_all(void)
|
||||
{
|
||||
/* Diagonal positions need TWO half-pel intermediates per output;
|
||||
* some of them read at (r+1,c) or (r,c+1) so the test geometry
|
||||
* needs an extra row + col of context. run_quarter_axis_qpel
|
||||
* already provides plenty (SRC_ROW=3, SRC_COL=3, 16x16 tile)
|
||||
* — reusing that harness is fine. */
|
||||
int fail = 0;
|
||||
fail |= run_quarter_axis_qpel("mc11", daedalus_put_h264_qpel8_mc11_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc11);
|
||||
fail |= run_quarter_axis_qpel("mc12", daedalus_put_h264_qpel8_mc12_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc12);
|
||||
fail |= run_quarter_axis_qpel("mc13", daedalus_put_h264_qpel8_mc13_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc13);
|
||||
fail |= run_quarter_axis_qpel("mc21", daedalus_put_h264_qpel8_mc21_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc21);
|
||||
fail |= run_quarter_axis_qpel("mc23", daedalus_put_h264_qpel8_mc23_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc23);
|
||||
fail |= run_quarter_axis_qpel("mc31", daedalus_put_h264_qpel8_mc31_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc31);
|
||||
fail |= run_quarter_axis_qpel("mc32", daedalus_put_h264_qpel8_mc32_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc32);
|
||||
fail |= run_quarter_axis_qpel("mc33", daedalus_put_h264_qpel8_mc33_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_mc33);
|
||||
return fail;
|
||||
}
|
||||
|
||||
/* Avg-form harness: pre-loads dst + dst_ref with the same random
|
||||
* content so we can verify the L2 averaging is happening (not just
|
||||
* put_-style overwrite). If the dispatch incorrectly overwrote
|
||||
* dst, the bit-exact compare would still catch the mismatch against
|
||||
* the avg_ reference. */
|
||||
static int run_avg_qpel(const char *name,
|
||||
qpel_ref_fn ref, qpel_dispatch_fn dispatch)
|
||||
{
|
||||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||
SRC_ROW = 3, SRC_COL = 3 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_qpel_meta meta[N];
|
||||
|
||||
/* Two random buffers: src for the qpel input, dst seeded with
|
||||
* different random content as the "list0 prediction" — both
|
||||
* dst and dst_ref get the SAME seed so the avg compare is fair. */
|
||||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||
for (int i = 0; i < TOTAL; i++) {
|
||||
uint8_t v = (uint8_t)(xs() & 0xff);
|
||||
dst[i] = dst_ref[i] = v;
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE);
|
||||
|
||||
int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta);
|
||||
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_qpel_avg_anchors(void)
|
||||
{
|
||||
int fail = 0;
|
||||
fail |= run_avg_qpel("avg_mc20", daedalus_avg_h264_qpel8_mc20_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_avg_mc20);
|
||||
fail |= run_avg_qpel("avg_mc02", daedalus_avg_h264_qpel8_mc02_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_avg_mc02);
|
||||
fail |= run_avg_qpel("avg_mc22", daedalus_avg_h264_qpel8_mc22_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_avg_mc22);
|
||||
return fail;
|
||||
}
|
||||
|
||||
static int test_qpel_avg_rest(void)
|
||||
{
|
||||
int fail = 0;
|
||||
/* Ref fns are named daedalus_avg_h264_qpel8_<mcXX>_ref (no
|
||||
* second "avg_"); dispatch fns are named ..._avg_mcXX. Macro
|
||||
* builds both from the bare mcXX name. */
|
||||
#define RUN(MC) fail |= run_avg_qpel("avg_" #MC, \
|
||||
daedalus_avg_h264_qpel8_ ## MC ## _ref, \
|
||||
daedalus_recipe_dispatch_h264_qpel_avg_ ## MC)
|
||||
RUN(mc10); RUN(mc30); RUN(mc01); RUN(mc03);
|
||||
RUN(mc11); RUN(mc12); RUN(mc13);
|
||||
RUN(mc21); RUN(mc23);
|
||||
RUN(mc31); RUN(mc32); RUN(mc33);
|
||||
#undef RUN
|
||||
return fail;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
||||
@@ -243,14 +683,29 @@ int main(void)
|
||||
printf(" H264_QPEL_MC20 recipe substrate: %d\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));
|
||||
|
||||
printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
|
||||
printf(" H264_DEBLOCK_LH recipe substrate: %d\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
|
||||
printf(" H264_DEBLOCK_CV recipe substrate: %d\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
|
||||
printf(" H264_DEBLOCK_CH recipe substrate: %d\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
|
||||
printf(" H264_DEBLOCK_*_INTRA recipe substrate: %d (bS=4 family, all on QPU)\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA));
|
||||
|
||||
int fail = 0;
|
||||
fail |= test_idct4();
|
||||
fail |= test_idct8();
|
||||
fail |= test_deblock();
|
||||
fail |= test_deblock_h();
|
||||
fail |= test_deblock_chroma_v();
|
||||
fail |= test_deblock_chroma_h();
|
||||
fail |= test_deblock_intra_all();
|
||||
fail |= test_qpel_mc20();
|
||||
fail |= test_qpel_mc02();
|
||||
fail |= test_qpel_mc22();
|
||||
fail |= test_qpel_quarter_axis_all();
|
||||
fail |= test_qpel_diag_all();
|
||||
fail |= test_qpel_avg_anchors();
|
||||
fail |= test_qpel_avg_rest();
|
||||
return fail;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
* Tests the H.264 chroma DC 2x2 Hadamard primitive against
|
||||
* spec-derived expected outputs.
|
||||
*
|
||||
* f[0,0] = c[0,0] + c[0,1] + c[1,0] + c[1,1] "sum"
|
||||
* f[0,1] = c[0,0] - c[0,1] + c[1,0] - c[1,1] "col-diff"
|
||||
* f[1,0] = c[0,0] + c[0,1] - c[1,0] - c[1,1] "row-diff"
|
||||
* f[1,1] = c[0,0] - c[0,1] - c[1,0] + c[1,1] "anti-diag"
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
extern void daedalus_h264_chroma_dc_hadamard_2x2_ref(int16_t c[4]);
|
||||
extern void daedalus_h264_chroma_dc_hadamard_2x2(int16_t c[4]); /* public API */
|
||||
|
||||
static int check(const char *name, int16_t in[4], int16_t expect[4])
|
||||
{
|
||||
int16_t c[4]; memcpy(c, in, sizeof(c));
|
||||
daedalus_h264_chroma_dc_hadamard_2x2_ref(c);
|
||||
int fail = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (c[i] != expect[i]) {
|
||||
fprintf(stderr, "%s: c[%d] = %d, expected %d\n",
|
||||
name, i, c[i], expect[i]);
|
||||
fail = 1;
|
||||
}
|
||||
}
|
||||
if (!fail) printf(" %-32s PASS\n", name);
|
||||
else printf(" %-32s FAIL\n", name);
|
||||
return fail;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int fail = 0;
|
||||
|
||||
/* Test 1: All-same input.
|
||||
* c = [5, 5, 5, 5]
|
||||
* f[0,0] = 20, f[0,1] = 0, f[1,0] = 0, f[1,1] = 0
|
||||
*/
|
||||
{ int16_t in[4] = { 5, 5, 5, 5 };
|
||||
int16_t ex[4] = { 20, 0, 0, 0 };
|
||||
fail |= check("all-uniform 5", in, ex); }
|
||||
|
||||
/* Test 2: Single-axis variation (col 1 = 0, col 2 = 10).
|
||||
* c = [0, 10, 0, 10]
|
||||
* f[0,0] = 0+10+0+10 = 20
|
||||
* f[0,1] = 0-10+0-10 = -20
|
||||
* f[1,0] = 0+10-0-10 = 0
|
||||
* f[1,1] = 0-10-0+10 = 0
|
||||
*/
|
||||
{ int16_t in[4] = { 0, 10, 0, 10 };
|
||||
int16_t ex[4] = { 20, -20, 0, 0 };
|
||||
fail |= check("col gradient [0,10,0,10]", in, ex); }
|
||||
|
||||
/* Test 3: Row gradient.
|
||||
* c = [0, 0, 10, 10]
|
||||
* f[0,0] = 20, f[0,1] = 0, f[1,0] = 0-20 = -20, f[1,1] = 0
|
||||
*/
|
||||
{ int16_t in[4] = { 0, 0, 10, 10 };
|
||||
int16_t ex[4] = { 20, 0, -20, 0 };
|
||||
fail |= check("row gradient [0,0,10,10]", in, ex); }
|
||||
|
||||
/* Test 4: Anti-diagonal pattern.
|
||||
* c = [10, 0, 0, 10]
|
||||
* f[0,0] = 20
|
||||
* f[0,1] = 10-0+0-10 = 0
|
||||
* f[1,0] = 10+0-0-10 = 0
|
||||
* f[1,1] = 10-0-0+10 = 20
|
||||
*/
|
||||
{ int16_t in[4] = { 10, 0, 0, 10 };
|
||||
int16_t ex[4] = { 20, 0, 0, 20 };
|
||||
fail |= check("anti-diagonal [10,0,0,10]", in, ex); }
|
||||
|
||||
/* Test 5: Asymmetric — all bands non-zero.
|
||||
* c = [1, 2, 3, 4]
|
||||
* f[0,0] = 10
|
||||
* f[0,1] = 1-2+3-4 = -2
|
||||
* f[1,0] = 1+2-3-4 = -4
|
||||
* f[1,1] = 1-2-3+4 = 0
|
||||
*/
|
||||
{ int16_t in[4] = { 1, 2, 3, 4 };
|
||||
int16_t ex[4] = { 10, -2, -4, 0 };
|
||||
fail |= check("asymmetric [1,2,3,4]", in, ex); }
|
||||
|
||||
/* Test 6: Negative inputs (Hadamard is linear, so signs preserve).
|
||||
* c = [-5, 5, -5, 5]
|
||||
* f[0,0] = -5+5-5+5 = 0
|
||||
* f[0,1] = -5-5-5-5 = -20
|
||||
* f[1,0] = -5+5+5-5 = 0
|
||||
* f[1,1] = -5-5+5+5 = 0
|
||||
*/
|
||||
{ int16_t in[4] = { -5, 5, -5, 5 };
|
||||
int16_t ex[4] = { 0, -20, 0, 0 };
|
||||
fail |= check("sign-alternating [-5,5,-5,5]", in, ex); }
|
||||
|
||||
/* Test 7: Inverse-property check. H * H = 4*I for the unscaled
|
||||
* 2x2 Hadamard. So applying twice multiplies each by 4.
|
||||
* c = [1, 2, 3, 4]
|
||||
* First Hadamard: [10, -2, -4, 0]
|
||||
* Second Hadamard: [4, 8, 12, 16]
|
||||
*/
|
||||
{ int16_t in[4] = { 1, 2, 3, 4 };
|
||||
int16_t ex[4] = { 4, 8, 12, 16 };
|
||||
int16_t c[4]; memcpy(c, in, sizeof(c));
|
||||
daedalus_h264_chroma_dc_hadamard_2x2_ref(c);
|
||||
daedalus_h264_chroma_dc_hadamard_2x2_ref(c);
|
||||
int local_fail = 0;
|
||||
for (int i = 0; i < 4; i++) if (c[i] != ex[i]) local_fail = 1;
|
||||
printf(" %-32s %s\n", "double-Hadamard = 4*orig",
|
||||
local_fail ? "FAIL" : "PASS");
|
||||
fail |= local_fail;
|
||||
}
|
||||
|
||||
/* Test 8: public API parity. The public symbol must produce
|
||||
* byte-identical output to the test-only ref for the same input.
|
||||
* If the src/ Hadamard ever drifts from the spec, this catches it. */
|
||||
{
|
||||
int16_t input[4] = { 7, -11, 23, -42 };
|
||||
int16_t a[4], b[4];
|
||||
memcpy(a, input, sizeof(a));
|
||||
memcpy(b, input, sizeof(b));
|
||||
daedalus_h264_chroma_dc_hadamard_2x2_ref(a);
|
||||
daedalus_h264_chroma_dc_hadamard_2x2(b);
|
||||
int local_fail = 0;
|
||||
for (int i = 0; i < 4; i++) if (a[i] != b[i]) local_fail = 1;
|
||||
printf(" %-32s %s\n", "public API parity vs _ref",
|
||||
local_fail ? "FAIL" : "PASS");
|
||||
fail |= local_fail;
|
||||
}
|
||||
|
||||
if (fail == 0) printf("\nALL chroma DC Hadamard tests PASS\n");
|
||||
else fprintf(stderr, "\n%d test(s) FAILED\n", fail);
|
||||
return fail ? 1 : 0;
|
||||
}
|
||||
@@ -0,0 +1,167 @@
|
||||
/*
|
||||
* Tests the 4 H.264 Intra_16x16 luma prediction modes against
|
||||
* spec-derived expected patterns. Same layout as the 4x4 test:
|
||||
* a buffer that holds the 16x16 output plus 1-pixel top/left
|
||||
* context and 1-pixel top-left corner.
|
||||
*
|
||||
* row 0: [tl][t0..t15]
|
||||
* row 1: [l0][output row 0]
|
||||
* row 2: [l1][output row 1]
|
||||
* ...
|
||||
* row 16: [l15][output row 15]
|
||||
*
|
||||
* Buffer dimensions: 17 rows × 17 cols, total 289 bytes.
|
||||
* dst (passed to the pred fns) points at row 1 col 1.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
extern void daedalus_h264_pred_16x16_vertical(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_16x16_horizontal(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_16x16_dc(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_16x16_plane(uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
#define STRIDE 17
|
||||
#define ROWS 17
|
||||
|
||||
static void set_ctx(uint8_t buf[ROWS][STRIDE], int tl,
|
||||
const int t[16], const int l[16])
|
||||
{
|
||||
for (int r = 0; r < ROWS; r++)
|
||||
for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff;
|
||||
buf[0][0] = (uint8_t) tl;
|
||||
for (int c = 0; c < 16; c++) buf[0][1 + c] = (uint8_t) t[c];
|
||||
for (int r = 0; r < 16; r++) buf[1 + r][0] = (uint8_t) l[r];
|
||||
}
|
||||
|
||||
static int check(const uint8_t buf[ROWS][STRIDE], const char *name,
|
||||
uint8_t (*expect_at)(int r, int c, void *), void *cookie)
|
||||
{
|
||||
int diff = 0;
|
||||
int first_r = 0, first_c = 0, first_got = 0, first_exp = 0;
|
||||
for (int r = 0; r < 16; r++) {
|
||||
for (int c = 0; c < 16; c++) {
|
||||
uint8_t got = buf[1 + r][1 + c];
|
||||
uint8_t exp = expect_at(r, c, cookie);
|
||||
if (got != exp) {
|
||||
if (diff == 0) {
|
||||
first_r = r; first_c = c;
|
||||
first_got = got; first_exp = exp;
|
||||
}
|
||||
diff++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (diff == 0)
|
||||
printf(" %-30s PASS\n", name);
|
||||
else
|
||||
printf(" %-30s FAIL (%d/256 wrong, first r=%d c=%d got=%u exp=%u)\n",
|
||||
name, diff, first_r, first_c, first_got, first_exp);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
/* Expectation helpers for each mode. */
|
||||
static uint8_t expect_uniform(int r, int c, void *cookie)
|
||||
{ (void)r; (void)c; return *(uint8_t *)cookie; }
|
||||
|
||||
struct vertical_ctx { const int *t; };
|
||||
static uint8_t expect_vertical(int r, int c, void *cookie)
|
||||
{ (void)r; return (uint8_t) ((struct vertical_ctx *)cookie)->t[c]; }
|
||||
|
||||
struct horizontal_ctx { const int *l; };
|
||||
static uint8_t expect_horizontal(int r, int c, void *cookie)
|
||||
{ (void)c; return (uint8_t) ((struct horizontal_ctx *)cookie)->l[r]; }
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int fail = 0;
|
||||
|
||||
/* --- Mode 0 Vertical: each col = top[col] --- */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[16];
|
||||
for (int i = 0; i < 16; i++) { t[i] = 10 + i; l[i] = 0; }
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_16x16_vertical(&buf[1][1], STRIDE);
|
||||
struct vertical_ctx vc = { t };
|
||||
fail |= check(buf, "Vertical (mode 0)", expect_vertical, &vc);
|
||||
}
|
||||
|
||||
/* --- Mode 1 Horizontal: each row = left[row] --- */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16] = {0}, l[16];
|
||||
for (int i = 0; i < 16; i++) l[i] = 50 + i;
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_16x16_horizontal(&buf[1][1], STRIDE);
|
||||
struct horizontal_ctx hc = { l };
|
||||
fail |= check(buf, "Horizontal (mode 1)", expect_horizontal, &hc);
|
||||
}
|
||||
|
||||
/* --- Mode 2 DC: ((sum + 16) >> 5) --- */
|
||||
/* All top = 2, all left = 6: sum = 32 + 96 = 128, +16 = 144,
|
||||
* >>5 = 144/32 = 4. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[16];
|
||||
for (int i = 0; i < 16; i++) { t[i] = 2; l[i] = 6; }
|
||||
set_ctx(buf, 99, t, l);
|
||||
daedalus_h264_pred_16x16_dc(&buf[1][1], STRIDE);
|
||||
uint8_t exp_val = 4;
|
||||
fail |= check(buf, "DC (mode 2)", expect_uniform, &exp_val);
|
||||
}
|
||||
|
||||
/* --- Mode 3 Plane: uniform neighbours → uniform output --- */
|
||||
/* H=V=0 when neighbours are uniform. a = 16*(p+p) = 32p.
|
||||
* pred[y][x] = (32p + 0 + 0 + 16) >> 5 = (32p + 16) >> 5 = p
|
||||
* (exact integer for any p, since 32p/32 = p and +16/32 = 0).
|
||||
* Verifies the orientation-free portion of the formula. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[16];
|
||||
for (int i = 0; i < 16; i++) { t[i] = 100; l[i] = 100; }
|
||||
set_ctx(buf, 100, t, l); /* uniform tl too — H/V sums actually zero */
|
||||
daedalus_h264_pred_16x16_plane(&buf[1][1], STRIDE);
|
||||
uint8_t exp_val = 100;
|
||||
fail |= check(buf, "Plane (mode 3, uniform)", expect_uniform, &exp_val);
|
||||
}
|
||||
|
||||
/* --- Mode 3 Plane: gradient sanity ---
|
||||
* Top row = 0..15 (gradient), left col = 0..15, tl = 0.
|
||||
* H = sum_{i=0..7} (i+1) * (t[8+i] - t[6-i] for i<7; or t[15]-tl=15 for i=7)
|
||||
* = 1*(8-6) + 2*(9-5) + 3*(10-4) + 4*(11-3) + 5*(12-2) + 6*(13-1)
|
||||
* + 7*(14-0) + 8*(15-0)
|
||||
* = 2 + 8 + 18 + 32 + 50 + 72 + 98 + 120 = 400
|
||||
* V = same shape on left col = 400
|
||||
* b = (5*400 + 32) >> 6 = 2032 >> 6 = 31
|
||||
* c = (5*400 + 32) >> 6 = 31
|
||||
* a = 16 * (l[15] + t[15]) = 16 * (15 + 15) = 480
|
||||
* pred[0][0] = (480 + 31*(-7) + 31*(-7) + 16) >> 5
|
||||
* = (480 - 217 - 217 + 16) >> 5
|
||||
* = 62 >> 5 = 1
|
||||
* pred[15][15] = (480 + 31*8 + 31*8 + 16) >> 5
|
||||
* = (480 + 248 + 248 + 16) >> 5
|
||||
* = 992 >> 5 = 31
|
||||
* Just spot-check those two corners. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[16];
|
||||
for (int i = 0; i < 16; i++) { t[i] = i; l[i] = i; }
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_16x16_plane(&buf[1][1], STRIDE);
|
||||
uint8_t tl_actual = buf[1 + 0][1 + 0];
|
||||
uint8_t br_actual = buf[1 + 15][1 + 15];
|
||||
int spot_fail = 0;
|
||||
if (tl_actual != 1) { fprintf(stderr, "Plane gradient pred[0][0] = %u, expected 1\n", tl_actual); spot_fail = 1; }
|
||||
if (br_actual != 31) { fprintf(stderr, "Plane gradient pred[15][15] = %u, expected 31\n", br_actual); spot_fail = 1; }
|
||||
if (!spot_fail) printf(" %-30s PASS (corners 1, 31)\n", "Plane (mode 3, gradient)");
|
||||
else printf(" %-30s FAIL\n", "Plane (mode 3, gradient)");
|
||||
fail |= spot_fail;
|
||||
}
|
||||
|
||||
if (fail == 0) printf("\nALL Intra_16x16 mode references PASS\n");
|
||||
else fprintf(stderr, "\n%d test(s) FAILED\n", fail);
|
||||
return fail ? 1 : 0;
|
||||
}
|
||||
@@ -0,0 +1,246 @@
|
||||
/*
|
||||
* Tests the 9 H.264 Intra_4x4 luma prediction modes against
|
||||
* spec-derived expected patterns. Goal: catch any mistake in
|
||||
* the reference (sign / shift / table mapping) before it lands
|
||||
* downstream. Each mode is exercised with a deterministic
|
||||
* neighbour context and checked against a hand-computed (or
|
||||
* spec-derived) expected 4x4 output.
|
||||
*
|
||||
* The test buffer layout reserves a 1-pixel top/left context border
|
||||
* + a 4-pixel top-right (for modes 3 / 7):
|
||||
*
|
||||
* row 0: [tl][t0 t1 t2 t3 t4 t5 t6 t7] <- TOP_STRIDE = 9 bytes
|
||||
* row 1: [l0][ 4x4 output goes here ]
|
||||
* row 2: [l1][ ]
|
||||
* row 3: [l2][ ]
|
||||
* row 4: [l3][ ]
|
||||
*
|
||||
* dst (passed to the pred fns) points at row 1 col 1.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
extern void daedalus_h264_pred_4x4_vertical(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_horizontal(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_dc(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_ddl(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_ddr(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_vr(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_hd(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_vl(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_4x4_hu(uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
#define STRIDE 9
|
||||
typedef void (*pred_fn)(uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
/* Set up the buffer: 5 rows × STRIDE cols.
|
||||
* top-left = tl, top[0..7] = t[0..7], left[0..3] = l[0..3].
|
||||
* The 4x4 output region (rows 1..4, cols 1..4) is filled with 0xff
|
||||
* sentinels so any unwritten cell shows up as 255 in the compare. */
|
||||
static void set_ctx(uint8_t buf[5][STRIDE], int tl, const int t[8], const int l[4])
|
||||
{
|
||||
for (int r = 0; r < 5; r++) for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff;
|
||||
buf[0][0] = (uint8_t) tl;
|
||||
for (int c = 0; c < 8; c++) buf[0][1 + c] = (uint8_t) t[c];
|
||||
for (int r = 0; r < 4; r++) buf[1 + r][0] = (uint8_t) l[r];
|
||||
}
|
||||
|
||||
static int check(const uint8_t buf[5][STRIDE], const char *name,
|
||||
const uint8_t expect[4][4])
|
||||
{
|
||||
int diff = 0;
|
||||
for (int r = 0; r < 4; r++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
uint8_t got = buf[1 + r][1 + c];
|
||||
uint8_t exp = expect[r][c];
|
||||
if (got != exp) {
|
||||
if (diff == 0)
|
||||
fprintf(stderr,
|
||||
"%s: first mismatch r=%d c=%d got=%u exp=%u\n",
|
||||
name, r, c, got, exp);
|
||||
diff++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (diff == 0)
|
||||
printf(" %-26s PASS\n", name);
|
||||
else
|
||||
printf(" %-26s FAIL (%d/16 bytes wrong)\n", name, diff);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int fail = 0;
|
||||
|
||||
/* Mode 0 — Vertical: each col = top[col]. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int tl = 0;
|
||||
int t[8] = { 10, 20, 30, 40, 0, 0, 0, 0 };
|
||||
int l[4] = { 0, 0, 0, 0 };
|
||||
set_ctx(buf, tl, t, l);
|
||||
daedalus_h264_pred_4x4_vertical(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{10,20,30,40}, {10,20,30,40}, {10,20,30,40}, {10,20,30,40}
|
||||
};
|
||||
fail |= check(buf, "Vertical (mode 0)", exp);
|
||||
}
|
||||
|
||||
/* Mode 1 — Horizontal: each row = left[row]. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 0,0,0,0, 0,0,0,0 };
|
||||
int l[4] = { 50, 60, 70, 80 };
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_4x4_horizontal(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{50,50,50,50}, {60,60,60,60}, {70,70,70,70}, {80,80,80,80}
|
||||
};
|
||||
fail |= check(buf, "Horizontal (mode 1)", exp);
|
||||
}
|
||||
|
||||
/* Mode 2 — DC: all 8 neighbours valid → ((sum + 4) >> 3) broadcast.
|
||||
* top sum = 4*1 = 4, left sum = 4*3 = 12, total 16, +4 = 20,
|
||||
* >>3 = 2. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 1,1,1,1, 0,0,0,0 };
|
||||
int l[4] = { 3,3,3,3 };
|
||||
set_ctx(buf, 99, t, l); /* tl unused for DC */
|
||||
daedalus_h264_pred_4x4_dc(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{2,2,2,2}, {2,2,2,2}, {2,2,2,2}, {2,2,2,2}
|
||||
};
|
||||
fail |= check(buf, "DC (mode 2)", exp);
|
||||
}
|
||||
|
||||
/* Mode 3 — Diagonal_Down_Left: zz[i] = avg3(t[i], t[i+1], t[i+2]);
|
||||
* dst[r][c] = zz[c + r].
|
||||
* With all t[]=100 → all zz=100 → all dst=100. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 100,100,100,100, 100,100,100,100 };
|
||||
int l[4] = { 0,0,0,0 };
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_4x4_ddl(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{100,100,100,100}, {100,100,100,100},
|
||||
{100,100,100,100}, {100,100,100,100}
|
||||
};
|
||||
fail |= check(buf, "DiagDownLeft (mode 3)", exp);
|
||||
}
|
||||
|
||||
/* Mode 4 — Diagonal_Down_Right: zz[c-r] with c-r ∈ {-3..+3}.
|
||||
* If all 9 surrounding pixels = 200 → all zz = 200 → all dst = 200. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 200,200,200,200, 0,0,0,0 };
|
||||
int l[4] = { 200,200,200,200 };
|
||||
set_ctx(buf, 200, t, l);
|
||||
daedalus_h264_pred_4x4_ddr(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{200,200,200,200}, {200,200,200,200},
|
||||
{200,200,200,200}, {200,200,200,200}
|
||||
};
|
||||
fail |= check(buf, "DiagDownRight (mode 4)", exp);
|
||||
}
|
||||
|
||||
/* Mode 5 — Vertical_Right. With all neighbours = 80 the 3-tap
|
||||
* (a+2b+c+2)>>2 and 2-tap (a+b+1)>>1 both yield 80. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 80,80,80,80, 0,0,0,0 };
|
||||
int l[4] = { 80,80,80,80 };
|
||||
set_ctx(buf, 80, t, l);
|
||||
daedalus_h264_pred_4x4_vr(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{80,80,80,80}, {80,80,80,80}, {80,80,80,80}, {80,80,80,80}
|
||||
};
|
||||
fail |= check(buf, "VerticalRight (mode 5)", exp);
|
||||
}
|
||||
|
||||
/* Mode 6 — Horizontal_Down. Same uniform-context degenerate case. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 120,120,120,120, 0,0,0,0 };
|
||||
int l[4] = { 120,120,120,120 };
|
||||
set_ctx(buf, 120, t, l);
|
||||
daedalus_h264_pred_4x4_hd(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{120,120,120,120}, {120,120,120,120},
|
||||
{120,120,120,120}, {120,120,120,120}
|
||||
};
|
||||
fail |= check(buf, "HorizontalDown (mode 6)", exp);
|
||||
}
|
||||
|
||||
/* Mode 7 — Vertical_Left. Uniform context. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 64,64,64,64, 64,64,64,64 };
|
||||
int l[4] = { 0,0,0,0 };
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_4x4_vl(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{64,64,64,64}, {64,64,64,64}, {64,64,64,64}, {64,64,64,64}
|
||||
};
|
||||
fail |= check(buf, "VerticalLeft (mode 7)", exp);
|
||||
}
|
||||
|
||||
/* Mode 8 — Horizontal_Up. Uniform context. */
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 0,0,0,0, 0,0,0,0 };
|
||||
int l[4] = { 200,200,200,200 };
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_4x4_hu(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{200,200,200,200}, {200,200,200,200},
|
||||
{200,200,200,200}, {200,200,200,200}
|
||||
};
|
||||
fail |= check(buf, "HorizontalUp (mode 8)", exp);
|
||||
}
|
||||
|
||||
/* Asymmetric Vertical_Right test: detects orientation /
|
||||
* row-vs-col confusion. Top=10,20,30,40, Left=50,60,70,
|
||||
* top-left=5. Spec-derived expected output computed by hand
|
||||
* from §8.3.1.4.6.
|
||||
*
|
||||
* d[0][0] = (tl+t0+1)>>1 = (5+10+1)>>1 = 8
|
||||
* d[0][1] = (t0+t1+1)>>1 = (10+20+1)>>1 = 15
|
||||
* d[0][2] = (t1+t2+1)>>1 = (20+30+1)>>1 = 25
|
||||
* d[0][3] = (t2+t3+1)>>1 = (30+40+1)>>1 = 35
|
||||
* d[1][0] = avg3(l0,tl,t0) = (50+2*5+10+2)>>2 = 72/4 = 18
|
||||
* d[1][1] = avg3(tl,t0,t1) = (5+20+20+2)>>2 = 47/4 = 11
|
||||
* d[1][2] = avg3(t0,t1,t2) = (10+40+30+2)>>2 = 82/4 = 20
|
||||
* d[1][3] = avg3(t1,t2,t3) = (20+60+40+2)>>2 = 122/4 = 30
|
||||
* d[2][0] = avg3(tl,l0,l1) = (5+100+60+2)>>2 = 167/4 = 41
|
||||
* d[2][1] = d[0][0] = 8
|
||||
* d[2][2] = d[0][1] = 15
|
||||
* d[2][3] = d[0][2] = 25
|
||||
* d[3][0] = avg3(l0,l1,l2) = (50+120+70+2)>>2 = 242/4 = 60
|
||||
* d[3][1] = d[1][0] = 18
|
||||
* d[3][2] = d[1][1] = 11
|
||||
* d[3][3] = d[1][2] = 20
|
||||
*/
|
||||
{
|
||||
uint8_t buf[5][STRIDE];
|
||||
int t[8] = { 10,20,30,40, 0,0,0,0 };
|
||||
int l[4] = { 50,60,70,0 };
|
||||
set_ctx(buf, 5, t, l);
|
||||
daedalus_h264_pred_4x4_vr(&buf[1][1], STRIDE);
|
||||
uint8_t exp[4][4] = {
|
||||
{ 8,15,25,35},
|
||||
{18,11,20,30},
|
||||
{41, 8,15,25},
|
||||
{60,18,11,20},
|
||||
};
|
||||
fail |= check(buf, "VR asym (sanity)", exp);
|
||||
}
|
||||
|
||||
if (fail == 0) printf("\nALL %d intra-4x4 mode references PASS\n", 10);
|
||||
else fprintf(stderr, "\n%d test(s) FAILED\n", fail);
|
||||
return fail ? 1 : 0;
|
||||
}
|
||||
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
* Tests the H.264 Intra_8x8 luma prediction modes against spec-derived
|
||||
* expectations. Buffer layout is 9 rows × 17 cols (extra cols for the
|
||||
* top-right extension that DDL/VL need; not exercised by V/H/DC but
|
||||
* already in-place for the eventual directional-modes follow-up):
|
||||
*
|
||||
* row 0: [tl][t0..t15] — 17 bytes
|
||||
* row 1: [l0][output row 0 ..] — 17 bytes
|
||||
* ...
|
||||
* row 8: [l7][output row 7 ..]
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
extern void daedalus_h264_pred_8x8l_vertical(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_8x8l_horizontal(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_8x8l_dc(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_8x8l_ddl(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_8x8l_ddr(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_8x8l_vr(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_8x8l_hd(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_8x8l_vl(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_8x8l_hu(uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
#define STRIDE 17
|
||||
#define ROWS 9
|
||||
|
||||
static void set_ctx(uint8_t buf[ROWS][STRIDE], int tl,
|
||||
const int t[16], const int l[8])
|
||||
{
|
||||
for (int r = 0; r < ROWS; r++)
|
||||
for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff;
|
||||
buf[0][0] = (uint8_t) tl;
|
||||
for (int c = 0; c < 16; c++) buf[0][1 + c] = (uint8_t) t[c];
|
||||
for (int r = 0; r < 8; r++) buf[1 + r][0] = (uint8_t) l[r];
|
||||
}
|
||||
|
||||
static int check_uniform(const uint8_t buf[ROWS][STRIDE], const char *name,
|
||||
uint8_t expect_val)
|
||||
{
|
||||
int diff = 0;
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++)
|
||||
if (buf[1+r][1+c] != expect_val) diff++;
|
||||
if (diff == 0) printf(" %-30s PASS\n", name);
|
||||
else printf(" %-30s FAIL (%d/64 wrong, expected %u)\n", name, diff, expect_val);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int fail = 0;
|
||||
|
||||
/* Mode 0 Vertical with uniform top → uniform output.
|
||||
* Filtered top[c] = (a + 2*a + a + 2) >> 2 = a for uniform a. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[8];
|
||||
for (int i = 0; i < 16; i++) t[i] = 50;
|
||||
for (int j = 0; j < 8; j++) l[j] = 0;
|
||||
set_ctx(buf, 50, t, l);
|
||||
daedalus_h264_pred_8x8l_vertical(&buf[1][1], STRIDE);
|
||||
fail |= check_uniform(buf, "Vertical (mode 0, uniform top)", 50);
|
||||
}
|
||||
|
||||
/* Mode 1 Horizontal with uniform left → uniform output. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16] = {0}, l[8];
|
||||
for (int j = 0; j < 8; j++) l[j] = 70;
|
||||
set_ctx(buf, 70, t, l);
|
||||
daedalus_h264_pred_8x8l_horizontal(&buf[1][1], STRIDE);
|
||||
fail |= check_uniform(buf, "Horizontal (mode 1, uniform left)", 70);
|
||||
}
|
||||
|
||||
/* Mode 2 DC with all-uniform neighbours → uniform output.
|
||||
* Filtered top[c] = top for uniform; filtered left[j] = left.
|
||||
* sum = 8*a + 8*a + 8 = 16a + 8. >> 4 = a (exact when +8 rounds). */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[8];
|
||||
for (int i = 0; i < 16; i++) t[i] = 33;
|
||||
for (int j = 0; j < 8; j++) l[j] = 33;
|
||||
set_ctx(buf, 33, t, l);
|
||||
daedalus_h264_pred_8x8l_dc(&buf[1][1], STRIDE);
|
||||
fail |= check_uniform(buf, "DC (mode 2, uniform)", 33);
|
||||
}
|
||||
|
||||
/* Mode 0 Vertical with NON-uniform top: gradient 0..15. Filtered
|
||||
* top[c] for c in 1..14 = (t[c-1] + 2*t[c] + t[c+1] + 2) >> 2
|
||||
* = (c-1 + 2c + c+1 + 2) >> 2
|
||||
* = (4c + 2) >> 2 = c (since (4c+2)/4 = c with rounding).
|
||||
* Wait — (4c + 2) >> 2 = c + 0 (since 4c is divisible by 4 and +2 rounds
|
||||
* BELOW 4, doesn't change anything). So filtered = c for c=1..14.
|
||||
* filt[0] (top-left) = (t[0] + 2*tl + l[0] + 2) >> 2 (not exercised
|
||||
* directly by Vertical mode).
|
||||
* filt[top 0] = (tl + 2*t[0] + t[1] + 2) >> 2 = (0 + 0 + 1 + 2) >> 2 = 0
|
||||
* (tl=0, t[0]=0, t[1]=1)
|
||||
* filt[top 15] = (t[14] + 3*t[15] + 2) >> 2 = (14 + 45 + 2) >> 2
|
||||
* = 61 >> 2 = 15
|
||||
*
|
||||
* So Vertical output col 0 = filt[top 0] = 0, col 1 = filt[top 1] = 1,
|
||||
* ..., col 7 = filt[top 7] = 7. Same for all 8 rows. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[8] = {0};
|
||||
for (int i = 0; i < 16; i++) t[i] = i;
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_8x8l_vertical(&buf[1][1], STRIDE);
|
||||
int diff = 0;
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++)
|
||||
if (buf[1+r][1+c] != c) diff++;
|
||||
if (diff == 0) printf(" %-30s PASS (filtered gradient)\n", "Vertical (mode 0, gradient)");
|
||||
else printf(" %-30s FAIL (%d/64 wrong)\n", "Vertical (mode 0, gradient)", diff);
|
||||
fail |= (diff == 0) ? 0 : 1;
|
||||
}
|
||||
|
||||
/* Mode 1 Horizontal gradient: left = 0..7. Filtered left:
|
||||
* filt[left 0] = (tl + 2*l[0] + l[1] + 2) >> 2 = (0 + 0 + 1 + 2) >> 2 = 0
|
||||
* filt[left j] for j=1..6 = (l[j-1] + 2*l[j] + l[j+1] + 2) >> 2 = j
|
||||
* (same arithmetic as top)
|
||||
* filt[left 7] = (l[6] + 3*l[7] + 2) >> 2 = (6 + 21 + 2) >> 2 = 7
|
||||
* So Horizontal output row 0 = 0, row 7 = 7. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16] = {0}, l[8];
|
||||
for (int j = 0; j < 8; j++) l[j] = j;
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_8x8l_horizontal(&buf[1][1], STRIDE);
|
||||
int diff = 0;
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++)
|
||||
if (buf[1+r][1+c] != r) diff++;
|
||||
if (diff == 0) printf(" %-30s PASS (filtered gradient)\n", "Horizontal (mode 1, gradient)");
|
||||
else printf(" %-30s FAIL (%d/64 wrong)\n", "Horizontal (mode 1, gradient)", diff);
|
||||
fail |= (diff == 0) ? 0 : 1;
|
||||
}
|
||||
|
||||
/* Directional modes — uniform-context sanity tests. With all
|
||||
* neighbours = N, the 1-2-1 filter produces uniform N, and any
|
||||
* 3-tap / 2-tap on uniform N produces N. So every directional
|
||||
* mode should output uniform N on uniform input. */
|
||||
{
|
||||
typedef void (*pred_fn_t)(uint8_t *dst, ptrdiff_t stride);
|
||||
struct { const char *name; pred_fn_t fn; } modes[] = {
|
||||
{ "DDL (mode 3, uniform)", daedalus_h264_pred_8x8l_ddl },
|
||||
{ "DDR (mode 4, uniform)", daedalus_h264_pred_8x8l_ddr },
|
||||
{ "VR (mode 5, uniform)", daedalus_h264_pred_8x8l_vr },
|
||||
{ "HD (mode 6, uniform)", daedalus_h264_pred_8x8l_hd },
|
||||
{ "VL (mode 7, uniform)", daedalus_h264_pred_8x8l_vl },
|
||||
{ "HU (mode 8, uniform)", daedalus_h264_pred_8x8l_hu },
|
||||
};
|
||||
for (size_t i = 0; i < sizeof(modes)/sizeof(modes[0]); i++) {
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[16], l[8];
|
||||
for (int k = 0; k < 16; k++) t[k] = 120;
|
||||
for (int k = 0; k < 8; k++) l[k] = 120;
|
||||
set_ctx(buf, 120, t, l);
|
||||
modes[i].fn(&buf[1][1], STRIDE);
|
||||
fail |= check_uniform(buf, modes[i].name, 120);
|
||||
}
|
||||
}
|
||||
|
||||
if (fail == 0) printf("\nALL Intra_8x8 luma PASS (9 modes — V, H, DC, DDL, DDR, VR, HD, VL, HU)\n");
|
||||
else fprintf(stderr, "\n%d test(s) FAILED\n", fail);
|
||||
return fail ? 1 : 0;
|
||||
}
|
||||
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
* Tests the 4 H.264 Intra_8x8 chroma prediction modes against
|
||||
* spec-derived expected patterns. Same buffer layout idea as the
|
||||
* other intra tests: a buffer that holds the 8x8 output + 1-pixel
|
||||
* top/left context + 1-pixel top-left corner.
|
||||
*
|
||||
* row 0: [tl][t0..t7]
|
||||
* row 1: [l0][output row 0]
|
||||
* ...
|
||||
* row 8: [l7][output row 7]
|
||||
*
|
||||
* Dimensions: 9 rows × 9 cols. dst (passed to pred fns) = &buf[1][1].
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
extern void daedalus_h264_pred_chroma8x8_dc(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_chroma8x8_horizontal(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_chroma8x8_vertical(uint8_t *dst, ptrdiff_t stride);
|
||||
extern void daedalus_h264_pred_chroma8x8_plane(uint8_t *dst, ptrdiff_t stride);
|
||||
|
||||
#define STRIDE 9
|
||||
#define ROWS 9
|
||||
|
||||
static void set_ctx(uint8_t buf[ROWS][STRIDE], int tl,
|
||||
const int t[8], const int l[8])
|
||||
{
|
||||
for (int r = 0; r < ROWS; r++)
|
||||
for (int c = 0; c < STRIDE; c++) buf[r][c] = 0xff;
|
||||
buf[0][0] = (uint8_t) tl;
|
||||
for (int c = 0; c < 8; c++) buf[0][1 + c] = (uint8_t) t[c];
|
||||
for (int r = 0; r < 8; r++) buf[1 + r][0] = (uint8_t) l[r];
|
||||
}
|
||||
|
||||
static int check_per_cell(const uint8_t buf[ROWS][STRIDE], const char *name,
|
||||
const uint8_t expect[8][8])
|
||||
{
|
||||
int diff = 0;
|
||||
int first_r = 0, first_c = 0, first_got = 0, first_exp = 0;
|
||||
for (int r = 0; r < 8; r++) {
|
||||
for (int c = 0; c < 8; c++) {
|
||||
uint8_t got = buf[1 + r][1 + c];
|
||||
uint8_t exp = expect[r][c];
|
||||
if (got != exp) {
|
||||
if (diff == 0) {
|
||||
first_r = r; first_c = c;
|
||||
first_got = got; first_exp = exp;
|
||||
}
|
||||
diff++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (diff == 0)
|
||||
printf(" %-30s PASS\n", name);
|
||||
else
|
||||
printf(" %-30s FAIL (%d/64 wrong, first r=%d c=%d got=%u exp=%u)\n",
|
||||
name, diff, first_r, first_c, first_got, first_exp);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int fail = 0;
|
||||
|
||||
/* --- Mode 1 Horizontal --- */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[8] = {0}, l[8] = {10, 20, 30, 40, 50, 60, 70, 80};
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_chroma8x8_horizontal(&buf[1][1], STRIDE);
|
||||
uint8_t exp[8][8];
|
||||
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = (uint8_t) l[r];
|
||||
fail |= check_per_cell(buf, "Horizontal (mode 1)", exp);
|
||||
}
|
||||
|
||||
/* --- Mode 2 Vertical --- */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[8] = {15, 25, 35, 45, 55, 65, 75, 85}, l[8] = {0};
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_chroma8x8_vertical(&buf[1][1], STRIDE);
|
||||
uint8_t exp[8][8];
|
||||
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = (uint8_t) t[c];
|
||||
fail |= check_per_cell(buf, "Vertical (mode 2)", exp);
|
||||
}
|
||||
|
||||
/* --- Mode 0 DC: per-quadrant. Test with distinct halves so any
|
||||
* quadrant mix-up surfaces immediately.
|
||||
*
|
||||
* top[0..3] = 4 × 8 → sum_top_lo = 32
|
||||
* top[4..7] = 4 × 16 → sum_top_hi = 64
|
||||
* left[0..3] = 4 × 24 → sum_left_lo = 96
|
||||
* left[4..7] = 4 × 40 → sum_left_hi = 160
|
||||
*
|
||||
* dc00 = (32 + 96 + 4) >> 3 = 132/8 = 16
|
||||
* dc01 = (64 + 2) >> 2 = 66/4 = 16
|
||||
* dc10 = ( 160 + 2) >> 2 = 162/4 = 40
|
||||
* dc11 = (64 + 160 + 4) >> 3 = 228/8 = 28
|
||||
*/
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[8] = { 8, 8, 8, 8, 16, 16, 16, 16 };
|
||||
int l[8] = { 24, 24, 24, 24, 40, 40, 40, 40 };
|
||||
set_ctx(buf, 99, t, l);
|
||||
daedalus_h264_pred_chroma8x8_dc(&buf[1][1], STRIDE);
|
||||
uint8_t exp[8][8] = {
|
||||
{16,16,16,16, 16,16,16,16},
|
||||
{16,16,16,16, 16,16,16,16},
|
||||
{16,16,16,16, 16,16,16,16},
|
||||
{16,16,16,16, 16,16,16,16},
|
||||
{40,40,40,40, 28,28,28,28},
|
||||
{40,40,40,40, 28,28,28,28},
|
||||
{40,40,40,40, 28,28,28,28},
|
||||
{40,40,40,40, 28,28,28,28},
|
||||
};
|
||||
fail |= check_per_cell(buf, "DC quadrants (mode 0)", exp);
|
||||
}
|
||||
|
||||
/* --- Mode 3 Plane (uniform): H = V = 0; a = 16 * (100 + 100) = 3200.
|
||||
* pred[y][x] = (3200 + 0 + 0 + 16) >> 5 = 3216 >> 5 = 100. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[8], l[8];
|
||||
for (int i = 0; i < 8; i++) { t[i] = 100; l[i] = 100; }
|
||||
set_ctx(buf, 100, t, l);
|
||||
daedalus_h264_pred_chroma8x8_plane(&buf[1][1], STRIDE);
|
||||
uint8_t exp[8][8];
|
||||
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = 100;
|
||||
fail |= check_per_cell(buf, "Plane uniform (mode 3)", exp);
|
||||
}
|
||||
|
||||
/* --- Mode 3 Plane gradient sanity ---
|
||||
* t = 0..7, l = 0..7, tl = 0.
|
||||
* H = 1*(t[4]-t[2]) + 2*(t[5]-t[1]) + 3*(t[6]-t[0]) + 4*(t[7]-tl)
|
||||
* = 1*(4-2) + 2*(5-1) + 3*(6-0) + 4*(7-0)
|
||||
* = 2 + 8 + 18 + 28 = 56
|
||||
* V = same shape on left = 56
|
||||
* b = (34*56 + 32) >> 6 = 1936 >> 6 = 30
|
||||
* c = 30
|
||||
* a = 16 * (l[7] + t[7]) = 16 * (7 + 7) = 224
|
||||
*
|
||||
* pred[0][0] = (224 + 30*(-3) + 30*(-3) + 16) >> 5
|
||||
* = (224 - 90 - 90 + 16) >> 5
|
||||
* = 60 >> 5 = 1
|
||||
* pred[7][7] = (224 + 30*4 + 30*4 + 16) >> 5
|
||||
* = (224 + 120 + 120 + 16) >> 5
|
||||
* = 480 >> 5 = 15
|
||||
* Spot-check those two corners. */
|
||||
{
|
||||
uint8_t buf[ROWS][STRIDE];
|
||||
int t[8], l[8];
|
||||
for (int i = 0; i < 8; i++) { t[i] = i; l[i] = i; }
|
||||
set_ctx(buf, 0, t, l);
|
||||
daedalus_h264_pred_chroma8x8_plane(&buf[1][1], STRIDE);
|
||||
uint8_t tl_actual = buf[1 + 0][1 + 0];
|
||||
uint8_t br_actual = buf[1 + 7][1 + 7];
|
||||
int spot_fail = 0;
|
||||
if (tl_actual != 1) { fprintf(stderr, "Plane gradient pred[0][0] = %u, expected 1\n", tl_actual); spot_fail = 1; }
|
||||
if (br_actual != 15) { fprintf(stderr, "Plane gradient pred[7][7] = %u, expected 15\n", br_actual); spot_fail = 1; }
|
||||
if (!spot_fail) printf(" %-30s PASS (corners 1, 15)\n", "Plane gradient (mode 3)");
|
||||
else printf(" %-30s FAIL\n", "Plane gradient (mode 3)");
|
||||
fail |= spot_fail;
|
||||
}
|
||||
|
||||
if (fail == 0) printf("\nALL Intra_8x8 chroma mode references PASS\n");
|
||||
else fprintf(stderr, "\n%d test(s) FAILED\n", fail);
|
||||
return fail ? 1 : 0;
|
||||
}
|
||||
Reference in New Issue
Block a user