h264: V3D shaders for all 15 avg_ qpel positions — qpel QPU complete
Generates 15 avg_ shader variants by templating from the existing put_ shaders. Each avg_ shader is identical to its put_ sibling except the final write does an L2 average with the existing dst: put_: dst[r,c] = result avg_: dst[r,c] = (dst[r,c] + result + 1) >> 1 Per H.264 §8.4.2.3.1 (B-slice biprediction): caller pre-loads dst with the list0 prediction; the avg_ call folds in list1. Generated via python (avg-shader-gen.py): reads each v3d_h264_qpel_mcXY.comp, transforms the docstring header + final write hunk, writes v3d_h264_qpel_avg_mcXY.comp. ~88 lines each; 15 new shader files. Dispatch reuses the existing dispatch_h264_qpel_diag_qpu helper for all 15 — same src envelope (10*stride+11 covers any (r±1, c±1) shift), the L2 step only touches dst. Slightly over-allocates for the simpler positions (avg_mc20/02/10/30/01/03) but negligible cost. Eliminates 15 wrappers + 15 src_max bound calculations that would otherwise duplicate. CMake foreach loops compile + install 15 new SPV files. ctx grows 15 pipeline pairs. Recipe table flips DAEDALUS_KERNEL_H264_QPEL_AVG_* from CPU to QPU. Public dispatchers re-defined via the existing DEFINE_QPEL_DIAG_PUBLIC macro (replaces the CPU-only DEFINE_QPEL_DISPATCH instantiations). Verified on hertz: $ ./build/test_api_h264 | grep "qpel avg" | wc -l 15 $ ./build/test_api_h264 | grep "qpel avg" | grep -c "100.0000%" 15 All 15 PASS 2048/2048 bytes bit-exact via QPU. QPU coverage for the H.264 8-bit 4:2:0 hot-path pixel kernels: Layer Coverage ───────────────────────────────────────────────────────────── IDCT 4x4 luma ✓ cycle 6 (one QPU shader, also handles chroma) IDCT 8x8 luma ✓ cycle 7 Chroma DC Hadamard CPU only (4 adds + 4 subs; not worth) Deblock luma_v ✓ cycle 8 Deblock luma_h ✓ PR #28 Deblock chroma_v/h ✓ PR #29 Deblock *_intra CPU only (less common, structurally different) qpel put_ 15 pos ✓ cycle 9 (mc20) + PRs #30-#33 qpel avg_ 15 pos ✓ THIS PR The H.264 non-intra-deblock hot path is now FULLY on QPU for any consumer that initialises daedalus with a QPU-capable context.
This commit is contained in:
+33
-1
@@ -389,7 +389,24 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
set(H264_QPEL_${_mc}_SPV ${_spv})
|
||||
endforeach()
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV} ${H264_QPEL_mc11_SPV} ${H264_QPEL_mc12_SPV} ${H264_QPEL_mc13_SPV} ${H264_QPEL_mc21_SPV} ${H264_QPEL_mc23_SPV} ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV})
|
||||
# avg_ biprediction variants — same shader as put_ + extra L2 with
|
||||
# existing dst. All 15 useful positions.
|
||||
foreach(_mc mc20 mc02 mc22 mc10 mc30 mc01 mc03
|
||||
mc11 mc12 mc13 mc21 mc23 mc31 mc32 mc33)
|
||||
set(_spv ${CMAKE_BINARY_DIR}/v3d_h264_qpel_avg_${_mc}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${_spv}
|
||||
COMMAND ${GLSLANG_VALIDATOR} -V --target-env vulkan1.3
|
||||
-o ${_spv}
|
||||
${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_avg_${_mc}.comp
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/src/v3d_h264_qpel_avg_${_mc}.comp
|
||||
COMMENT "glslang: v3d_h264_qpel_avg_${_mc}.comp -> .spv"
|
||||
VERBATIM
|
||||
)
|
||||
set(H264_QPEL_avg_${_mc}_SPV ${_spv})
|
||||
endforeach()
|
||||
|
||||
add_custom_target(daedalus_shaders ALL DEPENDS ${NOOP_SPV} ${IDCT8_SPV} ${LPF_SPV} ${MC_SPV} ${LPF8_SPV} ${CDEF_SPV} ${H264DEBLOCK_SPV} ${H264DEBLOCK_H_SPV} ${H264DEBLOCK_CHROMA_V_SPV} ${H264DEBLOCK_CHROMA_H_SPV} ${H264_IDCT4_SPV} ${H264_IDCT8_SPV} ${H264_QPEL_MC20_SPV} ${H264_QPEL_MC02_SPV} ${H264_QPEL_MC22_SPV} ${H264_QPEL_mc10_SPV} ${H264_QPEL_mc30_SPV} ${H264_QPEL_mc01_SPV} ${H264_QPEL_mc03_SPV} ${H264_QPEL_mc11_SPV} ${H264_QPEL_mc12_SPV} ${H264_QPEL_mc13_SPV} ${H264_QPEL_mc21_SPV} ${H264_QPEL_mc23_SPV} ${H264_QPEL_mc31_SPV} ${H264_QPEL_mc32_SPV} ${H264_QPEL_mc33_SPV} ${H264_QPEL_avg_mc20_SPV} ${H264_QPEL_avg_mc02_SPV} ${H264_QPEL_avg_mc22_SPV} ${H264_QPEL_avg_mc10_SPV} ${H264_QPEL_avg_mc30_SPV} ${H264_QPEL_avg_mc01_SPV} ${H264_QPEL_avg_mc03_SPV} ${H264_QPEL_avg_mc11_SPV} ${H264_QPEL_avg_mc12_SPV} ${H264_QPEL_avg_mc13_SPV} ${H264_QPEL_avg_mc21_SPV} ${H264_QPEL_avg_mc23_SPV} ${H264_QPEL_avg_mc31_SPV} ${H264_QPEL_avg_mc32_SPV} ${H264_QPEL_avg_mc33_SPV})
|
||||
|
||||
# v3d_runner — reusable Vulkan plumbing.
|
||||
add_library(v3d_runner STATIC src/v3d_runner.c)
|
||||
@@ -542,6 +559,21 @@ if (DAEDALUS_BUILD_VULKAN)
|
||||
${H264_QPEL_mc31_SPV}
|
||||
${H264_QPEL_mc32_SPV}
|
||||
${H264_QPEL_mc33_SPV}
|
||||
${H264_QPEL_avg_mc20_SPV}
|
||||
${H264_QPEL_avg_mc02_SPV}
|
||||
${H264_QPEL_avg_mc22_SPV}
|
||||
${H264_QPEL_avg_mc10_SPV}
|
||||
${H264_QPEL_avg_mc30_SPV}
|
||||
${H264_QPEL_avg_mc01_SPV}
|
||||
${H264_QPEL_avg_mc03_SPV}
|
||||
${H264_QPEL_avg_mc11_SPV}
|
||||
${H264_QPEL_avg_mc12_SPV}
|
||||
${H264_QPEL_avg_mc13_SPV}
|
||||
${H264_QPEL_avg_mc21_SPV}
|
||||
${H264_QPEL_avg_mc23_SPV}
|
||||
${H264_QPEL_avg_mc31_SPV}
|
||||
${H264_QPEL_avg_mc32_SPV}
|
||||
${H264_QPEL_avg_mc33_SPV}
|
||||
DESTINATION ${CMAKE_INSTALL_DATADIR}/daedalus-fourier/shaders
|
||||
)
|
||||
endif()
|
||||
|
||||
+86
-30
@@ -72,6 +72,22 @@ struct daedalus_ctx {
|
||||
int h264_qpel_mc31_pipe_ready; v3d_pipeline h264_qpel_mc31_pipe;
|
||||
int h264_qpel_mc32_pipe_ready; v3d_pipeline h264_qpel_mc32_pipe;
|
||||
int h264_qpel_mc33_pipe_ready; v3d_pipeline h264_qpel_mc33_pipe;
|
||||
/* avg_ biprediction pipelines — same shaders + L2 with existing dst. */
|
||||
int h264_qpel_avg_mc20_pipe_ready; v3d_pipeline h264_qpel_avg_mc20_pipe;
|
||||
int h264_qpel_avg_mc02_pipe_ready; v3d_pipeline h264_qpel_avg_mc02_pipe;
|
||||
int h264_qpel_avg_mc22_pipe_ready; v3d_pipeline h264_qpel_avg_mc22_pipe;
|
||||
int h264_qpel_avg_mc10_pipe_ready; v3d_pipeline h264_qpel_avg_mc10_pipe;
|
||||
int h264_qpel_avg_mc30_pipe_ready; v3d_pipeline h264_qpel_avg_mc30_pipe;
|
||||
int h264_qpel_avg_mc01_pipe_ready; v3d_pipeline h264_qpel_avg_mc01_pipe;
|
||||
int h264_qpel_avg_mc03_pipe_ready; v3d_pipeline h264_qpel_avg_mc03_pipe;
|
||||
int h264_qpel_avg_mc11_pipe_ready; v3d_pipeline h264_qpel_avg_mc11_pipe;
|
||||
int h264_qpel_avg_mc12_pipe_ready; v3d_pipeline h264_qpel_avg_mc12_pipe;
|
||||
int h264_qpel_avg_mc13_pipe_ready; v3d_pipeline h264_qpel_avg_mc13_pipe;
|
||||
int h264_qpel_avg_mc21_pipe_ready; v3d_pipeline h264_qpel_avg_mc21_pipe;
|
||||
int h264_qpel_avg_mc23_pipe_ready; v3d_pipeline h264_qpel_avg_mc23_pipe;
|
||||
int h264_qpel_avg_mc31_pipe_ready; v3d_pipeline h264_qpel_avg_mc31_pipe;
|
||||
int h264_qpel_avg_mc32_pipe_ready; v3d_pipeline h264_qpel_avg_mc32_pipe;
|
||||
int h264_qpel_avg_mc33_pipe_ready; v3d_pipeline h264_qpel_avg_mc33_pipe;
|
||||
};
|
||||
|
||||
daedalus_ctx *daedalus_ctx_create(void)
|
||||
@@ -146,6 +162,21 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
||||
if (ctx->h264_qpel_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc31_pipe);
|
||||
if (ctx->h264_qpel_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc32_pipe);
|
||||
if (ctx->h264_qpel_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc33_pipe);
|
||||
if (ctx->h264_qpel_avg_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc20_pipe);
|
||||
if (ctx->h264_qpel_avg_mc02_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc02_pipe);
|
||||
if (ctx->h264_qpel_avg_mc22_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc22_pipe);
|
||||
if (ctx->h264_qpel_avg_mc10_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc10_pipe);
|
||||
if (ctx->h264_qpel_avg_mc30_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc30_pipe);
|
||||
if (ctx->h264_qpel_avg_mc01_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc01_pipe);
|
||||
if (ctx->h264_qpel_avg_mc03_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc03_pipe);
|
||||
if (ctx->h264_qpel_avg_mc11_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc11_pipe);
|
||||
if (ctx->h264_qpel_avg_mc12_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc12_pipe);
|
||||
if (ctx->h264_qpel_avg_mc13_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc13_pipe);
|
||||
if (ctx->h264_qpel_avg_mc21_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc21_pipe);
|
||||
if (ctx->h264_qpel_avg_mc23_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc23_pipe);
|
||||
if (ctx->h264_qpel_avg_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc31_pipe);
|
||||
if (ctx->h264_qpel_avg_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc32_pipe);
|
||||
if (ctx->h264_qpel_avg_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc33_pipe);
|
||||
v3d_runner_destroy(ctx->runner);
|
||||
}
|
||||
free(ctx);
|
||||
@@ -195,21 +226,21 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC31: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc31.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC32: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc32.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC33: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc33.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_CPU; /* biprediction anchors */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_CPU; /* ¼-H L2 avg */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_CPU; /* diagonals avg */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc20.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc02.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc22.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc10.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc30.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc01.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc03.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc11.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc12.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc13.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc21.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc23.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc31.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc32.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc33.spv */
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
@@ -1837,6 +1868,26 @@ DEFINE_QPEL_DIAG_QPU(mc31)
|
||||
DEFINE_QPEL_DIAG_QPU(mc32)
|
||||
DEFINE_QPEL_DIAG_QPU(mc33)
|
||||
|
||||
/* avg_ variants — same diag-style envelope (10*stride+11 covers any
|
||||
* (r±1, c±1) offset the avg_ shaders use), different SPV file.
|
||||
* Slightly over-allocates for avg_mc20/02/10/30/01/03 (which need
|
||||
* less src context) but the cost is negligible. */
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc20)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc02)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc22)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc10)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc30)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc01)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc03)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc11)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc12)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc13)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc21)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc23)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc31)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc32)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc33)
|
||||
|
||||
#undef DEFINE_QPEL_DIAG_QPU
|
||||
|
||||
/* -------------------- Public dispatch entry points -------------- */
|
||||
@@ -2142,22 +2193,27 @@ DEFINE_QPEL_DIAG_PUBLIC(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33)
|
||||
|
||||
/* avg_ biprediction dispatchers (15 positions) — same macro, the
|
||||
* underlying _qpu dispatch fns also reuse the diag QPU helper since
|
||||
* the avg_ shaders share the put_ src envelope (the L2 step only
|
||||
* touches dst). */
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33)
|
||||
#undef DEFINE_QPEL_DIAG_PUBLIC
|
||||
DEFINE_QPEL_DISPATCH(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33)
|
||||
|
||||
#undef DEFINE_QPEL_DISPATCH
|
||||
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc01 (biprediction) (8x8, ¼-pel vertical),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "d" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r,c] + 1) >> 1)
|
||||
//
|
||||
// Sibling of v3d_h264_qpel_mc02.comp with L2 step against src[r, c].
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc01_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint col_base = src_off + c;
|
||||
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int vp = clamp(v >> 5, 0, 255);
|
||||
|
||||
int avg = (vp + s_0 + 1) >> 1; // L2 with src[r, c]
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc02 (biprediction) (8x8, vertical half-pel), V3D 7.1.
|
||||
//
|
||||
// Sibling of cycle 9's v3d_h264_qpel_mc20.comp. Same 6-tap filter,
|
||||
// transposed to vertical direction:
|
||||
//
|
||||
// dst[r,c] = clip255(
|
||||
// ( s[r-2,c]
|
||||
// - 5 * s[r-1,c]
|
||||
// + 20 * s[r, c]
|
||||
// + 20 * s[r+1,c]
|
||||
// - 5 * s[r+2,c]
|
||||
// + s[r+3,c]
|
||||
// + 16
|
||||
// ) >> 5)
|
||||
//
|
||||
// src+src_off points at row 0 col 0 of the OUTPUT block; the filter
|
||||
// reads rows -2..+3 (2 rows of top context, 3 rows of bottom).
|
||||
//
|
||||
// Same WG layout as mc20: 64 lanes / 1 block-per-WG / 1 lane-per-pixel.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc02_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3;
|
||||
uint c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
// Read the 6 rows of vertical context at col (c) of THIS output row.
|
||||
// src_off+r*stride+c is at the OUTPUT pixel position; the kernel
|
||||
// samples r-2..r+3 along the column. Unsigned-safe because the
|
||||
// public API contract guarantees src_off >= 2*stride.
|
||||
uint col_base = src_off + c;
|
||||
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int p = clamp(v >> 5, 0, 255);
|
||||
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc03 (biprediction) (8x8, ¾-pel vertical),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "n" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc02(s)[r,c]) + s[r+1, c] + 1) >> 1)
|
||||
//
|
||||
// Same as mc01 but L2-averages with src[r+1, c] instead of src[r, c].
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc03_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint col_base = src_off + c;
|
||||
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int vp = clamp(v >> 5, 0, 255);
|
||||
|
||||
int avg = (vp + s_p1 + 1) >> 1; // L2 with src[r+1, c]
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc10 (biprediction) (8x8, ¼-pel horizontal),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "a" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc20(s)[r,c]) + s[r,c] + 1) >> 1)
|
||||
//
|
||||
// = horizontal half-pel filter, clipped to u8, then L2 rounded-averaged
|
||||
// with the integer source pixel at the SAME position. Sibling of
|
||||
// v3d_h264_qpel_mc20.comp with the L2 step added at the tail.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc10_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint row_base = src_off + r * stride + c;
|
||||
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int hp = clamp(v >> 5, 0, 255);
|
||||
|
||||
// L2 average with the integer source at the SAME (r, c) position.
|
||||
int avg = (hp + s_0 + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc11 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc11[r,c] = avg(mc20(r, c),
|
||||
// mc02(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc11_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc12 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc12[r,c] = avg(mc22(r, c),
|
||||
// mc02(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc12_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc13 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc13[r,c] = avg(mc20(r+1, c),
|
||||
// mc02(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc13_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r+1u, c);
|
||||
int b = hpel_v(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc20 (biprediction) (8x8, horizontal half-pel), V3D 7.1.
|
||||
//
|
||||
// H.264 spec §8.4.2.2.1 horizontal 6-tap luma interpolation:
|
||||
//
|
||||
// dst[r,c] = clip255(
|
||||
// ( s[r,c-2]
|
||||
// - 5 * s[r,c-1]
|
||||
// + 20 * s[r,c]
|
||||
// + 20 * s[r,c+1]
|
||||
// - 5 * s[r,c+2]
|
||||
// + s[r,c+3]
|
||||
// + 16
|
||||
// ) >> 5)
|
||||
//
|
||||
// Single-stride: dst and src share `stride` (H264QpelContext
|
||||
// convention). src+src_off already points at the leftmost output
|
||||
// column (col 0); the filter reads cols -2..+3. Caller guarantees
|
||||
// edge-padding context per the public API docstring.
|
||||
//
|
||||
// Workgroup layout: 64 invocations = 1 lane per output pixel.
|
||||
// 1 block per WG; n_blocks WGs total. This is the simplest layout
|
||||
// that avoids any inter-lane communication — each lane independently
|
||||
// reads its 6 src samples and writes its 1 dst sample. V3D's L2
|
||||
// cache handles the redundant reads from adjacent lanes.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc20_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src {
|
||||
uint8_t src[];
|
||||
} u_src;
|
||||
|
||||
layout(binding = 1) buffer Dst {
|
||||
uint8_t dst[];
|
||||
} u_dst;
|
||||
|
||||
layout(binding = 2) readonly buffer Meta {
|
||||
uvec4 meta[]; // .x = dst_off, .y = src_off
|
||||
} u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
// 1 block per WG, 64 lanes covering the 8x8 output block.
|
||||
uint wg_id = gl_WorkGroupID.x;
|
||||
uint block_idx = wg_id;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3; // 0..7 (row)
|
||||
uint c = lane & 7u; // 0..7 (column)
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
// src points at output col 0 of the block; filter reads cols -2..+3
|
||||
// of the current row. Negative col arithmetic is unsigned-safe
|
||||
// because src_off >= 2 (caller-guaranteed left context).
|
||||
uint row_base = src_off + r * stride + c;
|
||||
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base + 0u]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int p = clamp(v >> 5, 0, 255);
|
||||
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc21 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc21[r,c] = avg(mc22(r, c),
|
||||
// mc20(r, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc21_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_h(src_off, stride, r, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc22 (biprediction) (8x8, 2D half-pel "j" position).
|
||||
// V3D 7.1.
|
||||
//
|
||||
// Cascaded H+V 6-tap per H.264 §8.4.2.2.1 / FFmpeg ff_put_h264_qpel8_mc22_neon:
|
||||
//
|
||||
// tmp[r,c] = src[r,c-2] - 5*src[r,c-1] + 20*src[r,c] + 20*src[r,c+1]
|
||||
// - 5*src[r,c+2] + src[r,c+3] (int16)
|
||||
//
|
||||
// dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c]
|
||||
// + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c]
|
||||
// + 512) >> 10)
|
||||
//
|
||||
// The +512 >> 10 final scale compensates for both 6-tap scalings.
|
||||
// CANNOT just cascade mc20→mc02 because intermediate must be int16
|
||||
// (no per-stage clip), so this is a dedicated kernel.
|
||||
//
|
||||
// Per-lane structure: each lane computes its own (r, c) output by
|
||||
// running the FULL cascade — 6 horizontal lowpass int16 values for
|
||||
// rows r-2..r+3, then a vertical lowpass on those. ~50 ALU ops per
|
||||
// lane. No shared memory / barriers needed; V3D L2 absorbs the
|
||||
// redundant src reads across lanes.
|
||||
//
|
||||
// WG layout: 64 lanes / 1 block-per-WG / 1 lane-per-output-pixel
|
||||
// (same as mc20 / mc02).
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc22_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
|
||||
layout(push_constant) uniform PC {
|
||||
uint n_blocks;
|
||||
uint stride_u8;
|
||||
uint _pad0, _pad1;
|
||||
} pc;
|
||||
|
||||
// Horizontal 6-tap filter at (row_off, c) — reads src at cols c-2..c+3
|
||||
// of the row identified by row_off, returns int16 intermediate (NOT
|
||||
// scaled — the v-pass does the +512 >> 10 for both stages).
|
||||
int hpel_h(uint row_off, uint c)
|
||||
{
|
||||
int s_m2 = int(u_src.src[row_off + c - 2u]);
|
||||
int s_m1 = int(u_src.src[row_off + c - 1u]);
|
||||
int s_0 = int(u_src.src[row_off + c ]);
|
||||
int s_p1 = int(u_src.src[row_off + c + 1u]);
|
||||
int s_p2 = int(u_src.src[row_off + c + 2u]);
|
||||
int s_p3 = int(u_src.src[row_off + c + 3u]);
|
||||
return s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3;
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3;
|
||||
uint c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
// Compute 6 horizontal lowpass values at rows r-2..r+3 (relative
|
||||
// to the output row r) of column c. src_off+r*stride+c is the
|
||||
// output pixel position; we sample rows r-2..r+3.
|
||||
// Unsigned-safe because src_off >= 2*stride per the caller contract.
|
||||
int t0 = hpel_h(src_off + (r - 2u) * stride, c);
|
||||
int t1 = hpel_h(src_off + (r - 1u) * stride, c);
|
||||
int t2 = hpel_h(src_off + r * stride, c);
|
||||
int t3 = hpel_h(src_off + (r + 1u) * stride, c);
|
||||
int t4 = hpel_h(src_off + (r + 2u) * stride, c);
|
||||
int t5 = hpel_h(src_off + (r + 3u) * stride, c);
|
||||
|
||||
int v = t0 - 5 * t1 + 20 * t2 + 20 * t3 - 5 * t4 + t5 + 512;
|
||||
int p = clamp(v >> 10, 0, 255);
|
||||
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + p + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc23 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc23[r,c] = avg(mc22(r, c),
|
||||
// mc20(r+1, c))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc23_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_h(src_off, stride, r+1u, c);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc30 (biprediction) (8x8, ¾-pel horizontal),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 "c" position:
|
||||
//
|
||||
// dst[r,c] = ((clip255(mc20(s)[r,c]) + s[r,c+1] + 1) >> 1)
|
||||
//
|
||||
// Same as mc10 but L2-averages with src[r, c+1] instead of src[r, c].
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc30_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
uint row_base = src_off + r * stride + c;
|
||||
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
int hp = clamp(v >> 5, 0, 255);
|
||||
|
||||
int avg = (hp + s_p1 + 1) >> 1; // L2 with src[r, c+1]
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc31 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc31[r,c] = avg(mc20(r, c),
|
||||
// mc02(r, c+1))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc31_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c+1u);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc32 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc32[r,c] = avg(mc22(r, c),
|
||||
// mc02(r, c+1))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc32_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_hv(src_off, stride, r, c);
|
||||
int b = hpel_v(src_off, stride, r, c+1u);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
// daedalus-fourier — H.264 luma qpel avg_mc33 (biprediction) (8x8, diagonal quarter-pel),
|
||||
// V3D 7.1. Per H.264 §8.4.2.2.1 (table 8-4) — composes two half-pel
|
||||
// anchors via L2 rounded-average:
|
||||
//
|
||||
// mc33[r,c] = avg(mc20(r+1, c),
|
||||
// mc02(r, c+1))
|
||||
//
|
||||
// Per-lane structure: each lane computes BOTH anchor outputs at its
|
||||
// own (r, c) target offset, then L2 averages. No shared memory.
|
||||
// Same WG geometry as the other qpel shaders.
|
||||
//
|
||||
//
|
||||
// avg_ variant for B-slice biprediction per H.264 §8.4.2.3.1:
|
||||
// dst[r,c] = avg(dst[r,c], mc33_value)
|
||||
// Caller pre-loads dst with the list0 prediction; this shader
|
||||
// folds in the list1 contribution.
|
||||
//
|
||||
// License: BSD-2-Clause.
|
||||
|
||||
#version 450
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(binding = 0) readonly buffer Src { uint8_t src[]; } u_src;
|
||||
layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
|
||||
layout(binding = 2) readonly buffer Meta { uvec4 meta[]; } u_meta;
|
||||
layout(push_constant) uniform PC { uint n_blocks, stride_u8, _p0, _p1; } pc;
|
||||
|
||||
int hpel_h(uint src_off, uint stride, uint r, uint c) {
|
||||
uint row_base = src_off + r * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_v(uint src_off, uint stride, uint r, uint c) {
|
||||
uint col_base = src_off + c;
|
||||
int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
|
||||
int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
|
||||
int s_0 = int(u_src.src[col_base + r * stride]);
|
||||
int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
|
||||
int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
|
||||
int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
|
||||
int v = s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3 + 16;
|
||||
return clamp(v >> 5, 0, 255);
|
||||
}
|
||||
|
||||
int hpel_hv_row(uint src_off, uint stride, uint rr, uint c) {
|
||||
// Single row's int16 horizontal lowpass (NOT clipped — used as
|
||||
// intermediate for the vertical pass of hpel_hv).
|
||||
uint row_base = src_off + rr * stride + c;
|
||||
int s_m2 = int(u_src.src[row_base - 2u]);
|
||||
int s_m1 = int(u_src.src[row_base - 1u]);
|
||||
int s_0 = int(u_src.src[row_base ]);
|
||||
int s_p1 = int(u_src.src[row_base + 1u]);
|
||||
int s_p2 = int(u_src.src[row_base + 2u]);
|
||||
int s_p3 = int(u_src.src[row_base + 3u]);
|
||||
return s_m2 - 5*s_m1 + 20*s_0 + 20*s_p1 - 5*s_p2 + s_p3;
|
||||
}
|
||||
|
||||
int hpel_hv(uint src_off, uint stride, uint r, uint c) {
|
||||
int t0 = hpel_hv_row(src_off, stride, r - 2u, c);
|
||||
int t1 = hpel_hv_row(src_off, stride, r - 1u, c);
|
||||
int t2 = hpel_hv_row(src_off, stride, r, c);
|
||||
int t3 = hpel_hv_row(src_off, stride, r + 1u, c);
|
||||
int t4 = hpel_hv_row(src_off, stride, r + 2u, c);
|
||||
int t5 = hpel_hv_row(src_off, stride, r + 3u, c);
|
||||
int v = t0 - 5*t1 + 20*t2 + 20*t3 - 5*t4 + t5 + 512;
|
||||
return clamp(v >> 10, 0, 255);
|
||||
}
|
||||
|
||||
void main()
|
||||
{
|
||||
uint block_idx = gl_WorkGroupID.x;
|
||||
if (block_idx >= pc.n_blocks) return;
|
||||
|
||||
uint lane = gl_LocalInvocationID.x;
|
||||
uint r = lane >> 3, c = lane & 7u;
|
||||
|
||||
uint dst_off = u_meta.meta[block_idx].x;
|
||||
uint src_off = u_meta.meta[block_idx].y;
|
||||
uint stride = pc.stride_u8;
|
||||
|
||||
int a = hpel_h(src_off, stride, r+1u, c);
|
||||
int b = hpel_v(src_off, stride, r, c+1u);
|
||||
int avg = (a + b + 1) >> 1;
|
||||
uint final_off = dst_off + r * stride + c;
|
||||
int prev = int(u_dst.dst[final_off]);
|
||||
u_dst.dst[final_off] = uint8_t((prev + avg + 1) >> 1);
|
||||
}
|
||||
Reference in New Issue
Block a user