h264: V3D shaders for all 15 avg_ qpel positions — qpel QPU complete
Generates 15 avg_ shader variants by templating from the existing put_ shaders. Each avg_ shader is identical to its put_ sibling except the final write does an L2 average with the existing dst: put_: dst[r,c] = result avg_: dst[r,c] = (dst[r,c] + result + 1) >> 1 Per H.264 §8.4.2.3.1 (B-slice biprediction): caller pre-loads dst with the list0 prediction; the avg_ call folds in list1. Generated via python (avg-shader-gen.py): reads each v3d_h264_qpel_mcXY.comp, transforms the docstring header + final write hunk, writes v3d_h264_qpel_avg_mcXY.comp. ~88 lines each; 15 new shader files. Dispatch reuses the existing dispatch_h264_qpel_diag_qpu helper for all 15 — same src envelope (10*stride+11 covers any (r±1, c±1) shift), the L2 step only touches dst. Slightly over-allocates for the simpler positions (avg_mc20/02/10/30/01/03) but negligible cost. Eliminates 15 wrappers + 15 src_max bound calculations that would otherwise duplicate. CMake foreach loops compile + install 15 new SPV files. ctx grows 15 pipeline pairs. Recipe table flips DAEDALUS_KERNEL_H264_QPEL_AVG_* from CPU to QPU. Public dispatchers re-defined via the existing DEFINE_QPEL_DIAG_PUBLIC macro (replaces the CPU-only DEFINE_QPEL_DISPATCH instantiations). Verified on hertz: $ ./build/test_api_h264 | grep "qpel avg" | wc -l 15 $ ./build/test_api_h264 | grep "qpel avg" | grep -c "100.0000%" 15 All 15 PASS 2048/2048 bytes bit-exact via QPU. QPU coverage for the H.264 8-bit 4:2:0 hot-path pixel kernels: Layer Coverage ───────────────────────────────────────────────────────────── IDCT 4x4 luma ✓ cycle 6 (one QPU shader, also handles chroma) IDCT 8x8 luma ✓ cycle 7 Chroma DC Hadamard CPU only (4 adds + 4 subs; not worth) Deblock luma_v ✓ cycle 8 Deblock luma_h ✓ PR #28 Deblock chroma_v/h ✓ PR #29 Deblock *_intra CPU only (less common, structurally different) qpel put_ 15 pos ✓ cycle 9 (mc20) + PRs #30-#33 qpel avg_ 15 pos ✓ THIS PR The H.264 non-intra-deblock hot path is now FULLY on QPU for any consumer that initialises daedalus with a QPU-capable context.
This commit is contained in:
+86
-30
@@ -72,6 +72,22 @@ struct daedalus_ctx {
|
||||
int h264_qpel_mc31_pipe_ready; v3d_pipeline h264_qpel_mc31_pipe;
|
||||
int h264_qpel_mc32_pipe_ready; v3d_pipeline h264_qpel_mc32_pipe;
|
||||
int h264_qpel_mc33_pipe_ready; v3d_pipeline h264_qpel_mc33_pipe;
|
||||
/* avg_ biprediction pipelines — same shaders + L2 with existing dst. */
|
||||
int h264_qpel_avg_mc20_pipe_ready; v3d_pipeline h264_qpel_avg_mc20_pipe;
|
||||
int h264_qpel_avg_mc02_pipe_ready; v3d_pipeline h264_qpel_avg_mc02_pipe;
|
||||
int h264_qpel_avg_mc22_pipe_ready; v3d_pipeline h264_qpel_avg_mc22_pipe;
|
||||
int h264_qpel_avg_mc10_pipe_ready; v3d_pipeline h264_qpel_avg_mc10_pipe;
|
||||
int h264_qpel_avg_mc30_pipe_ready; v3d_pipeline h264_qpel_avg_mc30_pipe;
|
||||
int h264_qpel_avg_mc01_pipe_ready; v3d_pipeline h264_qpel_avg_mc01_pipe;
|
||||
int h264_qpel_avg_mc03_pipe_ready; v3d_pipeline h264_qpel_avg_mc03_pipe;
|
||||
int h264_qpel_avg_mc11_pipe_ready; v3d_pipeline h264_qpel_avg_mc11_pipe;
|
||||
int h264_qpel_avg_mc12_pipe_ready; v3d_pipeline h264_qpel_avg_mc12_pipe;
|
||||
int h264_qpel_avg_mc13_pipe_ready; v3d_pipeline h264_qpel_avg_mc13_pipe;
|
||||
int h264_qpel_avg_mc21_pipe_ready; v3d_pipeline h264_qpel_avg_mc21_pipe;
|
||||
int h264_qpel_avg_mc23_pipe_ready; v3d_pipeline h264_qpel_avg_mc23_pipe;
|
||||
int h264_qpel_avg_mc31_pipe_ready; v3d_pipeline h264_qpel_avg_mc31_pipe;
|
||||
int h264_qpel_avg_mc32_pipe_ready; v3d_pipeline h264_qpel_avg_mc32_pipe;
|
||||
int h264_qpel_avg_mc33_pipe_ready; v3d_pipeline h264_qpel_avg_mc33_pipe;
|
||||
};
|
||||
|
||||
daedalus_ctx *daedalus_ctx_create(void)
|
||||
@@ -146,6 +162,21 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
||||
if (ctx->h264_qpel_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc31_pipe);
|
||||
if (ctx->h264_qpel_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc32_pipe);
|
||||
if (ctx->h264_qpel_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc33_pipe);
|
||||
if (ctx->h264_qpel_avg_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc20_pipe);
|
||||
if (ctx->h264_qpel_avg_mc02_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc02_pipe);
|
||||
if (ctx->h264_qpel_avg_mc22_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc22_pipe);
|
||||
if (ctx->h264_qpel_avg_mc10_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc10_pipe);
|
||||
if (ctx->h264_qpel_avg_mc30_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc30_pipe);
|
||||
if (ctx->h264_qpel_avg_mc01_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc01_pipe);
|
||||
if (ctx->h264_qpel_avg_mc03_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc03_pipe);
|
||||
if (ctx->h264_qpel_avg_mc11_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc11_pipe);
|
||||
if (ctx->h264_qpel_avg_mc12_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc12_pipe);
|
||||
if (ctx->h264_qpel_avg_mc13_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc13_pipe);
|
||||
if (ctx->h264_qpel_avg_mc21_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc21_pipe);
|
||||
if (ctx->h264_qpel_avg_mc23_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc23_pipe);
|
||||
if (ctx->h264_qpel_avg_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc31_pipe);
|
||||
if (ctx->h264_qpel_avg_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc32_pipe);
|
||||
if (ctx->h264_qpel_avg_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc33_pipe);
|
||||
v3d_runner_destroy(ctx->runner);
|
||||
}
|
||||
free(ctx);
|
||||
@@ -195,21 +226,21 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC31: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc31.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC32: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc32.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC33: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc33.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_CPU; /* biprediction anchors */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_CPU; /* ¼-H L2 avg */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_CPU; /* diagonals avg */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc20.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc02.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc22.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc10.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc30.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc01.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc03.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc11.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc12.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc13.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc21.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc23.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc31.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc32.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc33.spv */
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
@@ -1837,6 +1868,26 @@ DEFINE_QPEL_DIAG_QPU(mc31)
|
||||
DEFINE_QPEL_DIAG_QPU(mc32)
|
||||
DEFINE_QPEL_DIAG_QPU(mc33)
|
||||
|
||||
/* avg_ variants — same diag-style envelope (10*stride+11 covers any
|
||||
* (r±1, c±1) offset the avg_ shaders use), different SPV file.
|
||||
* Slightly over-allocates for avg_mc20/02/10/30/01/03 (which need
|
||||
* less src context) but the cost is negligible. */
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc20)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc02)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc22)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc10)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc30)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc01)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc03)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc11)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc12)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc13)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc21)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc23)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc31)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc32)
|
||||
DEFINE_QPEL_DIAG_QPU(avg_mc33)
|
||||
|
||||
#undef DEFINE_QPEL_DIAG_QPU
|
||||
|
||||
/* -------------------- Public dispatch entry points -------------- */
|
||||
@@ -2142,22 +2193,27 @@ DEFINE_QPEL_DIAG_PUBLIC(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33)
|
||||
|
||||
/* avg_ biprediction dispatchers (15 positions) — same macro, the
|
||||
* underlying _qpu dispatch fns also reuse the diag QPU helper since
|
||||
* the avg_ shaders share the put_ src envelope (the L2 step only
|
||||
* touches dst). */
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32)
|
||||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33)
|
||||
#undef DEFINE_QPEL_DIAG_PUBLIC
|
||||
DEFINE_QPEL_DISPATCH(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33)
|
||||
|
||||
#undef DEFINE_QPEL_DISPATCH
|
||||
|
||||
|
||||
Reference in New Issue
Block a user