2079fe39c6
Generates 15 avg_ shader variants by templating from the existing put_ shaders. Each avg_ shader is identical to its put_ sibling except the final write does an L2 average with the existing dst: put_: dst[r,c] = result avg_: dst[r,c] = (dst[r,c] + result + 1) >> 1 Per H.264 §8.4.2.3.1 (B-slice biprediction): caller pre-loads dst with the list0 prediction; the avg_ call folds in list1. Generated via python (avg-shader-gen.py): reads each v3d_h264_qpel_mcXY.comp, transforms the docstring header + final write hunk, writes v3d_h264_qpel_avg_mcXY.comp. ~88 lines each; 15 new shader files. Dispatch reuses the existing dispatch_h264_qpel_diag_qpu helper for all 15 — same src envelope (10*stride+11 covers any (r±1, c±1) shift), the L2 step only touches dst. Slightly over-allocates for the simpler positions (avg_mc20/02/10/30/01/03) but negligible cost. Eliminates 15 wrappers + 15 src_max bound calculations that would otherwise duplicate. CMake foreach loops compile + install 15 new SPV files. ctx grows 15 pipeline pairs. Recipe table flips DAEDALUS_KERNEL_H264_QPEL_AVG_* from CPU to QPU. Public dispatchers re-defined via the existing DEFINE_QPEL_DIAG_PUBLIC macro (replaces the CPU-only DEFINE_QPEL_DISPATCH instantiations). Verified on hertz: $ ./build/test_api_h264 | grep "qpel avg" | wc -l 15 $ ./build/test_api_h264 | grep "qpel avg" | grep -c "100.0000%" 15 All 15 PASS 2048/2048 bytes bit-exact via QPU. QPU coverage for the H.264 8-bit 4:2:0 hot-path pixel kernels: Layer Coverage ───────────────────────────────────────────────────────────── IDCT 4x4 luma ✓ cycle 6 (one QPU shader, also handles chroma) IDCT 8x8 luma ✓ cycle 7 Chroma DC Hadamard CPU only (4 adds + 4 subs; not worth) Deblock luma_v ✓ cycle 8 Deblock luma_h ✓ PR #28 Deblock chroma_v/h ✓ PR #29 Deblock *_intra CPU only (less common, structurally different) qpel put_ 15 pos ✓ cycle 9 (mc20) + PRs #30-#33 qpel avg_ 15 pos ✓ THIS PR The H.264 non-intra-deblock hot path is now FULLY on QPU for any consumer that initialises daedalus with a QPU-capable context.
2393 lines
107 KiB
C
2393 lines
107 KiB
C
/*
|
||
* daedalus-fourier core library — Phase 8 skeleton + IDCT QPU wired.
|
||
*
|
||
* Wraps cycles 1-5 kernels behind the public C API in
|
||
* include/daedalus.h. Recipe dispatch routes per-kernel to the
|
||
* verdict substrate from each cycle's Phase 7 doc.
|
||
*
|
||
* QPU dispatch wiring status:
|
||
* IDCT 8x8: wired (cycle 1 v4 shader).
|
||
* Others: stubbed (return -1); CPU path always works.
|
||
*
|
||
* License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
|
||
* dav1d BSD-2-Clause NEON snapshots.
|
||
*/
|
||
#include "../include/daedalus.h"
|
||
#include "v3d_runner.h"
|
||
|
||
#include <stdlib.h>
|
||
#include <stdint.h>
|
||
#include <stddef.h>
|
||
#include <string.h>
|
||
#include <assert.h>
|
||
|
||
/* -------------------- Context -------------------- */
|
||
|
||
struct daedalus_ctx {
|
||
int has_qpu;
|
||
v3d_runner *runner; /* NULL when has_qpu == 0 */
|
||
|
||
/* Per-kernel pipelines, lazy-created on first QPU dispatch. */
|
||
int idct8_pipe_ready;
|
||
v3d_pipeline idct8_pipe;
|
||
int lpf4_pipe_ready;
|
||
v3d_pipeline lpf4_pipe;
|
||
int lpf8_pipe_ready;
|
||
v3d_pipeline lpf8_pipe;
|
||
int mc8h_pipe_ready;
|
||
v3d_pipeline mc8h_pipe;
|
||
int cdef_pipe_ready;
|
||
v3d_pipeline cdef_pipe;
|
||
int h264deblock_pipe_ready;
|
||
v3d_pipeline h264deblock_pipe;
|
||
int h264deblock_h_pipe_ready;
|
||
v3d_pipeline h264deblock_h_pipe;
|
||
int h264deblock_chroma_v_pipe_ready;
|
||
v3d_pipeline h264deblock_chroma_v_pipe;
|
||
int h264deblock_chroma_h_pipe_ready;
|
||
v3d_pipeline h264deblock_chroma_h_pipe;
|
||
int h264_idct4_pipe_ready;
|
||
v3d_pipeline h264_idct4_pipe;
|
||
int h264_idct8_pipe_ready;
|
||
v3d_pipeline h264_idct8_pipe;
|
||
int h264_qpel_mc20_pipe_ready;
|
||
v3d_pipeline h264_qpel_mc20_pipe;
|
||
int h264_qpel_mc02_pipe_ready;
|
||
v3d_pipeline h264_qpel_mc02_pipe;
|
||
int h264_qpel_mc22_pipe_ready;
|
||
v3d_pipeline h264_qpel_mc22_pipe;
|
||
int h264_qpel_mc10_pipe_ready;
|
||
v3d_pipeline h264_qpel_mc10_pipe;
|
||
int h264_qpel_mc30_pipe_ready;
|
||
v3d_pipeline h264_qpel_mc30_pipe;
|
||
int h264_qpel_mc01_pipe_ready;
|
||
v3d_pipeline h264_qpel_mc01_pipe;
|
||
int h264_qpel_mc03_pipe_ready;
|
||
v3d_pipeline h264_qpel_mc03_pipe;
|
||
int h264_qpel_mc11_pipe_ready; v3d_pipeline h264_qpel_mc11_pipe;
|
||
int h264_qpel_mc12_pipe_ready; v3d_pipeline h264_qpel_mc12_pipe;
|
||
int h264_qpel_mc13_pipe_ready; v3d_pipeline h264_qpel_mc13_pipe;
|
||
int h264_qpel_mc21_pipe_ready; v3d_pipeline h264_qpel_mc21_pipe;
|
||
int h264_qpel_mc23_pipe_ready; v3d_pipeline h264_qpel_mc23_pipe;
|
||
int h264_qpel_mc31_pipe_ready; v3d_pipeline h264_qpel_mc31_pipe;
|
||
int h264_qpel_mc32_pipe_ready; v3d_pipeline h264_qpel_mc32_pipe;
|
||
int h264_qpel_mc33_pipe_ready; v3d_pipeline h264_qpel_mc33_pipe;
|
||
/* avg_ biprediction pipelines — same shaders + L2 with existing dst. */
|
||
int h264_qpel_avg_mc20_pipe_ready; v3d_pipeline h264_qpel_avg_mc20_pipe;
|
||
int h264_qpel_avg_mc02_pipe_ready; v3d_pipeline h264_qpel_avg_mc02_pipe;
|
||
int h264_qpel_avg_mc22_pipe_ready; v3d_pipeline h264_qpel_avg_mc22_pipe;
|
||
int h264_qpel_avg_mc10_pipe_ready; v3d_pipeline h264_qpel_avg_mc10_pipe;
|
||
int h264_qpel_avg_mc30_pipe_ready; v3d_pipeline h264_qpel_avg_mc30_pipe;
|
||
int h264_qpel_avg_mc01_pipe_ready; v3d_pipeline h264_qpel_avg_mc01_pipe;
|
||
int h264_qpel_avg_mc03_pipe_ready; v3d_pipeline h264_qpel_avg_mc03_pipe;
|
||
int h264_qpel_avg_mc11_pipe_ready; v3d_pipeline h264_qpel_avg_mc11_pipe;
|
||
int h264_qpel_avg_mc12_pipe_ready; v3d_pipeline h264_qpel_avg_mc12_pipe;
|
||
int h264_qpel_avg_mc13_pipe_ready; v3d_pipeline h264_qpel_avg_mc13_pipe;
|
||
int h264_qpel_avg_mc21_pipe_ready; v3d_pipeline h264_qpel_avg_mc21_pipe;
|
||
int h264_qpel_avg_mc23_pipe_ready; v3d_pipeline h264_qpel_avg_mc23_pipe;
|
||
int h264_qpel_avg_mc31_pipe_ready; v3d_pipeline h264_qpel_avg_mc31_pipe;
|
||
int h264_qpel_avg_mc32_pipe_ready; v3d_pipeline h264_qpel_avg_mc32_pipe;
|
||
int h264_qpel_avg_mc33_pipe_ready; v3d_pipeline h264_qpel_avg_mc33_pipe;
|
||
};
|
||
|
||
daedalus_ctx *daedalus_ctx_create(void)
|
||
{
|
||
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
||
if (!ctx) return NULL;
|
||
ctx->runner = v3d_runner_create();
|
||
ctx->has_qpu = (ctx->runner != NULL);
|
||
return ctx;
|
||
}
|
||
|
||
daedalus_ctx *daedalus_ctx_create_no_qpu(void)
|
||
{
|
||
/*
|
||
* Per the "QPU is default substrate" decree 2026-05-23:
|
||
* setting DAEDALUS_FORCE_QPU=1 in the process env escalates this
|
||
* function to a full daedalus_ctx_create(), letting the libavcodec
|
||
* substitution shims (which call create_no_qpu via pthread_once)
|
||
* fire the V3D shaders that exist for cycles 1/2/4/5/8. Without
|
||
* this hook each consumer process (firefox, mpv, daemon) would
|
||
* need its own shim build to opt into QPU.
|
||
*
|
||
* Default behaviour (env var unset / not "1") is unchanged: pure
|
||
* NEON ctx, no implicit Vulkan init. Firefox / mpv consumers
|
||
* that dlopen libavcodec without opting in stay on the
|
||
* Vulkan-free path; the daemon explicitly sets
|
||
* DAEDALUS_FORCE_QPU=1 before loading libavcodec.
|
||
*/
|
||
const char *force = getenv("DAEDALUS_FORCE_QPU");
|
||
if (force && force[0] == '1' && force[1] == 0)
|
||
return daedalus_ctx_create();
|
||
|
||
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
||
if (!ctx) return NULL;
|
||
ctx->has_qpu = 0;
|
||
ctx->runner = NULL;
|
||
return ctx;
|
||
}
|
||
|
||
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
|
||
{
|
||
return ctx ? ctx->has_qpu : 0;
|
||
}
|
||
|
||
void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
||
{
|
||
if (!ctx) return;
|
||
if (ctx->runner) {
|
||
if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
|
||
if (ctx->lpf4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe);
|
||
if (ctx->lpf8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe);
|
||
if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe);
|
||
if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
|
||
if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
|
||
if (ctx->h264deblock_h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_h_pipe);
|
||
if (ctx->h264deblock_chroma_v_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_chroma_v_pipe);
|
||
if (ctx->h264deblock_chroma_h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_chroma_h_pipe);
|
||
if (ctx->h264_idct4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe);
|
||
if (ctx->h264_idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe);
|
||
if (ctx->h264_qpel_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc20_pipe);
|
||
if (ctx->h264_qpel_mc02_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc02_pipe);
|
||
if (ctx->h264_qpel_mc22_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc22_pipe);
|
||
if (ctx->h264_qpel_mc10_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc10_pipe);
|
||
if (ctx->h264_qpel_mc30_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc30_pipe);
|
||
if (ctx->h264_qpel_mc01_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc01_pipe);
|
||
if (ctx->h264_qpel_mc03_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc03_pipe);
|
||
if (ctx->h264_qpel_mc11_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc11_pipe);
|
||
if (ctx->h264_qpel_mc12_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc12_pipe);
|
||
if (ctx->h264_qpel_mc13_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc13_pipe);
|
||
if (ctx->h264_qpel_mc21_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc21_pipe);
|
||
if (ctx->h264_qpel_mc23_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc23_pipe);
|
||
if (ctx->h264_qpel_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc31_pipe);
|
||
if (ctx->h264_qpel_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc32_pipe);
|
||
if (ctx->h264_qpel_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc33_pipe);
|
||
if (ctx->h264_qpel_avg_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc20_pipe);
|
||
if (ctx->h264_qpel_avg_mc02_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc02_pipe);
|
||
if (ctx->h264_qpel_avg_mc22_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc22_pipe);
|
||
if (ctx->h264_qpel_avg_mc10_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc10_pipe);
|
||
if (ctx->h264_qpel_avg_mc30_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc30_pipe);
|
||
if (ctx->h264_qpel_avg_mc01_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc01_pipe);
|
||
if (ctx->h264_qpel_avg_mc03_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc03_pipe);
|
||
if (ctx->h264_qpel_avg_mc11_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc11_pipe);
|
||
if (ctx->h264_qpel_avg_mc12_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc12_pipe);
|
||
if (ctx->h264_qpel_avg_mc13_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc13_pipe);
|
||
if (ctx->h264_qpel_avg_mc21_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc21_pipe);
|
||
if (ctx->h264_qpel_avg_mc23_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc23_pipe);
|
||
if (ctx->h264_qpel_avg_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc31_pipe);
|
||
if (ctx->h264_qpel_avg_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc32_pipe);
|
||
if (ctx->h264_qpel_avg_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc33_pipe);
|
||
v3d_runner_destroy(ctx->runner);
|
||
}
|
||
free(ctx);
|
||
}
|
||
|
||
/* -------------------- Recipe query -------------------- */
|
||
|
||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||
{
|
||
/*
|
||
* Recipe table per the "QPU is default substrate" decree
|
||
* 2026-05-23. Any kernel that has a V3D compute shader returns
|
||
* SUBSTRATE_QPU; CPU is the fallback for kernels without a
|
||
* shader (still the case for H.264 IDCT 4x4 / IDCT 8x8 / qpel
|
||
* mc20 — covered by follow-on task 165). The dispatch
|
||
* wrappers already fall back to CPU automatically when the
|
||
* ctx doesn't have QPU available (daedalus_ctx_has_qpu == 0).
|
||
*/
|
||
switch (k) {
|
||
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
|
||
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_QPU; /* v3d_mc_8h.spv */
|
||
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_cdef.spv */
|
||
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct4.spv */
|
||
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */
|
||
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */
|
||
case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock_h.spv */
|
||
case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock_chroma_v.spv */
|
||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock_chroma_h.spv */
|
||
case DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 luma QPU pending */
|
||
case DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||
case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */
|
||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc02.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC22: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc22.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC10: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc10.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC30: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc30.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC01: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc01.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC03: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc03.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC11: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc11.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC12: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc12.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC13: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc13.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC21: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc21.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC23: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc23.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC31: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc31.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC32: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc32.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_MC33: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc33.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc20.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc02.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc22.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc10.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc30.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc01.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc03.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc11.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc12.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc13.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc21.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc23.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc31.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc32.spv */
|
||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc33.spv */
|
||
}
|
||
return DAEDALUS_SUBSTRATE_CPU;
|
||
}
|
||
|
||
/* -------------------- NEON externs (per cycle bench links) ----- */
|
||
|
||
extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
|
||
int16_t *block, int eob);
|
||
extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride,
|
||
int E, int I, int H);
|
||
extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride,
|
||
int E, int I, int H);
|
||
extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||
const uint8_t *src, ptrdiff_t src_stride,
|
||
int h, int mx, int my);
|
||
extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||
const uint16_t *tmp,
|
||
int pri_strength, int sec_strength,
|
||
int dir, int damping, int h,
|
||
size_t edges);
|
||
extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||
extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t *tc0);
|
||
extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t *tc0);
|
||
extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t *tc0);
|
||
extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t *tc0);
|
||
extern void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta);
|
||
extern void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta);
|
||
extern void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta);
|
||
extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta);
|
||
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
|
||
/* -------------------- CPU dispatch implementations -------------- */
|
||
|
||
static int dispatch_idct8_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_idct8_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
int16_t scratch[64];
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t));
|
||
ff_vp9_idct_idct_8x8_add_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
scratch, 64);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_lpf_cpu(daedalus_ctx *ctx, int wd_8,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
uint8_t *p = dst + meta[i].dst_off;
|
||
if (wd_8) ff_vp9_loop_filter_h_8_8_neon(p, (ptrdiff_t) dst_stride,
|
||
meta[i].E, meta[i].I, meta[i].H);
|
||
else ff_vp9_loop_filter_h_4_8_neon(p, (ptrdiff_t) dst_stride,
|
||
meta[i].E, meta[i].I, meta[i].H);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_mc_8h_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const uint8_t *src, size_t src_stride,
|
||
size_t n_blocks, const daedalus_mc_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
ff_vp9_put_regular8_h_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
src + meta[i].src_off + 3,
|
||
(ptrdiff_t) src_stride,
|
||
8, meta[i].mx, 0);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_cdef_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const uint16_t *tmp,
|
||
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
dav1d_cdef_filter8_8bpc_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
tmp + meta[i].tmp_off_u16,
|
||
meta[i].pri_strength,
|
||
meta[i].sec_strength,
|
||
meta[i].dir, meta[i].damping, 8, 0);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_idct4_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_h264_block_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_blocks; i++)
|
||
ff_h264_idct_add_neon(dst + meta[i].dst_off,
|
||
coeffs + i * 16,
|
||
(ptrdiff_t) dst_stride);
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_idct8_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_h264_block_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_blocks; i++)
|
||
ff_h264_idct8_add_neon(dst + meta[i].dst_off,
|
||
coeffs + i * 64,
|
||
(ptrdiff_t) dst_stride);
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
/* NEON expects mutable tc0 pointer; copy to a local. */
|
||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
||
meta[i].tc0[2], meta[i].tc0[3] };
|
||
ff_h264_v_loop_filter_luma_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
meta[i].alpha, meta[i].beta, tc0_local);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
||
meta[i].tc0[2], meta[i].tc0[3] };
|
||
ff_h264_h_loop_filter_luma_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
meta[i].alpha, meta[i].beta, tc0_local);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
||
meta[i].tc0[2], meta[i].tc0[3] };
|
||
ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
meta[i].alpha, meta[i].beta, tc0_local);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
||
meta[i].tc0[2], meta[i].tc0[3] };
|
||
ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
meta[i].alpha, meta[i].beta, tc0_local);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
/* --- bS=4 intra variants. Note: the daedalus_h264_deblock_meta
|
||
* struct's tc0[] field is unused for intra (the spec hardcodes the
|
||
* strength). We accept the same meta type so callers can build a
|
||
* single edge-list and route by kernel — saves an extra struct.
|
||
*/
|
||
static int dispatch_h264_deblock_luma_v_intra_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
ff_h264_v_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
meta[i].alpha, meta[i].beta);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_deblock_luma_h_intra_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
ff_h264_h_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
meta[i].alpha, meta[i].beta);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_deblock_chroma_v_intra_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
ff_h264_v_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
meta[i].alpha, meta[i].beta);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_deblock_chroma_h_intra_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
ff_h264_h_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
|
||
(ptrdiff_t) dst_stride,
|
||
meta[i].alpha, meta[i].beta);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
/* FFmpeg's NEON entry uses a single stride for both dst and src
|
||
* (H264QpelContext convention). Caller already guarantees this
|
||
* via the public API contract documented in daedalus.h. */
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
ff_put_h264_qpel8_mc20_neon(dst + meta[i].dst_off,
|
||
src + meta[i].src_off,
|
||
(ptrdiff_t) stride);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
ff_put_h264_qpel8_mc02_neon(dst + meta[i].dst_off,
|
||
src + meta[i].src_off,
|
||
(ptrdiff_t) stride);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_h264_qpel_mc22_cpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
(void) ctx;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
ff_put_h264_qpel8_mc22_neon(dst + meta[i].dst_off,
|
||
src + meta[i].src_off,
|
||
(ptrdiff_t) stride);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
/* The four single-axis quarter-pel CPU dispatches are uniform; the
|
||
* macro collapses ~50 LOC of repetition. */
|
||
#define DEFINE_QPEL_CPU_DISPATCH(suffix, neon_fn) \
|
||
static int dispatch_h264_qpel_ ## suffix ## _cpu(daedalus_ctx *ctx, \
|
||
uint8_t *dst, const uint8_t *src, size_t stride, \
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
|
||
{ \
|
||
(void) ctx; \
|
||
for (size_t i = 0; i < n_blocks; i++) { \
|
||
neon_fn(dst + meta[i].dst_off, src + meta[i].src_off, \
|
||
(ptrdiff_t) stride); \
|
||
} \
|
||
return 0; \
|
||
}
|
||
|
||
DEFINE_QPEL_CPU_DISPATCH(mc10, ff_put_h264_qpel8_mc10_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc30, ff_put_h264_qpel8_mc30_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc01, ff_put_h264_qpel8_mc01_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc03, ff_put_h264_qpel8_mc03_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc11, ff_put_h264_qpel8_mc11_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc12, ff_put_h264_qpel8_mc12_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc13, ff_put_h264_qpel8_mc13_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc21, ff_put_h264_qpel8_mc21_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc23, ff_put_h264_qpel8_mc23_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc31, ff_put_h264_qpel8_mc31_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc32, ff_put_h264_qpel8_mc32_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(mc33, ff_put_h264_qpel8_mc33_neon)
|
||
|
||
/* avg_ biprediction variants — same dispatch shape as put_, just
|
||
* different NEON entry that L2-averages with the existing dst. */
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc20, ff_avg_h264_qpel8_mc20_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc02, ff_avg_h264_qpel8_mc02_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc22, ff_avg_h264_qpel8_mc22_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc10, ff_avg_h264_qpel8_mc10_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc30, ff_avg_h264_qpel8_mc30_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc01, ff_avg_h264_qpel8_mc01_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc03, ff_avg_h264_qpel8_mc03_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc11, ff_avg_h264_qpel8_mc11_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc12, ff_avg_h264_qpel8_mc12_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc13, ff_avg_h264_qpel8_mc13_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc21, ff_avg_h264_qpel8_mc21_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc23, ff_avg_h264_qpel8_mc23_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc31, ff_avg_h264_qpel8_mc31_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc32, ff_avg_h264_qpel8_mc32_neon)
|
||
DEFINE_QPEL_CPU_DISPATCH(avg_mc33, ff_avg_h264_qpel8_mc33_neon)
|
||
|
||
#undef DEFINE_QPEL_CPU_DISPATCH
|
||
|
||
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
||
|
||
typedef struct {
|
||
uint32_t n_blocks;
|
||
uint32_t blocks_per_row;
|
||
uint32_t dst_stride_u8;
|
||
uint32_t _pad;
|
||
} idct8_pc;
|
||
|
||
static int ensure_idct8_pipeline(daedalus_ctx *ctx)
|
||
{
|
||
if (ctx->idct8_pipe_ready) return 0;
|
||
if (v3d_runner_create_pipeline(ctx->runner,
|
||
"v3d_idct8.spv",
|
||
/*n_ssbos=*/3,
|
||
/*push_const_size=*/sizeof(idct8_pc),
|
||
&ctx->idct8_pipe) != 0) {
|
||
return -1;
|
||
}
|
||
ctx->idct8_pipe_ready = 1;
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_idct8_meta *meta)
|
||
{
|
||
if (ensure_idct8_pipeline(ctx) != 0) return -1;
|
||
|
||
/* Allocate three SSBOs per call (coeffs, dst, meta). Performance-
|
||
* tuning (buffer pool) is deferred; correctness first. */
|
||
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
|
||
size_t meta_bytes = n_blocks * 2 * sizeof(uint32_t); /* uvec2 per block */
|
||
/* dst buffer must hold all of dst[0..max_dst_off + 64 + 8*stride].
|
||
* Cheapest correct answer: alloc the smallest contiguous region
|
||
* containing every block's footprint. For Phase 8 we assume the
|
||
* caller's dst surface starts at byte 0 of the buffer and use
|
||
* the full provided extent. We size by scanning meta. */
|
||
size_t max_byte_touched = 0;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
size_t end = meta[i].dst_off + (size_t)(8 - 1) * dst_stride + 8;
|
||
if (end > max_byte_touched) max_byte_touched = end;
|
||
}
|
||
|
||
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
|
||
if (v3d_runner_acquire_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
||
if (v3d_runner_acquire_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
|
||
}
|
||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
|
||
}
|
||
|
||
/* Upload. Coeffs and meta are straight copies. Dst we copy the
|
||
* caller's full region (since we'll need to read it back). */
|
||
memcpy(buf_coeffs.mapped, coeffs, coeff_bytes);
|
||
memcpy(buf_dst.mapped, dst, max_byte_touched);
|
||
uint32_t *m = buf_meta.mapped;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
m[2*i + 0] = meta[i].block_x;
|
||
m[2*i + 1] = meta[i].block_y;
|
||
}
|
||
|
||
/* Bind: shader expects (coeffs, dst, meta) per src/v3d_idct8.comp. */
|
||
v3d_buffer binds[3] = { buf_coeffs, buf_dst, buf_meta };
|
||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->idct8_pipe, binds, 3)) {
|
||
goto fail;
|
||
}
|
||
|
||
/* WG geometry: 32 blocks per WG. */
|
||
uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
|
||
idct8_pc pc = {
|
||
.n_blocks = (uint32_t) n_blocks,
|
||
.blocks_per_row = 0, /* unused by v4 shader (meta drives placement) */
|
||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||
._pad = 0,
|
||
};
|
||
|
||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->idct8_pipe)) goto fail;
|
||
VkCommandBuffer cb = ctx->idct8_pipe.cb;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->idct8_pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->idct8_pipe.layout, 0, 1,
|
||
&ctx->idct8_pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, ctx->idct8_pipe.layout,
|
||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
/* Read-back dst. */
|
||
memcpy(dst, buf_dst.mapped, max_byte_touched);
|
||
|
||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
|
||
return 0;
|
||
|
||
fail:
|
||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
|
||
return -1;
|
||
}
|
||
|
||
/* -------------------- LPF QPU dispatch (cycles 2 + 4 shaders) --
|
||
*
|
||
* NOTE: the two LPF shaders disagree on push-constant slot order.
|
||
* v3d_lpf_h_4_8.comp: (n_edges, dst_stride_u8, _pad, _pad)
|
||
* v3d_lpf_h_8_8.comp: (n_edges, blocks_per_row=unused, dst_stride_u8, _pad)
|
||
*
|
||
* Same total size (16 bytes), different slot 2. Keep separate
|
||
* struct definitions to avoid silent corruption — Phase 8 caught
|
||
* this empirically when test_api_lpf wd=8 reported 95.6 % match.
|
||
*/
|
||
typedef struct {
|
||
uint32_t n_edges;
|
||
uint32_t dst_stride_u8;
|
||
uint32_t _pad0;
|
||
uint32_t _pad1;
|
||
} lpf4_pc;
|
||
|
||
typedef struct {
|
||
uint32_t n_edges;
|
||
uint32_t blocks_per_row; /* unused by shader, must exist */
|
||
uint32_t dst_stride_u8;
|
||
uint32_t _pad;
|
||
} lpf8_pc;
|
||
|
||
static int ensure_lpf_pipeline(daedalus_ctx *ctx, int wd_8,
|
||
int *flag, v3d_pipeline *pipe,
|
||
const char *spv)
|
||
{
|
||
if (*flag) return 0;
|
||
size_t pc_size = wd_8 ? sizeof(lpf8_pc) : sizeof(lpf4_pc);
|
||
if (v3d_runner_create_pipeline(ctx->runner, spv,
|
||
/*n_ssbos=*/2,
|
||
/*push_const_size=*/(uint32_t) pc_size,
|
||
pipe) != 0) {
|
||
return -1;
|
||
}
|
||
*flag = 1;
|
||
return 0;
|
||
}
|
||
|
||
static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||
{
|
||
int *flag = wd_8 ? &ctx->lpf8_pipe_ready : &ctx->lpf4_pipe_ready;
|
||
v3d_pipeline *p = wd_8 ? &ctx->lpf8_pipe : &ctx->lpf4_pipe;
|
||
const char *spv = wd_8 ? "v3d_lpf_h_8_8.spv" : "v3d_lpf_h_4_8.spv";
|
||
if (ensure_lpf_pipeline(ctx, wd_8, flag, p, spv) != 0) return -1;
|
||
|
||
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */
|
||
/* Determine smallest dst window. Each edge writes to bytes
|
||
* [dst_off - 4 .. dst_off + 3] for 8 rows at dst_stride. */
|
||
size_t lo = (size_t) -1, hi = 0;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
size_t base = meta[i].dst_off;
|
||
if (base >= 4) {
|
||
size_t this_lo = base - 4;
|
||
if (this_lo < lo) lo = this_lo;
|
||
} else {
|
||
lo = 0;
|
||
}
|
||
size_t this_hi = base + (size_t)(8 - 1) * dst_stride + 4;
|
||
if (this_hi > hi) hi = this_hi;
|
||
}
|
||
if (n_edges == 0) { lo = 0; hi = 0; }
|
||
size_t dst_window_size = hi - lo;
|
||
|
||
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
||
if (v3d_runner_acquire_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
||
v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1;
|
||
}
|
||
|
||
memcpy(buf_dst.mapped, dst + lo, dst_window_size);
|
||
uint32_t *m = buf_meta.mapped;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
m[4*i + 0] = (uint32_t)(meta[i].dst_off - lo);
|
||
m[4*i + 1] = (uint32_t) meta[i].E;
|
||
m[4*i + 2] = (uint32_t) meta[i].I;
|
||
m[4*i + 3] = (uint32_t) meta[i].H;
|
||
}
|
||
|
||
v3d_buffer binds[2] = { buf_meta, buf_dst };
|
||
if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
|
||
|
||
uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
|
||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, p)) goto fail;
|
||
VkCommandBuffer cb = p->cb;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
p->layout, 0, 1, &p->desc_set, 0, NULL);
|
||
if (wd_8) {
|
||
lpf8_pc pc = { .n_edges = (uint32_t) n_edges,
|
||
.blocks_per_row = 0,
|
||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||
._pad = 0 };
|
||
vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
} else {
|
||
lpf4_pc pc = { .n_edges = (uint32_t) n_edges,
|
||
.dst_stride_u8 = (uint32_t) dst_stride };
|
||
vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
}
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
memcpy(dst + lo, buf_dst.mapped, dst_window_size);
|
||
|
||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||
return -1;
|
||
}
|
||
|
||
/* -------------------- VP9 MC QPU dispatch (cycle 3) ------------- */
|
||
|
||
typedef struct {
|
||
uint32_t n_blocks;
|
||
uint32_t dst_stride_u8;
|
||
uint32_t src_stride_u8;
|
||
uint32_t _pad;
|
||
} mc_pc;
|
||
|
||
static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const uint8_t *src, size_t src_stride,
|
||
size_t n_blocks, const daedalus_mc_meta *meta)
|
||
{
|
||
if (!ctx->mc8h_pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_mc_8h.spv",
|
||
3, sizeof(mc_pc), &ctx->mc8h_pipe) != 0)
|
||
return -1;
|
||
ctx->mc8h_pipe_ready = 1;
|
||
}
|
||
|
||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||
size_t dst_max = 0, src_max = 0;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8;
|
||
if (de > dst_max) dst_max = de;
|
||
/* QPU shader reads src[src_off + row*stride + 0..14] for row=0..7. */
|
||
size_t se = meta[i].src_off + 7 * src_stride + 15;
|
||
if (se > src_max) src_max = se;
|
||
}
|
||
|
||
v3d_buffer bm = {0}, bd = {0}, bs = {0};
|
||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||
if (v3d_runner_acquire_buffer(ctx->runner, src_max, &bs)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||
|
||
memcpy(bs.mapped, src, src_max);
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = meta[i].src_off;
|
||
m[4*i+2] = (uint32_t) meta[i].mx;
|
||
m[4*i+3] = 0;
|
||
}
|
||
|
||
v3d_buffer binds[3] = { bm, bd, bs };
|
||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->mc8h_pipe, binds, 3)) goto fail;
|
||
|
||
uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
|
||
mc_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||
.src_stride_u8 = (uint32_t) src_stride };
|
||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->mc8h_pipe)) goto fail;
|
||
VkCommandBuffer cb = ctx->mc8h_pipe.cb;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->mc8h_pipe.layout, 0, 1, &ctx->mc8h_pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, ctx->mc8h_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
|
||
v3d_runner_release_buffer(ctx->runner, &bs);
|
||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_release_buffer(ctx->runner, &bs);
|
||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||
return -1;
|
||
}
|
||
|
||
/* -------------------- CDEF QPU dispatch (cycle 5) --------------- */
|
||
|
||
typedef struct {
|
||
uint32_t n_blocks;
|
||
uint32_t tmp_stride_u16;
|
||
uint32_t dst_stride_u8;
|
||
uint32_t _pad;
|
||
} cdef_pc;
|
||
|
||
static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const uint16_t *tmp,
|
||
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||
{
|
||
if (!ctx->cdef_pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_cdef.spv",
|
||
3, sizeof(cdef_pc), &ctx->cdef_pipe) != 0)
|
||
return -1;
|
||
ctx->cdef_pipe_ready = 1;
|
||
}
|
||
|
||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||
size_t dst_max = 0, tmp_max_u16 = 0;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8;
|
||
if (de > dst_max) dst_max = de;
|
||
size_t te = meta[i].tmp_off_u16 + (8 - 1) * 16 + 8; /* center 8x8 in stride-16 tmp */
|
||
if (te > tmp_max_u16) tmp_max_u16 = te;
|
||
}
|
||
size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
|
||
|
||
v3d_buffer bm = {0}, bd = {0}, bt = {0};
|
||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||
if (v3d_runner_acquire_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||
|
||
/* tmp may need padding before block-origin offset (caller-allocated). Just
|
||
* copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
|
||
* caller has the layout set up. */
|
||
memcpy(bt.mapped, tmp, tmp_bytes);
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
uint32_t pri = (uint32_t) meta[i].pri_strength;
|
||
uint32_t sec = (uint32_t) meta[i].sec_strength;
|
||
uint32_t damping = (uint32_t) meta[i].damping;
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = pri | (sec << 8) | (damping << 16);
|
||
m[4*i+2] = meta[i].tmp_off_u16;
|
||
m[4*i+3] = (uint32_t) meta[i].dir;
|
||
}
|
||
|
||
v3d_buffer binds[3] = { bm, bd, bt };
|
||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->cdef_pipe, binds, 3)) goto fail;
|
||
|
||
uint32_t wg_count = (uint32_t)((n_blocks + 3) / 4);
|
||
cdef_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
||
.tmp_stride_u16 = 16,
|
||
.dst_stride_u8 = (uint32_t) dst_stride };
|
||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->cdef_pipe)) goto fail;
|
||
VkCommandBuffer cb = ctx->cdef_pipe.cb;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->cdef_pipe.layout, 0, 1, &ctx->cdef_pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, ctx->cdef_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
|
||
v3d_runner_release_buffer(ctx->runner, &bt);
|
||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_release_buffer(ctx->runner, &bt);
|
||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||
return -1;
|
||
}
|
||
|
||
/* -------------------- H.264 deblock QPU dispatch (cycle 8) ------ */
|
||
|
||
typedef struct {
|
||
uint32_t n_edges;
|
||
uint32_t dst_stride_u8;
|
||
uint32_t _pad0;
|
||
uint32_t _pad1;
|
||
} h264deblock_pc;
|
||
|
||
static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
if (!ctx->h264deblock_pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264deblock.spv",
|
||
2, sizeof(h264deblock_pc), &ctx->h264deblock_pipe) != 0)
|
||
return -1;
|
||
ctx->h264deblock_pipe_ready = 1;
|
||
}
|
||
|
||
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t);
|
||
size_t dst_max = 0;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
/* Reads -4*stride to +3*stride+15 from dst_off; writes -2..+1 *stride. */
|
||
size_t e = meta[i].dst_off + 3 * dst_stride + 16;
|
||
if (e > dst_max) dst_max = e;
|
||
}
|
||
|
||
v3d_buffer bm = {0}, bd = {0};
|
||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8);
|
||
m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0])
|
||
| (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8)
|
||
| (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16)
|
||
| (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24);
|
||
m[4*i+3] = 0;
|
||
}
|
||
|
||
v3d_buffer binds[2] = { bm, bd };
|
||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264deblock_pipe, binds, 2)) goto fail;
|
||
|
||
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
|
||
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
|
||
.dst_stride_u8 = (uint32_t) dst_stride };
|
||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_pipe)) goto fail;
|
||
VkCommandBuffer cb = ctx->h264deblock_pipe.cb;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264deblock_pipe.layout, 0, 1, &ctx->h264deblock_pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, ctx->h264deblock_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
|
||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||
return -1;
|
||
}
|
||
|
||
/* -------------------- H.264 luma_h deblock QPU dispatch -------- */
|
||
|
||
static int dispatch_h264_deblock_h_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
if (!ctx->h264deblock_h_pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264deblock_h.spv",
|
||
2, sizeof(h264deblock_pc), &ctx->h264deblock_h_pipe) != 0)
|
||
return -1;
|
||
ctx->h264deblock_h_pipe_ready = 1;
|
||
}
|
||
|
||
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t);
|
||
/* H variant: reads cols [-4..+3] of 16 ROWS. Each lane processes one row.
|
||
* Max addressed byte = dst_off + 15*stride + 3 (last row, col +3). */
|
||
size_t dst_max = 0;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
size_t e = meta[i].dst_off + 15 * dst_stride + 4;
|
||
if (e > dst_max) dst_max = e;
|
||
}
|
||
|
||
v3d_buffer bm = {0}, bd = {0};
|
||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8);
|
||
m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0])
|
||
| (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8)
|
||
| (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16)
|
||
| (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24);
|
||
m[4*i+3] = 0;
|
||
}
|
||
|
||
v3d_buffer binds[2] = { bm, bd };
|
||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264deblock_h_pipe, binds, 2)) goto fail;
|
||
|
||
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
|
||
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
|
||
.dst_stride_u8 = (uint32_t) dst_stride };
|
||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_h_pipe)) goto fail;
|
||
VkCommandBuffer cb = ctx->h264deblock_h_pipe.cb;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_h_pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264deblock_h_pipe.layout, 0, 1, &ctx->h264deblock_h_pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, ctx->h264deblock_h_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
|
||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||
return -1;
|
||
}
|
||
|
||
/* -------------------- H.264 chroma deblock QPU dispatch -------- */
|
||
|
||
/* Generic chroma QPU dispatch (shared between V and H variants).
|
||
* Both shaders use 8 cells per edge; max-addressed-byte differs:
|
||
* V: dst_off + 1*stride + 7 (-2..+1 rows, cols 0..7 of edge)
|
||
* H: dst_off + 7*stride + 1 (-2..+1 cols, rows 0..7 of edge)
|
||
* Caller passes the precomputed extent.
|
||
*/
|
||
static int dispatch_h264_deblock_chroma_qpu(daedalus_ctx *ctx,
|
||
v3d_pipeline *pipe, int *pipe_ready, const char *spv_name,
|
||
uint8_t *dst, size_t dst_stride, size_t n_edges,
|
||
const daedalus_h264_deblock_meta *meta, int orient_h)
|
||
{
|
||
if (!*pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, spv_name,
|
||
2, sizeof(h264deblock_pc), pipe) != 0)
|
||
return -1;
|
||
*pipe_ready = 1;
|
||
}
|
||
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t);
|
||
size_t dst_max = 0;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
size_t e = orient_h ? meta[i].dst_off + 7 * dst_stride + 2
|
||
: meta[i].dst_off + 1 * dst_stride + 8;
|
||
if (e > dst_max) dst_max = e;
|
||
}
|
||
v3d_buffer bm = {0}, bd = {0};
|
||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_edges; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8);
|
||
m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0])
|
||
| (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8)
|
||
| (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16)
|
||
| (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24);
|
||
m[4*i+3] = 0;
|
||
}
|
||
v3d_buffer binds[2] = { bm, bd };
|
||
if (v3d_runner_bind_buffers(ctx->runner, pipe, binds, 2)) goto fail;
|
||
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
|
||
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
|
||
.dst_stride_u8 = (uint32_t) dst_stride };
|
||
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, pipe)) goto fail;
|
||
VkCommandBuffer cb = pipe->cb;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe->pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
pipe->layout, 0, 1, &pipe->desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, pipe->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||
return -1;
|
||
}
|
||
|
||
static int dispatch_h264_deblock_chroma_v_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
return dispatch_h264_deblock_chroma_qpu(ctx,
|
||
&ctx->h264deblock_chroma_v_pipe, &ctx->h264deblock_chroma_v_pipe_ready,
|
||
"v3d_h264deblock_chroma_v.spv", dst, dst_stride, n_edges, meta, 0);
|
||
}
|
||
|
||
static int dispatch_h264_deblock_chroma_h_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
return dispatch_h264_deblock_chroma_qpu(ctx,
|
||
&ctx->h264deblock_chroma_h_pipe, &ctx->h264deblock_chroma_h_pipe_ready,
|
||
"v3d_h264deblock_chroma_h.spv", dst, dst_stride, n_edges, meta, 1);
|
||
}
|
||
|
||
/* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */
|
||
|
||
typedef struct {
|
||
uint32_t n_blocks;
|
||
uint32_t dst_stride_u8;
|
||
uint32_t _pad0;
|
||
uint32_t _pad1;
|
||
} h264_idct4_pc;
|
||
|
||
static int dispatch_h264_idct4_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_h264_block_meta *meta)
|
||
{
|
||
if (!ctx->h264_idct4_pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct4.spv",
|
||
3, sizeof(h264_idct4_pc),
|
||
&ctx->h264_idct4_pipe) != 0)
|
||
return -1;
|
||
ctx->h264_idct4_pipe_ready = 1;
|
||
}
|
||
|
||
size_t coeff_bytes = n_blocks * 16 * sizeof(int16_t);
|
||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); /* uvec4 per block */
|
||
size_t dst_max = 0;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
size_t e = meta[i].dst_off + (size_t) 3 * dst_stride + 4;
|
||
if (e > dst_max) dst_max = e;
|
||
}
|
||
|
||
v3d_buffer bc = {0}, bd = {0}, bm = {0};
|
||
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
|
||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||
}
|
||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||
}
|
||
|
||
memcpy(bc.mapped, coeffs, coeff_bytes);
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = 0;
|
||
m[4*i+2] = 0;
|
||
m[4*i+3] = 0;
|
||
}
|
||
|
||
v3d_buffer binds[3] = { bc, bd, bm };
|
||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct4_pipe, binds, 3))
|
||
goto fail;
|
||
|
||
uint32_t wg_count = (uint32_t)((n_blocks + 15) / 16); /* 16 blocks/WG */
|
||
h264_idct4_pc pc = {
|
||
.n_blocks = (uint32_t) n_blocks,
|
||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||
};
|
||
|
||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||
if (cb == VK_NULL_HANDLE) goto fail;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264_idct4_pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264_idct4_pipe.layout, 0, 1,
|
||
&ctx->h264_idct4_pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, ctx->h264_idct4_pipe.layout,
|
||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
|
||
/* H.264/FFmpeg convention: zero the coeffs block after the
|
||
* transform (matches the C ref + NEON .S behaviour). */
|
||
memset(coeffs, 0, coeff_bytes);
|
||
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||
return -1;
|
||
}
|
||
|
||
/* -------------------- H.264 IDCT 8x8 QPU dispatch (cycle 7) ----- */
|
||
|
||
typedef struct {
|
||
uint32_t n_blocks;
|
||
uint32_t dst_stride_u8;
|
||
uint32_t _pad0;
|
||
uint32_t _pad1;
|
||
} h264_idct8_pc;
|
||
|
||
static int dispatch_h264_idct8_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_h264_block_meta *meta)
|
||
{
|
||
if (!ctx->h264_idct8_pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct8.spv",
|
||
3, sizeof(h264_idct8_pc),
|
||
&ctx->h264_idct8_pipe) != 0)
|
||
return -1;
|
||
ctx->h264_idct8_pipe_ready = 1;
|
||
}
|
||
|
||
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
|
||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||
size_t dst_max = 0;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
size_t e = meta[i].dst_off + (size_t) 7 * dst_stride + 8;
|
||
if (e > dst_max) dst_max = e;
|
||
}
|
||
|
||
v3d_buffer bc = {0}, bd = {0}, bm = {0};
|
||
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
|
||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||
}
|
||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
|
||
}
|
||
|
||
memcpy(bc.mapped, coeffs, coeff_bytes);
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = 0;
|
||
m[4*i+2] = 0;
|
||
m[4*i+3] = 0;
|
||
}
|
||
|
||
v3d_buffer binds[3] = { bc, bd, bm };
|
||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct8_pipe, binds, 3))
|
||
goto fail;
|
||
|
||
uint32_t wg_count = (uint32_t)((n_blocks + 7) / 8); /* 8 blocks/WG */
|
||
h264_idct8_pc pc = {
|
||
.n_blocks = (uint32_t) n_blocks,
|
||
.dst_stride_u8 = (uint32_t) dst_stride,
|
||
};
|
||
|
||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||
if (cb == VK_NULL_HANDLE) goto fail;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264_idct8_pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264_idct8_pipe.layout, 0, 1,
|
||
&ctx->h264_idct8_pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, ctx->h264_idct8_pipe.layout,
|
||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
memset(coeffs, 0, coeff_bytes);
|
||
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bc);
|
||
return -1;
|
||
}
|
||
|
||
/* -------------------- H.264 qpel mc20 QPU dispatch (cycle 9) --- */
|
||
|
||
typedef struct {
|
||
uint32_t n_blocks;
|
||
uint32_t stride_u8;
|
||
uint32_t _pad0;
|
||
uint32_t _pad1;
|
||
} h264_qpel_mc20_pc;
|
||
|
||
static int dispatch_h264_qpel_mc20_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
if (!ctx->h264_qpel_mc20_pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_qpel_mc20.spv",
|
||
3, sizeof(h264_qpel_mc20_pc),
|
||
&ctx->h264_qpel_mc20_pipe) != 0)
|
||
return -1;
|
||
ctx->h264_qpel_mc20_pipe_ready = 1;
|
||
}
|
||
|
||
/* Compute the smallest contiguous src/dst window that covers
|
||
* every block's read/write footprint.
|
||
*
|
||
* src: filter reads cols (c-2)..(c+3) for c=0..7 across rows 0..7.
|
||
* Highest read = src_off + 7*stride + (7 + 3) = src_off + 7*stride + 10.
|
||
* Plus 1 for the byte-count semantic of memcpy (length=N copies
|
||
* indices 0..N-1) → src_max = src_off + 7*stride + 11.
|
||
*
|
||
* dst: writes cols 0..7 across rows 0..7.
|
||
* Highest write = dst_off + 7*stride + 7; +1 → dst_off + 7*stride + 8. */
|
||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||
size_t src_max = 0, dst_max = 0;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
size_t s_end = meta[i].src_off + (size_t) 7 * stride + 11;
|
||
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
|
||
if (s_end > src_max) src_max = s_end;
|
||
if (d_end > dst_max) dst_max = d_end;
|
||
}
|
||
|
||
v3d_buffer bs = {0}, bd = {0}, bm = {0};
|
||
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
|
||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||
}
|
||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||
}
|
||
|
||
/* Copy src window (filter needs cols -2..+3, captured by src_max
|
||
* upper bound above; the lower bound is implicit in src_off >= 2
|
||
* which the caller guarantees per the public API contract). */
|
||
memcpy(bs.mapped, src, src_max);
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = meta[i].src_off;
|
||
m[4*i+2] = 0;
|
||
m[4*i+3] = 0;
|
||
}
|
||
|
||
v3d_buffer binds[3] = { bs, bd, bm };
|
||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_qpel_mc20_pipe, binds, 3))
|
||
goto fail;
|
||
|
||
uint32_t wg_count = (uint32_t) n_blocks; /* 1 block per WG */
|
||
h264_qpel_mc20_pc pc = {
|
||
.n_blocks = (uint32_t) n_blocks,
|
||
.stride_u8 = (uint32_t) stride,
|
||
};
|
||
|
||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||
if (cb == VK_NULL_HANDLE) goto fail;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264_qpel_mc20_pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264_qpel_mc20_pipe.layout, 0, 1,
|
||
&ctx->h264_qpel_mc20_pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, ctx->h264_qpel_mc20_pipe.layout,
|
||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||
return -1;
|
||
}
|
||
|
||
static int dispatch_h264_qpel_mc02_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
/* Same shape as mc20 but with vertical access pattern. src_max
|
||
* reflects the row-wise filter window: bottom output row (r=7)
|
||
* reads up to row r+3 = 10 of the src; so src_max = src_off +
|
||
* 10*stride + 8 (last col + 1 for memcpy semantics). */
|
||
if (!ctx->h264_qpel_mc02_pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_qpel_mc02.spv",
|
||
3, sizeof(h264_qpel_mc20_pc),
|
||
&ctx->h264_qpel_mc02_pipe) != 0)
|
||
return -1;
|
||
ctx->h264_qpel_mc02_pipe_ready = 1;
|
||
}
|
||
|
||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||
size_t src_max = 0, dst_max = 0;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
size_t s_end = meta[i].src_off + (size_t) 10 * stride + 8;
|
||
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
|
||
if (s_end > src_max) src_max = s_end;
|
||
if (d_end > dst_max) dst_max = d_end;
|
||
}
|
||
|
||
v3d_buffer bs = {0}, bd = {0}, bm = {0};
|
||
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
|
||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||
}
|
||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||
}
|
||
|
||
memcpy(bs.mapped, src, src_max);
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = meta[i].src_off;
|
||
m[4*i+2] = 0;
|
||
m[4*i+3] = 0;
|
||
}
|
||
|
||
v3d_buffer binds[3] = { bs, bd, bm };
|
||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_qpel_mc02_pipe, binds, 3))
|
||
goto fail;
|
||
|
||
uint32_t wg_count = (uint32_t) n_blocks;
|
||
h264_qpel_mc20_pc pc = {
|
||
.n_blocks = (uint32_t) n_blocks,
|
||
.stride_u8 = (uint32_t) stride,
|
||
};
|
||
|
||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||
if (cb == VK_NULL_HANDLE) goto fail;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264_qpel_mc02_pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264_qpel_mc02_pipe.layout, 0, 1,
|
||
&ctx->h264_qpel_mc02_pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, ctx->h264_qpel_mc02_pipe.layout,
|
||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||
return -1;
|
||
}
|
||
|
||
static int dispatch_h264_qpel_mc22_qpu(daedalus_ctx *ctx,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
/* 2D HV cascade: rows -2..+10 (13 rows of src) AND cols -2..+10
|
||
* per row (8 output cols × cols c-2..c+3 → up to col 10). So
|
||
* src_max = src_off + 10*stride + 11.
|
||
* (mc20 needed 7*stride + 11; mc02 needed 10*stride + 8;
|
||
* mc22 needs MAX of both = 10*stride + 11.) */
|
||
if (!ctx->h264_qpel_mc22_pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_qpel_mc22.spv",
|
||
3, sizeof(h264_qpel_mc20_pc),
|
||
&ctx->h264_qpel_mc22_pipe) != 0)
|
||
return -1;
|
||
ctx->h264_qpel_mc22_pipe_ready = 1;
|
||
}
|
||
|
||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||
size_t src_max = 0, dst_max = 0;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
size_t s_end = meta[i].src_off + (size_t) 10 * stride + 11;
|
||
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
|
||
if (s_end > src_max) src_max = s_end;
|
||
if (d_end > dst_max) dst_max = d_end;
|
||
}
|
||
|
||
v3d_buffer bs = {0}, bd = {0}, bm = {0};
|
||
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
|
||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||
}
|
||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||
}
|
||
|
||
memcpy(bs.mapped, src, src_max);
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = meta[i].src_off;
|
||
m[4*i+2] = 0;
|
||
m[4*i+3] = 0;
|
||
}
|
||
|
||
v3d_buffer binds[3] = { bs, bd, bm };
|
||
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_qpel_mc22_pipe, binds, 3))
|
||
goto fail;
|
||
|
||
uint32_t wg_count = (uint32_t) n_blocks;
|
||
h264_qpel_mc20_pc pc = {
|
||
.n_blocks = (uint32_t) n_blocks,
|
||
.stride_u8 = (uint32_t) stride,
|
||
};
|
||
|
||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||
if (cb == VK_NULL_HANDLE) goto fail;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264_qpel_mc22_pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
ctx->h264_qpel_mc22_pipe.layout, 0, 1,
|
||
&ctx->h264_qpel_mc22_pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, ctx->h264_qpel_mc22_pipe.layout,
|
||
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||
return -1;
|
||
}
|
||
|
||
/* Generic QPU dispatch for the 4 single-axis quarter-pel shaders
|
||
* (mc10/30 horizontal, mc01/03 vertical). All 4 share the same WG
|
||
* geometry (64 lanes/block, 1 block/WG), push-constant layout, and
|
||
* 3-binding interface (src/dst/meta) as mc20/mc02. Only src_max
|
||
* differs by axis:
|
||
* H variants: src_max = src_off + 7*stride + 11 (cols -2..+10)
|
||
* V variants: src_max = src_off + 10*stride + 8 (rows -2..+10)
|
||
*/
|
||
static int dispatch_h264_qpel_axis_qpu(daedalus_ctx *ctx,
|
||
v3d_pipeline *pipe, int *pipe_ready, const char *spv,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta,
|
||
int axis_v)
|
||
{
|
||
if (!*pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, spv,
|
||
3, sizeof(h264_qpel_mc20_pc), pipe) != 0)
|
||
return -1;
|
||
*pipe_ready = 1;
|
||
}
|
||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||
size_t src_max = 0, dst_max = 0;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
size_t s_end = axis_v ? meta[i].src_off + (size_t) 10 * stride + 8
|
||
: meta[i].src_off + (size_t) 7 * stride + 11;
|
||
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
|
||
if (s_end > src_max) src_max = s_end;
|
||
if (d_end > dst_max) dst_max = d_end;
|
||
}
|
||
v3d_buffer bs = {0}, bd = {0}, bm = {0};
|
||
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
|
||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||
}
|
||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||
}
|
||
memcpy(bs.mapped, src, src_max);
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = meta[i].src_off;
|
||
m[4*i+2] = 0;
|
||
m[4*i+3] = 0;
|
||
}
|
||
v3d_buffer binds[3] = { bs, bd, bm };
|
||
if (v3d_runner_bind_buffers(ctx->runner, pipe, binds, 3)) goto fail;
|
||
h264_qpel_mc20_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
||
.stride_u8 = (uint32_t) stride };
|
||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||
if (cb == VK_NULL_HANDLE) goto fail;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe->pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
pipe->layout, 0, 1, &pipe->desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, pipe->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, (uint32_t) n_blocks, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||
return -1;
|
||
}
|
||
|
||
#define DEFINE_QPEL_AXIS_QPU(name, spv, axis_v) \
|
||
static int dispatch_h264_qpel_ ## name ## _qpu(daedalus_ctx *ctx, \
|
||
uint8_t *dst, const uint8_t *src, size_t stride, \
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
|
||
{ \
|
||
return dispatch_h264_qpel_axis_qpu(ctx, &ctx->h264_qpel_ ## name ## _pipe, \
|
||
&ctx->h264_qpel_ ## name ## _pipe_ready, spv, dst, src, stride, \
|
||
n_blocks, meta, axis_v); \
|
||
}
|
||
|
||
DEFINE_QPEL_AXIS_QPU(mc10, "v3d_h264_qpel_mc10.spv", 0)
|
||
DEFINE_QPEL_AXIS_QPU(mc30, "v3d_h264_qpel_mc30.spv", 0)
|
||
DEFINE_QPEL_AXIS_QPU(mc01, "v3d_h264_qpel_mc01.spv", 1)
|
||
DEFINE_QPEL_AXIS_QPU(mc03, "v3d_h264_qpel_mc03.spv", 1)
|
||
|
||
#undef DEFINE_QPEL_AXIS_QPU
|
||
|
||
/* Diagonals share the mc22-style src envelope (rows -2..+10, cols
|
||
* -2..+10) because they compose mc22 with mc20/mc02, sometimes
|
||
* with (r+1, c) or (r, c+1) offsets. */
|
||
static int dispatch_h264_qpel_diag_qpu(daedalus_ctx *ctx,
|
||
v3d_pipeline *pipe, int *pipe_ready, const char *spv,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
if (!*pipe_ready) {
|
||
if (v3d_runner_create_pipeline(ctx->runner, spv,
|
||
3, sizeof(h264_qpel_mc20_pc), pipe) != 0)
|
||
return -1;
|
||
*pipe_ready = 1;
|
||
}
|
||
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
||
size_t src_max = 0, dst_max = 0;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
size_t s_end = meta[i].src_off + (size_t) 10 * stride + 11;
|
||
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
|
||
if (s_end > src_max) src_max = s_end;
|
||
if (d_end > dst_max) dst_max = d_end;
|
||
}
|
||
v3d_buffer bs = {0}, bd = {0}, bm = {0};
|
||
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
|
||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||
}
|
||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
|
||
}
|
||
memcpy(bs.mapped, src, src_max);
|
||
memcpy(bd.mapped, dst, dst_max);
|
||
uint32_t *m = bm.mapped;
|
||
for (size_t i = 0; i < n_blocks; i++) {
|
||
m[4*i+0] = meta[i].dst_off;
|
||
m[4*i+1] = meta[i].src_off;
|
||
m[4*i+2] = 0;
|
||
m[4*i+3] = 0;
|
||
}
|
||
v3d_buffer binds[3] = { bs, bd, bm };
|
||
if (v3d_runner_bind_buffers(ctx->runner, pipe, binds, 3)) goto fail;
|
||
h264_qpel_mc20_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
||
.stride_u8 = (uint32_t) stride };
|
||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||
if (cb == VK_NULL_HANDLE) goto fail;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe->pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
pipe->layout, 0, 1, &pipe->desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, pipe->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, (uint32_t) n_blocks, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||
memcpy(dst, bd.mapped, dst_max);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||
return 0;
|
||
fail:
|
||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||
return -1;
|
||
}
|
||
|
||
#define DEFINE_QPEL_DIAG_QPU(name) \
|
||
static int dispatch_h264_qpel_ ## name ## _qpu(daedalus_ctx *ctx, \
|
||
uint8_t *dst, const uint8_t *src, size_t stride, \
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
|
||
{ \
|
||
return dispatch_h264_qpel_diag_qpu(ctx, &ctx->h264_qpel_ ## name ## _pipe, \
|
||
&ctx->h264_qpel_ ## name ## _pipe_ready, \
|
||
"v3d_h264_qpel_" #name ".spv", dst, src, stride, n_blocks, meta); \
|
||
}
|
||
|
||
DEFINE_QPEL_DIAG_QPU(mc11)
|
||
DEFINE_QPEL_DIAG_QPU(mc12)
|
||
DEFINE_QPEL_DIAG_QPU(mc13)
|
||
DEFINE_QPEL_DIAG_QPU(mc21)
|
||
DEFINE_QPEL_DIAG_QPU(mc23)
|
||
DEFINE_QPEL_DIAG_QPU(mc31)
|
||
DEFINE_QPEL_DIAG_QPU(mc32)
|
||
DEFINE_QPEL_DIAG_QPU(mc33)
|
||
|
||
/* avg_ variants — same diag-style envelope (10*stride+11 covers any
|
||
* (r±1, c±1) offset the avg_ shaders use), different SPV file.
|
||
* Slightly over-allocates for avg_mc20/02/10/30/01/03 (which need
|
||
* less src context) but the cost is negligible. */
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc20)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc02)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc22)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc10)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc30)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc01)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc03)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc11)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc12)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc13)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc21)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc23)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc31)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc32)
|
||
DEFINE_QPEL_DIAG_QPU(avg_mc33)
|
||
|
||
#undef DEFINE_QPEL_DIAG_QPU
|
||
|
||
/* -------------------- Public dispatch entry points -------------- */
|
||
|
||
#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \
|
||
daedalus_substrate eff = sub; \
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \
|
||
return -1 /* QPU path not yet wired for this kernel */
|
||
|
||
int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_idct8_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
|
||
return dispatch_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta);
|
||
return dispatch_lpf_qpu(ctx, 0, dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta);
|
||
return dispatch_lpf_qpu(ctx, 1, dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const uint8_t *src, size_t src_stride,
|
||
size_t n_blocks, const daedalus_mc_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_MC_8H);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_mc_8h_cpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta);
|
||
return dispatch_mc_8h_qpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const uint16_t *tmp,
|
||
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_AV1_CDEF_8X8);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_cdef_cpu(ctx, dst, dst_stride, tmp, n_blocks, meta);
|
||
return dispatch_cdef_qpu(ctx, dst, dst_stride, tmp, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_h264_block_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_h264_idct4_cpu(ctx, dst, dst_stride,
|
||
coeffs, n_blocks, meta);
|
||
return dispatch_h264_idct4_qpu(ctx, dst, dst_stride,
|
||
coeffs, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_h264_block_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_h264_idct8_cpu(ctx, dst, dst_stride,
|
||
coeffs, n_blocks, meta);
|
||
return dispatch_h264_idct8_qpu(ctx, dst, dst_stride,
|
||
coeffs, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_h264_deblock_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||
return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||
return dispatch_h264_deblock_h_qpu(ctx, dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||
return dispatch_h264_deblock_chroma_v_qpu(ctx, dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||
return dispatch_h264_deblock_chroma_h_qpu(ctx, dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
#define DEFINE_INTRA_DISPATCH(name, kernel, cpu_fn) \
|
||
int daedalus_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
|
||
daedalus_substrate sub, uint8_t *dst, size_t dst_stride, \
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
|
||
{ \
|
||
daedalus_substrate eff = sub; \
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
|
||
eff = daedalus_recipe_substrate_for(kernel); \
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \
|
||
return cpu_fn(ctx, dst, dst_stride, n_edges, meta); \
|
||
}
|
||
|
||
DEFINE_INTRA_DISPATCH(luma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA, dispatch_h264_deblock_luma_v_intra_cpu)
|
||
DEFINE_INTRA_DISPATCH(luma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA, dispatch_h264_deblock_luma_h_intra_cpu)
|
||
DEFINE_INTRA_DISPATCH(chroma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA, dispatch_h264_deblock_chroma_v_intra_cpu)
|
||
DEFINE_INTRA_DISPATCH(chroma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA, dispatch_h264_deblock_chroma_h_intra_cpu)
|
||
|
||
#undef DEFINE_INTRA_DISPATCH
|
||
|
||
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_h264_qpel_mc20_cpu(ctx, dst, src, stride,
|
||
n_blocks, meta);
|
||
return dispatch_h264_qpel_mc20_qpu(ctx, dst, src, stride,
|
||
n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC02);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta);
|
||
return dispatch_h264_qpel_mc02_qpu(ctx, dst, src, stride, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
daedalus_substrate eff = sub;
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC22);
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||
return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta);
|
||
return dispatch_h264_qpel_mc22_qpu(ctx, dst, src, stride, n_blocks, meta);
|
||
}
|
||
|
||
#define DEFINE_QPEL_DISPATCH(suffix, kernel) \
|
||
int daedalus_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \
|
||
daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, \
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
|
||
{ \
|
||
daedalus_substrate eff = sub; \
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
|
||
eff = daedalus_recipe_substrate_for(kernel); \
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \
|
||
return dispatch_h264_qpel_ ## suffix ## _cpu(ctx, dst, src, stride, \
|
||
n_blocks, meta); \
|
||
}
|
||
|
||
/* mc10/30/01/03 now have QPU shaders — explicit definitions instead of
|
||
* the no-QPU DEFINE_QPEL_DISPATCH macro. Same routing shape as mc20/02. */
|
||
#define DEFINE_QPEL_DISPATCH_QPU(suffix, kernel) \
|
||
int daedalus_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \
|
||
daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, \
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
|
||
{ \
|
||
daedalus_substrate eff = sub; \
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
|
||
eff = daedalus_recipe_substrate_for(kernel); \
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU) \
|
||
return dispatch_h264_qpel_ ## suffix ## _cpu(ctx, dst, src, stride, \
|
||
n_blocks, meta); \
|
||
return dispatch_h264_qpel_ ## suffix ## _qpu(ctx, dst, src, stride, \
|
||
n_blocks, meta); \
|
||
}
|
||
|
||
DEFINE_QPEL_DISPATCH_QPU(mc10, DAEDALUS_KERNEL_H264_QPEL_MC10)
|
||
DEFINE_QPEL_DISPATCH_QPU(mc30, DAEDALUS_KERNEL_H264_QPEL_MC30)
|
||
DEFINE_QPEL_DISPATCH_QPU(mc01, DAEDALUS_KERNEL_H264_QPEL_MC01)
|
||
DEFINE_QPEL_DISPATCH_QPU(mc03, DAEDALUS_KERNEL_H264_QPEL_MC03)
|
||
#undef DEFINE_QPEL_DISPATCH_QPU
|
||
/* mc11..mc33 diagonals — QPU-capable, same macro shape as mc10/30/01/03. */
|
||
#define DEFINE_QPEL_DIAG_PUBLIC(suffix, kernel) \
|
||
int daedalus_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \
|
||
daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, \
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
|
||
{ \
|
||
daedalus_substrate eff = sub; \
|
||
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
|
||
eff = daedalus_recipe_substrate_for(kernel); \
|
||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||
if (eff == DAEDALUS_SUBSTRATE_CPU) \
|
||
return dispatch_h264_qpel_ ## suffix ## _cpu(ctx, dst, src, stride, \
|
||
n_blocks, meta); \
|
||
return dispatch_h264_qpel_ ## suffix ## _qpu(ctx, dst, src, stride, \
|
||
n_blocks, meta); \
|
||
}
|
||
|
||
DEFINE_QPEL_DIAG_PUBLIC(mc11, DAEDALUS_KERNEL_H264_QPEL_MC11)
|
||
DEFINE_QPEL_DIAG_PUBLIC(mc12, DAEDALUS_KERNEL_H264_QPEL_MC12)
|
||
DEFINE_QPEL_DIAG_PUBLIC(mc13, DAEDALUS_KERNEL_H264_QPEL_MC13)
|
||
DEFINE_QPEL_DIAG_PUBLIC(mc21, DAEDALUS_KERNEL_H264_QPEL_MC21)
|
||
DEFINE_QPEL_DIAG_PUBLIC(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23)
|
||
DEFINE_QPEL_DIAG_PUBLIC(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31)
|
||
DEFINE_QPEL_DIAG_PUBLIC(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32)
|
||
DEFINE_QPEL_DIAG_PUBLIC(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33)
|
||
|
||
/* avg_ biprediction dispatchers (15 positions) — same macro, the
|
||
* underlying _qpu dispatch fns also reuse the diag QPU helper since
|
||
* the avg_ shaders share the put_ src envelope (the L2 step only
|
||
* touches dst). */
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32)
|
||
DEFINE_QPEL_DIAG_PUBLIC(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33)
|
||
#undef DEFINE_QPEL_DIAG_PUBLIC
|
||
|
||
#undef DEFINE_QPEL_DISPATCH
|
||
|
||
/* -------------------- Recipe convenience wrappers --------------- */
|
||
|
||
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_idct8_meta *meta)
|
||
{
|
||
return daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, coeffs, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_vp9_lpf4(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||
{
|
||
return daedalus_dispatch_vp9_lpf4(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_vp9_lpf8(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_lpf_meta *meta)
|
||
{
|
||
return daedalus_dispatch_vp9_lpf8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_vp9_mc_8h(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const uint8_t *src, size_t src_stride,
|
||
size_t n_blocks, const daedalus_mc_meta *meta)
|
||
{
|
||
return daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, src, src_stride, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
const uint16_t *tmp,
|
||
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||
{
|
||
return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, tmp, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_h264_block_meta *meta)
|
||
{
|
||
return daedalus_dispatch_h264_idct4(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, coeffs, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
int16_t *coeffs, size_t n_blocks,
|
||
const daedalus_h264_block_meta *meta)
|
||
{
|
||
return daedalus_dispatch_h264_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, coeffs, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
return daedalus_dispatch_h264_deblock_luma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
|
||
uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||
{
|
||
return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, dst_stride, n_edges, meta);
|
||
}
|
||
|
||
#define DEFINE_INTRA_RECIPE(name) \
|
||
int daedalus_recipe_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
|
||
uint8_t *dst, size_t dst_stride, \
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
|
||
{ \
|
||
return daedalus_dispatch_h264_deblock_ ## name (ctx, DAEDALUS_SUBSTRATE_AUTO, \
|
||
dst, dst_stride, n_edges, meta); \
|
||
}
|
||
|
||
DEFINE_INTRA_RECIPE(luma_v_intra)
|
||
DEFINE_INTRA_RECIPE(luma_h_intra)
|
||
DEFINE_INTRA_RECIPE(chroma_v_intra)
|
||
DEFINE_INTRA_RECIPE(chroma_h_intra)
|
||
|
||
#undef DEFINE_INTRA_RECIPE
|
||
|
||
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, src, stride, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, src, stride, n_blocks, meta);
|
||
}
|
||
|
||
int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx,
|
||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||
{
|
||
return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||
dst, src, stride, n_blocks, meta);
|
||
}
|
||
|
||
#define DEFINE_QPEL_RECIPE(suffix) \
|
||
int daedalus_recipe_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \
|
||
uint8_t *dst, const uint8_t *src, size_t stride, \
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
|
||
{ \
|
||
return daedalus_dispatch_h264_qpel_ ## suffix(ctx, DAEDALUS_SUBSTRATE_AUTO,\
|
||
dst, src, stride, n_blocks, meta); \
|
||
}
|
||
|
||
DEFINE_QPEL_RECIPE(mc10)
|
||
DEFINE_QPEL_RECIPE(mc30)
|
||
DEFINE_QPEL_RECIPE(mc01)
|
||
DEFINE_QPEL_RECIPE(mc03)
|
||
DEFINE_QPEL_RECIPE(mc11)
|
||
DEFINE_QPEL_RECIPE(mc12)
|
||
DEFINE_QPEL_RECIPE(mc13)
|
||
DEFINE_QPEL_RECIPE(mc21)
|
||
DEFINE_QPEL_RECIPE(mc23)
|
||
DEFINE_QPEL_RECIPE(mc31)
|
||
DEFINE_QPEL_RECIPE(mc32)
|
||
DEFINE_QPEL_RECIPE(mc33)
|
||
DEFINE_QPEL_RECIPE(avg_mc20)
|
||
DEFINE_QPEL_RECIPE(avg_mc02)
|
||
DEFINE_QPEL_RECIPE(avg_mc22)
|
||
DEFINE_QPEL_RECIPE(avg_mc10)
|
||
DEFINE_QPEL_RECIPE(avg_mc30)
|
||
DEFINE_QPEL_RECIPE(avg_mc01)
|
||
DEFINE_QPEL_RECIPE(avg_mc03)
|
||
DEFINE_QPEL_RECIPE(avg_mc11)
|
||
DEFINE_QPEL_RECIPE(avg_mc12)
|
||
DEFINE_QPEL_RECIPE(avg_mc13)
|
||
DEFINE_QPEL_RECIPE(avg_mc21)
|
||
DEFINE_QPEL_RECIPE(avg_mc23)
|
||
DEFINE_QPEL_RECIPE(avg_mc31)
|
||
DEFINE_QPEL_RECIPE(avg_mc32)
|
||
DEFINE_QPEL_RECIPE(avg_mc33)
|
||
|
||
#undef DEFINE_QPEL_RECIPE
|