Files
daedalus-fourier/src/daedalus_core.c
T
claude-noether 2079fe39c6 h264: V3D shaders for all 15 avg_ qpel positions — qpel QPU complete
Generates 15 avg_ shader variants by templating from the existing
put_ shaders.  Each avg_ shader is identical to its put_ sibling
except the final write does an L2 average with the existing dst:

  put_:  dst[r,c] = result
  avg_:  dst[r,c] = (dst[r,c] + result + 1) >> 1

Per H.264 §8.4.2.3.1 (B-slice biprediction): caller pre-loads dst
with the list0 prediction; the avg_ call folds in list1.

Generated via python (avg-shader-gen.py): reads each
v3d_h264_qpel_mcXY.comp, transforms the docstring header + final
write hunk, writes v3d_h264_qpel_avg_mcXY.comp.  ~88 lines each;
15 new shader files.

Dispatch reuses the existing dispatch_h264_qpel_diag_qpu helper for
all 15 — same src envelope (10*stride+11 covers any (r±1, c±1)
shift), the L2 step only touches dst.  Slightly over-allocates for
the simpler positions (avg_mc20/02/10/30/01/03) but negligible
cost.  Eliminates 15 wrappers + 15 src_max bound calculations that
would otherwise duplicate.

CMake foreach loops compile + install 15 new SPV files.  ctx grows
15 pipeline pairs.  Recipe table flips DAEDALUS_KERNEL_H264_QPEL_AVG_*
from CPU to QPU.  Public dispatchers re-defined via the existing
DEFINE_QPEL_DIAG_PUBLIC macro (replaces the CPU-only
DEFINE_QPEL_DISPATCH instantiations).

Verified on hertz:

  $ ./build/test_api_h264 | grep "qpel avg" | wc -l
  15
  $ ./build/test_api_h264 | grep "qpel avg" | grep -c "100.0000%"
  15

All 15 PASS 2048/2048 bytes bit-exact via QPU.

QPU coverage for the H.264 8-bit 4:2:0 hot-path pixel kernels:

  Layer                Coverage
  ─────────────────────────────────────────────────────────────
  IDCT 4x4 luma        ✓ cycle 6 (one QPU shader, also handles chroma)
  IDCT 8x8 luma        ✓ cycle 7
  Chroma DC Hadamard   CPU only (4 adds + 4 subs; not worth)
  Deblock luma_v       ✓ cycle 8
  Deblock luma_h       ✓ PR #28
  Deblock chroma_v/h   ✓ PR #29
  Deblock *_intra      CPU only (less common, structurally different)
  qpel put_ 15 pos     ✓ cycle 9 (mc20) + PRs #30-#33
  qpel avg_ 15 pos     ✓ THIS PR

The H.264 non-intra-deblock hot path is now FULLY on QPU for any
consumer that initialises daedalus with a QPU-capable context.
2026-05-25 20:22:33 +02:00

2393 lines
107 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* daedalus-fourier core library — Phase 8 skeleton + IDCT QPU wired.
*
* Wraps cycles 1-5 kernels behind the public C API in
* include/daedalus.h. Recipe dispatch routes per-kernel to the
* verdict substrate from each cycle's Phase 7 doc.
*
* QPU dispatch wiring status:
* IDCT 8x8: wired (cycle 1 v4 shader).
* Others: stubbed (return -1); CPU path always works.
*
* License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
* dav1d BSD-2-Clause NEON snapshots.
*/
#include "../include/daedalus.h"
#include "v3d_runner.h"
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <assert.h>
/* -------------------- Context -------------------- */
struct daedalus_ctx {
int has_qpu;
v3d_runner *runner; /* NULL when has_qpu == 0 */
/* Per-kernel pipelines, lazy-created on first QPU dispatch. */
int idct8_pipe_ready;
v3d_pipeline idct8_pipe;
int lpf4_pipe_ready;
v3d_pipeline lpf4_pipe;
int lpf8_pipe_ready;
v3d_pipeline lpf8_pipe;
int mc8h_pipe_ready;
v3d_pipeline mc8h_pipe;
int cdef_pipe_ready;
v3d_pipeline cdef_pipe;
int h264deblock_pipe_ready;
v3d_pipeline h264deblock_pipe;
int h264deblock_h_pipe_ready;
v3d_pipeline h264deblock_h_pipe;
int h264deblock_chroma_v_pipe_ready;
v3d_pipeline h264deblock_chroma_v_pipe;
int h264deblock_chroma_h_pipe_ready;
v3d_pipeline h264deblock_chroma_h_pipe;
int h264_idct4_pipe_ready;
v3d_pipeline h264_idct4_pipe;
int h264_idct8_pipe_ready;
v3d_pipeline h264_idct8_pipe;
int h264_qpel_mc20_pipe_ready;
v3d_pipeline h264_qpel_mc20_pipe;
int h264_qpel_mc02_pipe_ready;
v3d_pipeline h264_qpel_mc02_pipe;
int h264_qpel_mc22_pipe_ready;
v3d_pipeline h264_qpel_mc22_pipe;
int h264_qpel_mc10_pipe_ready;
v3d_pipeline h264_qpel_mc10_pipe;
int h264_qpel_mc30_pipe_ready;
v3d_pipeline h264_qpel_mc30_pipe;
int h264_qpel_mc01_pipe_ready;
v3d_pipeline h264_qpel_mc01_pipe;
int h264_qpel_mc03_pipe_ready;
v3d_pipeline h264_qpel_mc03_pipe;
int h264_qpel_mc11_pipe_ready; v3d_pipeline h264_qpel_mc11_pipe;
int h264_qpel_mc12_pipe_ready; v3d_pipeline h264_qpel_mc12_pipe;
int h264_qpel_mc13_pipe_ready; v3d_pipeline h264_qpel_mc13_pipe;
int h264_qpel_mc21_pipe_ready; v3d_pipeline h264_qpel_mc21_pipe;
int h264_qpel_mc23_pipe_ready; v3d_pipeline h264_qpel_mc23_pipe;
int h264_qpel_mc31_pipe_ready; v3d_pipeline h264_qpel_mc31_pipe;
int h264_qpel_mc32_pipe_ready; v3d_pipeline h264_qpel_mc32_pipe;
int h264_qpel_mc33_pipe_ready; v3d_pipeline h264_qpel_mc33_pipe;
/* avg_ biprediction pipelines — same shaders + L2 with existing dst. */
int h264_qpel_avg_mc20_pipe_ready; v3d_pipeline h264_qpel_avg_mc20_pipe;
int h264_qpel_avg_mc02_pipe_ready; v3d_pipeline h264_qpel_avg_mc02_pipe;
int h264_qpel_avg_mc22_pipe_ready; v3d_pipeline h264_qpel_avg_mc22_pipe;
int h264_qpel_avg_mc10_pipe_ready; v3d_pipeline h264_qpel_avg_mc10_pipe;
int h264_qpel_avg_mc30_pipe_ready; v3d_pipeline h264_qpel_avg_mc30_pipe;
int h264_qpel_avg_mc01_pipe_ready; v3d_pipeline h264_qpel_avg_mc01_pipe;
int h264_qpel_avg_mc03_pipe_ready; v3d_pipeline h264_qpel_avg_mc03_pipe;
int h264_qpel_avg_mc11_pipe_ready; v3d_pipeline h264_qpel_avg_mc11_pipe;
int h264_qpel_avg_mc12_pipe_ready; v3d_pipeline h264_qpel_avg_mc12_pipe;
int h264_qpel_avg_mc13_pipe_ready; v3d_pipeline h264_qpel_avg_mc13_pipe;
int h264_qpel_avg_mc21_pipe_ready; v3d_pipeline h264_qpel_avg_mc21_pipe;
int h264_qpel_avg_mc23_pipe_ready; v3d_pipeline h264_qpel_avg_mc23_pipe;
int h264_qpel_avg_mc31_pipe_ready; v3d_pipeline h264_qpel_avg_mc31_pipe;
int h264_qpel_avg_mc32_pipe_ready; v3d_pipeline h264_qpel_avg_mc32_pipe;
int h264_qpel_avg_mc33_pipe_ready; v3d_pipeline h264_qpel_avg_mc33_pipe;
};
daedalus_ctx *daedalus_ctx_create(void)
{
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
if (!ctx) return NULL;
ctx->runner = v3d_runner_create();
ctx->has_qpu = (ctx->runner != NULL);
return ctx;
}
daedalus_ctx *daedalus_ctx_create_no_qpu(void)
{
/*
* Per the "QPU is default substrate" decree 2026-05-23:
* setting DAEDALUS_FORCE_QPU=1 in the process env escalates this
* function to a full daedalus_ctx_create(), letting the libavcodec
* substitution shims (which call create_no_qpu via pthread_once)
* fire the V3D shaders that exist for cycles 1/2/4/5/8. Without
* this hook each consumer process (firefox, mpv, daemon) would
* need its own shim build to opt into QPU.
*
* Default behaviour (env var unset / not "1") is unchanged: pure
* NEON ctx, no implicit Vulkan init. Firefox / mpv consumers
* that dlopen libavcodec without opting in stay on the
* Vulkan-free path; the daemon explicitly sets
* DAEDALUS_FORCE_QPU=1 before loading libavcodec.
*/
const char *force = getenv("DAEDALUS_FORCE_QPU");
if (force && force[0] == '1' && force[1] == 0)
return daedalus_ctx_create();
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
if (!ctx) return NULL;
ctx->has_qpu = 0;
ctx->runner = NULL;
return ctx;
}
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
{
return ctx ? ctx->has_qpu : 0;
}
void daedalus_ctx_destroy(daedalus_ctx *ctx)
{
if (!ctx) return;
if (ctx->runner) {
if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
if (ctx->lpf4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe);
if (ctx->lpf8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe);
if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe);
if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
if (ctx->h264deblock_h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_h_pipe);
if (ctx->h264deblock_chroma_v_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_chroma_v_pipe);
if (ctx->h264deblock_chroma_h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_chroma_h_pipe);
if (ctx->h264_idct4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe);
if (ctx->h264_idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe);
if (ctx->h264_qpel_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc20_pipe);
if (ctx->h264_qpel_mc02_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc02_pipe);
if (ctx->h264_qpel_mc22_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc22_pipe);
if (ctx->h264_qpel_mc10_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc10_pipe);
if (ctx->h264_qpel_mc30_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc30_pipe);
if (ctx->h264_qpel_mc01_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc01_pipe);
if (ctx->h264_qpel_mc03_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc03_pipe);
if (ctx->h264_qpel_mc11_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc11_pipe);
if (ctx->h264_qpel_mc12_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc12_pipe);
if (ctx->h264_qpel_mc13_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc13_pipe);
if (ctx->h264_qpel_mc21_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc21_pipe);
if (ctx->h264_qpel_mc23_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc23_pipe);
if (ctx->h264_qpel_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc31_pipe);
if (ctx->h264_qpel_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc32_pipe);
if (ctx->h264_qpel_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc33_pipe);
if (ctx->h264_qpel_avg_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc20_pipe);
if (ctx->h264_qpel_avg_mc02_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc02_pipe);
if (ctx->h264_qpel_avg_mc22_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc22_pipe);
if (ctx->h264_qpel_avg_mc10_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc10_pipe);
if (ctx->h264_qpel_avg_mc30_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc30_pipe);
if (ctx->h264_qpel_avg_mc01_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc01_pipe);
if (ctx->h264_qpel_avg_mc03_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc03_pipe);
if (ctx->h264_qpel_avg_mc11_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc11_pipe);
if (ctx->h264_qpel_avg_mc12_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc12_pipe);
if (ctx->h264_qpel_avg_mc13_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc13_pipe);
if (ctx->h264_qpel_avg_mc21_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc21_pipe);
if (ctx->h264_qpel_avg_mc23_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc23_pipe);
if (ctx->h264_qpel_avg_mc31_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc31_pipe);
if (ctx->h264_qpel_avg_mc32_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc32_pipe);
if (ctx->h264_qpel_avg_mc33_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_avg_mc33_pipe);
v3d_runner_destroy(ctx->runner);
}
free(ctx);
}
/* -------------------- Recipe query -------------------- */
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
{
/*
* Recipe table per the "QPU is default substrate" decree
* 2026-05-23. Any kernel that has a V3D compute shader returns
* SUBSTRATE_QPU; CPU is the fallback for kernels without a
* shader (still the case for H.264 IDCT 4x4 / IDCT 8x8 / qpel
* mc20 — covered by follow-on task 165). The dispatch
* wrappers already fall back to CPU automatically when the
* ctx doesn't have QPU available (daedalus_ctx_has_qpu == 0).
*/
switch (k) {
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_QPU; /* v3d_mc_8h.spv */
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_cdef.spv */
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct4.spv */
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */
case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock_h.spv */
case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock_chroma_v.spv */
case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock_chroma_h.spv */
case DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 luma QPU pending */
case DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */
case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc02.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC22: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc22.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC10: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc10.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC30: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc30.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC01: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc01.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC03: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc03.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC11: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc11.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC12: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc12.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC13: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc13.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC21: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc21.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC23: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc23.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC31: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc31.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC32: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc32.spv */
case DAEDALUS_KERNEL_H264_QPEL_MC33: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc33.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc20.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc02.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc22.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc10.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc30.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc01.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc03.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc11.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc12.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc13.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc21.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc23.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc31.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc32.spv */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_avg_mc33.spv */
}
return DAEDALUS_SUBSTRATE_CPU;
}
/* -------------------- NEON externs (per cycle bench links) ----- */
extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob);
extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride,
int E, int I, int H);
extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride,
int E, int I, int H);
extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
int h, int mx, int my);
extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
const uint16_t *tmp,
int pri_strength, int sec_strength,
int dir, int damping, int h,
size_t edges);
extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
extern void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
/* -------------------- CPU dispatch implementations -------------- */
static int dispatch_idct8_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta)
{
(void) ctx;
int16_t scratch[64];
for (size_t i = 0; i < n_blocks; i++) {
memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t));
ff_vp9_idct_idct_8x8_add_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
scratch, 64);
}
return 0;
}
static int dispatch_lpf_cpu(daedalus_ctx *ctx, int wd_8,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
uint8_t *p = dst + meta[i].dst_off;
if (wd_8) ff_vp9_loop_filter_h_8_8_neon(p, (ptrdiff_t) dst_stride,
meta[i].E, meta[i].I, meta[i].H);
else ff_vp9_loop_filter_h_4_8_neon(p, (ptrdiff_t) dst_stride,
meta[i].E, meta[i].I, meta[i].H);
}
return 0;
}
static int dispatch_mc_8h_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_blocks; i++) {
ff_vp9_put_regular8_h_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
src + meta[i].src_off + 3,
(ptrdiff_t) src_stride,
8, meta[i].mx, 0);
}
return 0;
}
static int dispatch_cdef_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_blocks; i++) {
dav1d_cdef_filter8_8bpc_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
tmp + meta[i].tmp_off_u16,
meta[i].pri_strength,
meta[i].sec_strength,
meta[i].dir, meta[i].damping, 8, 0);
}
return 0;
}
static int dispatch_h264_idct4_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_blocks; i++)
ff_h264_idct_add_neon(dst + meta[i].dst_off,
coeffs + i * 16,
(ptrdiff_t) dst_stride);
return 0;
}
static int dispatch_h264_idct8_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_blocks; i++)
ff_h264_idct8_add_neon(dst + meta[i].dst_off,
coeffs + i * 64,
(ptrdiff_t) dst_stride);
return 0;
}
static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
/* NEON expects mutable tc0 pointer; copy to a local. */
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
meta[i].tc0[2], meta[i].tc0[3] };
ff_h264_v_loop_filter_luma_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta, tc0_local);
}
return 0;
}
static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
meta[i].tc0[2], meta[i].tc0[3] };
ff_h264_h_loop_filter_luma_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta, tc0_local);
}
return 0;
}
static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
meta[i].tc0[2], meta[i].tc0[3] };
ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta, tc0_local);
}
return 0;
}
static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
meta[i].tc0[2], meta[i].tc0[3] };
ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta, tc0_local);
}
return 0;
}
/* --- bS=4 intra variants. Note: the daedalus_h264_deblock_meta
* struct's tc0[] field is unused for intra (the spec hardcodes the
* strength). We accept the same meta type so callers can build a
* single edge-list and route by kernel — saves an extra struct.
*/
static int dispatch_h264_deblock_luma_v_intra_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
ff_h264_v_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta);
}
return 0;
}
static int dispatch_h264_deblock_luma_h_intra_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
ff_h264_h_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta);
}
return 0;
}
static int dispatch_h264_deblock_chroma_v_intra_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
ff_h264_v_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta);
}
return 0;
}
static int dispatch_h264_deblock_chroma_h_intra_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
ff_h264_h_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta);
}
return 0;
}
static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
(void) ctx;
/* FFmpeg's NEON entry uses a single stride for both dst and src
* (H264QpelContext convention). Caller already guarantees this
* via the public API contract documented in daedalus.h. */
for (size_t i = 0; i < n_blocks; i++) {
ff_put_h264_qpel8_mc20_neon(dst + meta[i].dst_off,
src + meta[i].src_off,
(ptrdiff_t) stride);
}
return 0;
}
static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_blocks; i++) {
ff_put_h264_qpel8_mc02_neon(dst + meta[i].dst_off,
src + meta[i].src_off,
(ptrdiff_t) stride);
}
return 0;
}
static int dispatch_h264_qpel_mc22_cpu(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_blocks; i++) {
ff_put_h264_qpel8_mc22_neon(dst + meta[i].dst_off,
src + meta[i].src_off,
(ptrdiff_t) stride);
}
return 0;
}
/* The four single-axis quarter-pel CPU dispatches are uniform; the
* macro collapses ~50 LOC of repetition. */
#define DEFINE_QPEL_CPU_DISPATCH(suffix, neon_fn) \
static int dispatch_h264_qpel_ ## suffix ## _cpu(daedalus_ctx *ctx, \
uint8_t *dst, const uint8_t *src, size_t stride, \
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
{ \
(void) ctx; \
for (size_t i = 0; i < n_blocks; i++) { \
neon_fn(dst + meta[i].dst_off, src + meta[i].src_off, \
(ptrdiff_t) stride); \
} \
return 0; \
}
DEFINE_QPEL_CPU_DISPATCH(mc10, ff_put_h264_qpel8_mc10_neon)
DEFINE_QPEL_CPU_DISPATCH(mc30, ff_put_h264_qpel8_mc30_neon)
DEFINE_QPEL_CPU_DISPATCH(mc01, ff_put_h264_qpel8_mc01_neon)
DEFINE_QPEL_CPU_DISPATCH(mc03, ff_put_h264_qpel8_mc03_neon)
DEFINE_QPEL_CPU_DISPATCH(mc11, ff_put_h264_qpel8_mc11_neon)
DEFINE_QPEL_CPU_DISPATCH(mc12, ff_put_h264_qpel8_mc12_neon)
DEFINE_QPEL_CPU_DISPATCH(mc13, ff_put_h264_qpel8_mc13_neon)
DEFINE_QPEL_CPU_DISPATCH(mc21, ff_put_h264_qpel8_mc21_neon)
DEFINE_QPEL_CPU_DISPATCH(mc23, ff_put_h264_qpel8_mc23_neon)
DEFINE_QPEL_CPU_DISPATCH(mc31, ff_put_h264_qpel8_mc31_neon)
DEFINE_QPEL_CPU_DISPATCH(mc32, ff_put_h264_qpel8_mc32_neon)
DEFINE_QPEL_CPU_DISPATCH(mc33, ff_put_h264_qpel8_mc33_neon)
/* avg_ biprediction variants — same dispatch shape as put_, just
* different NEON entry that L2-averages with the existing dst. */
DEFINE_QPEL_CPU_DISPATCH(avg_mc20, ff_avg_h264_qpel8_mc20_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc02, ff_avg_h264_qpel8_mc02_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc22, ff_avg_h264_qpel8_mc22_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc10, ff_avg_h264_qpel8_mc10_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc30, ff_avg_h264_qpel8_mc30_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc01, ff_avg_h264_qpel8_mc01_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc03, ff_avg_h264_qpel8_mc03_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc11, ff_avg_h264_qpel8_mc11_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc12, ff_avg_h264_qpel8_mc12_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc13, ff_avg_h264_qpel8_mc13_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc21, ff_avg_h264_qpel8_mc21_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc23, ff_avg_h264_qpel8_mc23_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc31, ff_avg_h264_qpel8_mc31_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc32, ff_avg_h264_qpel8_mc32_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc33, ff_avg_h264_qpel8_mc33_neon)
#undef DEFINE_QPEL_CPU_DISPATCH
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
typedef struct {
uint32_t n_blocks;
uint32_t blocks_per_row;
uint32_t dst_stride_u8;
uint32_t _pad;
} idct8_pc;
static int ensure_idct8_pipeline(daedalus_ctx *ctx)
{
if (ctx->idct8_pipe_ready) return 0;
if (v3d_runner_create_pipeline(ctx->runner,
"v3d_idct8.spv",
/*n_ssbos=*/3,
/*push_const_size=*/sizeof(idct8_pc),
&ctx->idct8_pipe) != 0) {
return -1;
}
ctx->idct8_pipe_ready = 1;
return 0;
}
static int dispatch_idct8_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta)
{
if (ensure_idct8_pipeline(ctx) != 0) return -1;
/* Allocate three SSBOs per call (coeffs, dst, meta). Performance-
* tuning (buffer pool) is deferred; correctness first. */
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
size_t meta_bytes = n_blocks * 2 * sizeof(uint32_t); /* uvec2 per block */
/* dst buffer must hold all of dst[0..max_dst_off + 64 + 8*stride].
* Cheapest correct answer: alloc the smallest contiguous region
* containing every block's footprint. For Phase 8 we assume the
* caller's dst surface starts at byte 0 of the buffer and use
* the full provided extent. We size by scanning meta. */
size_t max_byte_touched = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t end = meta[i].dst_off + (size_t)(8 - 1) * dst_stride + 8;
if (end > max_byte_touched) max_byte_touched = end;
}
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
if (v3d_runner_acquire_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
if (v3d_runner_acquire_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
}
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) {
v3d_runner_release_buffer(ctx->runner, &buf_dst);
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
}
/* Upload. Coeffs and meta are straight copies. Dst we copy the
* caller's full region (since we'll need to read it back). */
memcpy(buf_coeffs.mapped, coeffs, coeff_bytes);
memcpy(buf_dst.mapped, dst, max_byte_touched);
uint32_t *m = buf_meta.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[2*i + 0] = meta[i].block_x;
m[2*i + 1] = meta[i].block_y;
}
/* Bind: shader expects (coeffs, dst, meta) per src/v3d_idct8.comp. */
v3d_buffer binds[3] = { buf_coeffs, buf_dst, buf_meta };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->idct8_pipe, binds, 3)) {
goto fail;
}
/* WG geometry: 32 blocks per WG. */
uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
idct8_pc pc = {
.n_blocks = (uint32_t) n_blocks,
.blocks_per_row = 0, /* unused by v4 shader (meta drives placement) */
.dst_stride_u8 = (uint32_t) dst_stride,
._pad = 0,
};
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->idct8_pipe)) goto fail;
VkCommandBuffer cb = ctx->idct8_pipe.cb;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->idct8_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->idct8_pipe.layout, 0, 1,
&ctx->idct8_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->idct8_pipe.layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
/* Read-back dst. */
memcpy(dst, buf_dst.mapped, max_byte_touched);
v3d_runner_release_buffer(ctx->runner, &buf_meta);
v3d_runner_release_buffer(ctx->runner, &buf_dst);
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
return 0;
fail:
v3d_runner_release_buffer(ctx->runner, &buf_meta);
v3d_runner_release_buffer(ctx->runner, &buf_dst);
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
return -1;
}
/* -------------------- LPF QPU dispatch (cycles 2 + 4 shaders) --
*
* NOTE: the two LPF shaders disagree on push-constant slot order.
* v3d_lpf_h_4_8.comp: (n_edges, dst_stride_u8, _pad, _pad)
* v3d_lpf_h_8_8.comp: (n_edges, blocks_per_row=unused, dst_stride_u8, _pad)
*
* Same total size (16 bytes), different slot 2. Keep separate
* struct definitions to avoid silent corruption — Phase 8 caught
* this empirically when test_api_lpf wd=8 reported 95.6 % match.
*/
typedef struct {
uint32_t n_edges;
uint32_t dst_stride_u8;
uint32_t _pad0;
uint32_t _pad1;
} lpf4_pc;
typedef struct {
uint32_t n_edges;
uint32_t blocks_per_row; /* unused by shader, must exist */
uint32_t dst_stride_u8;
uint32_t _pad;
} lpf8_pc;
static int ensure_lpf_pipeline(daedalus_ctx *ctx, int wd_8,
int *flag, v3d_pipeline *pipe,
const char *spv)
{
if (*flag) return 0;
size_t pc_size = wd_8 ? sizeof(lpf8_pc) : sizeof(lpf4_pc);
if (v3d_runner_create_pipeline(ctx->runner, spv,
/*n_ssbos=*/2,
/*push_const_size=*/(uint32_t) pc_size,
pipe) != 0) {
return -1;
}
*flag = 1;
return 0;
}
static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
int *flag = wd_8 ? &ctx->lpf8_pipe_ready : &ctx->lpf4_pipe_ready;
v3d_pipeline *p = wd_8 ? &ctx->lpf8_pipe : &ctx->lpf4_pipe;
const char *spv = wd_8 ? "v3d_lpf_h_8_8.spv" : "v3d_lpf_h_4_8.spv";
if (ensure_lpf_pipeline(ctx, wd_8, flag, p, spv) != 0) return -1;
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */
/* Determine smallest dst window. Each edge writes to bytes
* [dst_off - 4 .. dst_off + 3] for 8 rows at dst_stride. */
size_t lo = (size_t) -1, hi = 0;
for (size_t i = 0; i < n_edges; i++) {
size_t base = meta[i].dst_off;
if (base >= 4) {
size_t this_lo = base - 4;
if (this_lo < lo) lo = this_lo;
} else {
lo = 0;
}
size_t this_hi = base + (size_t)(8 - 1) * dst_stride + 4;
if (this_hi > hi) hi = this_hi;
}
if (n_edges == 0) { lo = 0; hi = 0; }
size_t dst_window_size = hi - lo;
v3d_buffer buf_meta = {0}, buf_dst = {0};
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
if (v3d_runner_acquire_buffer(ctx->runner, dst_window_size, &buf_dst)) {
v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1;
}
memcpy(buf_dst.mapped, dst + lo, dst_window_size);
uint32_t *m = buf_meta.mapped;
for (size_t i = 0; i < n_edges; i++) {
m[4*i + 0] = (uint32_t)(meta[i].dst_off - lo);
m[4*i + 1] = (uint32_t) meta[i].E;
m[4*i + 2] = (uint32_t) meta[i].I;
m[4*i + 3] = (uint32_t) meta[i].H;
}
v3d_buffer binds[2] = { buf_meta, buf_dst };
if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, p)) goto fail;
VkCommandBuffer cb = p->cb;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
p->layout, 0, 1, &p->desc_set, 0, NULL);
if (wd_8) {
lpf8_pc pc = { .n_edges = (uint32_t) n_edges,
.blocks_per_row = 0,
.dst_stride_u8 = (uint32_t) dst_stride,
._pad = 0 };
vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
} else {
lpf4_pc pc = { .n_edges = (uint32_t) n_edges,
.dst_stride_u8 = (uint32_t) dst_stride };
vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
}
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst + lo, buf_dst.mapped, dst_window_size);
v3d_runner_release_buffer(ctx->runner, &buf_dst);
v3d_runner_release_buffer(ctx->runner, &buf_meta);
return 0;
fail:
v3d_runner_release_buffer(ctx->runner, &buf_dst);
v3d_runner_release_buffer(ctx->runner, &buf_meta);
return -1;
}
/* -------------------- VP9 MC QPU dispatch (cycle 3) ------------- */
typedef struct {
uint32_t n_blocks;
uint32_t dst_stride_u8;
uint32_t src_stride_u8;
uint32_t _pad;
} mc_pc;
static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta)
{
if (!ctx->mc8h_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_mc_8h.spv",
3, sizeof(mc_pc), &ctx->mc8h_pipe) != 0)
return -1;
ctx->mc8h_pipe_ready = 1;
}
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
size_t dst_max = 0, src_max = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8;
if (de > dst_max) dst_max = de;
/* QPU shader reads src[src_off + row*stride + 0..14] for row=0..7. */
size_t se = meta[i].src_off + 7 * src_stride + 15;
if (se > src_max) src_max = se;
}
v3d_buffer bm = {0}, bd = {0}, bs = {0};
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
if (v3d_runner_acquire_buffer(ctx->runner, src_max, &bs)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
memcpy(bs.mapped, src, src_max);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = meta[i].src_off;
m[4*i+2] = (uint32_t) meta[i].mx;
m[4*i+3] = 0;
}
v3d_buffer binds[3] = { bm, bd, bs };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->mc8h_pipe, binds, 3)) goto fail;
uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
mc_pc pc = { .n_blocks = (uint32_t) n_blocks,
.dst_stride_u8 = (uint32_t) dst_stride,
.src_stride_u8 = (uint32_t) src_stride };
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->mc8h_pipe)) goto fail;
VkCommandBuffer cb = ctx->mc8h_pipe.cb;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->mc8h_pipe.layout, 0, 1, &ctx->mc8h_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->mc8h_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
v3d_runner_release_buffer(ctx->runner, &bs);
v3d_runner_release_buffer(ctx->runner, &bd);
v3d_runner_release_buffer(ctx->runner, &bm);
return 0;
fail:
v3d_runner_release_buffer(ctx->runner, &bs);
v3d_runner_release_buffer(ctx->runner, &bd);
v3d_runner_release_buffer(ctx->runner, &bm);
return -1;
}
/* -------------------- CDEF QPU dispatch (cycle 5) --------------- */
typedef struct {
uint32_t n_blocks;
uint32_t tmp_stride_u16;
uint32_t dst_stride_u8;
uint32_t _pad;
} cdef_pc;
static int dispatch_cdef_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta)
{
if (!ctx->cdef_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_cdef.spv",
3, sizeof(cdef_pc), &ctx->cdef_pipe) != 0)
return -1;
ctx->cdef_pipe_ready = 1;
}
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
size_t dst_max = 0, tmp_max_u16 = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8;
if (de > dst_max) dst_max = de;
size_t te = meta[i].tmp_off_u16 + (8 - 1) * 16 + 8; /* center 8x8 in stride-16 tmp */
if (te > tmp_max_u16) tmp_max_u16 = te;
}
size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
v3d_buffer bm = {0}, bd = {0}, bt = {0};
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
if (v3d_runner_acquire_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
/* tmp may need padding before block-origin offset (caller-allocated). Just
* copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
* caller has the layout set up. */
memcpy(bt.mapped, tmp, tmp_bytes);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
uint32_t pri = (uint32_t) meta[i].pri_strength;
uint32_t sec = (uint32_t) meta[i].sec_strength;
uint32_t damping = (uint32_t) meta[i].damping;
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = pri | (sec << 8) | (damping << 16);
m[4*i+2] = meta[i].tmp_off_u16;
m[4*i+3] = (uint32_t) meta[i].dir;
}
v3d_buffer binds[3] = { bm, bd, bt };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->cdef_pipe, binds, 3)) goto fail;
uint32_t wg_count = (uint32_t)((n_blocks + 3) / 4);
cdef_pc pc = { .n_blocks = (uint32_t) n_blocks,
.tmp_stride_u16 = 16,
.dst_stride_u8 = (uint32_t) dst_stride };
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->cdef_pipe)) goto fail;
VkCommandBuffer cb = ctx->cdef_pipe.cb;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->cdef_pipe.layout, 0, 1, &ctx->cdef_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->cdef_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
v3d_runner_release_buffer(ctx->runner, &bt);
v3d_runner_release_buffer(ctx->runner, &bd);
v3d_runner_release_buffer(ctx->runner, &bm);
return 0;
fail:
v3d_runner_release_buffer(ctx->runner, &bt);
v3d_runner_release_buffer(ctx->runner, &bd);
v3d_runner_release_buffer(ctx->runner, &bm);
return -1;
}
/* -------------------- H.264 deblock QPU dispatch (cycle 8) ------ */
typedef struct {
uint32_t n_edges;
uint32_t dst_stride_u8;
uint32_t _pad0;
uint32_t _pad1;
} h264deblock_pc;
static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
if (!ctx->h264deblock_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264deblock.spv",
2, sizeof(h264deblock_pc), &ctx->h264deblock_pipe) != 0)
return -1;
ctx->h264deblock_pipe_ready = 1;
}
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t);
size_t dst_max = 0;
for (size_t i = 0; i < n_edges; i++) {
/* Reads -4*stride to +3*stride+15 from dst_off; writes -2..+1 *stride. */
size_t e = meta[i].dst_off + 3 * dst_stride + 16;
if (e > dst_max) dst_max = e;
}
v3d_buffer bm = {0}, bd = {0};
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_edges; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8);
m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0])
| (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8)
| (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16)
| (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24);
m[4*i+3] = 0;
}
v3d_buffer binds[2] = { bm, bd };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264deblock_pipe, binds, 2)) goto fail;
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
.dst_stride_u8 = (uint32_t) dst_stride };
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_pipe)) goto fail;
VkCommandBuffer cb = ctx->h264deblock_pipe.cb;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264deblock_pipe.layout, 0, 1, &ctx->h264deblock_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->h264deblock_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
v3d_runner_release_buffer(ctx->runner, &bd);
v3d_runner_release_buffer(ctx->runner, &bm);
return 0;
fail:
v3d_runner_release_buffer(ctx->runner, &bd);
v3d_runner_release_buffer(ctx->runner, &bm);
return -1;
}
/* -------------------- H.264 luma_h deblock QPU dispatch -------- */
static int dispatch_h264_deblock_h_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
if (!ctx->h264deblock_h_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264deblock_h.spv",
2, sizeof(h264deblock_pc), &ctx->h264deblock_h_pipe) != 0)
return -1;
ctx->h264deblock_h_pipe_ready = 1;
}
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t);
/* H variant: reads cols [-4..+3] of 16 ROWS. Each lane processes one row.
* Max addressed byte = dst_off + 15*stride + 3 (last row, col +3). */
size_t dst_max = 0;
for (size_t i = 0; i < n_edges; i++) {
size_t e = meta[i].dst_off + 15 * dst_stride + 4;
if (e > dst_max) dst_max = e;
}
v3d_buffer bm = {0}, bd = {0};
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_edges; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8);
m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0])
| (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8)
| (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16)
| (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24);
m[4*i+3] = 0;
}
v3d_buffer binds[2] = { bm, bd };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264deblock_h_pipe, binds, 2)) goto fail;
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
.dst_stride_u8 = (uint32_t) dst_stride };
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_h_pipe)) goto fail;
VkCommandBuffer cb = ctx->h264deblock_h_pipe.cb;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_h_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264deblock_h_pipe.layout, 0, 1, &ctx->h264deblock_h_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->h264deblock_h_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
v3d_runner_release_buffer(ctx->runner, &bd);
v3d_runner_release_buffer(ctx->runner, &bm);
return 0;
fail:
v3d_runner_release_buffer(ctx->runner, &bd);
v3d_runner_release_buffer(ctx->runner, &bm);
return -1;
}
/* -------------------- H.264 chroma deblock QPU dispatch -------- */
/* Generic chroma QPU dispatch (shared between V and H variants).
* Both shaders use 8 cells per edge; max-addressed-byte differs:
* V: dst_off + 1*stride + 7 (-2..+1 rows, cols 0..7 of edge)
* H: dst_off + 7*stride + 1 (-2..+1 cols, rows 0..7 of edge)
* Caller passes the precomputed extent.
*/
static int dispatch_h264_deblock_chroma_qpu(daedalus_ctx *ctx,
v3d_pipeline *pipe, int *pipe_ready, const char *spv_name,
uint8_t *dst, size_t dst_stride, size_t n_edges,
const daedalus_h264_deblock_meta *meta, int orient_h)
{
if (!*pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, spv_name,
2, sizeof(h264deblock_pc), pipe) != 0)
return -1;
*pipe_ready = 1;
}
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t);
size_t dst_max = 0;
for (size_t i = 0; i < n_edges; i++) {
size_t e = orient_h ? meta[i].dst_off + 7 * dst_stride + 2
: meta[i].dst_off + 1 * dst_stride + 8;
if (e > dst_max) dst_max = e;
}
v3d_buffer bm = {0}, bd = {0};
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_edges; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8);
m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0])
| (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8)
| (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16)
| (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24);
m[4*i+3] = 0;
}
v3d_buffer binds[2] = { bm, bd };
if (v3d_runner_bind_buffers(ctx->runner, pipe, binds, 2)) goto fail;
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
.dst_stride_u8 = (uint32_t) dst_stride };
if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, pipe)) goto fail;
VkCommandBuffer cb = pipe->cb;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe->pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
pipe->layout, 0, 1, &pipe->desc_set, 0, NULL);
vkCmdPushConstants(cb, pipe->layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
v3d_runner_release_buffer(ctx->runner, &bd);
v3d_runner_release_buffer(ctx->runner, &bm);
return 0;
fail:
v3d_runner_release_buffer(ctx->runner, &bd);
v3d_runner_release_buffer(ctx->runner, &bm);
return -1;
}
static int dispatch_h264_deblock_chroma_v_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
return dispatch_h264_deblock_chroma_qpu(ctx,
&ctx->h264deblock_chroma_v_pipe, &ctx->h264deblock_chroma_v_pipe_ready,
"v3d_h264deblock_chroma_v.spv", dst, dst_stride, n_edges, meta, 0);
}
static int dispatch_h264_deblock_chroma_h_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
return dispatch_h264_deblock_chroma_qpu(ctx,
&ctx->h264deblock_chroma_h_pipe, &ctx->h264deblock_chroma_h_pipe_ready,
"v3d_h264deblock_chroma_h.spv", dst, dst_stride, n_edges, meta, 1);
}
/* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */
typedef struct {
uint32_t n_blocks;
uint32_t dst_stride_u8;
uint32_t _pad0;
uint32_t _pad1;
} h264_idct4_pc;
static int dispatch_h264_idct4_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
if (!ctx->h264_idct4_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct4.spv",
3, sizeof(h264_idct4_pc),
&ctx->h264_idct4_pipe) != 0)
return -1;
ctx->h264_idct4_pipe_ready = 1;
}
size_t coeff_bytes = n_blocks * 16 * sizeof(int16_t);
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); /* uvec4 per block */
size_t dst_max = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t e = meta[i].dst_off + (size_t) 3 * dst_stride + 4;
if (e > dst_max) dst_max = e;
}
v3d_buffer bc = {0}, bd = {0}, bm = {0};
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
}
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
}
memcpy(bc.mapped, coeffs, coeff_bytes);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = 0;
m[4*i+2] = 0;
m[4*i+3] = 0;
}
v3d_buffer binds[3] = { bc, bd, bm };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct4_pipe, binds, 3))
goto fail;
uint32_t wg_count = (uint32_t)((n_blocks + 15) / 16); /* 16 blocks/WG */
h264_idct4_pc pc = {
.n_blocks = (uint32_t) n_blocks,
.dst_stride_u8 = (uint32_t) dst_stride,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
if (cb == VK_NULL_HANDLE) goto fail;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_idct4_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_idct4_pipe.layout, 0, 1,
&ctx->h264_idct4_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->h264_idct4_pipe.layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
/* H.264/FFmpeg convention: zero the coeffs block after the
* transform (matches the C ref + NEON .S behaviour). */
memset(coeffs, 0, coeff_bytes);
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc);
return 0;
fail:
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc);
return -1;
}
/* -------------------- H.264 IDCT 8x8 QPU dispatch (cycle 7) ----- */
typedef struct {
uint32_t n_blocks;
uint32_t dst_stride_u8;
uint32_t _pad0;
uint32_t _pad1;
} h264_idct8_pc;
static int dispatch_h264_idct8_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
if (!ctx->h264_idct8_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct8.spv",
3, sizeof(h264_idct8_pc),
&ctx->h264_idct8_pipe) != 0)
return -1;
ctx->h264_idct8_pipe_ready = 1;
}
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
size_t dst_max = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t e = meta[i].dst_off + (size_t) 7 * dst_stride + 8;
if (e > dst_max) dst_max = e;
}
v3d_buffer bc = {0}, bd = {0}, bm = {0};
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1;
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
}
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc); return -1;
}
memcpy(bc.mapped, coeffs, coeff_bytes);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = 0;
m[4*i+2] = 0;
m[4*i+3] = 0;
}
v3d_buffer binds[3] = { bc, bd, bm };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct8_pipe, binds, 3))
goto fail;
uint32_t wg_count = (uint32_t)((n_blocks + 7) / 8); /* 8 blocks/WG */
h264_idct8_pc pc = {
.n_blocks = (uint32_t) n_blocks,
.dst_stride_u8 = (uint32_t) dst_stride,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
if (cb == VK_NULL_HANDLE) goto fail;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_idct8_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_idct8_pipe.layout, 0, 1,
&ctx->h264_idct8_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->h264_idct8_pipe.layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
memset(coeffs, 0, coeff_bytes);
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc);
return 0;
fail:
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bc);
return -1;
}
/* -------------------- H.264 qpel mc20 QPU dispatch (cycle 9) --- */
typedef struct {
uint32_t n_blocks;
uint32_t stride_u8;
uint32_t _pad0;
uint32_t _pad1;
} h264_qpel_mc20_pc;
static int dispatch_h264_qpel_mc20_qpu(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
if (!ctx->h264_qpel_mc20_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_qpel_mc20.spv",
3, sizeof(h264_qpel_mc20_pc),
&ctx->h264_qpel_mc20_pipe) != 0)
return -1;
ctx->h264_qpel_mc20_pipe_ready = 1;
}
/* Compute the smallest contiguous src/dst window that covers
* every block's read/write footprint.
*
* src: filter reads cols (c-2)..(c+3) for c=0..7 across rows 0..7.
* Highest read = src_off + 7*stride + (7 + 3) = src_off + 7*stride + 10.
* Plus 1 for the byte-count semantic of memcpy (length=N copies
* indices 0..N-1) → src_max = src_off + 7*stride + 11.
*
* dst: writes cols 0..7 across rows 0..7.
* Highest write = dst_off + 7*stride + 7; +1 → dst_off + 7*stride + 8. */
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
size_t src_max = 0, dst_max = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t s_end = meta[i].src_off + (size_t) 7 * stride + 11;
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
if (s_end > src_max) src_max = s_end;
if (d_end > dst_max) dst_max = d_end;
}
v3d_buffer bs = {0}, bd = {0}, bm = {0};
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
}
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
}
/* Copy src window (filter needs cols -2..+3, captured by src_max
* upper bound above; the lower bound is implicit in src_off >= 2
* which the caller guarantees per the public API contract). */
memcpy(bs.mapped, src, src_max);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = meta[i].src_off;
m[4*i+2] = 0;
m[4*i+3] = 0;
}
v3d_buffer binds[3] = { bs, bd, bm };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_qpel_mc20_pipe, binds, 3))
goto fail;
uint32_t wg_count = (uint32_t) n_blocks; /* 1 block per WG */
h264_qpel_mc20_pc pc = {
.n_blocks = (uint32_t) n_blocks,
.stride_u8 = (uint32_t) stride,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
if (cb == VK_NULL_HANDLE) goto fail;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_qpel_mc20_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_qpel_mc20_pipe.layout, 0, 1,
&ctx->h264_qpel_mc20_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->h264_qpel_mc20_pipe.layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs);
return 0;
fail:
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs);
return -1;
}
static int dispatch_h264_qpel_mc02_qpu(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
/* Same shape as mc20 but with vertical access pattern. src_max
* reflects the row-wise filter window: bottom output row (r=7)
* reads up to row r+3 = 10 of the src; so src_max = src_off +
* 10*stride + 8 (last col + 1 for memcpy semantics). */
if (!ctx->h264_qpel_mc02_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_qpel_mc02.spv",
3, sizeof(h264_qpel_mc20_pc),
&ctx->h264_qpel_mc02_pipe) != 0)
return -1;
ctx->h264_qpel_mc02_pipe_ready = 1;
}
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
size_t src_max = 0, dst_max = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t s_end = meta[i].src_off + (size_t) 10 * stride + 8;
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
if (s_end > src_max) src_max = s_end;
if (d_end > dst_max) dst_max = d_end;
}
v3d_buffer bs = {0}, bd = {0}, bm = {0};
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
}
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
}
memcpy(bs.mapped, src, src_max);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = meta[i].src_off;
m[4*i+2] = 0;
m[4*i+3] = 0;
}
v3d_buffer binds[3] = { bs, bd, bm };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_qpel_mc02_pipe, binds, 3))
goto fail;
uint32_t wg_count = (uint32_t) n_blocks;
h264_qpel_mc20_pc pc = {
.n_blocks = (uint32_t) n_blocks,
.stride_u8 = (uint32_t) stride,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
if (cb == VK_NULL_HANDLE) goto fail;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_qpel_mc02_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_qpel_mc02_pipe.layout, 0, 1,
&ctx->h264_qpel_mc02_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->h264_qpel_mc02_pipe.layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs);
return 0;
fail:
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs);
return -1;
}
static int dispatch_h264_qpel_mc22_qpu(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
/* 2D HV cascade: rows -2..+10 (13 rows of src) AND cols -2..+10
* per row (8 output cols × cols c-2..c+3 → up to col 10). So
* src_max = src_off + 10*stride + 11.
* (mc20 needed 7*stride + 11; mc02 needed 10*stride + 8;
* mc22 needs MAX of both = 10*stride + 11.) */
if (!ctx->h264_qpel_mc22_pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_qpel_mc22.spv",
3, sizeof(h264_qpel_mc20_pc),
&ctx->h264_qpel_mc22_pipe) != 0)
return -1;
ctx->h264_qpel_mc22_pipe_ready = 1;
}
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
size_t src_max = 0, dst_max = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t s_end = meta[i].src_off + (size_t) 10 * stride + 11;
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
if (s_end > src_max) src_max = s_end;
if (d_end > dst_max) dst_max = d_end;
}
v3d_buffer bs = {0}, bd = {0}, bm = {0};
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
}
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
}
memcpy(bs.mapped, src, src_max);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = meta[i].src_off;
m[4*i+2] = 0;
m[4*i+3] = 0;
}
v3d_buffer binds[3] = { bs, bd, bm };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_qpel_mc22_pipe, binds, 3))
goto fail;
uint32_t wg_count = (uint32_t) n_blocks;
h264_qpel_mc20_pc pc = {
.n_blocks = (uint32_t) n_blocks,
.stride_u8 = (uint32_t) stride,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
if (cb == VK_NULL_HANDLE) goto fail;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_qpel_mc22_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->h264_qpel_mc22_pipe.layout, 0, 1,
&ctx->h264_qpel_mc22_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->h264_qpel_mc22_pipe.layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs);
return 0;
fail:
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs);
return -1;
}
/* Generic QPU dispatch for the 4 single-axis quarter-pel shaders
* (mc10/30 horizontal, mc01/03 vertical). All 4 share the same WG
* geometry (64 lanes/block, 1 block/WG), push-constant layout, and
* 3-binding interface (src/dst/meta) as mc20/mc02. Only src_max
* differs by axis:
* H variants: src_max = src_off + 7*stride + 11 (cols -2..+10)
* V variants: src_max = src_off + 10*stride + 8 (rows -2..+10)
*/
static int dispatch_h264_qpel_axis_qpu(daedalus_ctx *ctx,
v3d_pipeline *pipe, int *pipe_ready, const char *spv,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta,
int axis_v)
{
if (!*pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, spv,
3, sizeof(h264_qpel_mc20_pc), pipe) != 0)
return -1;
*pipe_ready = 1;
}
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
size_t src_max = 0, dst_max = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t s_end = axis_v ? meta[i].src_off + (size_t) 10 * stride + 8
: meta[i].src_off + (size_t) 7 * stride + 11;
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
if (s_end > src_max) src_max = s_end;
if (d_end > dst_max) dst_max = d_end;
}
v3d_buffer bs = {0}, bd = {0}, bm = {0};
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
}
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
}
memcpy(bs.mapped, src, src_max);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = meta[i].src_off;
m[4*i+2] = 0;
m[4*i+3] = 0;
}
v3d_buffer binds[3] = { bs, bd, bm };
if (v3d_runner_bind_buffers(ctx->runner, pipe, binds, 3)) goto fail;
h264_qpel_mc20_pc pc = { .n_blocks = (uint32_t) n_blocks,
.stride_u8 = (uint32_t) stride };
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
if (cb == VK_NULL_HANDLE) goto fail;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe->pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
pipe->layout, 0, 1, &pipe->desc_set, 0, NULL);
vkCmdPushConstants(cb, pipe->layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, (uint32_t) n_blocks, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs);
return 0;
fail:
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs);
return -1;
}
#define DEFINE_QPEL_AXIS_QPU(name, spv, axis_v) \
static int dispatch_h264_qpel_ ## name ## _qpu(daedalus_ctx *ctx, \
uint8_t *dst, const uint8_t *src, size_t stride, \
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
{ \
return dispatch_h264_qpel_axis_qpu(ctx, &ctx->h264_qpel_ ## name ## _pipe, \
&ctx->h264_qpel_ ## name ## _pipe_ready, spv, dst, src, stride, \
n_blocks, meta, axis_v); \
}
DEFINE_QPEL_AXIS_QPU(mc10, "v3d_h264_qpel_mc10.spv", 0)
DEFINE_QPEL_AXIS_QPU(mc30, "v3d_h264_qpel_mc30.spv", 0)
DEFINE_QPEL_AXIS_QPU(mc01, "v3d_h264_qpel_mc01.spv", 1)
DEFINE_QPEL_AXIS_QPU(mc03, "v3d_h264_qpel_mc03.spv", 1)
#undef DEFINE_QPEL_AXIS_QPU
/* Diagonals share the mc22-style src envelope (rows -2..+10, cols
* -2..+10) because they compose mc22 with mc20/mc02, sometimes
* with (r+1, c) or (r, c+1) offsets. */
static int dispatch_h264_qpel_diag_qpu(daedalus_ctx *ctx,
v3d_pipeline *pipe, int *pipe_ready, const char *spv,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
if (!*pipe_ready) {
if (v3d_runner_create_pipeline(ctx->runner, spv,
3, sizeof(h264_qpel_mc20_pc), pipe) != 0)
return -1;
*pipe_ready = 1;
}
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
size_t src_max = 0, dst_max = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t s_end = meta[i].src_off + (size_t) 10 * stride + 11;
size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8;
if (s_end > src_max) src_max = s_end;
if (d_end > dst_max) dst_max = d_end;
}
v3d_buffer bs = {0}, bd = {0}, bm = {0};
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1;
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) {
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
}
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) {
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs); return -1;
}
memcpy(bs.mapped, src, src_max);
memcpy(bd.mapped, dst, dst_max);
uint32_t *m = bm.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[4*i+0] = meta[i].dst_off;
m[4*i+1] = meta[i].src_off;
m[4*i+2] = 0;
m[4*i+3] = 0;
}
v3d_buffer binds[3] = { bs, bd, bm };
if (v3d_runner_bind_buffers(ctx->runner, pipe, binds, 3)) goto fail;
h264_qpel_mc20_pc pc = { .n_blocks = (uint32_t) n_blocks,
.stride_u8 = (uint32_t) stride };
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
if (cb == VK_NULL_HANDLE) goto fail;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe->pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
pipe->layout, 0, 1, &pipe->desc_set, 0, NULL);
vkCmdPushConstants(cb, pipe->layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, (uint32_t) n_blocks, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
memcpy(dst, bd.mapped, dst_max);
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs);
return 0;
fail:
v3d_runner_destroy_buffer(ctx->runner, &bm);
v3d_runner_destroy_buffer(ctx->runner, &bd);
v3d_runner_destroy_buffer(ctx->runner, &bs);
return -1;
}
#define DEFINE_QPEL_DIAG_QPU(name) \
static int dispatch_h264_qpel_ ## name ## _qpu(daedalus_ctx *ctx, \
uint8_t *dst, const uint8_t *src, size_t stride, \
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
{ \
return dispatch_h264_qpel_diag_qpu(ctx, &ctx->h264_qpel_ ## name ## _pipe, \
&ctx->h264_qpel_ ## name ## _pipe_ready, \
"v3d_h264_qpel_" #name ".spv", dst, src, stride, n_blocks, meta); \
}
DEFINE_QPEL_DIAG_QPU(mc11)
DEFINE_QPEL_DIAG_QPU(mc12)
DEFINE_QPEL_DIAG_QPU(mc13)
DEFINE_QPEL_DIAG_QPU(mc21)
DEFINE_QPEL_DIAG_QPU(mc23)
DEFINE_QPEL_DIAG_QPU(mc31)
DEFINE_QPEL_DIAG_QPU(mc32)
DEFINE_QPEL_DIAG_QPU(mc33)
/* avg_ variants — same diag-style envelope (10*stride+11 covers any
* (r±1, c±1) offset the avg_ shaders use), different SPV file.
* Slightly over-allocates for avg_mc20/02/10/30/01/03 (which need
* less src context) but the cost is negligible. */
DEFINE_QPEL_DIAG_QPU(avg_mc20)
DEFINE_QPEL_DIAG_QPU(avg_mc02)
DEFINE_QPEL_DIAG_QPU(avg_mc22)
DEFINE_QPEL_DIAG_QPU(avg_mc10)
DEFINE_QPEL_DIAG_QPU(avg_mc30)
DEFINE_QPEL_DIAG_QPU(avg_mc01)
DEFINE_QPEL_DIAG_QPU(avg_mc03)
DEFINE_QPEL_DIAG_QPU(avg_mc11)
DEFINE_QPEL_DIAG_QPU(avg_mc12)
DEFINE_QPEL_DIAG_QPU(avg_mc13)
DEFINE_QPEL_DIAG_QPU(avg_mc21)
DEFINE_QPEL_DIAG_QPU(avg_mc23)
DEFINE_QPEL_DIAG_QPU(avg_mc31)
DEFINE_QPEL_DIAG_QPU(avg_mc32)
DEFINE_QPEL_DIAG_QPU(avg_mc33)
#undef DEFINE_QPEL_DIAG_QPU
/* -------------------- Public dispatch entry points -------------- */
#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \
daedalus_substrate eff = sub; \
if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
eff = DAEDALUS_SUBSTRATE_CPU; \
if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \
return -1 /* QPU path not yet wired for this kernel */
int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
return dispatch_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
}
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta);
return dispatch_lpf_qpu(ctx, 0, dst, dst_stride, n_edges, meta);
}
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta);
return dispatch_lpf_qpu(ctx, 1, dst, dst_stride, n_edges, meta);
}
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_MC_8H);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_mc_8h_cpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta);
return dispatch_mc_8h_qpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta);
}
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_AV1_CDEF_8X8);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_cdef_cpu(ctx, dst, dst_stride, tmp, n_blocks, meta);
return dispatch_cdef_qpu(ctx, dst, dst_stride, tmp, n_blocks, meta);
}
int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_idct4_cpu(ctx, dst, dst_stride,
coeffs, n_blocks, meta);
return dispatch_h264_idct4_qpu(ctx, dst, dst_stride,
coeffs, n_blocks, meta);
}
int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_idct8_cpu(ctx, dst, dst_stride,
coeffs, n_blocks, meta);
return dispatch_h264_idct8_qpu(ctx, dst, dst_stride,
coeffs, n_blocks, meta);
}
int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_deblock_cpu(ctx, dst, dst_stride, n_edges, meta);
return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
}
int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta);
return dispatch_h264_deblock_h_qpu(ctx, dst, dst_stride, n_edges, meta);
}
int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta);
return dispatch_h264_deblock_chroma_v_qpu(ctx, dst, dst_stride, n_edges, meta);
}
int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
return dispatch_h264_deblock_chroma_h_qpu(ctx, dst, dst_stride, n_edges, meta);
}
#define DEFINE_INTRA_DISPATCH(name, kernel, cpu_fn) \
int daedalus_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
daedalus_substrate sub, uint8_t *dst, size_t dst_stride, \
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
{ \
daedalus_substrate eff = sub; \
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
eff = daedalus_recipe_substrate_for(kernel); \
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
eff = DAEDALUS_SUBSTRATE_CPU; \
if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \
return cpu_fn(ctx, dst, dst_stride, n_edges, meta); \
}
DEFINE_INTRA_DISPATCH(luma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA, dispatch_h264_deblock_luma_v_intra_cpu)
DEFINE_INTRA_DISPATCH(luma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA, dispatch_h264_deblock_luma_h_intra_cpu)
DEFINE_INTRA_DISPATCH(chroma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA, dispatch_h264_deblock_chroma_v_intra_cpu)
DEFINE_INTRA_DISPATCH(chroma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA, dispatch_h264_deblock_chroma_h_intra_cpu)
#undef DEFINE_INTRA_DISPATCH
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_qpel_mc20_cpu(ctx, dst, src, stride,
n_blocks, meta);
return dispatch_h264_qpel_mc20_qpu(ctx, dst, src, stride,
n_blocks, meta);
}
int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC02);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta);
return dispatch_h264_qpel_mc02_qpu(ctx, dst, src, stride, n_blocks, meta);
}
int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC22);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta);
return dispatch_h264_qpel_mc22_qpu(ctx, dst, src, stride, n_blocks, meta);
}
#define DEFINE_QPEL_DISPATCH(suffix, kernel) \
int daedalus_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \
daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, \
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
{ \
daedalus_substrate eff = sub; \
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
eff = daedalus_recipe_substrate_for(kernel); \
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
eff = DAEDALUS_SUBSTRATE_CPU; \
if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \
return dispatch_h264_qpel_ ## suffix ## _cpu(ctx, dst, src, stride, \
n_blocks, meta); \
}
/* mc10/30/01/03 now have QPU shaders — explicit definitions instead of
* the no-QPU DEFINE_QPEL_DISPATCH macro. Same routing shape as mc20/02. */
#define DEFINE_QPEL_DISPATCH_QPU(suffix, kernel) \
int daedalus_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \
daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, \
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
{ \
daedalus_substrate eff = sub; \
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
eff = daedalus_recipe_substrate_for(kernel); \
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
eff = DAEDALUS_SUBSTRATE_CPU; \
if (eff == DAEDALUS_SUBSTRATE_CPU) \
return dispatch_h264_qpel_ ## suffix ## _cpu(ctx, dst, src, stride, \
n_blocks, meta); \
return dispatch_h264_qpel_ ## suffix ## _qpu(ctx, dst, src, stride, \
n_blocks, meta); \
}
DEFINE_QPEL_DISPATCH_QPU(mc10, DAEDALUS_KERNEL_H264_QPEL_MC10)
DEFINE_QPEL_DISPATCH_QPU(mc30, DAEDALUS_KERNEL_H264_QPEL_MC30)
DEFINE_QPEL_DISPATCH_QPU(mc01, DAEDALUS_KERNEL_H264_QPEL_MC01)
DEFINE_QPEL_DISPATCH_QPU(mc03, DAEDALUS_KERNEL_H264_QPEL_MC03)
#undef DEFINE_QPEL_DISPATCH_QPU
/* mc11..mc33 diagonals — QPU-capable, same macro shape as mc10/30/01/03. */
#define DEFINE_QPEL_DIAG_PUBLIC(suffix, kernel) \
int daedalus_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \
daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, \
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
{ \
daedalus_substrate eff = sub; \
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
eff = daedalus_recipe_substrate_for(kernel); \
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
eff = DAEDALUS_SUBSTRATE_CPU; \
if (eff == DAEDALUS_SUBSTRATE_CPU) \
return dispatch_h264_qpel_ ## suffix ## _cpu(ctx, dst, src, stride, \
n_blocks, meta); \
return dispatch_h264_qpel_ ## suffix ## _qpu(ctx, dst, src, stride, \
n_blocks, meta); \
}
DEFINE_QPEL_DIAG_PUBLIC(mc11, DAEDALUS_KERNEL_H264_QPEL_MC11)
DEFINE_QPEL_DIAG_PUBLIC(mc12, DAEDALUS_KERNEL_H264_QPEL_MC12)
DEFINE_QPEL_DIAG_PUBLIC(mc13, DAEDALUS_KERNEL_H264_QPEL_MC13)
DEFINE_QPEL_DIAG_PUBLIC(mc21, DAEDALUS_KERNEL_H264_QPEL_MC21)
DEFINE_QPEL_DIAG_PUBLIC(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23)
DEFINE_QPEL_DIAG_PUBLIC(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31)
DEFINE_QPEL_DIAG_PUBLIC(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32)
DEFINE_QPEL_DIAG_PUBLIC(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33)
/* avg_ biprediction dispatchers (15 positions) — same macro, the
* underlying _qpu dispatch fns also reuse the diag QPU helper since
* the avg_ shaders share the put_ src envelope (the L2 step only
* touches dst). */
DEFINE_QPEL_DIAG_PUBLIC(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32)
DEFINE_QPEL_DIAG_PUBLIC(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33)
#undef DEFINE_QPEL_DIAG_PUBLIC
#undef DEFINE_QPEL_DISPATCH
/* -------------------- Recipe convenience wrappers --------------- */
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta)
{
return daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, coeffs, n_blocks, meta);
}
int daedalus_recipe_dispatch_vp9_lpf4(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
return daedalus_dispatch_vp9_lpf4(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, n_edges, meta);
}
int daedalus_recipe_dispatch_vp9_lpf8(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_lpf_meta *meta)
{
return daedalus_dispatch_vp9_lpf8(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, n_edges, meta);
}
int daedalus_recipe_dispatch_vp9_mc_8h(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta)
{
return daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, src, src_stride, n_blocks, meta);
}
int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta)
{
return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, tmp, n_blocks, meta);
}
int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
return daedalus_dispatch_h264_idct4(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, coeffs, n_blocks, meta);
}
int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
int16_t *coeffs, size_t n_blocks,
const daedalus_h264_block_meta *meta)
{
return daedalus_dispatch_h264_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, coeffs, n_blocks, meta);
}
int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, n_edges, meta);
}
int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
return daedalus_dispatch_h264_deblock_luma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, n_edges, meta);
}
int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, n_edges, meta);
}
int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, n_edges, meta);
}
#define DEFINE_INTRA_RECIPE(name) \
int daedalus_recipe_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
uint8_t *dst, size_t dst_stride, \
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
{ \
return daedalus_dispatch_h264_deblock_ ## name (ctx, DAEDALUS_SUBSTRATE_AUTO, \
dst, dst_stride, n_edges, meta); \
}
DEFINE_INTRA_RECIPE(luma_v_intra)
DEFINE_INTRA_RECIPE(luma_h_intra)
DEFINE_INTRA_RECIPE(chroma_v_intra)
DEFINE_INTRA_RECIPE(chroma_h_intra)
#undef DEFINE_INTRA_RECIPE
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, src, stride, n_blocks, meta);
}
int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, src, stride, n_blocks, meta);
}
int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
{
return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, src, stride, n_blocks, meta);
}
#define DEFINE_QPEL_RECIPE(suffix) \
int daedalus_recipe_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \
uint8_t *dst, const uint8_t *src, size_t stride, \
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
{ \
return daedalus_dispatch_h264_qpel_ ## suffix(ctx, DAEDALUS_SUBSTRATE_AUTO,\
dst, src, stride, n_blocks, meta); \
}
DEFINE_QPEL_RECIPE(mc10)
DEFINE_QPEL_RECIPE(mc30)
DEFINE_QPEL_RECIPE(mc01)
DEFINE_QPEL_RECIPE(mc03)
DEFINE_QPEL_RECIPE(mc11)
DEFINE_QPEL_RECIPE(mc12)
DEFINE_QPEL_RECIPE(mc13)
DEFINE_QPEL_RECIPE(mc21)
DEFINE_QPEL_RECIPE(mc23)
DEFINE_QPEL_RECIPE(mc31)
DEFINE_QPEL_RECIPE(mc32)
DEFINE_QPEL_RECIPE(mc33)
DEFINE_QPEL_RECIPE(avg_mc20)
DEFINE_QPEL_RECIPE(avg_mc02)
DEFINE_QPEL_RECIPE(avg_mc22)
DEFINE_QPEL_RECIPE(avg_mc10)
DEFINE_QPEL_RECIPE(avg_mc30)
DEFINE_QPEL_RECIPE(avg_mc01)
DEFINE_QPEL_RECIPE(avg_mc03)
DEFINE_QPEL_RECIPE(avg_mc11)
DEFINE_QPEL_RECIPE(avg_mc12)
DEFINE_QPEL_RECIPE(avg_mc13)
DEFINE_QPEL_RECIPE(avg_mc21)
DEFINE_QPEL_RECIPE(avg_mc23)
DEFINE_QPEL_RECIPE(avg_mc31)
DEFINE_QPEL_RECIPE(avg_mc32)
DEFINE_QPEL_RECIPE(avg_mc33)
#undef DEFINE_QPEL_RECIPE