0a99b16489
Wires QPU dispatch for cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264
deblock) through the public API. These three kernels have recipe
substrate = CPU, but per Issue 003 the mixed-kernel helper value
is real — the dispatch path must exist so override-mode callers
can request QPU on the side.
Pattern mirrors dispatch_idct8_qpu (lazy pipeline + per-call SSBO
alloc + memcpy + dispatch + readback). Each kernel has its own
push-constant struct (mc_pc 3-field, cdef_pc 3-field, deblock_pc
2-field shared with lpf).
Notable bug caught + fixed in test_api_opportunistic_qpu: the
initial dispatch_mc_8h_qpu sized src_max using CPU-side reach
(src_off + 3 + 8 + 7*stride), but the QPU shader reads src[
src_off + row*stride + 0..14] for row=0..7. Last block had 3
uninitialized bytes → 99.8% match → 100% after fix.
After this commit, the public API surface fully covers cycles 1-8:
Cycle 1 (IDCT 8x8): CPU + QPU + AUTO bit-exact
Cycle 2 (LPF wd=4): CPU + QPU + AUTO bit-exact
Cycle 3 (MC 8h): CPU recipe; QPU override bit-exact
Cycle 4 (LPF wd=8): CPU + QPU + AUTO bit-exact
Cycle 5 (CDEF): CPU recipe; QPU override (untested in this
test — bench_v3d_cdef is the authoritative 3-way M1)
Cycle 6 (H.264 IDCT 4x4): CPU only (no QPU shader by recipe)
Cycle 7 (H.264 IDCT 8x8): CPU only
Cycle 8 (H.264 deblock luma-v): CPU recipe; QPU override bit-exact
Tests: test_api_opportunistic_qpu adds CPU-vs-QPU bit-exact
comparison for VP9 MC and H.264 deblock through the API.
test_api_idct, test_api_lpf, test_api_h264 still pass.
Per the locked Phase 8 architecture (project_phase8_architecture
memory): next session opens daedalus-v4l2 sibling repo with
Option B (kernel V4L2 shim + userspace daemon), Option γ (dlopen
FFmpeg parser).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
884 lines
34 KiB
C
884 lines
34 KiB
C
/*
|
|
* daedalus-fourier core library — Phase 8 skeleton + IDCT QPU wired.
|
|
*
|
|
* Wraps cycles 1-5 kernels behind the public C API in
|
|
* include/daedalus.h. Recipe dispatch routes per-kernel to the
|
|
* verdict substrate from each cycle's Phase 7 doc.
|
|
*
|
|
* QPU dispatch wiring status:
|
|
* IDCT 8x8: wired (cycle 1 v4 shader).
|
|
* Others: stubbed (return -1); CPU path always works.
|
|
*
|
|
* License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
|
|
* dav1d BSD-2-Clause NEON snapshots.
|
|
*/
|
|
#include "../include/daedalus.h"
|
|
#include "v3d_runner.h"
|
|
|
|
#include <stdlib.h>
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
|
|
/* -------------------- Context -------------------- */
|
|
|
|
struct daedalus_ctx {
|
|
int has_qpu;
|
|
v3d_runner *runner; /* NULL when has_qpu == 0 */
|
|
|
|
/* Per-kernel pipelines, lazy-created on first QPU dispatch. */
|
|
int idct8_pipe_ready;
|
|
v3d_pipeline idct8_pipe;
|
|
int lpf4_pipe_ready;
|
|
v3d_pipeline lpf4_pipe;
|
|
int lpf8_pipe_ready;
|
|
v3d_pipeline lpf8_pipe;
|
|
int mc8h_pipe_ready;
|
|
v3d_pipeline mc8h_pipe;
|
|
int cdef_pipe_ready;
|
|
v3d_pipeline cdef_pipe;
|
|
int h264deblock_pipe_ready;
|
|
v3d_pipeline h264deblock_pipe;
|
|
};
|
|
|
|
daedalus_ctx *daedalus_ctx_create(void)
|
|
{
|
|
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
|
if (!ctx) return NULL;
|
|
ctx->runner = v3d_runner_create();
|
|
ctx->has_qpu = (ctx->runner != NULL);
|
|
return ctx;
|
|
}
|
|
|
|
daedalus_ctx *daedalus_ctx_create_no_qpu(void)
|
|
{
|
|
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
|
if (!ctx) return NULL;
|
|
ctx->has_qpu = 0;
|
|
ctx->runner = NULL;
|
|
return ctx;
|
|
}
|
|
|
|
int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
|
|
{
|
|
return ctx ? ctx->has_qpu : 0;
|
|
}
|
|
|
|
void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
|
{
|
|
if (!ctx) return;
|
|
if (ctx->runner) {
|
|
if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
|
|
if (ctx->lpf4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe);
|
|
if (ctx->lpf8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe);
|
|
if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe);
|
|
if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe);
|
|
if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe);
|
|
v3d_runner_destroy(ctx->runner);
|
|
}
|
|
free(ctx);
|
|
}
|
|
|
|
/* -------------------- Recipe query -------------------- */
|
|
|
|
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
|
{
|
|
switch (k) {
|
|
case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU;
|
|
case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
|
case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_CPU;
|
|
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
|
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
|
|
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU;
|
|
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU;
|
|
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_CPU;
|
|
}
|
|
return DAEDALUS_SUBSTRATE_CPU;
|
|
}
|
|
|
|
/* -------------------- NEON externs (per cycle bench links) ----- */
|
|
|
|
extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
|
|
int16_t *block, int eob);
|
|
extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride,
|
|
int E, int I, int H);
|
|
extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride,
|
|
int E, int I, int H);
|
|
extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
|
const uint8_t *src, ptrdiff_t src_stride,
|
|
int h, int mx, int my);
|
|
extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
|
const uint16_t *tmp,
|
|
int pri_strength, int sec_strength,
|
|
int dir, int damping, int h,
|
|
size_t edges);
|
|
extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
|
extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
|
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
|
int alpha, int beta, int8_t *tc0);
|
|
|
|
/* -------------------- CPU dispatch implementations -------------- */
|
|
|
|
static int dispatch_idct8_cpu(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const int16_t *coeffs, size_t n_blocks,
|
|
const daedalus_idct8_meta *meta)
|
|
{
|
|
(void) ctx;
|
|
int16_t scratch[64];
|
|
for (size_t i = 0; i < n_blocks; i++) {
|
|
memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t));
|
|
ff_vp9_idct_idct_8x8_add_neon(dst + meta[i].dst_off,
|
|
(ptrdiff_t) dst_stride,
|
|
scratch, 64);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int dispatch_lpf_cpu(daedalus_ctx *ctx, int wd_8,
|
|
uint8_t *dst, size_t dst_stride,
|
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
|
{
|
|
(void) ctx;
|
|
for (size_t i = 0; i < n_edges; i++) {
|
|
uint8_t *p = dst + meta[i].dst_off;
|
|
if (wd_8) ff_vp9_loop_filter_h_8_8_neon(p, (ptrdiff_t) dst_stride,
|
|
meta[i].E, meta[i].I, meta[i].H);
|
|
else ff_vp9_loop_filter_h_4_8_neon(p, (ptrdiff_t) dst_stride,
|
|
meta[i].E, meta[i].I, meta[i].H);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int dispatch_mc_8h_cpu(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const uint8_t *src, size_t src_stride,
|
|
size_t n_blocks, const daedalus_mc_meta *meta)
|
|
{
|
|
(void) ctx;
|
|
for (size_t i = 0; i < n_blocks; i++) {
|
|
ff_vp9_put_regular8_h_neon(dst + meta[i].dst_off,
|
|
(ptrdiff_t) dst_stride,
|
|
src + meta[i].src_off + 3,
|
|
(ptrdiff_t) src_stride,
|
|
8, meta[i].mx, 0);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int dispatch_cdef_cpu(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const uint16_t *tmp,
|
|
size_t n_blocks, const daedalus_cdef_meta *meta)
|
|
{
|
|
(void) ctx;
|
|
for (size_t i = 0; i < n_blocks; i++) {
|
|
dav1d_cdef_filter8_8bpc_neon(dst + meta[i].dst_off,
|
|
(ptrdiff_t) dst_stride,
|
|
tmp + meta[i].tmp_off_u16,
|
|
meta[i].pri_strength,
|
|
meta[i].sec_strength,
|
|
meta[i].dir, meta[i].damping, 8, 0);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int dispatch_h264_idct4_cpu(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
int16_t *coeffs, size_t n_blocks,
|
|
const daedalus_h264_block_meta *meta)
|
|
{
|
|
(void) ctx;
|
|
for (size_t i = 0; i < n_blocks; i++)
|
|
ff_h264_idct_add_neon(dst + meta[i].dst_off,
|
|
coeffs + i * 16,
|
|
(ptrdiff_t) dst_stride);
|
|
return 0;
|
|
}
|
|
|
|
static int dispatch_h264_idct8_cpu(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
int16_t *coeffs, size_t n_blocks,
|
|
const daedalus_h264_block_meta *meta)
|
|
{
|
|
(void) ctx;
|
|
for (size_t i = 0; i < n_blocks; i++)
|
|
ff_h264_idct8_add_neon(dst + meta[i].dst_off,
|
|
coeffs + i * 64,
|
|
(ptrdiff_t) dst_stride);
|
|
return 0;
|
|
}
|
|
|
|
static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
|
{
|
|
(void) ctx;
|
|
for (size_t i = 0; i < n_edges; i++) {
|
|
/* NEON expects mutable tc0 pointer; copy to a local. */
|
|
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
|
meta[i].tc0[2], meta[i].tc0[3] };
|
|
ff_h264_v_loop_filter_luma_neon(dst + meta[i].dst_off,
|
|
(ptrdiff_t) dst_stride,
|
|
meta[i].alpha, meta[i].beta, tc0_local);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
|
|
|
typedef struct {
|
|
uint32_t n_blocks;
|
|
uint32_t blocks_per_row;
|
|
uint32_t dst_stride_u8;
|
|
uint32_t _pad;
|
|
} idct8_pc;
|
|
|
|
static int ensure_idct8_pipeline(daedalus_ctx *ctx)
|
|
{
|
|
if (ctx->idct8_pipe_ready) return 0;
|
|
if (v3d_runner_create_pipeline(ctx->runner,
|
|
"v3d_idct8.spv",
|
|
/*n_ssbos=*/3,
|
|
/*push_const_size=*/sizeof(idct8_pc),
|
|
&ctx->idct8_pipe) != 0) {
|
|
return -1;
|
|
}
|
|
ctx->idct8_pipe_ready = 1;
|
|
return 0;
|
|
}
|
|
|
|
static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const int16_t *coeffs, size_t n_blocks,
|
|
const daedalus_idct8_meta *meta)
|
|
{
|
|
if (ensure_idct8_pipeline(ctx) != 0) return -1;
|
|
|
|
/* Allocate three SSBOs per call (coeffs, dst, meta). Performance-
|
|
* tuning (buffer pool) is deferred; correctness first. */
|
|
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
|
|
size_t meta_bytes = n_blocks * 2 * sizeof(uint32_t); /* uvec2 per block */
|
|
/* dst buffer must hold all of dst[0..max_dst_off + 64 + 8*stride].
|
|
* Cheapest correct answer: alloc the smallest contiguous region
|
|
* containing every block's footprint. For Phase 8 we assume the
|
|
* caller's dst surface starts at byte 0 of the buffer and use
|
|
* the full provided extent. We size by scanning meta. */
|
|
size_t max_byte_touched = 0;
|
|
for (size_t i = 0; i < n_blocks; i++) {
|
|
size_t end = meta[i].dst_off + (size_t)(8 - 1) * dst_stride + 8;
|
|
if (end > max_byte_touched) max_byte_touched = end;
|
|
}
|
|
|
|
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
|
|
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
|
if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
|
}
|
|
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
|
}
|
|
|
|
/* Upload. Coeffs and meta are straight copies. Dst we copy the
|
|
* caller's full region (since we'll need to read it back). */
|
|
memcpy(buf_coeffs.mapped, coeffs, coeff_bytes);
|
|
memcpy(buf_dst.mapped, dst, max_byte_touched);
|
|
uint32_t *m = buf_meta.mapped;
|
|
for (size_t i = 0; i < n_blocks; i++) {
|
|
m[2*i + 0] = meta[i].block_x;
|
|
m[2*i + 1] = meta[i].block_y;
|
|
}
|
|
|
|
/* Bind: shader expects (coeffs, dst, meta) per src/v3d_idct8.comp. */
|
|
v3d_buffer binds[3] = { buf_coeffs, buf_dst, buf_meta };
|
|
if (v3d_runner_bind_buffers(ctx->runner, &ctx->idct8_pipe, binds, 3)) {
|
|
goto fail;
|
|
}
|
|
|
|
/* WG geometry: 32 blocks per WG. */
|
|
uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
|
|
idct8_pc pc = {
|
|
.n_blocks = (uint32_t) n_blocks,
|
|
.blocks_per_row = 0, /* unused by v4 shader (meta drives placement) */
|
|
.dst_stride_u8 = (uint32_t) dst_stride,
|
|
._pad = 0,
|
|
};
|
|
|
|
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
|
if (cb == VK_NULL_HANDLE) goto fail;
|
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
|
vkBeginCommandBuffer(cb, &cbbi);
|
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
|
ctx->idct8_pipe.pipeline);
|
|
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
|
ctx->idct8_pipe.layout, 0, 1,
|
|
&ctx->idct8_pipe.desc_set, 0, NULL);
|
|
vkCmdPushConstants(cb, ctx->idct8_pipe.layout,
|
|
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
|
vkCmdDispatch(cb, wg_count, 1, 1);
|
|
vkEndCommandBuffer(cb);
|
|
|
|
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
|
|
|
/* Read-back dst. */
|
|
memcpy(dst, buf_dst.mapped, max_byte_touched);
|
|
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
|
return 0;
|
|
|
|
fail:
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
|
return -1;
|
|
}
|
|
|
|
/* -------------------- LPF QPU dispatch (cycles 2 + 4 shaders) --
|
|
*
|
|
* NOTE: the two LPF shaders disagree on push-constant slot order.
|
|
* v3d_lpf_h_4_8.comp: (n_edges, dst_stride_u8, _pad, _pad)
|
|
* v3d_lpf_h_8_8.comp: (n_edges, blocks_per_row=unused, dst_stride_u8, _pad)
|
|
*
|
|
* Same total size (16 bytes), different slot 2. Keep separate
|
|
* struct definitions to avoid silent corruption — Phase 8 caught
|
|
* this empirically when test_api_lpf wd=8 reported 95.6 % match.
|
|
*/
|
|
typedef struct {
|
|
uint32_t n_edges;
|
|
uint32_t dst_stride_u8;
|
|
uint32_t _pad0;
|
|
uint32_t _pad1;
|
|
} lpf4_pc;
|
|
|
|
typedef struct {
|
|
uint32_t n_edges;
|
|
uint32_t blocks_per_row; /* unused by shader, must exist */
|
|
uint32_t dst_stride_u8;
|
|
uint32_t _pad;
|
|
} lpf8_pc;
|
|
|
|
static int ensure_lpf_pipeline(daedalus_ctx *ctx, int wd_8,
|
|
int *flag, v3d_pipeline *pipe,
|
|
const char *spv)
|
|
{
|
|
if (*flag) return 0;
|
|
size_t pc_size = wd_8 ? sizeof(lpf8_pc) : sizeof(lpf4_pc);
|
|
if (v3d_runner_create_pipeline(ctx->runner, spv,
|
|
/*n_ssbos=*/2,
|
|
/*push_const_size=*/(uint32_t) pc_size,
|
|
pipe) != 0) {
|
|
return -1;
|
|
}
|
|
*flag = 1;
|
|
return 0;
|
|
}
|
|
|
|
static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
|
uint8_t *dst, size_t dst_stride,
|
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
|
{
|
|
int *flag = wd_8 ? &ctx->lpf8_pipe_ready : &ctx->lpf4_pipe_ready;
|
|
v3d_pipeline *p = wd_8 ? &ctx->lpf8_pipe : &ctx->lpf4_pipe;
|
|
const char *spv = wd_8 ? "v3d_lpf_h_8_8.spv" : "v3d_lpf_h_4_8.spv";
|
|
if (ensure_lpf_pipeline(ctx, wd_8, flag, p, spv) != 0) return -1;
|
|
|
|
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */
|
|
/* Determine smallest dst window. Each edge writes to bytes
|
|
* [dst_off - 4 .. dst_off + 3] for 8 rows at dst_stride. */
|
|
size_t lo = (size_t) -1, hi = 0;
|
|
for (size_t i = 0; i < n_edges; i++) {
|
|
size_t base = meta[i].dst_off;
|
|
if (base >= 4) {
|
|
size_t this_lo = base - 4;
|
|
if (this_lo < lo) lo = this_lo;
|
|
} else {
|
|
lo = 0;
|
|
}
|
|
size_t this_hi = base + (size_t)(8 - 1) * dst_stride + 4;
|
|
if (this_hi > hi) hi = this_hi;
|
|
}
|
|
if (n_edges == 0) { lo = 0; hi = 0; }
|
|
size_t dst_window_size = hi - lo;
|
|
|
|
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
|
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
|
if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1;
|
|
}
|
|
|
|
memcpy(buf_dst.mapped, dst + lo, dst_window_size);
|
|
uint32_t *m = buf_meta.mapped;
|
|
for (size_t i = 0; i < n_edges; i++) {
|
|
m[4*i + 0] = (uint32_t)(meta[i].dst_off - lo);
|
|
m[4*i + 1] = (uint32_t) meta[i].E;
|
|
m[4*i + 2] = (uint32_t) meta[i].I;
|
|
m[4*i + 3] = (uint32_t) meta[i].H;
|
|
}
|
|
|
|
v3d_buffer binds[2] = { buf_meta, buf_dst };
|
|
if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
|
|
|
|
uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
|
|
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
|
if (cb == VK_NULL_HANDLE) goto fail;
|
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
|
vkBeginCommandBuffer(cb, &cbbi);
|
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
|
|
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
|
p->layout, 0, 1, &p->desc_set, 0, NULL);
|
|
if (wd_8) {
|
|
lpf8_pc pc = { .n_edges = (uint32_t) n_edges,
|
|
.blocks_per_row = 0,
|
|
.dst_stride_u8 = (uint32_t) dst_stride,
|
|
._pad = 0 };
|
|
vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
|
0, sizeof(pc), &pc);
|
|
} else {
|
|
lpf4_pc pc = { .n_edges = (uint32_t) n_edges,
|
|
.dst_stride_u8 = (uint32_t) dst_stride };
|
|
vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
|
0, sizeof(pc), &pc);
|
|
}
|
|
vkCmdDispatch(cb, wg_count, 1, 1);
|
|
vkEndCommandBuffer(cb);
|
|
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
|
|
|
memcpy(dst + lo, buf_dst.mapped, dst_window_size);
|
|
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
|
return 0;
|
|
fail:
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
|
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
|
return -1;
|
|
}
|
|
|
|
/* -------------------- VP9 MC QPU dispatch (cycle 3) ------------- */
|
|
|
|
typedef struct {
|
|
uint32_t n_blocks;
|
|
uint32_t dst_stride_u8;
|
|
uint32_t src_stride_u8;
|
|
uint32_t _pad;
|
|
} mc_pc;
|
|
|
|
static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const uint8_t *src, size_t src_stride,
|
|
size_t n_blocks, const daedalus_mc_meta *meta)
|
|
{
|
|
if (!ctx->mc8h_pipe_ready) {
|
|
if (v3d_runner_create_pipeline(ctx->runner, "v3d_mc_8h.spv",
|
|
3, sizeof(mc_pc), &ctx->mc8h_pipe) != 0)
|
|
return -1;
|
|
ctx->mc8h_pipe_ready = 1;
|
|
}
|
|
|
|
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
|
size_t dst_max = 0, src_max = 0;
|
|
for (size_t i = 0; i < n_blocks; i++) {
|
|
size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8;
|
|
if (de > dst_max) dst_max = de;
|
|
/* QPU shader reads src[src_off + row*stride + 0..14] for row=0..7. */
|
|
size_t se = meta[i].src_off + 7 * src_stride + 15;
|
|
if (se > src_max) src_max = se;
|
|
}
|
|
|
|
v3d_buffer bm = {0}, bd = {0}, bs = {0};
|
|
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
|
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
|
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
|
|
|
memcpy(bs.mapped, src, src_max);
|
|
memcpy(bd.mapped, dst, dst_max);
|
|
uint32_t *m = bm.mapped;
|
|
for (size_t i = 0; i < n_blocks; i++) {
|
|
m[4*i+0] = meta[i].dst_off;
|
|
m[4*i+1] = meta[i].src_off;
|
|
m[4*i+2] = (uint32_t) meta[i].mx;
|
|
m[4*i+3] = 0;
|
|
}
|
|
|
|
v3d_buffer binds[3] = { bm, bd, bs };
|
|
if (v3d_runner_bind_buffers(ctx->runner, &ctx->mc8h_pipe, binds, 3)) goto fail;
|
|
|
|
uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
|
|
mc_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
|
.dst_stride_u8 = (uint32_t) dst_stride,
|
|
.src_stride_u8 = (uint32_t) src_stride };
|
|
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
|
if (cb == VK_NULL_HANDLE) goto fail;
|
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
|
vkBeginCommandBuffer(cb, &cbbi);
|
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline);
|
|
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
|
ctx->mc8h_pipe.layout, 0, 1, &ctx->mc8h_pipe.desc_set, 0, NULL);
|
|
vkCmdPushConstants(cb, ctx->mc8h_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
|
0, sizeof(pc), &pc);
|
|
vkCmdDispatch(cb, wg_count, 1, 1);
|
|
vkEndCommandBuffer(cb);
|
|
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
|
|
|
memcpy(dst, bd.mapped, dst_max);
|
|
|
|
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
|
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
|
return 0;
|
|
fail:
|
|
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
|
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
|
return -1;
|
|
}
|
|
|
|
/* -------------------- CDEF QPU dispatch (cycle 5) --------------- */
|
|
|
|
typedef struct {
|
|
uint32_t n_blocks;
|
|
uint32_t tmp_stride_u16;
|
|
uint32_t dst_stride_u8;
|
|
uint32_t _pad;
|
|
} cdef_pc;
|
|
|
|
static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const uint16_t *tmp,
|
|
size_t n_blocks, const daedalus_cdef_meta *meta)
|
|
{
|
|
if (!ctx->cdef_pipe_ready) {
|
|
if (v3d_runner_create_pipeline(ctx->runner, "v3d_cdef.spv",
|
|
3, sizeof(cdef_pc), &ctx->cdef_pipe) != 0)
|
|
return -1;
|
|
ctx->cdef_pipe_ready = 1;
|
|
}
|
|
|
|
size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t);
|
|
size_t dst_max = 0, tmp_max_u16 = 0;
|
|
for (size_t i = 0; i < n_blocks; i++) {
|
|
size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8;
|
|
if (de > dst_max) dst_max = de;
|
|
size_t te = meta[i].tmp_off_u16 + (8 - 1) * 16 + 8; /* center 8x8 in stride-16 tmp */
|
|
if (te > tmp_max_u16) tmp_max_u16 = te;
|
|
}
|
|
size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
|
|
|
|
v3d_buffer bm = {0}, bd = {0}, bt = {0};
|
|
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
|
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
|
if (v3d_runner_create_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
|
|
|
/* tmp may need padding before block-origin offset (caller-allocated). Just
|
|
* copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
|
|
* caller has the layout set up. */
|
|
memcpy(bt.mapped, tmp, tmp_bytes);
|
|
memcpy(bd.mapped, dst, dst_max);
|
|
uint32_t *m = bm.mapped;
|
|
for (size_t i = 0; i < n_blocks; i++) {
|
|
uint32_t pri = (uint32_t) meta[i].pri_strength;
|
|
uint32_t sec = (uint32_t) meta[i].sec_strength;
|
|
uint32_t damping = (uint32_t) meta[i].damping;
|
|
m[4*i+0] = meta[i].dst_off;
|
|
m[4*i+1] = pri | (sec << 8) | (damping << 16);
|
|
m[4*i+2] = meta[i].tmp_off_u16;
|
|
m[4*i+3] = (uint32_t) meta[i].dir;
|
|
}
|
|
|
|
v3d_buffer binds[3] = { bm, bd, bt };
|
|
if (v3d_runner_bind_buffers(ctx->runner, &ctx->cdef_pipe, binds, 3)) goto fail;
|
|
|
|
uint32_t wg_count = (uint32_t)((n_blocks + 3) / 4);
|
|
cdef_pc pc = { .n_blocks = (uint32_t) n_blocks,
|
|
.tmp_stride_u16 = 16,
|
|
.dst_stride_u8 = (uint32_t) dst_stride };
|
|
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
|
if (cb == VK_NULL_HANDLE) goto fail;
|
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
|
vkBeginCommandBuffer(cb, &cbbi);
|
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline);
|
|
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
|
ctx->cdef_pipe.layout, 0, 1, &ctx->cdef_pipe.desc_set, 0, NULL);
|
|
vkCmdPushConstants(cb, ctx->cdef_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
|
0, sizeof(pc), &pc);
|
|
vkCmdDispatch(cb, wg_count, 1, 1);
|
|
vkEndCommandBuffer(cb);
|
|
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
|
|
|
memcpy(dst, bd.mapped, dst_max);
|
|
|
|
v3d_runner_destroy_buffer(ctx->runner, &bt);
|
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
|
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
|
return 0;
|
|
fail:
|
|
v3d_runner_destroy_buffer(ctx->runner, &bt);
|
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
|
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
|
return -1;
|
|
}
|
|
|
|
/* -------------------- H.264 deblock QPU dispatch (cycle 8) ------ */
|
|
|
|
typedef struct {
|
|
uint32_t n_edges;
|
|
uint32_t dst_stride_u8;
|
|
uint32_t _pad0;
|
|
uint32_t _pad1;
|
|
} h264deblock_pc;
|
|
|
|
static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
|
{
|
|
if (!ctx->h264deblock_pipe_ready) {
|
|
if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264deblock.spv",
|
|
2, sizeof(h264deblock_pc), &ctx->h264deblock_pipe) != 0)
|
|
return -1;
|
|
ctx->h264deblock_pipe_ready = 1;
|
|
}
|
|
|
|
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t);
|
|
size_t dst_max = 0;
|
|
for (size_t i = 0; i < n_edges; i++) {
|
|
/* Reads -4*stride to +3*stride+15 from dst_off; writes -2..+1 *stride. */
|
|
size_t e = meta[i].dst_off + 3 * dst_stride + 16;
|
|
if (e > dst_max) dst_max = e;
|
|
}
|
|
|
|
v3d_buffer bm = {0}, bd = {0};
|
|
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
|
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
|
|
|
memcpy(bd.mapped, dst, dst_max);
|
|
uint32_t *m = bm.mapped;
|
|
for (size_t i = 0; i < n_edges; i++) {
|
|
m[4*i+0] = meta[i].dst_off;
|
|
m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8);
|
|
m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0])
|
|
| (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8)
|
|
| (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16)
|
|
| (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24);
|
|
m[4*i+3] = 0;
|
|
}
|
|
|
|
v3d_buffer binds[2] = { bm, bd };
|
|
if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264deblock_pipe, binds, 2)) goto fail;
|
|
|
|
uint32_t wg_count = (uint32_t)((n_edges + 15) / 16);
|
|
h264deblock_pc pc = { .n_edges = (uint32_t) n_edges,
|
|
.dst_stride_u8 = (uint32_t) dst_stride };
|
|
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
|
if (cb == VK_NULL_HANDLE) goto fail;
|
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
|
vkBeginCommandBuffer(cb, &cbbi);
|
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline);
|
|
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
|
ctx->h264deblock_pipe.layout, 0, 1, &ctx->h264deblock_pipe.desc_set, 0, NULL);
|
|
vkCmdPushConstants(cb, ctx->h264deblock_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
|
0, sizeof(pc), &pc);
|
|
vkCmdDispatch(cb, wg_count, 1, 1);
|
|
vkEndCommandBuffer(cb);
|
|
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
|
|
|
memcpy(dst, bd.mapped, dst_max);
|
|
|
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
|
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
|
return 0;
|
|
fail:
|
|
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
|
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
|
return -1;
|
|
}
|
|
|
|
/* -------------------- Public dispatch entry points -------------- */
|
|
|
|
#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \
|
|
daedalus_substrate eff = sub; \
|
|
if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
|
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
|
eff = DAEDALUS_SUBSTRATE_CPU; \
|
|
if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \
|
|
return -1 /* QPU path not yet wired for this kernel */
|
|
|
|
int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const int16_t *coeffs, size_t n_blocks,
|
|
const daedalus_idct8_meta *meta)
|
|
{
|
|
daedalus_substrate eff = sub;
|
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8);
|
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
|
return dispatch_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
|
|
return dispatch_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
|
|
}
|
|
|
|
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
|
|
uint8_t *dst, size_t dst_stride,
|
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
|
{
|
|
daedalus_substrate eff = sub;
|
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER);
|
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
|
return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta);
|
|
return dispatch_lpf_qpu(ctx, 0, dst, dst_stride, n_edges, meta);
|
|
}
|
|
|
|
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
|
|
uint8_t *dst, size_t dst_stride,
|
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
|
{
|
|
daedalus_substrate eff = sub;
|
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER);
|
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
|
return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta);
|
|
return dispatch_lpf_qpu(ctx, 1, dst, dst_stride, n_edges, meta);
|
|
}
|
|
|
|
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const uint8_t *src, size_t src_stride,
|
|
size_t n_blocks, const daedalus_mc_meta *meta)
|
|
{
|
|
daedalus_substrate eff = sub;
|
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_MC_8H);
|
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
|
return dispatch_mc_8h_cpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta);
|
|
return dispatch_mc_8h_qpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta);
|
|
}
|
|
|
|
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const uint16_t *tmp,
|
|
size_t n_blocks, const daedalus_cdef_meta *meta)
|
|
{
|
|
daedalus_substrate eff = sub;
|
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_AV1_CDEF_8X8);
|
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
|
return dispatch_cdef_cpu(ctx, dst, dst_stride, tmp, n_blocks, meta);
|
|
return dispatch_cdef_qpu(ctx, dst, dst_stride, tmp, n_blocks, meta);
|
|
}
|
|
|
|
int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub,
|
|
uint8_t *dst, size_t dst_stride,
|
|
int16_t *coeffs, size_t n_blocks,
|
|
const daedalus_h264_block_meta *meta)
|
|
{
|
|
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT4, dispatch_h264_idct4_cpu,
|
|
dst, dst_stride, coeffs, n_blocks, meta);
|
|
}
|
|
|
|
int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
|
uint8_t *dst, size_t dst_stride,
|
|
int16_t *coeffs, size_t n_blocks,
|
|
const daedalus_h264_block_meta *meta)
|
|
{
|
|
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_IDCT8, dispatch_h264_idct8_cpu,
|
|
dst, dst_stride, coeffs, n_blocks, meta);
|
|
}
|
|
|
|
int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
|
uint8_t *dst, size_t dst_stride,
|
|
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
|
{
|
|
daedalus_substrate eff = sub;
|
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV);
|
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
|
return dispatch_h264_deblock_cpu(ctx, dst, dst_stride, n_edges, meta);
|
|
return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
|
|
}
|
|
|
|
/* -------------------- Recipe convenience wrappers --------------- */
|
|
|
|
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const int16_t *coeffs, size_t n_blocks,
|
|
const daedalus_idct8_meta *meta)
|
|
{
|
|
return daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
|
dst, dst_stride, coeffs, n_blocks, meta);
|
|
}
|
|
|
|
int daedalus_recipe_dispatch_vp9_lpf4(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
|
{
|
|
return daedalus_dispatch_vp9_lpf4(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
|
dst, dst_stride, n_edges, meta);
|
|
}
|
|
|
|
int daedalus_recipe_dispatch_vp9_lpf8(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
|
{
|
|
return daedalus_dispatch_vp9_lpf8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
|
dst, dst_stride, n_edges, meta);
|
|
}
|
|
|
|
int daedalus_recipe_dispatch_vp9_mc_8h(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const uint8_t *src, size_t src_stride,
|
|
size_t n_blocks, const daedalus_mc_meta *meta)
|
|
{
|
|
return daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
|
dst, dst_stride, src, src_stride, n_blocks, meta);
|
|
}
|
|
|
|
int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
const uint16_t *tmp,
|
|
size_t n_blocks, const daedalus_cdef_meta *meta)
|
|
{
|
|
return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
|
dst, dst_stride, tmp, n_blocks, meta);
|
|
}
|
|
|
|
int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
int16_t *coeffs, size_t n_blocks,
|
|
const daedalus_h264_block_meta *meta)
|
|
{
|
|
return daedalus_dispatch_h264_idct4(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
|
dst, dst_stride, coeffs, n_blocks, meta);
|
|
}
|
|
|
|
int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
int16_t *coeffs, size_t n_blocks,
|
|
const daedalus_h264_block_meta *meta)
|
|
{
|
|
return daedalus_dispatch_h264_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
|
dst, dst_stride, coeffs, n_blocks, meta);
|
|
}
|
|
|
|
int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
|
|
uint8_t *dst, size_t dst_stride,
|
|
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
|
{
|
|
return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
|
dst, dst_stride, n_edges, meta);
|
|
}
|