/* * daedalus-fourier core library — Phase 8 skeleton + IDCT QPU wired. * * Wraps cycles 1-5 kernels behind the public C API in * include/daedalus.h. Recipe dispatch routes per-kernel to the * verdict substrate from each cycle's Phase 7 doc. * * QPU dispatch wiring status: * IDCT 8x8: wired (cycle 1 v4 shader). * Others: stubbed (return -1); CPU path always works. * * License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ + * dav1d BSD-2-Clause NEON snapshots. */ #include "../include/daedalus.h" #include "v3d_runner.h" #include #include #include #include #include /* -------------------- Context -------------------- */ struct daedalus_ctx { int has_qpu; v3d_runner *runner; /* NULL when has_qpu == 0 */ /* Per-kernel pipelines, lazy-created on first QPU dispatch. */ int idct8_pipe_ready; v3d_pipeline idct8_pipe; int lpf4_pipe_ready; v3d_pipeline lpf4_pipe; int lpf8_pipe_ready; v3d_pipeline lpf8_pipe; int mc8h_pipe_ready; v3d_pipeline mc8h_pipe; int cdef_pipe_ready; v3d_pipeline cdef_pipe; int h264deblock_pipe_ready; v3d_pipeline h264deblock_pipe; int h264_idct4_pipe_ready; v3d_pipeline h264_idct4_pipe; int h264_idct8_pipe_ready; v3d_pipeline h264_idct8_pipe; int h264_qpel_mc20_pipe_ready; v3d_pipeline h264_qpel_mc20_pipe; }; daedalus_ctx *daedalus_ctx_create(void) { daedalus_ctx *ctx = calloc(1, sizeof(*ctx)); if (!ctx) return NULL; ctx->runner = v3d_runner_create(); ctx->has_qpu = (ctx->runner != NULL); return ctx; } daedalus_ctx *daedalus_ctx_create_no_qpu(void) { /* * Per the "QPU is default substrate" decree 2026-05-23: * setting DAEDALUS_FORCE_QPU=1 in the process env escalates this * function to a full daedalus_ctx_create(), letting the libavcodec * substitution shims (which call create_no_qpu via pthread_once) * fire the V3D shaders that exist for cycles 1/2/4/5/8. Without * this hook each consumer process (firefox, mpv, daemon) would * need its own shim build to opt into QPU. * * Default behaviour (env var unset / not "1") is unchanged: pure * NEON ctx, no implicit Vulkan init. Firefox / mpv consumers * that dlopen libavcodec without opting in stay on the * Vulkan-free path; the daemon explicitly sets * DAEDALUS_FORCE_QPU=1 before loading libavcodec. */ const char *force = getenv("DAEDALUS_FORCE_QPU"); if (force && force[0] == '1' && force[1] == 0) return daedalus_ctx_create(); daedalus_ctx *ctx = calloc(1, sizeof(*ctx)); if (!ctx) return NULL; ctx->has_qpu = 0; ctx->runner = NULL; return ctx; } int daedalus_ctx_has_qpu(const daedalus_ctx *ctx) { return ctx ? ctx->has_qpu : 0; } void daedalus_ctx_destroy(daedalus_ctx *ctx) { if (!ctx) return; if (ctx->runner) { if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe); if (ctx->lpf4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe); if (ctx->lpf8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe); if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe); if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe); if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe); if (ctx->h264_idct4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct4_pipe); if (ctx->h264_idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_idct8_pipe); if (ctx->h264_qpel_mc20_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264_qpel_mc20_pipe); v3d_runner_destroy(ctx->runner); } free(ctx); } /* -------------------- Recipe query -------------------- */ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) { /* * Recipe table per the "QPU is default substrate" decree * 2026-05-23. Any kernel that has a V3D compute shader returns * SUBSTRATE_QPU; CPU is the fallback for kernels without a * shader (still the case for H.264 IDCT 4x4 / IDCT 8x8 / qpel * mc20 — covered by follow-on task 165). The dispatch * wrappers already fall back to CPU automatically when the * ctx doesn't have QPU available (daedalus_ctx_has_qpu == 0). */ switch (k) { case DAEDALUS_KERNEL_VP9_IDCT8: return DAEDALUS_SUBSTRATE_QPU; case DAEDALUS_KERNEL_VP9_LPF4_INNER: return DAEDALUS_SUBSTRATE_QPU; case DAEDALUS_KERNEL_VP9_MC_8H: return DAEDALUS_SUBSTRATE_QPU; /* v3d_mc_8h.spv */ case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU; case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_cdef.spv */ case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct4.spv */ case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */ case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */ case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */ case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */ case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */ case DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 luma QPU pending */ case DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA: return DAEDALUS_SUBSTRATE_CPU; case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */ case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU; case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */ case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */ case DAEDALUS_KERNEL_H264_QPEL_MC22: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc22 shader pending (hv lowpass) */ case DAEDALUS_KERNEL_H264_QPEL_MC10: return DAEDALUS_SUBSTRATE_CPU; /* ¼-H L2 */ case DAEDALUS_KERNEL_H264_QPEL_MC30: return DAEDALUS_SUBSTRATE_CPU; /* ¾-H L2 */ case DAEDALUS_KERNEL_H264_QPEL_MC01: return DAEDALUS_SUBSTRATE_CPU; /* ¼-V L2 */ case DAEDALUS_KERNEL_H264_QPEL_MC03: return DAEDALUS_SUBSTRATE_CPU; /* ¾-V L2 */ } return DAEDALUS_SUBSTRATE_CPU; } /* -------------------- NEON externs (per cycle bench links) ----- */ extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H); extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H); extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my); extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride, const uint16_t *tmp, int pri_strength, int sec_strength, int dir, int damping, int h, size_t edges); extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); extern void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); extern void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); extern void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); /* -------------------- CPU dispatch implementations -------------- */ static int dispatch_idct8_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, const int16_t *coeffs, size_t n_blocks, const daedalus_idct8_meta *meta) { (void) ctx; int16_t scratch[64]; for (size_t i = 0; i < n_blocks; i++) { memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t)); ff_vp9_idct_idct_8x8_add_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, scratch, 64); } return 0; } static int dispatch_lpf_cpu(daedalus_ctx *ctx, int wd_8, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_lpf_meta *meta) { (void) ctx; for (size_t i = 0; i < n_edges; i++) { uint8_t *p = dst + meta[i].dst_off; if (wd_8) ff_vp9_loop_filter_h_8_8_neon(p, (ptrdiff_t) dst_stride, meta[i].E, meta[i].I, meta[i].H); else ff_vp9_loop_filter_h_4_8_neon(p, (ptrdiff_t) dst_stride, meta[i].E, meta[i].I, meta[i].H); } return 0; } static int dispatch_mc_8h_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, const uint8_t *src, size_t src_stride, size_t n_blocks, const daedalus_mc_meta *meta) { (void) ctx; for (size_t i = 0; i < n_blocks; i++) { ff_vp9_put_regular8_h_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, src + meta[i].src_off + 3, (ptrdiff_t) src_stride, 8, meta[i].mx, 0); } return 0; } static int dispatch_cdef_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, const uint16_t *tmp, size_t n_blocks, const daedalus_cdef_meta *meta) { (void) ctx; for (size_t i = 0; i < n_blocks; i++) { dav1d_cdef_filter8_8bpc_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, tmp + meta[i].tmp_off_u16, meta[i].pri_strength, meta[i].sec_strength, meta[i].dir, meta[i].damping, 8, 0); } return 0; } static int dispatch_h264_idct4_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { (void) ctx; for (size_t i = 0; i < n_blocks; i++) ff_h264_idct_add_neon(dst + meta[i].dst_off, coeffs + i * 16, (ptrdiff_t) dst_stride); return 0; } static int dispatch_h264_idct8_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { (void) ctx; for (size_t i = 0; i < n_blocks; i++) ff_h264_idct8_add_neon(dst + meta[i].dst_off, coeffs + i * 64, (ptrdiff_t) dst_stride); return 0; } static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { (void) ctx; for (size_t i = 0; i < n_edges; i++) { /* NEON expects mutable tc0 pointer; copy to a local. */ int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; ff_h264_v_loop_filter_luma_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, meta[i].alpha, meta[i].beta, tc0_local); } return 0; } static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { (void) ctx; for (size_t i = 0; i < n_edges; i++) { int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; ff_h264_h_loop_filter_luma_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, meta[i].alpha, meta[i].beta, tc0_local); } return 0; } static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { (void) ctx; for (size_t i = 0; i < n_edges; i++) { int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, meta[i].alpha, meta[i].beta, tc0_local); } return 0; } static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { (void) ctx; for (size_t i = 0; i < n_edges; i++) { int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, meta[i].alpha, meta[i].beta, tc0_local); } return 0; } /* --- bS=4 intra variants. Note: the daedalus_h264_deblock_meta * struct's tc0[] field is unused for intra (the spec hardcodes the * strength). We accept the same meta type so callers can build a * single edge-list and route by kernel — saves an extra struct. */ static int dispatch_h264_deblock_luma_v_intra_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { (void) ctx; for (size_t i = 0; i < n_edges; i++) { ff_h264_v_loop_filter_luma_intra_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, meta[i].alpha, meta[i].beta); } return 0; } static int dispatch_h264_deblock_luma_h_intra_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { (void) ctx; for (size_t i = 0; i < n_edges; i++) { ff_h264_h_loop_filter_luma_intra_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, meta[i].alpha, meta[i].beta); } return 0; } static int dispatch_h264_deblock_chroma_v_intra_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { (void) ctx; for (size_t i = 0; i < n_edges; i++) { ff_h264_v_loop_filter_chroma_intra_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, meta[i].alpha, meta[i].beta); } return 0; } static int dispatch_h264_deblock_chroma_h_intra_cpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { (void) ctx; for (size_t i = 0; i < n_edges; i++) { ff_h264_h_loop_filter_chroma_intra_neon(dst + meta[i].dst_off, (ptrdiff_t) dst_stride, meta[i].alpha, meta[i].beta); } return 0; } static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) { (void) ctx; /* FFmpeg's NEON entry uses a single stride for both dst and src * (H264QpelContext convention). Caller already guarantees this * via the public API contract documented in daedalus.h. */ for (size_t i = 0; i < n_blocks; i++) { ff_put_h264_qpel8_mc20_neon(dst + meta[i].dst_off, src + meta[i].src_off, (ptrdiff_t) stride); } return 0; } static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) { (void) ctx; for (size_t i = 0; i < n_blocks; i++) { ff_put_h264_qpel8_mc02_neon(dst + meta[i].dst_off, src + meta[i].src_off, (ptrdiff_t) stride); } return 0; } static int dispatch_h264_qpel_mc22_cpu(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) { (void) ctx; for (size_t i = 0; i < n_blocks; i++) { ff_put_h264_qpel8_mc22_neon(dst + meta[i].dst_off, src + meta[i].src_off, (ptrdiff_t) stride); } return 0; } /* The four single-axis quarter-pel CPU dispatches are uniform; the * macro collapses ~50 LOC of repetition. */ #define DEFINE_QPEL_CPU_DISPATCH(suffix, neon_fn) \ static int dispatch_h264_qpel_ ## suffix ## _cpu(daedalus_ctx *ctx, \ uint8_t *dst, const uint8_t *src, size_t stride, \ size_t n_blocks, const daedalus_h264_qpel_meta *meta) \ { \ (void) ctx; \ for (size_t i = 0; i < n_blocks; i++) { \ neon_fn(dst + meta[i].dst_off, src + meta[i].src_off, \ (ptrdiff_t) stride); \ } \ return 0; \ } DEFINE_QPEL_CPU_DISPATCH(mc10, ff_put_h264_qpel8_mc10_neon) DEFINE_QPEL_CPU_DISPATCH(mc30, ff_put_h264_qpel8_mc30_neon) DEFINE_QPEL_CPU_DISPATCH(mc01, ff_put_h264_qpel8_mc01_neon) DEFINE_QPEL_CPU_DISPATCH(mc03, ff_put_h264_qpel8_mc03_neon) #undef DEFINE_QPEL_CPU_DISPATCH /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */ typedef struct { uint32_t n_blocks; uint32_t blocks_per_row; uint32_t dst_stride_u8; uint32_t _pad; } idct8_pc; static int ensure_idct8_pipeline(daedalus_ctx *ctx) { if (ctx->idct8_pipe_ready) return 0; if (v3d_runner_create_pipeline(ctx->runner, "v3d_idct8.spv", /*n_ssbos=*/3, /*push_const_size=*/sizeof(idct8_pc), &ctx->idct8_pipe) != 0) { return -1; } ctx->idct8_pipe_ready = 1; return 0; } static int dispatch_idct8_qpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, const int16_t *coeffs, size_t n_blocks, const daedalus_idct8_meta *meta) { if (ensure_idct8_pipeline(ctx) != 0) return -1; /* Allocate three SSBOs per call (coeffs, dst, meta). Performance- * tuning (buffer pool) is deferred; correctness first. */ size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t); size_t meta_bytes = n_blocks * 2 * sizeof(uint32_t); /* uvec2 per block */ /* dst buffer must hold all of dst[0..max_dst_off + 64 + 8*stride]. * Cheapest correct answer: alloc the smallest contiguous region * containing every block's footprint. For Phase 8 we assume the * caller's dst surface starts at byte 0 of the buffer and use * the full provided extent. We size by scanning meta. */ size_t max_byte_touched = 0; for (size_t i = 0; i < n_blocks; i++) { size_t end = meta[i].dst_off + (size_t)(8 - 1) * dst_stride + 8; if (end > max_byte_touched) max_byte_touched = end; } v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0}; if (v3d_runner_acquire_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1; if (v3d_runner_acquire_buffer(ctx->runner, max_byte_touched, &buf_dst)) { v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1; } if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) { v3d_runner_release_buffer(ctx->runner, &buf_dst); v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1; } /* Upload. Coeffs and meta are straight copies. Dst we copy the * caller's full region (since we'll need to read it back). */ memcpy(buf_coeffs.mapped, coeffs, coeff_bytes); memcpy(buf_dst.mapped, dst, max_byte_touched); uint32_t *m = buf_meta.mapped; for (size_t i = 0; i < n_blocks; i++) { m[2*i + 0] = meta[i].block_x; m[2*i + 1] = meta[i].block_y; } /* Bind: shader expects (coeffs, dst, meta) per src/v3d_idct8.comp. */ v3d_buffer binds[3] = { buf_coeffs, buf_dst, buf_meta }; if (v3d_runner_bind_buffers(ctx->runner, &ctx->idct8_pipe, binds, 3)) { goto fail; } /* WG geometry: 32 blocks per WG. */ uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32); idct8_pc pc = { .n_blocks = (uint32_t) n_blocks, .blocks_per_row = 0, /* unused by v4 shader (meta drives placement) */ .dst_stride_u8 = (uint32_t) dst_stride, ._pad = 0, }; if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->idct8_pipe)) goto fail; VkCommandBuffer cb = ctx->idct8_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->idct8_pipe.pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->idct8_pipe.layout, 0, 1, &ctx->idct8_pipe.desc_set, 0, NULL); vkCmdPushConstants(cb, ctx->idct8_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); vkCmdDispatch(cb, wg_count, 1, 1); vkEndCommandBuffer(cb); if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; /* Read-back dst. */ memcpy(dst, buf_dst.mapped, max_byte_touched); v3d_runner_release_buffer(ctx->runner, &buf_meta); v3d_runner_release_buffer(ctx->runner, &buf_dst); v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return 0; fail: v3d_runner_release_buffer(ctx->runner, &buf_meta); v3d_runner_release_buffer(ctx->runner, &buf_dst); v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1; } /* -------------------- LPF QPU dispatch (cycles 2 + 4 shaders) -- * * NOTE: the two LPF shaders disagree on push-constant slot order. * v3d_lpf_h_4_8.comp: (n_edges, dst_stride_u8, _pad, _pad) * v3d_lpf_h_8_8.comp: (n_edges, blocks_per_row=unused, dst_stride_u8, _pad) * * Same total size (16 bytes), different slot 2. Keep separate * struct definitions to avoid silent corruption — Phase 8 caught * this empirically when test_api_lpf wd=8 reported 95.6 % match. */ typedef struct { uint32_t n_edges; uint32_t dst_stride_u8; uint32_t _pad0; uint32_t _pad1; } lpf4_pc; typedef struct { uint32_t n_edges; uint32_t blocks_per_row; /* unused by shader, must exist */ uint32_t dst_stride_u8; uint32_t _pad; } lpf8_pc; static int ensure_lpf_pipeline(daedalus_ctx *ctx, int wd_8, int *flag, v3d_pipeline *pipe, const char *spv) { if (*flag) return 0; size_t pc_size = wd_8 ? sizeof(lpf8_pc) : sizeof(lpf4_pc); if (v3d_runner_create_pipeline(ctx->runner, spv, /*n_ssbos=*/2, /*push_const_size=*/(uint32_t) pc_size, pipe) != 0) { return -1; } *flag = 1; return 0; } static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_lpf_meta *meta) { int *flag = wd_8 ? &ctx->lpf8_pipe_ready : &ctx->lpf4_pipe_ready; v3d_pipeline *p = wd_8 ? &ctx->lpf8_pipe : &ctx->lpf4_pipe; const char *spv = wd_8 ? "v3d_lpf_h_8_8.spv" : "v3d_lpf_h_4_8.spv"; if (ensure_lpf_pipeline(ctx, wd_8, flag, p, spv) != 0) return -1; size_t meta_bytes = n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */ /* Determine smallest dst window. Each edge writes to bytes * [dst_off - 4 .. dst_off + 3] for 8 rows at dst_stride. */ size_t lo = (size_t) -1, hi = 0; for (size_t i = 0; i < n_edges; i++) { size_t base = meta[i].dst_off; if (base >= 4) { size_t this_lo = base - 4; if (this_lo < lo) lo = this_lo; } else { lo = 0; } size_t this_hi = base + (size_t)(8 - 1) * dst_stride + 4; if (this_hi > hi) hi = this_hi; } if (n_edges == 0) { lo = 0; hi = 0; } size_t dst_window_size = hi - lo; v3d_buffer buf_meta = {0}, buf_dst = {0}; if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1; if (v3d_runner_acquire_buffer(ctx->runner, dst_window_size, &buf_dst)) { v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1; } memcpy(buf_dst.mapped, dst + lo, dst_window_size); uint32_t *m = buf_meta.mapped; for (size_t i = 0; i < n_edges; i++) { m[4*i + 0] = (uint32_t)(meta[i].dst_off - lo); m[4*i + 1] = (uint32_t) meta[i].E; m[4*i + 2] = (uint32_t) meta[i].I; m[4*i + 3] = (uint32_t) meta[i].H; } v3d_buffer binds[2] = { buf_meta, buf_dst }; if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail; uint32_t wg_count = (uint32_t)((n_edges + 31) / 32); if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, p)) goto fail; VkCommandBuffer cb = p->cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->layout, 0, 1, &p->desc_set, 0, NULL); if (wd_8) { lpf8_pc pc = { .n_edges = (uint32_t) n_edges, .blocks_per_row = 0, .dst_stride_u8 = (uint32_t) dst_stride, ._pad = 0 }; vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); } else { lpf4_pc pc = { .n_edges = (uint32_t) n_edges, .dst_stride_u8 = (uint32_t) dst_stride }; vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); } vkCmdDispatch(cb, wg_count, 1, 1); vkEndCommandBuffer(cb); if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; memcpy(dst + lo, buf_dst.mapped, dst_window_size); v3d_runner_release_buffer(ctx->runner, &buf_dst); v3d_runner_release_buffer(ctx->runner, &buf_meta); return 0; fail: v3d_runner_release_buffer(ctx->runner, &buf_dst); v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1; } /* -------------------- VP9 MC QPU dispatch (cycle 3) ------------- */ typedef struct { uint32_t n_blocks; uint32_t dst_stride_u8; uint32_t src_stride_u8; uint32_t _pad; } mc_pc; static int dispatch_mc_8h_qpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, const uint8_t *src, size_t src_stride, size_t n_blocks, const daedalus_mc_meta *meta) { if (!ctx->mc8h_pipe_ready) { if (v3d_runner_create_pipeline(ctx->runner, "v3d_mc_8h.spv", 3, sizeof(mc_pc), &ctx->mc8h_pipe) != 0) return -1; ctx->mc8h_pipe_ready = 1; } size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); size_t dst_max = 0, src_max = 0; for (size_t i = 0; i < n_blocks; i++) { size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8; if (de > dst_max) dst_max = de; /* QPU shader reads src[src_off + row*stride + 0..14] for row=0..7. */ size_t se = meta[i].src_off + 7 * src_stride + 15; if (se > src_max) src_max = se; } v3d_buffer bm = {0}, bd = {0}, bs = {0}; if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1; if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; } if (v3d_runner_acquire_buffer(ctx->runner, src_max, &bs)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; } memcpy(bs.mapped, src, src_max); memcpy(bd.mapped, dst, dst_max); uint32_t *m = bm.mapped; for (size_t i = 0; i < n_blocks; i++) { m[4*i+0] = meta[i].dst_off; m[4*i+1] = meta[i].src_off; m[4*i+2] = (uint32_t) meta[i].mx; m[4*i+3] = 0; } v3d_buffer binds[3] = { bm, bd, bs }; if (v3d_runner_bind_buffers(ctx->runner, &ctx->mc8h_pipe, binds, 3)) goto fail; uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32); mc_pc pc = { .n_blocks = (uint32_t) n_blocks, .dst_stride_u8 = (uint32_t) dst_stride, .src_stride_u8 = (uint32_t) src_stride }; if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->mc8h_pipe)) goto fail; VkCommandBuffer cb = ctx->mc8h_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.layout, 0, 1, &ctx->mc8h_pipe.desc_set, 0, NULL); vkCmdPushConstants(cb, ctx->mc8h_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); vkCmdDispatch(cb, wg_count, 1, 1); vkEndCommandBuffer(cb); if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; memcpy(dst, bd.mapped, dst_max); v3d_runner_release_buffer(ctx->runner, &bs); v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return 0; fail: v3d_runner_release_buffer(ctx->runner, &bs); v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; } /* -------------------- CDEF QPU dispatch (cycle 5) --------------- */ typedef struct { uint32_t n_blocks; uint32_t tmp_stride_u16; uint32_t dst_stride_u8; uint32_t _pad; } cdef_pc; static int dispatch_cdef_qpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, const uint16_t *tmp, size_t n_blocks, const daedalus_cdef_meta *meta) { if (!ctx->cdef_pipe_ready) { if (v3d_runner_create_pipeline(ctx->runner, "v3d_cdef.spv", 3, sizeof(cdef_pc), &ctx->cdef_pipe) != 0) return -1; ctx->cdef_pipe_ready = 1; } size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); size_t dst_max = 0, tmp_max_u16 = 0; for (size_t i = 0; i < n_blocks; i++) { size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8; if (de > dst_max) dst_max = de; size_t te = meta[i].tmp_off_u16 + (8 - 1) * 16 + 8; /* center 8x8 in stride-16 tmp */ if (te > tmp_max_u16) tmp_max_u16 = te; } size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t); v3d_buffer bm = {0}, bd = {0}, bt = {0}; if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1; if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; } if (v3d_runner_acquire_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; } /* tmp may need padding before block-origin offset (caller-allocated). Just * copy from caller; we assume meta[i].tmp_off_u16 is consistent with how * caller has the layout set up. */ memcpy(bt.mapped, tmp, tmp_bytes); memcpy(bd.mapped, dst, dst_max); uint32_t *m = bm.mapped; for (size_t i = 0; i < n_blocks; i++) { uint32_t pri = (uint32_t) meta[i].pri_strength; uint32_t sec = (uint32_t) meta[i].sec_strength; uint32_t damping = (uint32_t) meta[i].damping; m[4*i+0] = meta[i].dst_off; m[4*i+1] = pri | (sec << 8) | (damping << 16); m[4*i+2] = meta[i].tmp_off_u16; m[4*i+3] = (uint32_t) meta[i].dir; } v3d_buffer binds[3] = { bm, bd, bt }; if (v3d_runner_bind_buffers(ctx->runner, &ctx->cdef_pipe, binds, 3)) goto fail; uint32_t wg_count = (uint32_t)((n_blocks + 3) / 4); cdef_pc pc = { .n_blocks = (uint32_t) n_blocks, .tmp_stride_u16 = 16, .dst_stride_u8 = (uint32_t) dst_stride }; if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->cdef_pipe)) goto fail; VkCommandBuffer cb = ctx->cdef_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.layout, 0, 1, &ctx->cdef_pipe.desc_set, 0, NULL); vkCmdPushConstants(cb, ctx->cdef_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); vkCmdDispatch(cb, wg_count, 1, 1); vkEndCommandBuffer(cb); if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; memcpy(dst, bd.mapped, dst_max); v3d_runner_release_buffer(ctx->runner, &bt); v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return 0; fail: v3d_runner_release_buffer(ctx->runner, &bt); v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; } /* -------------------- H.264 deblock QPU dispatch (cycle 8) ------ */ typedef struct { uint32_t n_edges; uint32_t dst_stride_u8; uint32_t _pad0; uint32_t _pad1; } h264deblock_pc; static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { if (!ctx->h264deblock_pipe_ready) { if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264deblock.spv", 2, sizeof(h264deblock_pc), &ctx->h264deblock_pipe) != 0) return -1; ctx->h264deblock_pipe_ready = 1; } size_t meta_bytes = n_edges * 4 * sizeof(uint32_t); size_t dst_max = 0; for (size_t i = 0; i < n_edges; i++) { /* Reads -4*stride to +3*stride+15 from dst_off; writes -2..+1 *stride. */ size_t e = meta[i].dst_off + 3 * dst_stride + 16; if (e > dst_max) dst_max = e; } v3d_buffer bm = {0}, bd = {0}; if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1; if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; } memcpy(bd.mapped, dst, dst_max); uint32_t *m = bm.mapped; for (size_t i = 0; i < n_edges; i++) { m[4*i+0] = meta[i].dst_off; m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8); m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0]) | (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8) | (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16) | (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24); m[4*i+3] = 0; } v3d_buffer binds[2] = { bm, bd }; if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264deblock_pipe, binds, 2)) goto fail; uint32_t wg_count = (uint32_t)((n_edges + 15) / 16); h264deblock_pc pc = { .n_edges = (uint32_t) n_edges, .dst_stride_u8 = (uint32_t) dst_stride }; if (v3d_runner_pipeline_cmdbuf_reset(ctx->runner, &ctx->h264deblock_pipe)) goto fail; VkCommandBuffer cb = ctx->h264deblock_pipe.cb; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.layout, 0, 1, &ctx->h264deblock_pipe.desc_set, 0, NULL); vkCmdPushConstants(cb, ctx->h264deblock_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); vkCmdDispatch(cb, wg_count, 1, 1); vkEndCommandBuffer(cb); if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; memcpy(dst, bd.mapped, dst_max); v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return 0; fail: v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; } /* -------------------- H.264 IDCT 4x4 QPU dispatch (cycle 6) ----- */ typedef struct { uint32_t n_blocks; uint32_t dst_stride_u8; uint32_t _pad0; uint32_t _pad1; } h264_idct4_pc; static int dispatch_h264_idct4_qpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { if (!ctx->h264_idct4_pipe_ready) { if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct4.spv", 3, sizeof(h264_idct4_pc), &ctx->h264_idct4_pipe) != 0) return -1; ctx->h264_idct4_pipe_ready = 1; } size_t coeff_bytes = n_blocks * 16 * sizeof(int16_t); size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); /* uvec4 per block */ size_t dst_max = 0; for (size_t i = 0; i < n_blocks; i++) { size_t e = meta[i].dst_off + (size_t) 3 * dst_stride + 4; if (e > dst_max) dst_max = e; } v3d_buffer bc = {0}, bd = {0}, bm = {0}; if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1; if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; } if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; } memcpy(bc.mapped, coeffs, coeff_bytes); memcpy(bd.mapped, dst, dst_max); uint32_t *m = bm.mapped; for (size_t i = 0; i < n_blocks; i++) { m[4*i+0] = meta[i].dst_off; m[4*i+1] = 0; m[4*i+2] = 0; m[4*i+3] = 0; } v3d_buffer binds[3] = { bc, bd, bm }; if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct4_pipe, binds, 3)) goto fail; uint32_t wg_count = (uint32_t)((n_blocks + 15) / 16); /* 16 blocks/WG */ h264_idct4_pc pc = { .n_blocks = (uint32_t) n_blocks, .dst_stride_u8 = (uint32_t) dst_stride, }; VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); if (cb == VK_NULL_HANDLE) goto fail; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264_idct4_pipe.pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264_idct4_pipe.layout, 0, 1, &ctx->h264_idct4_pipe.desc_set, 0, NULL); vkCmdPushConstants(cb, ctx->h264_idct4_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); vkCmdDispatch(cb, wg_count, 1, 1); vkEndCommandBuffer(cb); if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; memcpy(dst, bd.mapped, dst_max); /* H.264/FFmpeg convention: zero the coeffs block after the * transform (matches the C ref + NEON .S behaviour). */ memset(coeffs, 0, coeff_bytes); v3d_runner_destroy_buffer(ctx->runner, &bm); v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bc); return 0; fail: v3d_runner_destroy_buffer(ctx->runner, &bm); v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; } /* -------------------- H.264 IDCT 8x8 QPU dispatch (cycle 7) ----- */ typedef struct { uint32_t n_blocks; uint32_t dst_stride_u8; uint32_t _pad0; uint32_t _pad1; } h264_idct8_pc; static int dispatch_h264_idct8_qpu(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { if (!ctx->h264_idct8_pipe_ready) { if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_idct8.spv", 3, sizeof(h264_idct8_pc), &ctx->h264_idct8_pipe) != 0) return -1; ctx->h264_idct8_pipe_ready = 1; } size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t); size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); size_t dst_max = 0; for (size_t i = 0; i < n_blocks; i++) { size_t e = meta[i].dst_off + (size_t) 7 * dst_stride + 8; if (e > dst_max) dst_max = e; } v3d_buffer bc = {0}, bd = {0}, bm = {0}; if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &bc)) return -1; if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; } if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; } memcpy(bc.mapped, coeffs, coeff_bytes); memcpy(bd.mapped, dst, dst_max); uint32_t *m = bm.mapped; for (size_t i = 0; i < n_blocks; i++) { m[4*i+0] = meta[i].dst_off; m[4*i+1] = 0; m[4*i+2] = 0; m[4*i+3] = 0; } v3d_buffer binds[3] = { bc, bd, bm }; if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_idct8_pipe, binds, 3)) goto fail; uint32_t wg_count = (uint32_t)((n_blocks + 7) / 8); /* 8 blocks/WG */ h264_idct8_pc pc = { .n_blocks = (uint32_t) n_blocks, .dst_stride_u8 = (uint32_t) dst_stride, }; VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); if (cb == VK_NULL_HANDLE) goto fail; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264_idct8_pipe.pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264_idct8_pipe.layout, 0, 1, &ctx->h264_idct8_pipe.desc_set, 0, NULL); vkCmdPushConstants(cb, ctx->h264_idct8_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); vkCmdDispatch(cb, wg_count, 1, 1); vkEndCommandBuffer(cb); if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; memcpy(dst, bd.mapped, dst_max); memset(coeffs, 0, coeff_bytes); v3d_runner_destroy_buffer(ctx->runner, &bm); v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bc); return 0; fail: v3d_runner_destroy_buffer(ctx->runner, &bm); v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bc); return -1; } /* -------------------- H.264 qpel mc20 QPU dispatch (cycle 9) --- */ typedef struct { uint32_t n_blocks; uint32_t stride_u8; uint32_t _pad0; uint32_t _pad1; } h264_qpel_mc20_pc; static int dispatch_h264_qpel_mc20_qpu(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) { if (!ctx->h264_qpel_mc20_pipe_ready) { if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264_qpel_mc20.spv", 3, sizeof(h264_qpel_mc20_pc), &ctx->h264_qpel_mc20_pipe) != 0) return -1; ctx->h264_qpel_mc20_pipe_ready = 1; } /* Compute the smallest contiguous src/dst window that covers * every block's read/write footprint. * * src: filter reads cols (c-2)..(c+3) for c=0..7 across rows 0..7. * Highest read = src_off + 7*stride + (7 + 3) = src_off + 7*stride + 10. * Plus 1 for the byte-count semantic of memcpy (length=N copies * indices 0..N-1) → src_max = src_off + 7*stride + 11. * * dst: writes cols 0..7 across rows 0..7. * Highest write = dst_off + 7*stride + 7; +1 → dst_off + 7*stride + 8. */ size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); size_t src_max = 0, dst_max = 0; for (size_t i = 0; i < n_blocks; i++) { size_t s_end = meta[i].src_off + (size_t) 7 * stride + 11; size_t d_end = meta[i].dst_off + (size_t) 7 * stride + 8; if (s_end > src_max) src_max = s_end; if (d_end > dst_max) dst_max = d_end; } v3d_buffer bs = {0}, bd = {0}, bm = {0}; if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) return -1; if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bs); return -1; } if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bs); return -1; } /* Copy src window (filter needs cols -2..+3, captured by src_max * upper bound above; the lower bound is implicit in src_off >= 2 * which the caller guarantees per the public API contract). */ memcpy(bs.mapped, src, src_max); memcpy(bd.mapped, dst, dst_max); uint32_t *m = bm.mapped; for (size_t i = 0; i < n_blocks; i++) { m[4*i+0] = meta[i].dst_off; m[4*i+1] = meta[i].src_off; m[4*i+2] = 0; m[4*i+3] = 0; } v3d_buffer binds[3] = { bs, bd, bm }; if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264_qpel_mc20_pipe, binds, 3)) goto fail; uint32_t wg_count = (uint32_t) n_blocks; /* 1 block per WG */ h264_qpel_mc20_pc pc = { .n_blocks = (uint32_t) n_blocks, .stride_u8 = (uint32_t) stride, }; VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); if (cb == VK_NULL_HANDLE) goto fail; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264_qpel_mc20_pipe.pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264_qpel_mc20_pipe.layout, 0, 1, &ctx->h264_qpel_mc20_pipe.desc_set, 0, NULL); vkCmdPushConstants(cb, ctx->h264_qpel_mc20_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); vkCmdDispatch(cb, wg_count, 1, 1); vkEndCommandBuffer(cb); if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; memcpy(dst, bd.mapped, dst_max); v3d_runner_destroy_buffer(ctx->runner, &bm); v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bs); return 0; fail: v3d_runner_destroy_buffer(ctx->runner, &bm); v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bs); return -1; } /* -------------------- Public dispatch entry points -------------- */ #define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \ daedalus_substrate eff = sub; \ if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \ if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \ eff = DAEDALUS_SUBSTRATE_CPU; \ if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \ return -1 /* QPU path not yet wired for this kernel */ int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, const int16_t *coeffs, size_t n_blocks, const daedalus_idct8_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta); return dispatch_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta); } int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_lpf_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta); return dispatch_lpf_qpu(ctx, 0, dst, dst_stride, n_edges, meta); } int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_lpf_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta); return dispatch_lpf_qpu(ctx, 1, dst, dst_stride, n_edges, meta); } int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, const uint8_t *src, size_t src_stride, size_t n_blocks, const daedalus_mc_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_MC_8H); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_mc_8h_cpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta); return dispatch_mc_8h_qpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta); } int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, const uint16_t *tmp, size_t n_blocks, const daedalus_cdef_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_AV1_CDEF_8X8); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_cdef_cpu(ctx, dst, dst_stride, tmp, n_blocks, meta); return dispatch_cdef_qpu(ctx, dst, dst_stride, tmp, n_blocks, meta); } int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_h264_idct4_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta); return dispatch_h264_idct4_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta); } int daedalus_dispatch_h264_idct8(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_h264_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta); return dispatch_h264_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta); } int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_h264_deblock_cpu(ctx, dst, dst_stride, n_edges, meta); return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta); } int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH); /* No QPU shader for the H variant yet — always falls through to * CPU. Mirror the _v shape anyway so the substrate switch is * uniform; QPU just isn't a real option here yet. */ if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_QPU) { /* QPU shader for H deblock isn't implemented yet; recipe * table returns CPU, so AUTO never lands here. An explicit * QPU request fails fast rather than silently degrading to * CPU — matches the principle from the IDCT QPU substrate * (explicit means explicit). */ return -1; } return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta); } int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; /* No chroma QPU shader yet. */ return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta); } int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta); } #define DEFINE_INTRA_DISPATCH(name, kernel, cpu_fn) \ int daedalus_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \ daedalus_substrate sub, uint8_t *dst, size_t dst_stride, \ size_t n_edges, const daedalus_h264_deblock_meta *meta) \ { \ daedalus_substrate eff = sub; \ if (eff == DAEDALUS_SUBSTRATE_AUTO) \ eff = daedalus_recipe_substrate_for(kernel); \ if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \ eff = DAEDALUS_SUBSTRATE_CPU; \ if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \ return cpu_fn(ctx, dst, dst_stride, n_edges, meta); \ } DEFINE_INTRA_DISPATCH(luma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA, dispatch_h264_deblock_luma_v_intra_cpu) DEFINE_INTRA_DISPATCH(luma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA, dispatch_h264_deblock_luma_h_intra_cpu) DEFINE_INTRA_DISPATCH(chroma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA, dispatch_h264_deblock_chroma_v_intra_cpu) DEFINE_INTRA_DISPATCH(chroma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA, dispatch_h264_deblock_chroma_h_intra_cpu) #undef DEFINE_INTRA_DISPATCH int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_h264_qpel_mc20_cpu(ctx, dst, src, stride, n_blocks, meta); return dispatch_h264_qpel_mc20_qpu(ctx, dst, src, stride, n_blocks, meta); } int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC02); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; /* No mc02 QPU shader yet — explicit QPU fast-fails. */ return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta); } int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) { daedalus_substrate eff = sub; if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC22); if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; /* No mc22 QPU shader yet — explicit QPU fast-fails. */ return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta); } #define DEFINE_QPEL_DISPATCH(suffix, kernel) \ int daedalus_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \ daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, \ size_t n_blocks, const daedalus_h264_qpel_meta *meta) \ { \ daedalus_substrate eff = sub; \ if (eff == DAEDALUS_SUBSTRATE_AUTO) \ eff = daedalus_recipe_substrate_for(kernel); \ if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \ eff = DAEDALUS_SUBSTRATE_CPU; \ if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \ return dispatch_h264_qpel_ ## suffix ## _cpu(ctx, dst, src, stride, \ n_blocks, meta); \ } DEFINE_QPEL_DISPATCH(mc10, DAEDALUS_KERNEL_H264_QPEL_MC10) DEFINE_QPEL_DISPATCH(mc30, DAEDALUS_KERNEL_H264_QPEL_MC30) DEFINE_QPEL_DISPATCH(mc01, DAEDALUS_KERNEL_H264_QPEL_MC01) DEFINE_QPEL_DISPATCH(mc03, DAEDALUS_KERNEL_H264_QPEL_MC03) #undef DEFINE_QPEL_DISPATCH /* -------------------- Recipe convenience wrappers --------------- */ int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, const int16_t *coeffs, size_t n_blocks, const daedalus_idct8_meta *meta) { return daedalus_dispatch_vp9_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, coeffs, n_blocks, meta); } int daedalus_recipe_dispatch_vp9_lpf4(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_lpf_meta *meta) { return daedalus_dispatch_vp9_lpf4(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, n_edges, meta); } int daedalus_recipe_dispatch_vp9_lpf8(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_lpf_meta *meta) { return daedalus_dispatch_vp9_lpf8(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, n_edges, meta); } int daedalus_recipe_dispatch_vp9_mc_8h(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, const uint8_t *src, size_t src_stride, size_t n_blocks, const daedalus_mc_meta *meta) { return daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, src, src_stride, n_blocks, meta); } int daedalus_recipe_dispatch_cdef_8x8(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, const uint16_t *tmp, size_t n_blocks, const daedalus_cdef_meta *meta) { return daedalus_dispatch_cdef_8x8(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, tmp, n_blocks, meta); } int daedalus_recipe_dispatch_h264_idct4(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { return daedalus_dispatch_h264_idct4(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, coeffs, n_blocks, meta); } int daedalus_recipe_dispatch_h264_idct8(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, int16_t *coeffs, size_t n_blocks, const daedalus_h264_block_meta *meta) { return daedalus_dispatch_h264_idct8(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, coeffs, n_blocks, meta); } int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, n_edges, meta); } int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { return daedalus_dispatch_h264_deblock_luma_h(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, n_edges, meta); } int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, n_edges, meta); } int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, dst_stride, n_edges, meta); } #define DEFINE_INTRA_RECIPE(name) \ int daedalus_recipe_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \ uint8_t *dst, size_t dst_stride, \ size_t n_edges, const daedalus_h264_deblock_meta *meta) \ { \ return daedalus_dispatch_h264_deblock_ ## name (ctx, DAEDALUS_SUBSTRATE_AUTO, \ dst, dst_stride, n_edges, meta); \ } DEFINE_INTRA_RECIPE(luma_v_intra) DEFINE_INTRA_RECIPE(luma_h_intra) DEFINE_INTRA_RECIPE(chroma_v_intra) DEFINE_INTRA_RECIPE(chroma_h_intra) #undef DEFINE_INTRA_RECIPE int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) { return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, src, stride, n_blocks, meta); } int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) { return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, src, stride, n_blocks, meta); } int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) { return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, src, stride, n_blocks, meta); } #define DEFINE_QPEL_RECIPE(suffix) \ int daedalus_recipe_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \ uint8_t *dst, const uint8_t *src, size_t stride, \ size_t n_blocks, const daedalus_h264_qpel_meta *meta) \ { \ return daedalus_dispatch_h264_qpel_ ## suffix(ctx, DAEDALUS_SUBSTRATE_AUTO,\ dst, src, stride, n_blocks, meta); \ } DEFINE_QPEL_RECIPE(mc10) DEFINE_QPEL_RECIPE(mc30) DEFINE_QPEL_RECIPE(mc01) DEFINE_QPEL_RECIPE(mc03) #undef DEFINE_QPEL_RECIPE