Phase 8: wire IDCT QPU dispatch through public API

daedalus_ctx now owns a v3d_runner when V3D is available. The
public API's dispatch_vp9_idct8 routes QPU calls through a
new dispatch_idct8_qpu helper that: (1) lazy-creates the cycle 1
v4 pipeline on first use, (2) allocates 3 host-visible SSBOs
per call (coeffs/dst/meta), (3) memcpy host->GPU, (4) dispatch
with the v4 32-blocks-per-WG geometry, (5) memcpy GPU->host.

Per-call alloc is intentional for Phase 8 correctness-first
scope; buffer-pool perf optimization is deferred.

Added daedalus_ctx_create_no_qpu() for fast-path callers that
know they want CPU only.

test_api_idct extended to a 3-mode matrix: CPU forced, QPU
forced, AUTO recipe. All three deliver 4096/4096 bit-exact
on hertz with V3D 7.1.7.0:

  recipe substrate for VP9_IDCT8: 2 (QPU)
  [CPU] 4096/4096 bit-exact
  [QPU] 4096/4096 bit-exact (real QPU dispatch through the API)
  [AUTO] 4096/4096 bit-exact (recipe routes to QPU)

Next Phase 8 sub-step: same wiring pattern for cycle 2 LPF wd=4
and cycle 4 LPF wd=8 (the other two recipe-QPU kernels).
Cycle 3 MC and cycle 5 CDEF only need the dispatch hook
(recipe routes to CPU; QPU stays opportunistic via explicit
override).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 13:55:55 +00:00
parent 760f6a4060
commit 1085c5699c
4 changed files with 194 additions and 32 deletions
+6
View File
@@ -282,6 +282,7 @@ endif()
add_library(daedalus_core STATIC
src/daedalus_core.c
src/v3d_runner.c
${FFASM_SOURCES}
${FFASM_LPF_SOURCES}
${FFASM_MC_SOURCES}
@@ -290,7 +291,12 @@ add_library(daedalus_core STATIC
${DAV1D_CDEF_C_SOURCES}
)
target_include_directories(daedalus_core PUBLIC include)
target_include_directories(daedalus_core PRIVATE src)
target_link_libraries(daedalus_core PUBLIC Vulkan::Vulkan)
target_compile_options(daedalus_core PRIVATE -O2)
if (DAEDALUS_BUILD_VULKAN)
add_dependencies(daedalus_core daedalus_shaders)
endif()
add_executable(test_api_idct
tests/test_api_idct.c
+4
View File
@@ -70,6 +70,10 @@ typedef struct daedalus_ctx daedalus_ctx;
* failure. */
daedalus_ctx *daedalus_ctx_create(void);
/* Same but skip V3D init — for callers that know they want CPU
* only and want a fast-creating context. */
daedalus_ctx *daedalus_ctx_create_no_qpu(void);
/* Returns 1 if QPU dispatch is available on this context, 0 if
* NEON-only. Useful for the integration layer to short-circuit
* QPU dispatch attempts. */
+149 -12
View File
@@ -1,14 +1,19 @@
/*
* daedalus-fourier core library — Phase 8 skeleton.
* daedalus-fourier core library — Phase 8 skeleton + IDCT QPU wired.
*
* Wraps cycles 1-5 kernels behind the public C API in
* include/daedalus.h. Recipe dispatch routes per-kernel to the
* verdict substrate from each cycle's Phase 7 doc.
*
* QPU dispatch wiring status:
* IDCT 8x8: wired (cycle 1 v4 shader).
* Others: stubbed (return -1); CPU path always works.
*
* License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
* dav1d BSD-2-Clause NEON snapshots.
*/
#include "../include/daedalus.h"
#include "v3d_runner.h"
#include <stdlib.h>
#include <stdint.h>
@@ -19,18 +24,29 @@
/* -------------------- Context -------------------- */
struct daedalus_ctx {
/* For Phase 8 skeleton: just a flag. Real impl would hold the
* v3d_runner + per-kernel pipeline handles. */
int has_qpu;
v3d_runner *runner; /* NULL when has_qpu == 0 */
/* Per-kernel pipelines, lazy-created on first QPU dispatch. */
int idct8_pipe_ready;
v3d_pipeline idct8_pipe;
};
daedalus_ctx *daedalus_ctx_create(void)
{
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
if (!ctx) return NULL;
/* Phase 8 deferred: real impl probes V3D Vulkan device; for now
* default to CPU-only (NEON paths are always available). */
ctx->runner = v3d_runner_create();
ctx->has_qpu = (ctx->runner != NULL);
return ctx;
}
daedalus_ctx *daedalus_ctx_create_no_qpu(void)
{
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
if (!ctx) return NULL;
ctx->has_qpu = 0;
ctx->runner = NULL;
return ctx;
}
@@ -41,6 +57,10 @@ int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
void daedalus_ctx_destroy(daedalus_ctx *ctx)
{
if (!ctx) return;
if (ctx->idct8_pipe_ready && ctx->runner)
v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
if (ctx->runner) v3d_runner_destroy(ctx->runner);
free(ctx);
}
@@ -55,7 +75,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
}
return DAEDALUS_SUBSTRATE_CPU; /* defensive default */
return DAEDALUS_SUBSTRATE_CPU;
}
/* -------------------- NEON externs (per cycle bench links) ----- */
@@ -141,23 +161,140 @@ static int dispatch_cdef_cpu(daedalus_ctx *ctx,
return 0;
}
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
typedef struct {
uint32_t n_blocks;
uint32_t blocks_per_row;
uint32_t dst_stride_u8;
uint32_t _pad;
} idct8_pc;
static int ensure_idct8_pipeline(daedalus_ctx *ctx)
{
if (ctx->idct8_pipe_ready) return 0;
if (v3d_runner_create_pipeline(ctx->runner,
"v3d_idct8.spv",
/*n_ssbos=*/3,
/*push_const_size=*/sizeof(idct8_pc),
&ctx->idct8_pipe) != 0) {
return -1;
}
ctx->idct8_pipe_ready = 1;
return 0;
}
static int dispatch_idct8_qpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta)
{
if (ensure_idct8_pipeline(ctx) != 0) return -1;
/* Allocate three SSBOs per call (coeffs, dst, meta). Performance-
* tuning (buffer pool) is deferred; correctness first. */
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
size_t meta_bytes = n_blocks * 2 * sizeof(uint32_t); /* uvec2 per block */
/* dst buffer must hold all of dst[0..max_dst_off + 64 + 8*stride].
* Cheapest correct answer: alloc the smallest contiguous region
* containing every block's footprint. For Phase 8 we assume the
* caller's dst surface starts at byte 0 of the buffer and use
* the full provided extent. We size by scanning meta. */
size_t max_byte_touched = 0;
for (size_t i = 0; i < n_blocks; i++) {
size_t end = meta[i].dst_off + (size_t)(8 - 1) * dst_stride + 8;
if (end > max_byte_touched) max_byte_touched = end;
}
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
}
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
}
/* Upload. Coeffs and meta are straight copies. Dst we copy the
* caller's full region (since we'll need to read it back). */
memcpy(buf_coeffs.mapped, coeffs, coeff_bytes);
memcpy(buf_dst.mapped, dst, max_byte_touched);
uint32_t *m = buf_meta.mapped;
for (size_t i = 0; i < n_blocks; i++) {
m[2*i + 0] = meta[i].block_x;
m[2*i + 1] = meta[i].block_y;
}
/* Bind: shader expects (coeffs, dst, meta) per src/v3d_idct8.comp. */
v3d_buffer binds[3] = { buf_coeffs, buf_dst, buf_meta };
if (v3d_runner_bind_buffers(ctx->runner, &ctx->idct8_pipe, binds, 3)) {
goto fail;
}
/* WG geometry: 32 blocks per WG. */
uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
idct8_pc pc = {
.n_blocks = (uint32_t) n_blocks,
.blocks_per_row = 0, /* unused by v4 shader (meta drives placement) */
.dst_stride_u8 = (uint32_t) dst_stride,
._pad = 0,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
if (cb == VK_NULL_HANDLE) goto fail;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->idct8_pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
ctx->idct8_pipe.layout, 0, 1,
&ctx->idct8_pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, ctx->idct8_pipe.layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
/* Read-back dst. */
memcpy(dst, buf_dst.mapped, max_byte_touched);
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
return 0;
fail:
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
return -1;
}
/* -------------------- Public dispatch entry points -------------- */
#define ROUTE(_kernel, _cpu_fn, ...) \
#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \
daedalus_substrate eff = sub; \
if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
eff = DAEDALUS_SUBSTRATE_CPU; \
if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \
return -1 /* QPU path not yet wired in Phase 8 skeleton */
return -1 /* QPU path not yet wired for this kernel */
int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
const int16_t *coeffs, size_t n_blocks,
const daedalus_idct8_meta *meta)
{
ROUTE(DAEDALUS_KERNEL_VP9_IDCT8, dispatch_idct8_cpu,
dst, dst_stride, coeffs, n_blocks, meta);
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_CPU)
return dispatch_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
return dispatch_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
}
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
@@ -193,7 +330,7 @@ int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
const uint8_t *src, size_t src_stride,
size_t n_blocks, const daedalus_mc_meta *meta)
{
ROUTE(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
dst, dst_stride, src, src_stride, n_blocks, meta);
}
@@ -202,7 +339,7 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
const uint16_t *tmp,
size_t n_blocks, const daedalus_cdef_meta *meta)
{
ROUTE(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
dst, dst_stride, tmp, n_blocks, meta);
}
+33 -18
View File
@@ -37,14 +37,37 @@ static inline uint64_t xs(void) {
return xs_state = x;
}
int main(void)
static int run_once(daedalus_substrate force,
const int16_t *coeffs,
const daedalus_idct8_meta *meta,
const uint8_t *dst_initial,
const uint8_t *dst_ref,
const char *label)
{
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
int has_qpu = daedalus_ctx_has_qpu(ctx);
printf(" [%s] has_qpu=%d force=%d\n", label, has_qpu, (int) force);
if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) {
printf(" SKIP — QPU unavailable on this host\n");
daedalus_ctx_destroy(ctx); return 0;
}
uint8_t dst[DST_BYTES];
memcpy(dst, dst_initial, DST_BYTES);
int rc = daedalus_dispatch_vp9_idct8(ctx, force, dst, DST_STRIDE,
coeffs, N_BLOCKS, meta);
if (rc) { fprintf(stderr, " dispatch rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; }
int diffs = 0;
for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++;
printf(" %d / %d bytes bit-exact (%.4f%%)\n",
DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
daedalus_ctx_destroy(ctx);
return diffs == 0 ? 0 : 1;
}
int main(void)
{
printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n");
printf(" has_qpu: %d (Phase 8 skeleton: NEON-only)\n",
daedalus_ctx_has_qpu(ctx));
printf(" recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8));
@@ -61,9 +84,9 @@ int main(void)
}
}
uint8_t dst_ref[DST_BYTES], dst_api[DST_BYTES];
uint8_t dst_ref[DST_BYTES], dst_initial[DST_BYTES];
for (int i = 0; i < DST_BYTES; i++)
dst_ref[i] = dst_api[i] = (uint8_t)(xs() & 0xff);
dst_ref[i] = dst_initial[i] = (uint8_t)(xs() & 0xff);
/* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset
* by*8*stride + bx*8. */
@@ -87,17 +110,9 @@ int main(void)
DST_STRIDE, scratch, 64);
}
/* Dispatch through the public API. */
int rc = daedalus_recipe_dispatch_vp9_idct8(ctx, dst_api, DST_STRIDE,
coeffs, N_BLOCKS, meta);
if (rc != 0) { fprintf(stderr, "API dispatch failed rc=%d\n", rc); return 1; }
/* Compare. */
int diffs = 0;
for (int i = 0; i < DST_BYTES; i++) if (dst_ref[i] != dst_api[i]) diffs++;
printf(" bytes bit-exact: %d / %d (%.4f%%)\n",
DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
daedalus_ctx_destroy(ctx);
return diffs == 0 ? 0 : 1;
int fail = 0;
fail |= run_once(DAEDALUS_SUBSTRATE_CPU, coeffs, meta, dst_initial, dst_ref, "CPU");
fail |= run_once(DAEDALUS_SUBSTRATE_QPU, coeffs, meta, dst_initial, dst_ref, "QPU");
fail |= run_once(DAEDALUS_SUBSTRATE_AUTO, coeffs, meta, dst_initial, dst_ref, "AUTO");
return fail;
}