From 1085c5699c3a3d71e623d4d27427e101a34e3104 Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Mon, 18 May 2026 13:55:55 +0000 Subject: [PATCH] Phase 8: wire IDCT QPU dispatch through public API daedalus_ctx now owns a v3d_runner when V3D is available. The public API's dispatch_vp9_idct8 routes QPU calls through a new dispatch_idct8_qpu helper that: (1) lazy-creates the cycle 1 v4 pipeline on first use, (2) allocates 3 host-visible SSBOs per call (coeffs/dst/meta), (3) memcpy host->GPU, (4) dispatch with the v4 32-blocks-per-WG geometry, (5) memcpy GPU->host. Per-call alloc is intentional for Phase 8 correctness-first scope; buffer-pool perf optimization is deferred. Added daedalus_ctx_create_no_qpu() for fast-path callers that know they want CPU only. test_api_idct extended to a 3-mode matrix: CPU forced, QPU forced, AUTO recipe. All three deliver 4096/4096 bit-exact on hertz with V3D 7.1.7.0: recipe substrate for VP9_IDCT8: 2 (QPU) [CPU] 4096/4096 bit-exact [QPU] 4096/4096 bit-exact (real QPU dispatch through the API) [AUTO] 4096/4096 bit-exact (recipe routes to QPU) Next Phase 8 sub-step: same wiring pattern for cycle 2 LPF wd=4 and cycle 4 LPF wd=8 (the other two recipe-QPU kernels). Cycle 3 MC and cycle 5 CDEF only need the dispatch hook (recipe routes to CPU; QPU stays opportunistic via explicit override). Co-Authored-By: Claude Opus 4.7 (1M context) --- CMakeLists.txt | 6 ++ include/daedalus.h | 4 + src/daedalus_core.c | 165 ++++++++++++++++++++++++++++++++++++++---- tests/test_api_idct.c | 51 ++++++++----- 4 files changed, 194 insertions(+), 32 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ff739f6..921be42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -282,6 +282,7 @@ endif() add_library(daedalus_core STATIC src/daedalus_core.c + src/v3d_runner.c ${FFASM_SOURCES} ${FFASM_LPF_SOURCES} ${FFASM_MC_SOURCES} @@ -290,7 +291,12 @@ add_library(daedalus_core STATIC ${DAV1D_CDEF_C_SOURCES} ) target_include_directories(daedalus_core PUBLIC include) +target_include_directories(daedalus_core PRIVATE src) +target_link_libraries(daedalus_core PUBLIC Vulkan::Vulkan) target_compile_options(daedalus_core PRIVATE -O2) +if (DAEDALUS_BUILD_VULKAN) + add_dependencies(daedalus_core daedalus_shaders) +endif() add_executable(test_api_idct tests/test_api_idct.c diff --git a/include/daedalus.h b/include/daedalus.h index 38a9ccf..2e91795 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -70,6 +70,10 @@ typedef struct daedalus_ctx daedalus_ctx; * failure. */ daedalus_ctx *daedalus_ctx_create(void); +/* Same but skip V3D init — for callers that know they want CPU + * only and want a fast-creating context. */ +daedalus_ctx *daedalus_ctx_create_no_qpu(void); + /* Returns 1 if QPU dispatch is available on this context, 0 if * NEON-only. Useful for the integration layer to short-circuit * QPU dispatch attempts. */ diff --git a/src/daedalus_core.c b/src/daedalus_core.c index fd45298..4087dc0 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -1,14 +1,19 @@ /* - * daedalus-fourier core library — Phase 8 skeleton. + * daedalus-fourier core library — Phase 8 skeleton + IDCT QPU wired. * * Wraps cycles 1-5 kernels behind the public C API in * include/daedalus.h. Recipe dispatch routes per-kernel to the * verdict substrate from each cycle's Phase 7 doc. * + * QPU dispatch wiring status: + * IDCT 8x8: wired (cycle 1 v4 shader). + * Others: stubbed (return -1); CPU path always works. + * * License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ + * dav1d BSD-2-Clause NEON snapshots. */ #include "../include/daedalus.h" +#include "v3d_runner.h" #include #include @@ -19,18 +24,29 @@ /* -------------------- Context -------------------- */ struct daedalus_ctx { - /* For Phase 8 skeleton: just a flag. Real impl would hold the - * v3d_runner + per-kernel pipeline handles. */ int has_qpu; + v3d_runner *runner; /* NULL when has_qpu == 0 */ + + /* Per-kernel pipelines, lazy-created on first QPU dispatch. */ + int idct8_pipe_ready; + v3d_pipeline idct8_pipe; }; daedalus_ctx *daedalus_ctx_create(void) { daedalus_ctx *ctx = calloc(1, sizeof(*ctx)); if (!ctx) return NULL; - /* Phase 8 deferred: real impl probes V3D Vulkan device; for now - * default to CPU-only (NEON paths are always available). */ + ctx->runner = v3d_runner_create(); + ctx->has_qpu = (ctx->runner != NULL); + return ctx; +} + +daedalus_ctx *daedalus_ctx_create_no_qpu(void) +{ + daedalus_ctx *ctx = calloc(1, sizeof(*ctx)); + if (!ctx) return NULL; ctx->has_qpu = 0; + ctx->runner = NULL; return ctx; } @@ -41,6 +57,10 @@ int daedalus_ctx_has_qpu(const daedalus_ctx *ctx) void daedalus_ctx_destroy(daedalus_ctx *ctx) { + if (!ctx) return; + if (ctx->idct8_pipe_ready && ctx->runner) + v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe); + if (ctx->runner) v3d_runner_destroy(ctx->runner); free(ctx); } @@ -55,7 +75,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU; case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU; } - return DAEDALUS_SUBSTRATE_CPU; /* defensive default */ + return DAEDALUS_SUBSTRATE_CPU; } /* -------------------- NEON externs (per cycle bench links) ----- */ @@ -141,23 +161,140 @@ static int dispatch_cdef_cpu(daedalus_ctx *ctx, return 0; } +/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */ + +typedef struct { + uint32_t n_blocks; + uint32_t blocks_per_row; + uint32_t dst_stride_u8; + uint32_t _pad; +} idct8_pc; + +static int ensure_idct8_pipeline(daedalus_ctx *ctx) +{ + if (ctx->idct8_pipe_ready) return 0; + if (v3d_runner_create_pipeline(ctx->runner, + "v3d_idct8.spv", + /*n_ssbos=*/3, + /*push_const_size=*/sizeof(idct8_pc), + &ctx->idct8_pipe) != 0) { + return -1; + } + ctx->idct8_pipe_ready = 1; + return 0; +} + +static int dispatch_idct8_qpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const int16_t *coeffs, size_t n_blocks, + const daedalus_idct8_meta *meta) +{ + if (ensure_idct8_pipeline(ctx) != 0) return -1; + + /* Allocate three SSBOs per call (coeffs, dst, meta). Performance- + * tuning (buffer pool) is deferred; correctness first. */ + size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t); + size_t meta_bytes = n_blocks * 2 * sizeof(uint32_t); /* uvec2 per block */ + /* dst buffer must hold all of dst[0..max_dst_off + 64 + 8*stride]. + * Cheapest correct answer: alloc the smallest contiguous region + * containing every block's footprint. For Phase 8 we assume the + * caller's dst surface starts at byte 0 of the buffer and use + * the full provided extent. We size by scanning meta. */ + size_t max_byte_touched = 0; + for (size_t i = 0; i < n_blocks; i++) { + size_t end = meta[i].dst_off + (size_t)(8 - 1) * dst_stride + 8; + if (end > max_byte_touched) max_byte_touched = end; + } + + v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0}; + if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1; + if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) { + v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1; + } + if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) { + v3d_runner_destroy_buffer(ctx->runner, &buf_dst); + v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1; + } + + /* Upload. Coeffs and meta are straight copies. Dst we copy the + * caller's full region (since we'll need to read it back). */ + memcpy(buf_coeffs.mapped, coeffs, coeff_bytes); + memcpy(buf_dst.mapped, dst, max_byte_touched); + uint32_t *m = buf_meta.mapped; + for (size_t i = 0; i < n_blocks; i++) { + m[2*i + 0] = meta[i].block_x; + m[2*i + 1] = meta[i].block_y; + } + + /* Bind: shader expects (coeffs, dst, meta) per src/v3d_idct8.comp. */ + v3d_buffer binds[3] = { buf_coeffs, buf_dst, buf_meta }; + if (v3d_runner_bind_buffers(ctx->runner, &ctx->idct8_pipe, binds, 3)) { + goto fail; + } + + /* WG geometry: 32 blocks per WG. */ + uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32); + idct8_pc pc = { + .n_blocks = (uint32_t) n_blocks, + .blocks_per_row = 0, /* unused by v4 shader (meta drives placement) */ + .dst_stride_u8 = (uint32_t) dst_stride, + ._pad = 0, + }; + + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); + if (cb == VK_NULL_HANDLE) goto fail; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->idct8_pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->idct8_pipe.layout, 0, 1, + &ctx->idct8_pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, ctx->idct8_pipe.layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); + vkCmdDispatch(cb, wg_count, 1, 1); + vkEndCommandBuffer(cb); + + if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; + + /* Read-back dst. */ + memcpy(dst, buf_dst.mapped, max_byte_touched); + + v3d_runner_destroy_buffer(ctx->runner, &buf_meta); + v3d_runner_destroy_buffer(ctx->runner, &buf_dst); + v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); + return 0; + +fail: + v3d_runner_destroy_buffer(ctx->runner, &buf_meta); + v3d_runner_destroy_buffer(ctx->runner, &buf_dst); + v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); + return -1; +} + /* -------------------- Public dispatch entry points -------------- */ -#define ROUTE(_kernel, _cpu_fn, ...) \ +#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \ daedalus_substrate eff = sub; \ if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \ if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \ eff = DAEDALUS_SUBSTRATE_CPU; \ if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \ - return -1 /* QPU path not yet wired in Phase 8 skeleton */ + return -1 /* QPU path not yet wired for this kernel */ int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, size_t dst_stride, const int16_t *coeffs, size_t n_blocks, const daedalus_idct8_meta *meta) { - ROUTE(DAEDALUS_KERNEL_VP9_IDCT8, dispatch_idct8_cpu, - dst, dst_stride, coeffs, n_blocks, meta); + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta); + return dispatch_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta); } int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub, @@ -193,8 +330,8 @@ int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub, const uint8_t *src, size_t src_stride, size_t n_blocks, const daedalus_mc_meta *meta) { - ROUTE(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu, - dst, dst_stride, src, src_stride, n_blocks, meta); + ROUTE_CPU_ONLY(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu, + dst, dst_stride, src, src_stride, n_blocks, meta); } int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub, @@ -202,8 +339,8 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub, const uint16_t *tmp, size_t n_blocks, const daedalus_cdef_meta *meta) { - ROUTE(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu, - dst, dst_stride, tmp, n_blocks, meta); + ROUTE_CPU_ONLY(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu, + dst, dst_stride, tmp, n_blocks, meta); } /* -------------------- Recipe convenience wrappers --------------- */ diff --git a/tests/test_api_idct.c b/tests/test_api_idct.c index 13896b3..3804fc5 100644 --- a/tests/test_api_idct.c +++ b/tests/test_api_idct.c @@ -37,14 +37,37 @@ static inline uint64_t xs(void) { return xs_state = x; } -int main(void) +static int run_once(daedalus_substrate force, + const int16_t *coeffs, + const daedalus_idct8_meta *meta, + const uint8_t *dst_initial, + const uint8_t *dst_ref, + const char *label) { daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; } + int has_qpu = daedalus_ctx_has_qpu(ctx); + printf(" [%s] has_qpu=%d force=%d\n", label, has_qpu, (int) force); + if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) { + printf(" SKIP — QPU unavailable on this host\n"); + daedalus_ctx_destroy(ctx); return 0; + } + uint8_t dst[DST_BYTES]; + memcpy(dst, dst_initial, DST_BYTES); + int rc = daedalus_dispatch_vp9_idct8(ctx, force, dst, DST_STRIDE, + coeffs, N_BLOCKS, meta); + if (rc) { fprintf(stderr, " dispatch rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; } + int diffs = 0; + for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++; + printf(" %d / %d bytes bit-exact (%.4f%%)\n", + DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES); + daedalus_ctx_destroy(ctx); + return diffs == 0 ? 0 : 1; +} +int main(void) +{ printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n"); - printf(" has_qpu: %d (Phase 8 skeleton: NEON-only)\n", - daedalus_ctx_has_qpu(ctx)); printf(" recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8)); @@ -61,9 +84,9 @@ int main(void) } } - uint8_t dst_ref[DST_BYTES], dst_api[DST_BYTES]; + uint8_t dst_ref[DST_BYTES], dst_initial[DST_BYTES]; for (int i = 0; i < DST_BYTES; i++) - dst_ref[i] = dst_api[i] = (uint8_t)(xs() & 0xff); + dst_ref[i] = dst_initial[i] = (uint8_t)(xs() & 0xff); /* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset * by*8*stride + bx*8. */ @@ -87,17 +110,9 @@ int main(void) DST_STRIDE, scratch, 64); } - /* Dispatch through the public API. */ - int rc = daedalus_recipe_dispatch_vp9_idct8(ctx, dst_api, DST_STRIDE, - coeffs, N_BLOCKS, meta); - if (rc != 0) { fprintf(stderr, "API dispatch failed rc=%d\n", rc); return 1; } - - /* Compare. */ - int diffs = 0; - for (int i = 0; i < DST_BYTES; i++) if (dst_ref[i] != dst_api[i]) diffs++; - printf(" bytes bit-exact: %d / %d (%.4f%%)\n", - DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES); - - daedalus_ctx_destroy(ctx); - return diffs == 0 ? 0 : 1; + int fail = 0; + fail |= run_once(DAEDALUS_SUBSTRATE_CPU, coeffs, meta, dst_initial, dst_ref, "CPU"); + fail |= run_once(DAEDALUS_SUBSTRATE_QPU, coeffs, meta, dst_initial, dst_ref, "QPU"); + fail |= run_once(DAEDALUS_SUBSTRATE_AUTO, coeffs, meta, dst_initial, dst_ref, "AUTO"); + return fail; }