From 0a99b164894df9aedd1a6f5834f98088b1a6ad3c Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Mon, 18 May 2026 14:50:41 +0000 Subject: [PATCH] Phase 8b: opportunistic QPU paths through public API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires QPU dispatch for cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264 deblock) through the public API. These three kernels have recipe substrate = CPU, but per Issue 003 the mixed-kernel helper value is real — the dispatch path must exist so override-mode callers can request QPU on the side. Pattern mirrors dispatch_idct8_qpu (lazy pipeline + per-call SSBO alloc + memcpy + dispatch + readback). Each kernel has its own push-constant struct (mc_pc 3-field, cdef_pc 3-field, deblock_pc 2-field shared with lpf). Notable bug caught + fixed in test_api_opportunistic_qpu: the initial dispatch_mc_8h_qpu sized src_max using CPU-side reach (src_off + 3 + 8 + 7*stride), but the QPU shader reads src[ src_off + row*stride + 0..14] for row=0..7. Last block had 3 uninitialized bytes → 99.8% match → 100% after fix. After this commit, the public API surface fully covers cycles 1-8: Cycle 1 (IDCT 8x8): CPU + QPU + AUTO bit-exact Cycle 2 (LPF wd=4): CPU + QPU + AUTO bit-exact Cycle 3 (MC 8h): CPU recipe; QPU override bit-exact Cycle 4 (LPF wd=8): CPU + QPU + AUTO bit-exact Cycle 5 (CDEF): CPU recipe; QPU override (untested in this test — bench_v3d_cdef is the authoritative 3-way M1) Cycle 6 (H.264 IDCT 4x4): CPU only (no QPU shader by recipe) Cycle 7 (H.264 IDCT 8x8): CPU only Cycle 8 (H.264 deblock luma-v): CPU recipe; QPU override bit-exact Tests: test_api_opportunistic_qpu adds CPU-vs-QPU bit-exact comparison for VP9 MC and H.264 deblock through the API. test_api_idct, test_api_lpf, test_api_h264 still pass. Per the locked Phase 8 architecture (project_phase8_architecture memory): next session opens daedalus-v4l2 sibling repo with Option B (kernel V4L2 shim + userspace daemon), Option γ (dlopen FFmpeg parser). Co-Authored-By: Claude Opus 4.7 (1M context) --- CMakeLists.txt | 4 + src/daedalus_core.c | 283 ++++++++++++++++++++++++++++- tests/test_api_opportunistic_qpu.c | 118 ++++++++++++ 3 files changed, 396 insertions(+), 9 deletions(-) create mode 100644 tests/test_api_opportunistic_qpu.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 678b32b..d99c150 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -384,6 +384,10 @@ add_executable(test_api_h264 target_link_libraries(test_api_h264 PRIVATE daedalus_core) target_compile_options(test_api_h264 PRIVATE -O2) +add_executable(test_api_opportunistic_qpu tests/test_api_opportunistic_qpu.c) +target_link_libraries(test_api_opportunistic_qpu PRIVATE daedalus_core) +target_compile_options(test_api_opportunistic_qpu PRIVATE -O2) + if (DAEDALUS_BUILD_VULKAN) # (re-open the conditional so the closing endif() below balances) diff --git a/src/daedalus_core.c b/src/daedalus_core.c index f0ee7fa..0cdec1e 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -34,6 +34,12 @@ struct daedalus_ctx { v3d_pipeline lpf4_pipe; int lpf8_pipe_ready; v3d_pipeline lpf8_pipe; + int mc8h_pipe_ready; + v3d_pipeline mc8h_pipe; + int cdef_pipe_ready; + v3d_pipeline cdef_pipe; + int h264deblock_pipe_ready; + v3d_pipeline h264deblock_pipe; }; daedalus_ctx *daedalus_ctx_create(void) @@ -63,9 +69,12 @@ void daedalus_ctx_destroy(daedalus_ctx *ctx) { if (!ctx) return; if (ctx->runner) { - if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe); - if (ctx->lpf4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe); - if (ctx->lpf8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe); + if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe); + if (ctx->lpf4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe); + if (ctx->lpf8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe); + if (ctx->mc8h_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->mc8h_pipe); + if (ctx->cdef_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->cdef_pipe); + if (ctx->h264deblock_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->h264deblock_pipe); v3d_runner_destroy(ctx->runner); } free(ctx); @@ -449,6 +458,244 @@ fail: return -1; } +/* -------------------- VP9 MC QPU dispatch (cycle 3) ------------- */ + +typedef struct { + uint32_t n_blocks; + uint32_t dst_stride_u8; + uint32_t src_stride_u8; + uint32_t _pad; +} mc_pc; + +static int dispatch_mc_8h_qpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const uint8_t *src, size_t src_stride, + size_t n_blocks, const daedalus_mc_meta *meta) +{ + if (!ctx->mc8h_pipe_ready) { + if (v3d_runner_create_pipeline(ctx->runner, "v3d_mc_8h.spv", + 3, sizeof(mc_pc), &ctx->mc8h_pipe) != 0) + return -1; + ctx->mc8h_pipe_ready = 1; + } + + size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); + size_t dst_max = 0, src_max = 0; + for (size_t i = 0; i < n_blocks; i++) { + size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8; + if (de > dst_max) dst_max = de; + /* QPU shader reads src[src_off + row*stride + 0..14] for row=0..7. */ + size_t se = meta[i].src_off + 7 * src_stride + 15; + if (se > src_max) src_max = se; + } + + v3d_buffer bm = {0}, bd = {0}, bs = {0}; + if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1; + if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; } + if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; } + + memcpy(bs.mapped, src, src_max); + memcpy(bd.mapped, dst, dst_max); + uint32_t *m = bm.mapped; + for (size_t i = 0; i < n_blocks; i++) { + m[4*i+0] = meta[i].dst_off; + m[4*i+1] = meta[i].src_off; + m[4*i+2] = (uint32_t) meta[i].mx; + m[4*i+3] = 0; + } + + v3d_buffer binds[3] = { bm, bd, bs }; + if (v3d_runner_bind_buffers(ctx->runner, &ctx->mc8h_pipe, binds, 3)) goto fail; + + uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32); + mc_pc pc = { .n_blocks = (uint32_t) n_blocks, + .dst_stride_u8 = (uint32_t) dst_stride, + .src_stride_u8 = (uint32_t) src_stride }; + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); + if (cb == VK_NULL_HANDLE) goto fail; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->mc8h_pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->mc8h_pipe.layout, 0, 1, &ctx->mc8h_pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, ctx->mc8h_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + vkCmdDispatch(cb, wg_count, 1, 1); + vkEndCommandBuffer(cb); + if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; + + memcpy(dst, bd.mapped, dst_max); + + v3d_runner_destroy_buffer(ctx->runner, &bs); + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bm); + return 0; +fail: + v3d_runner_destroy_buffer(ctx->runner, &bs); + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bm); + return -1; +} + +/* -------------------- CDEF QPU dispatch (cycle 5) --------------- */ + +typedef struct { + uint32_t n_blocks; + uint32_t tmp_stride_u16; + uint32_t dst_stride_u8; + uint32_t _pad; +} cdef_pc; + +static int dispatch_cdef_qpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + const uint16_t *tmp, + size_t n_blocks, const daedalus_cdef_meta *meta) +{ + if (!ctx->cdef_pipe_ready) { + if (v3d_runner_create_pipeline(ctx->runner, "v3d_cdef.spv", + 3, sizeof(cdef_pc), &ctx->cdef_pipe) != 0) + return -1; + ctx->cdef_pipe_ready = 1; + } + + size_t meta_bytes = n_blocks * 4 * sizeof(uint32_t); + size_t dst_max = 0, tmp_max_u16 = 0; + for (size_t i = 0; i < n_blocks; i++) { + size_t de = meta[i].dst_off + (8 - 1) * dst_stride + 8; + if (de > dst_max) dst_max = de; + size_t te = meta[i].tmp_off_u16 + (8 - 1) * 16 + 8; /* center 8x8 in stride-16 tmp */ + if (te > tmp_max_u16) tmp_max_u16 = te; + } + size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t); + + v3d_buffer bm = {0}, bd = {0}, bt = {0}; + if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1; + if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; } + if (v3d_runner_create_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; } + + /* tmp may need padding before block-origin offset (caller-allocated). Just + * copy from caller; we assume meta[i].tmp_off_u16 is consistent with how + * caller has the layout set up. */ + memcpy(bt.mapped, tmp, tmp_bytes); + memcpy(bd.mapped, dst, dst_max); + uint32_t *m = bm.mapped; + for (size_t i = 0; i < n_blocks; i++) { + uint32_t pri = (uint32_t) meta[i].pri_strength; + uint32_t sec = (uint32_t) meta[i].sec_strength; + uint32_t damping = (uint32_t) meta[i].damping; + m[4*i+0] = meta[i].dst_off; + m[4*i+1] = pri | (sec << 8) | (damping << 16); + m[4*i+2] = meta[i].tmp_off_u16; + m[4*i+3] = (uint32_t) meta[i].dir; + } + + v3d_buffer binds[3] = { bm, bd, bt }; + if (v3d_runner_bind_buffers(ctx->runner, &ctx->cdef_pipe, binds, 3)) goto fail; + + uint32_t wg_count = (uint32_t)((n_blocks + 3) / 4); + cdef_pc pc = { .n_blocks = (uint32_t) n_blocks, + .tmp_stride_u16 = 16, + .dst_stride_u8 = (uint32_t) dst_stride }; + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); + if (cb == VK_NULL_HANDLE) goto fail; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->cdef_pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->cdef_pipe.layout, 0, 1, &ctx->cdef_pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, ctx->cdef_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + vkCmdDispatch(cb, wg_count, 1, 1); + vkEndCommandBuffer(cb); + if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; + + memcpy(dst, bd.mapped, dst_max); + + v3d_runner_destroy_buffer(ctx->runner, &bt); + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bm); + return 0; +fail: + v3d_runner_destroy_buffer(ctx->runner, &bt); + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bm); + return -1; +} + +/* -------------------- H.264 deblock QPU dispatch (cycle 8) ------ */ + +typedef struct { + uint32_t n_edges; + uint32_t dst_stride_u8; + uint32_t _pad0; + uint32_t _pad1; +} h264deblock_pc; + +static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + if (!ctx->h264deblock_pipe_ready) { + if (v3d_runner_create_pipeline(ctx->runner, "v3d_h264deblock.spv", + 2, sizeof(h264deblock_pc), &ctx->h264deblock_pipe) != 0) + return -1; + ctx->h264deblock_pipe_ready = 1; + } + + size_t meta_bytes = n_edges * 4 * sizeof(uint32_t); + size_t dst_max = 0; + for (size_t i = 0; i < n_edges; i++) { + /* Reads -4*stride to +3*stride+15 from dst_off; writes -2..+1 *stride. */ + size_t e = meta[i].dst_off + 3 * dst_stride + 16; + if (e > dst_max) dst_max = e; + } + + v3d_buffer bm = {0}, bd = {0}; + if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1; + if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; } + + memcpy(bd.mapped, dst, dst_max); + uint32_t *m = bm.mapped; + for (size_t i = 0; i < n_edges; i++) { + m[4*i+0] = meta[i].dst_off; + m[4*i+1] = ((uint32_t) meta[i].alpha) | (((uint32_t) meta[i].beta) << 8); + m[4*i+2] = ((uint32_t)(uint8_t) meta[i].tc0[0]) + | (((uint32_t)(uint8_t) meta[i].tc0[1]) << 8) + | (((uint32_t)(uint8_t) meta[i].tc0[2]) << 16) + | (((uint32_t)(uint8_t) meta[i].tc0[3]) << 24); + m[4*i+3] = 0; + } + + v3d_buffer binds[2] = { bm, bd }; + if (v3d_runner_bind_buffers(ctx->runner, &ctx->h264deblock_pipe, binds, 2)) goto fail; + + uint32_t wg_count = (uint32_t)((n_edges + 15) / 16); + h264deblock_pc pc = { .n_edges = (uint32_t) n_edges, + .dst_stride_u8 = (uint32_t) dst_stride }; + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); + if (cb == VK_NULL_HANDLE) goto fail; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->h264deblock_pipe.pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + ctx->h264deblock_pipe.layout, 0, 1, &ctx->h264deblock_pipe.desc_set, 0, NULL); + vkCmdPushConstants(cb, ctx->h264deblock_pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + vkCmdDispatch(cb, wg_count, 1, 1); + vkEndCommandBuffer(cb); + if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; + + memcpy(dst, bd.mapped, dst_max); + + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bm); + return 0; +fail: + v3d_runner_destroy_buffer(ctx->runner, &bd); + v3d_runner_destroy_buffer(ctx->runner, &bm); + return -1; +} + /* -------------------- Public dispatch entry points -------------- */ #define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \ @@ -507,8 +754,14 @@ int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub, const uint8_t *src, size_t src_stride, size_t n_blocks, const daedalus_mc_meta *meta) { - ROUTE_CPU_ONLY(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu, - dst, dst_stride, src, src_stride, n_blocks, meta); + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_MC_8H); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_mc_8h_cpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta); + return dispatch_mc_8h_qpu(ctx, dst, dst_stride, src, src_stride, n_blocks, meta); } int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub, @@ -516,8 +769,14 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub, const uint16_t *tmp, size_t n_blocks, const daedalus_cdef_meta *meta) { - ROUTE_CPU_ONLY(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu, - dst, dst_stride, tmp, n_blocks, meta); + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_AV1_CDEF_8X8); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_cdef_cpu(ctx, dst, dst_stride, tmp, n_blocks, meta); + return dispatch_cdef_qpu(ctx, dst, dst_stride, tmp, n_blocks, meta); } int daedalus_dispatch_h264_idct4(daedalus_ctx *ctx, daedalus_substrate sub, @@ -542,8 +801,14 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta) { - ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_DEBLOCK_LV, dispatch_h264_deblock_cpu, - dst, dst_stride, n_edges, meta); + daedalus_substrate eff = sub; + if (eff == DAEDALUS_SUBSTRATE_AUTO) + eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV); + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) + eff = DAEDALUS_SUBSTRATE_CPU; + if (eff == DAEDALUS_SUBSTRATE_CPU) + return dispatch_h264_deblock_cpu(ctx, dst, dst_stride, n_edges, meta); + return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta); } /* -------------------- Recipe convenience wrappers --------------- */ diff --git a/tests/test_api_opportunistic_qpu.c b/tests/test_api_opportunistic_qpu.c new file mode 100644 index 0000000..612ce12 --- /dev/null +++ b/tests/test_api_opportunistic_qpu.c @@ -0,0 +1,118 @@ +/* + * Phase 8b — opportunistic-QPU dispatch paths through public API. + * + * Verifies that cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264 deblock) + * can be force-routed to QPU via daedalus_dispatch_*(QPU, ...) and + * produce bit-exact output vs the CPU path (which is the C ref proxy + * for each kernel — see per-cycle Phase 7 docs). + * + * AUTO/recipe path stays on CPU for these kernels — that's the + * deployment shape. This test exercises the override-mode path + * the integration layer would use for runtime-aware scheduling. + */ +#include +#include +#include +#include +#include + +#include "../include/daedalus.h" + +static uint64_t xs_state = 0xab10b81cULL; +static inline uint64_t xs(void) { + uint64_t x = xs_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs_state = x; +} + +static int test_mc(void) +{ + enum { N = 32, DST_STRIDE = 16, DST_ROWS = 8 * 4, DST_BYTES = DST_ROWS * DST_STRIDE, + SRC_STRIDE = 16, SRC_ROWS = 12, SRC_BYTES = SRC_ROWS * SRC_STRIDE * N }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + if (!daedalus_ctx_has_qpu(ctx)) { + printf(" VP9 MC: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0; + } + + /* Allocate per-block src tiles (12 rows x 16 cols each). */ + uint8_t *src = malloc(SRC_BYTES); + uint8_t *dst_cpu = calloc(1, DST_BYTES * N); + uint8_t *dst_qpu = calloc(1, DST_BYTES * N); + daedalus_mc_meta *meta = calloc(N, sizeof(*meta)); + if (!src || !dst_cpu || !dst_qpu || !meta) return 1; + + for (size_t i = 0; i < SRC_BYTES; i++) src[i] = (uint8_t)(xs() & 0xff); + for (int i = 0; i < N; i++) { + meta[i].dst_off = i * 64; /* 8 rows × 8 cols = 64 bytes per block */ + meta[i].src_off = i * SRC_STRIDE * SRC_ROWS; /* RAW src offset; shader handles -3 */ + meta[i].mx = (int)(xs() & 15); + } + + daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, 8, src, SRC_STRIDE, N, meta); + daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, 8, src, SRC_STRIDE, N, meta); + + int diff = 0; + for (int i = 0; i < N * 64; i++) if (dst_cpu[i] != dst_qpu[i]) diff++; + printf(" VP9 MC (CPU vs QPU): %d/%d bytes match (%.4f%%)\n", + N * 64 - diff, N * 64, 100.0 * (N * 64 - diff) / (N * 64)); + + free(src); free(dst_cpu); free(dst_qpu); free(meta); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + +static int test_deblock(void) +{ + enum { N = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE, + TOTAL = N * TILE_BYTES, EDGE_OFF = 4 * TILE_STRIDE }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + if (!daedalus_ctx_has_qpu(ctx)) { + printf(" H.264 deblock: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0; + } + + uint8_t *master = malloc(TOTAL); + uint8_t *dst_cpu = malloc(TOTAL); + uint8_t *dst_qpu = malloc(TOTAL); + daedalus_h264_deblock_meta *meta = calloc(N, sizeof(*meta)); + if (!master || !dst_cpu || !dst_qpu || !meta) return 1; + + for (int i = 0; i < TOTAL; i++) master[i] = (uint8_t)(xs() & 0xff); + memcpy(dst_cpu, master, TOTAL); + memcpy(dst_qpu, master, TOTAL); + + for (int i = 0; i < N; i++) { + meta[i].dst_off = i * TILE_BYTES + EDGE_OFF; + meta[i].alpha = (int)(xs() % 64) + 1; + meta[i].beta = (int)(xs() % 16) + 1; + for (int s = 0; s < 4; s++) { + int r = (int)(xs() % 8); + meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); + } + } + + daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, TILE_STRIDE, N, meta); + daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, TILE_STRIDE, N, meta); + + int diff = 0; + for (int i = 0; i < TOTAL; i++) if (dst_cpu[i] != dst_qpu[i]) diff++; + printf(" H.264 deblock (CPU vs QPU): %d/%d bytes match (%.4f%%)\n", + TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); + + free(master); free(dst_cpu); free(dst_qpu); free(meta); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + +int main(void) +{ + printf("=== Phase 8b: opportunistic-QPU paths through API ===\n"); + int fail = 0; + fail |= test_mc(); + fail |= test_deblock(); + /* CDEF skipped here — tmp construction in C ref differs subtly + * from dav1d NEON's; bench_v3d_cdef.c is the authoritative gate + * for the QPU CDEF path. */ + return fail; +}