Phase 8: wire IDCT QPU dispatch through public API
daedalus_ctx now owns a v3d_runner when V3D is available. The public API's dispatch_vp9_idct8 routes QPU calls through a new dispatch_idct8_qpu helper that: (1) lazy-creates the cycle 1 v4 pipeline on first use, (2) allocates 3 host-visible SSBOs per call (coeffs/dst/meta), (3) memcpy host->GPU, (4) dispatch with the v4 32-blocks-per-WG geometry, (5) memcpy GPU->host. Per-call alloc is intentional for Phase 8 correctness-first scope; buffer-pool perf optimization is deferred. Added daedalus_ctx_create_no_qpu() for fast-path callers that know they want CPU only. test_api_idct extended to a 3-mode matrix: CPU forced, QPU forced, AUTO recipe. All three deliver 4096/4096 bit-exact on hertz with V3D 7.1.7.0: recipe substrate for VP9_IDCT8: 2 (QPU) [CPU] 4096/4096 bit-exact [QPU] 4096/4096 bit-exact (real QPU dispatch through the API) [AUTO] 4096/4096 bit-exact (recipe routes to QPU) Next Phase 8 sub-step: same wiring pattern for cycle 2 LPF wd=4 and cycle 4 LPF wd=8 (the other two recipe-QPU kernels). Cycle 3 MC and cycle 5 CDEF only need the dispatch hook (recipe routes to CPU; QPU stays opportunistic via explicit override). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -282,6 +282,7 @@ endif()
|
|||||||
|
|
||||||
add_library(daedalus_core STATIC
|
add_library(daedalus_core STATIC
|
||||||
src/daedalus_core.c
|
src/daedalus_core.c
|
||||||
|
src/v3d_runner.c
|
||||||
${FFASM_SOURCES}
|
${FFASM_SOURCES}
|
||||||
${FFASM_LPF_SOURCES}
|
${FFASM_LPF_SOURCES}
|
||||||
${FFASM_MC_SOURCES}
|
${FFASM_MC_SOURCES}
|
||||||
@@ -290,7 +291,12 @@ add_library(daedalus_core STATIC
|
|||||||
${DAV1D_CDEF_C_SOURCES}
|
${DAV1D_CDEF_C_SOURCES}
|
||||||
)
|
)
|
||||||
target_include_directories(daedalus_core PUBLIC include)
|
target_include_directories(daedalus_core PUBLIC include)
|
||||||
|
target_include_directories(daedalus_core PRIVATE src)
|
||||||
|
target_link_libraries(daedalus_core PUBLIC Vulkan::Vulkan)
|
||||||
target_compile_options(daedalus_core PRIVATE -O2)
|
target_compile_options(daedalus_core PRIVATE -O2)
|
||||||
|
if (DAEDALUS_BUILD_VULKAN)
|
||||||
|
add_dependencies(daedalus_core daedalus_shaders)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_executable(test_api_idct
|
add_executable(test_api_idct
|
||||||
tests/test_api_idct.c
|
tests/test_api_idct.c
|
||||||
|
|||||||
@@ -70,6 +70,10 @@ typedef struct daedalus_ctx daedalus_ctx;
|
|||||||
* failure. */
|
* failure. */
|
||||||
daedalus_ctx *daedalus_ctx_create(void);
|
daedalus_ctx *daedalus_ctx_create(void);
|
||||||
|
|
||||||
|
/* Same but skip V3D init — for callers that know they want CPU
|
||||||
|
* only and want a fast-creating context. */
|
||||||
|
daedalus_ctx *daedalus_ctx_create_no_qpu(void);
|
||||||
|
|
||||||
/* Returns 1 if QPU dispatch is available on this context, 0 if
|
/* Returns 1 if QPU dispatch is available on this context, 0 if
|
||||||
* NEON-only. Useful for the integration layer to short-circuit
|
* NEON-only. Useful for the integration layer to short-circuit
|
||||||
* QPU dispatch attempts. */
|
* QPU dispatch attempts. */
|
||||||
|
|||||||
+151
-14
@@ -1,14 +1,19 @@
|
|||||||
/*
|
/*
|
||||||
* daedalus-fourier core library — Phase 8 skeleton.
|
* daedalus-fourier core library — Phase 8 skeleton + IDCT QPU wired.
|
||||||
*
|
*
|
||||||
* Wraps cycles 1-5 kernels behind the public C API in
|
* Wraps cycles 1-5 kernels behind the public C API in
|
||||||
* include/daedalus.h. Recipe dispatch routes per-kernel to the
|
* include/daedalus.h. Recipe dispatch routes per-kernel to the
|
||||||
* verdict substrate from each cycle's Phase 7 doc.
|
* verdict substrate from each cycle's Phase 7 doc.
|
||||||
*
|
*
|
||||||
|
* QPU dispatch wiring status:
|
||||||
|
* IDCT 8x8: wired (cycle 1 v4 shader).
|
||||||
|
* Others: stubbed (return -1); CPU path always works.
|
||||||
|
*
|
||||||
* License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
|
* License: BSD-2-Clause. Links vendored FFmpeg LGPL-2.1+ +
|
||||||
* dav1d BSD-2-Clause NEON snapshots.
|
* dav1d BSD-2-Clause NEON snapshots.
|
||||||
*/
|
*/
|
||||||
#include "../include/daedalus.h"
|
#include "../include/daedalus.h"
|
||||||
|
#include "v3d_runner.h"
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
@@ -19,18 +24,29 @@
|
|||||||
/* -------------------- Context -------------------- */
|
/* -------------------- Context -------------------- */
|
||||||
|
|
||||||
struct daedalus_ctx {
|
struct daedalus_ctx {
|
||||||
/* For Phase 8 skeleton: just a flag. Real impl would hold the
|
|
||||||
* v3d_runner + per-kernel pipeline handles. */
|
|
||||||
int has_qpu;
|
int has_qpu;
|
||||||
|
v3d_runner *runner; /* NULL when has_qpu == 0 */
|
||||||
|
|
||||||
|
/* Per-kernel pipelines, lazy-created on first QPU dispatch. */
|
||||||
|
int idct8_pipe_ready;
|
||||||
|
v3d_pipeline idct8_pipe;
|
||||||
};
|
};
|
||||||
|
|
||||||
daedalus_ctx *daedalus_ctx_create(void)
|
daedalus_ctx *daedalus_ctx_create(void)
|
||||||
{
|
{
|
||||||
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
||||||
if (!ctx) return NULL;
|
if (!ctx) return NULL;
|
||||||
/* Phase 8 deferred: real impl probes V3D Vulkan device; for now
|
ctx->runner = v3d_runner_create();
|
||||||
* default to CPU-only (NEON paths are always available). */
|
ctx->has_qpu = (ctx->runner != NULL);
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
daedalus_ctx *daedalus_ctx_create_no_qpu(void)
|
||||||
|
{
|
||||||
|
daedalus_ctx *ctx = calloc(1, sizeof(*ctx));
|
||||||
|
if (!ctx) return NULL;
|
||||||
ctx->has_qpu = 0;
|
ctx->has_qpu = 0;
|
||||||
|
ctx->runner = NULL;
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -41,6 +57,10 @@ int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
|
|||||||
|
|
||||||
void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
||||||
{
|
{
|
||||||
|
if (!ctx) return;
|
||||||
|
if (ctx->idct8_pipe_ready && ctx->runner)
|
||||||
|
v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
|
||||||
|
if (ctx->runner) v3d_runner_destroy(ctx->runner);
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -55,7 +75,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
|||||||
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
case DAEDALUS_KERNEL_VP9_LPF8_INNER: return DAEDALUS_SUBSTRATE_QPU;
|
||||||
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_AV1_CDEF_8X8: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
}
|
}
|
||||||
return DAEDALUS_SUBSTRATE_CPU; /* defensive default */
|
return DAEDALUS_SUBSTRATE_CPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* -------------------- NEON externs (per cycle bench links) ----- */
|
/* -------------------- NEON externs (per cycle bench links) ----- */
|
||||||
@@ -141,23 +161,140 @@ static int dispatch_cdef_cpu(daedalus_ctx *ctx,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint32_t n_blocks;
|
||||||
|
uint32_t blocks_per_row;
|
||||||
|
uint32_t dst_stride_u8;
|
||||||
|
uint32_t _pad;
|
||||||
|
} idct8_pc;
|
||||||
|
|
||||||
|
static int ensure_idct8_pipeline(daedalus_ctx *ctx)
|
||||||
|
{
|
||||||
|
if (ctx->idct8_pipe_ready) return 0;
|
||||||
|
if (v3d_runner_create_pipeline(ctx->runner,
|
||||||
|
"v3d_idct8.spv",
|
||||||
|
/*n_ssbos=*/3,
|
||||||
|
/*push_const_size=*/sizeof(idct8_pc),
|
||||||
|
&ctx->idct8_pipe) != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
ctx->idct8_pipe_ready = 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
const int16_t *coeffs, size_t n_blocks,
|
||||||
|
const daedalus_idct8_meta *meta)
|
||||||
|
{
|
||||||
|
if (ensure_idct8_pipeline(ctx) != 0) return -1;
|
||||||
|
|
||||||
|
/* Allocate three SSBOs per call (coeffs, dst, meta). Performance-
|
||||||
|
* tuning (buffer pool) is deferred; correctness first. */
|
||||||
|
size_t coeff_bytes = n_blocks * 64 * sizeof(int16_t);
|
||||||
|
size_t meta_bytes = n_blocks * 2 * sizeof(uint32_t); /* uvec2 per block */
|
||||||
|
/* dst buffer must hold all of dst[0..max_dst_off + 64 + 8*stride].
|
||||||
|
* Cheapest correct answer: alloc the smallest contiguous region
|
||||||
|
* containing every block's footprint. For Phase 8 we assume the
|
||||||
|
* caller's dst surface starts at byte 0 of the buffer and use
|
||||||
|
* the full provided extent. We size by scanning meta. */
|
||||||
|
size_t max_byte_touched = 0;
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
size_t end = meta[i].dst_off + (size_t)(8 - 1) * dst_stride + 8;
|
||||||
|
if (end > max_byte_touched) max_byte_touched = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||||
|
}
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Upload. Coeffs and meta are straight copies. Dst we copy the
|
||||||
|
* caller's full region (since we'll need to read it back). */
|
||||||
|
memcpy(buf_coeffs.mapped, coeffs, coeff_bytes);
|
||||||
|
memcpy(buf_dst.mapped, dst, max_byte_touched);
|
||||||
|
uint32_t *m = buf_meta.mapped;
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
m[2*i + 0] = meta[i].block_x;
|
||||||
|
m[2*i + 1] = meta[i].block_y;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Bind: shader expects (coeffs, dst, meta) per src/v3d_idct8.comp. */
|
||||||
|
v3d_buffer binds[3] = { buf_coeffs, buf_dst, buf_meta };
|
||||||
|
if (v3d_runner_bind_buffers(ctx->runner, &ctx->idct8_pipe, binds, 3)) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* WG geometry: 32 blocks per WG. */
|
||||||
|
uint32_t wg_count = (uint32_t)((n_blocks + 31) / 32);
|
||||||
|
idct8_pc pc = {
|
||||||
|
.n_blocks = (uint32_t) n_blocks,
|
||||||
|
.blocks_per_row = 0, /* unused by v4 shader (meta drives placement) */
|
||||||
|
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||||
|
._pad = 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||||
|
if (cb == VK_NULL_HANDLE) goto fail;
|
||||||
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||||
|
vkBeginCommandBuffer(cb, &cbbi);
|
||||||
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||||
|
ctx->idct8_pipe.pipeline);
|
||||||
|
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||||
|
ctx->idct8_pipe.layout, 0, 1,
|
||||||
|
&ctx->idct8_pipe.desc_set, 0, NULL);
|
||||||
|
vkCmdPushConstants(cb, ctx->idct8_pipe.layout,
|
||||||
|
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||||||
|
vkCmdDispatch(cb, wg_count, 1, 1);
|
||||||
|
vkEndCommandBuffer(cb);
|
||||||
|
|
||||||
|
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||||||
|
|
||||||
|
/* Read-back dst. */
|
||||||
|
memcpy(dst, buf_dst.mapped, max_byte_touched);
|
||||||
|
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
fail:
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
/* -------------------- Public dispatch entry points -------------- */
|
/* -------------------- Public dispatch entry points -------------- */
|
||||||
|
|
||||||
#define ROUTE(_kernel, _cpu_fn, ...) \
|
#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \
|
||||||
daedalus_substrate eff = sub; \
|
daedalus_substrate eff = sub; \
|
||||||
if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
|
if (eff == DAEDALUS_SUBSTRATE_AUTO) eff = daedalus_recipe_substrate_for(_kernel); \
|
||||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||||||
eff = DAEDALUS_SUBSTRATE_CPU; \
|
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||||||
if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \
|
if (eff == DAEDALUS_SUBSTRATE_CPU) return _cpu_fn(ctx, __VA_ARGS__); \
|
||||||
return -1 /* QPU path not yet wired in Phase 8 skeleton */
|
return -1 /* QPU path not yet wired for this kernel */
|
||||||
|
|
||||||
int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
int daedalus_dispatch_vp9_idct8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
uint8_t *dst, size_t dst_stride,
|
uint8_t *dst, size_t dst_stride,
|
||||||
const int16_t *coeffs, size_t n_blocks,
|
const int16_t *coeffs, size_t n_blocks,
|
||||||
const daedalus_idct8_meta *meta)
|
const daedalus_idct8_meta *meta)
|
||||||
{
|
{
|
||||||
ROUTE(DAEDALUS_KERNEL_VP9_IDCT8, dispatch_idct8_cpu,
|
daedalus_substrate eff = sub;
|
||||||
dst, dst_stride, coeffs, n_blocks, meta);
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||||
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8);
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||||
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||||
|
return dispatch_idct8_cpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
|
||||||
|
return dispatch_idct8_qpu(ctx, dst, dst_stride, coeffs, n_blocks, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
|
int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
@@ -193,8 +330,8 @@ int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
|
|||||||
const uint8_t *src, size_t src_stride,
|
const uint8_t *src, size_t src_stride,
|
||||||
size_t n_blocks, const daedalus_mc_meta *meta)
|
size_t n_blocks, const daedalus_mc_meta *meta)
|
||||||
{
|
{
|
||||||
ROUTE(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
|
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_VP9_MC_8H, dispatch_mc_8h_cpu,
|
||||||
dst, dst_stride, src, src_stride, n_blocks, meta);
|
dst, dst_stride, src, src_stride, n_blocks, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
@@ -202,8 +339,8 @@ int daedalus_dispatch_cdef_8x8(daedalus_ctx *ctx, daedalus_substrate sub,
|
|||||||
const uint16_t *tmp,
|
const uint16_t *tmp,
|
||||||
size_t n_blocks, const daedalus_cdef_meta *meta)
|
size_t n_blocks, const daedalus_cdef_meta *meta)
|
||||||
{
|
{
|
||||||
ROUTE(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
|
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_AV1_CDEF_8X8, dispatch_cdef_cpu,
|
||||||
dst, dst_stride, tmp, n_blocks, meta);
|
dst, dst_stride, tmp, n_blocks, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* -------------------- Recipe convenience wrappers --------------- */
|
/* -------------------- Recipe convenience wrappers --------------- */
|
||||||
|
|||||||
+33
-18
@@ -37,14 +37,37 @@ static inline uint64_t xs(void) {
|
|||||||
return xs_state = x;
|
return xs_state = x;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(void)
|
static int run_once(daedalus_substrate force,
|
||||||
|
const int16_t *coeffs,
|
||||||
|
const daedalus_idct8_meta *meta,
|
||||||
|
const uint8_t *dst_initial,
|
||||||
|
const uint8_t *dst_ref,
|
||||||
|
const char *label)
|
||||||
{
|
{
|
||||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||||
if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
|
if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
|
||||||
|
int has_qpu = daedalus_ctx_has_qpu(ctx);
|
||||||
|
printf(" [%s] has_qpu=%d force=%d\n", label, has_qpu, (int) force);
|
||||||
|
if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) {
|
||||||
|
printf(" SKIP — QPU unavailable on this host\n");
|
||||||
|
daedalus_ctx_destroy(ctx); return 0;
|
||||||
|
}
|
||||||
|
uint8_t dst[DST_BYTES];
|
||||||
|
memcpy(dst, dst_initial, DST_BYTES);
|
||||||
|
int rc = daedalus_dispatch_vp9_idct8(ctx, force, dst, DST_STRIDE,
|
||||||
|
coeffs, N_BLOCKS, meta);
|
||||||
|
if (rc) { fprintf(stderr, " dispatch rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; }
|
||||||
|
int diffs = 0;
|
||||||
|
for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++;
|
||||||
|
printf(" %d / %d bytes bit-exact (%.4f%%)\n",
|
||||||
|
DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
|
||||||
|
daedalus_ctx_destroy(ctx);
|
||||||
|
return diffs == 0 ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n");
|
printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n");
|
||||||
printf(" has_qpu: %d (Phase 8 skeleton: NEON-only)\n",
|
|
||||||
daedalus_ctx_has_qpu(ctx));
|
|
||||||
printf(" recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n",
|
printf(" recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n",
|
||||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8));
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8));
|
||||||
|
|
||||||
@@ -61,9 +84,9 @@ int main(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t dst_ref[DST_BYTES], dst_api[DST_BYTES];
|
uint8_t dst_ref[DST_BYTES], dst_initial[DST_BYTES];
|
||||||
for (int i = 0; i < DST_BYTES; i++)
|
for (int i = 0; i < DST_BYTES; i++)
|
||||||
dst_ref[i] = dst_api[i] = (uint8_t)(xs() & 0xff);
|
dst_ref[i] = dst_initial[i] = (uint8_t)(xs() & 0xff);
|
||||||
|
|
||||||
/* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset
|
/* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset
|
||||||
* by*8*stride + bx*8. */
|
* by*8*stride + bx*8. */
|
||||||
@@ -87,17 +110,9 @@ int main(void)
|
|||||||
DST_STRIDE, scratch, 64);
|
DST_STRIDE, scratch, 64);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Dispatch through the public API. */
|
int fail = 0;
|
||||||
int rc = daedalus_recipe_dispatch_vp9_idct8(ctx, dst_api, DST_STRIDE,
|
fail |= run_once(DAEDALUS_SUBSTRATE_CPU, coeffs, meta, dst_initial, dst_ref, "CPU");
|
||||||
coeffs, N_BLOCKS, meta);
|
fail |= run_once(DAEDALUS_SUBSTRATE_QPU, coeffs, meta, dst_initial, dst_ref, "QPU");
|
||||||
if (rc != 0) { fprintf(stderr, "API dispatch failed rc=%d\n", rc); return 1; }
|
fail |= run_once(DAEDALUS_SUBSTRATE_AUTO, coeffs, meta, dst_initial, dst_ref, "AUTO");
|
||||||
|
return fail;
|
||||||
/* Compare. */
|
|
||||||
int diffs = 0;
|
|
||||||
for (int i = 0; i < DST_BYTES; i++) if (dst_ref[i] != dst_api[i]) diffs++;
|
|
||||||
printf(" bytes bit-exact: %d / %d (%.4f%%)\n",
|
|
||||||
DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
|
|
||||||
|
|
||||||
daedalus_ctx_destroy(ctx);
|
|
||||||
return diffs == 0 ? 0 : 1;
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user