diff --git a/CMakeLists.txt b/CMakeLists.txt index 921be42..caecc61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -305,6 +305,14 @@ add_executable(test_api_idct target_link_libraries(test_api_idct PRIVATE daedalus_core) target_compile_options(test_api_idct PRIVATE -O2) +add_executable(test_api_lpf + tests/test_api_lpf.c + tests/vp9_lpf_ref.c + tests/vp9_lpf8_ref.c +) +target_link_libraries(test_api_lpf PRIVATE daedalus_core) +target_compile_options(test_api_lpf PRIVATE -O2) + if (DAEDALUS_BUILD_VULKAN) # (re-open the conditional so the closing endif() below balances) diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 4087dc0..9f624cf 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -30,6 +30,10 @@ struct daedalus_ctx { /* Per-kernel pipelines, lazy-created on first QPU dispatch. */ int idct8_pipe_ready; v3d_pipeline idct8_pipe; + int lpf4_pipe_ready; + v3d_pipeline lpf4_pipe; + int lpf8_pipe_ready; + v3d_pipeline lpf8_pipe; }; daedalus_ctx *daedalus_ctx_create(void) @@ -58,9 +62,12 @@ int daedalus_ctx_has_qpu(const daedalus_ctx *ctx) void daedalus_ctx_destroy(daedalus_ctx *ctx) { if (!ctx) return; - if (ctx->idct8_pipe_ready && ctx->runner) - v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe); - if (ctx->runner) v3d_runner_destroy(ctx->runner); + if (ctx->runner) { + if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe); + if (ctx->lpf4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe); + if (ctx->lpf8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe); + v3d_runner_destroy(ctx->runner); + } free(ctx); } @@ -272,6 +279,127 @@ fail: return -1; } +/* -------------------- LPF QPU dispatch (cycles 2 + 4 shaders) -- + * + * NOTE: the two LPF shaders disagree on push-constant slot order. + * v3d_lpf_h_4_8.comp: (n_edges, dst_stride_u8, _pad, _pad) + * v3d_lpf_h_8_8.comp: (n_edges, blocks_per_row=unused, dst_stride_u8, _pad) + * + * Same total size (16 bytes), different slot 2. Keep separate + * struct definitions to avoid silent corruption — Phase 8 caught + * this empirically when test_api_lpf wd=8 reported 95.6 % match. + */ +typedef struct { + uint32_t n_edges; + uint32_t dst_stride_u8; + uint32_t _pad0; + uint32_t _pad1; +} lpf4_pc; + +typedef struct { + uint32_t n_edges; + uint32_t blocks_per_row; /* unused by shader, must exist */ + uint32_t dst_stride_u8; + uint32_t _pad; +} lpf8_pc; + +static int ensure_lpf_pipeline(daedalus_ctx *ctx, int wd_8, + int *flag, v3d_pipeline *pipe, + const char *spv) +{ + if (*flag) return 0; + size_t pc_size = wd_8 ? sizeof(lpf8_pc) : sizeof(lpf4_pc); + if (v3d_runner_create_pipeline(ctx->runner, spv, + /*n_ssbos=*/2, + /*push_const_size=*/(uint32_t) pc_size, + pipe) != 0) { + return -1; + } + *flag = 1; + return 0; +} + +static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_lpf_meta *meta) +{ + int *flag = wd_8 ? &ctx->lpf8_pipe_ready : &ctx->lpf4_pipe_ready; + v3d_pipeline *p = wd_8 ? &ctx->lpf8_pipe : &ctx->lpf4_pipe; + const char *spv = wd_8 ? "v3d_lpf_h_8_8.spv" : "v3d_lpf_h_4_8.spv"; + if (ensure_lpf_pipeline(ctx, wd_8, flag, p, spv) != 0) return -1; + + size_t meta_bytes = n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */ + /* Determine smallest dst window. Each edge writes to bytes + * [dst_off - 4 .. dst_off + 3] for 8 rows at dst_stride. */ + size_t lo = (size_t) -1, hi = 0; + for (size_t i = 0; i < n_edges; i++) { + size_t base = meta[i].dst_off; + if (base >= 4) { + size_t this_lo = base - 4; + if (this_lo < lo) lo = this_lo; + } else { + lo = 0; + } + size_t this_hi = base + (size_t)(8 - 1) * dst_stride + 4; + if (this_hi > hi) hi = this_hi; + } + if (n_edges == 0) { lo = 0; hi = 0; } + size_t dst_window_size = hi - lo; + + v3d_buffer buf_meta = {0}, buf_dst = {0}; + if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1; + if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) { + v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1; + } + + memcpy(buf_dst.mapped, dst + lo, dst_window_size); + uint32_t *m = buf_meta.mapped; + for (size_t i = 0; i < n_edges; i++) { + m[4*i + 0] = (uint32_t)(meta[i].dst_off - lo); + m[4*i + 1] = (uint32_t) meta[i].E; + m[4*i + 2] = (uint32_t) meta[i].I; + m[4*i + 3] = (uint32_t) meta[i].H; + } + + v3d_buffer binds[2] = { buf_meta, buf_dst }; + if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail; + + uint32_t wg_count = (uint32_t)((n_edges + 31) / 32); + VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner); + if (cb == VK_NULL_HANDLE) goto fail; + VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; + vkBeginCommandBuffer(cb, &cbbi); + vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline); + vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, + p->layout, 0, 1, &p->desc_set, 0, NULL); + if (wd_8) { + lpf8_pc pc = { .n_edges = (uint32_t) n_edges, + .blocks_per_row = 0, + .dst_stride_u8 = (uint32_t) dst_stride, + ._pad = 0 }; + vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + } else { + lpf4_pc pc = { .n_edges = (uint32_t) n_edges, + .dst_stride_u8 = (uint32_t) dst_stride }; + vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pc), &pc); + } + vkCmdDispatch(cb, wg_count, 1, 1); + vkEndCommandBuffer(cb); + if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail; + + memcpy(dst + lo, buf_dst.mapped, dst_window_size); + + v3d_runner_destroy_buffer(ctx->runner, &buf_dst); + v3d_runner_destroy_buffer(ctx->runner, &buf_meta); + return 0; +fail: + v3d_runner_destroy_buffer(ctx->runner, &buf_dst); + v3d_runner_destroy_buffer(ctx->runner, &buf_meta); + return -1; +} + /* -------------------- Public dispatch entry points -------------- */ #define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \ @@ -308,7 +436,7 @@ int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub, eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta); - return -1; + return dispatch_lpf_qpu(ctx, 0, dst, dst_stride, n_edges, meta); } int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub, @@ -322,7 +450,7 @@ int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub, eff = DAEDALUS_SUBSTRATE_CPU; if (eff == DAEDALUS_SUBSTRATE_CPU) return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta); - return -1; + return dispatch_lpf_qpu(ctx, 1, dst, dst_stride, n_edges, meta); } int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub, diff --git a/tests/test_api_lpf.c b/tests/test_api_lpf.c new file mode 100644 index 0000000..591659d --- /dev/null +++ b/tests/test_api_lpf.c @@ -0,0 +1,121 @@ +/* + * Phase 8 — VP9 LPF wd=4 + wd=8 through the public API. + * + * Exercises both kernels in CPU / QPU / AUTO modes against the + * C reference (tests/vp9_lpf_ref.c, vp9_lpf8_ref.c). Bit-exact + * gate per cycle 2 and 4 phase 7 docs. + */ +#include +#include +#include +#include +#include + +#include "../include/daedalus.h" + +extern void daedalus_vp9_loop_filter_h_4_8_ref( + uint8_t *dst, ptrdiff_t stride, int E, int I, int H); +extern void daedalus_vp9_loop_filter_h_8_8_ref( + uint8_t *dst, ptrdiff_t stride, int E, int I, int H); + +#define N_EDGES 32 +#define EDGE_STRIDE 8 +#define EDGE_H 8 +#define EDGE_BYTES (EDGE_H * EDGE_STRIDE) /* 64 */ +#define DST_BYTES (N_EDGES * EDGE_BYTES) + +static uint64_t xs_state = 0xa57edbeef5717ULL; +static inline uint64_t xs(void) { + uint64_t x = xs_state; + x ^= x << 13; x ^= x >> 7; x ^= x << 17; + return xs_state = x; +} + +static void gen_edge_pixels(uint8_t *buf) +{ + int side_a_base = (int)(xs() % 200) + 20; + int side_b_base = (int)(xs() % 200) + 20; + int noise = (int)(xs() % 30); + for (int r = 0; r < EDGE_H; r++) { + for (int c = 0; c < 8; c++) { + int base = (c < 4) ? side_a_base : side_b_base; + int n = ((int)(xs() % (2 * noise + 1))) - noise; + int v = base + n; + buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v); + } + } +} + +static int run_lpf(int wd_8, daedalus_substrate force, + const uint8_t *dst_initial, + const uint8_t *dst_ref, + const daedalus_lpf_meta *meta, + const char *label) +{ + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + int has_qpu = daedalus_ctx_has_qpu(ctx); + if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) { + printf(" [%s wd=%d] SKIP — QPU unavailable\n", label, wd_8 ? 8 : 4); + daedalus_ctx_destroy(ctx); return 0; + } + uint8_t dst[DST_BYTES]; + memcpy(dst, dst_initial, DST_BYTES); + int rc = wd_8 + ? daedalus_dispatch_vp9_lpf8(ctx, force, dst, EDGE_STRIDE, N_EDGES, meta) + : daedalus_dispatch_vp9_lpf4(ctx, force, dst, EDGE_STRIDE, N_EDGES, meta); + if (rc) { fprintf(stderr, " rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; } + int diffs = 0; + for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++; + printf(" [%s wd=%d] %d/%d bit-exact (%.4f%%)\n", + label, wd_8 ? 8 : 4, + DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES); + daedalus_ctx_destroy(ctx); + return diffs == 0 ? 0 : 1; +} + +static int run_one_kernel(int wd_8) +{ + /* Per-edge layout: edge i occupies bytes [i*64..i*64+63]. Edge + * center is at column 4 of row 0 → byte offset i*64 + 4. */ + uint8_t initial[DST_BYTES]; + uint8_t ref[DST_BYTES]; + daedalus_lpf_meta meta[N_EDGES]; + + for (int i = 0; i < N_EDGES; i++) { + gen_edge_pixels(initial + i * EDGE_BYTES); + meta[i].dst_off = (uint32_t)(i * EDGE_BYTES + 4); + meta[i].E = (int32_t)(xs() % 81); + meta[i].I = (int32_t)(xs() % 41); + meta[i].H = (int32_t)(xs() % 11); + } + memcpy(ref, initial, DST_BYTES); + for (int i = 0; i < N_EDGES; i++) { + if (wd_8) daedalus_vp9_loop_filter_h_8_8_ref( + ref + meta[i].dst_off, EDGE_STRIDE, meta[i].E, meta[i].I, meta[i].H); + else daedalus_vp9_loop_filter_h_4_8_ref( + ref + meta[i].dst_off, EDGE_STRIDE, meta[i].E, meta[i].I, meta[i].H); + } + + int fail = 0; + fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_CPU, initial, ref, meta, "CPU"); + fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_QPU, initial, ref, meta, "QPU"); + fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_AUTO, initial, ref, meta, "AUTO"); + return fail; +} + +int main(void) +{ + printf("=== Phase 8 API smoke: VP9 LPF wd=4 + wd=8 ===\n"); + printf(" recipe for LPF4_INNER: %d (1=CPU, 2=QPU)\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER)); + printf(" recipe for LPF8_INNER: %d\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER)); + + int fail = 0; + printf("\nLPF wd=4:\n"); + fail |= run_one_kernel(0); + printf("\nLPF wd=8:\n"); + fail |= run_one_kernel(1); + return fail; +}