Phase 8: wire LPF wd=4 + wd=8 QPU through public API
Mirror the IDCT pattern (lazy pipeline + per-call SSBO alloc + dispatch + readback) for cycles 2 (LPF wd=4) and 4 (LPF wd=8). Important caught-empirically bug: the two LPF shaders disagree on push-constant slot order — wd=4 puts dst_stride_u8 at slot 1, wd=8 puts it at slot 2 (with unused blocks_per_row at slot 1). Initial single-struct attempt silently corrupted wd=8 output (1958/2048 = 95.6 % bit-exact on test_api_lpf). Fixed by keeping separate lpf4_pc and lpf8_pc struct definitions. dst-window calc handles both kernels (same -4..+3 byte footprint per row). test_api_lpf exercises both kernels in CPU / QPU / AUTO modes against the C reference. All 6 mode/kernel combinations pass 2048/2048 bit-exact (32 edges × 8 rows × 8 bytes/edge). Phase 8 status after this commit: 3 of 5 kernels wired through API for QPU dispatch (IDCT, LPF wd=4, LPF wd=8 — i.e., all 3 QPU-default kernels per recipe). Cycle 3 MC and cycle 5 CDEF still need wiring for opportunistic-override mode but aren't needed for recipe-AUTO path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -305,6 +305,14 @@ add_executable(test_api_idct
|
|||||||
target_link_libraries(test_api_idct PRIVATE daedalus_core)
|
target_link_libraries(test_api_idct PRIVATE daedalus_core)
|
||||||
target_compile_options(test_api_idct PRIVATE -O2)
|
target_compile_options(test_api_idct PRIVATE -O2)
|
||||||
|
|
||||||
|
add_executable(test_api_lpf
|
||||||
|
tests/test_api_lpf.c
|
||||||
|
tests/vp9_lpf_ref.c
|
||||||
|
tests/vp9_lpf8_ref.c
|
||||||
|
)
|
||||||
|
target_link_libraries(test_api_lpf PRIVATE daedalus_core)
|
||||||
|
target_compile_options(test_api_lpf PRIVATE -O2)
|
||||||
|
|
||||||
if (DAEDALUS_BUILD_VULKAN)
|
if (DAEDALUS_BUILD_VULKAN)
|
||||||
# (re-open the conditional so the closing endif() below balances)
|
# (re-open the conditional so the closing endif() below balances)
|
||||||
|
|
||||||
|
|||||||
+133
-5
@@ -30,6 +30,10 @@ struct daedalus_ctx {
|
|||||||
/* Per-kernel pipelines, lazy-created on first QPU dispatch. */
|
/* Per-kernel pipelines, lazy-created on first QPU dispatch. */
|
||||||
int idct8_pipe_ready;
|
int idct8_pipe_ready;
|
||||||
v3d_pipeline idct8_pipe;
|
v3d_pipeline idct8_pipe;
|
||||||
|
int lpf4_pipe_ready;
|
||||||
|
v3d_pipeline lpf4_pipe;
|
||||||
|
int lpf8_pipe_ready;
|
||||||
|
v3d_pipeline lpf8_pipe;
|
||||||
};
|
};
|
||||||
|
|
||||||
daedalus_ctx *daedalus_ctx_create(void)
|
daedalus_ctx *daedalus_ctx_create(void)
|
||||||
@@ -58,9 +62,12 @@ int daedalus_ctx_has_qpu(const daedalus_ctx *ctx)
|
|||||||
void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
void daedalus_ctx_destroy(daedalus_ctx *ctx)
|
||||||
{
|
{
|
||||||
if (!ctx) return;
|
if (!ctx) return;
|
||||||
if (ctx->idct8_pipe_ready && ctx->runner)
|
if (ctx->runner) {
|
||||||
v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
|
if (ctx->idct8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->idct8_pipe);
|
||||||
if (ctx->runner) v3d_runner_destroy(ctx->runner);
|
if (ctx->lpf4_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf4_pipe);
|
||||||
|
if (ctx->lpf8_pipe_ready) v3d_runner_destroy_pipeline(ctx->runner, &ctx->lpf8_pipe);
|
||||||
|
v3d_runner_destroy(ctx->runner);
|
||||||
|
}
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -272,6 +279,127 @@ fail:
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* -------------------- LPF QPU dispatch (cycles 2 + 4 shaders) --
|
||||||
|
*
|
||||||
|
* NOTE: the two LPF shaders disagree on push-constant slot order.
|
||||||
|
* v3d_lpf_h_4_8.comp: (n_edges, dst_stride_u8, _pad, _pad)
|
||||||
|
* v3d_lpf_h_8_8.comp: (n_edges, blocks_per_row=unused, dst_stride_u8, _pad)
|
||||||
|
*
|
||||||
|
* Same total size (16 bytes), different slot 2. Keep separate
|
||||||
|
* struct definitions to avoid silent corruption — Phase 8 caught
|
||||||
|
* this empirically when test_api_lpf wd=8 reported 95.6 % match.
|
||||||
|
*/
|
||||||
|
typedef struct {
|
||||||
|
uint32_t n_edges;
|
||||||
|
uint32_t dst_stride_u8;
|
||||||
|
uint32_t _pad0;
|
||||||
|
uint32_t _pad1;
|
||||||
|
} lpf4_pc;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint32_t n_edges;
|
||||||
|
uint32_t blocks_per_row; /* unused by shader, must exist */
|
||||||
|
uint32_t dst_stride_u8;
|
||||||
|
uint32_t _pad;
|
||||||
|
} lpf8_pc;
|
||||||
|
|
||||||
|
static int ensure_lpf_pipeline(daedalus_ctx *ctx, int wd_8,
|
||||||
|
int *flag, v3d_pipeline *pipe,
|
||||||
|
const char *spv)
|
||||||
|
{
|
||||||
|
if (*flag) return 0;
|
||||||
|
size_t pc_size = wd_8 ? sizeof(lpf8_pc) : sizeof(lpf4_pc);
|
||||||
|
if (v3d_runner_create_pipeline(ctx->runner, spv,
|
||||||
|
/*n_ssbos=*/2,
|
||||||
|
/*push_const_size=*/(uint32_t) pc_size,
|
||||||
|
pipe) != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
*flag = 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_lpf_meta *meta)
|
||||||
|
{
|
||||||
|
int *flag = wd_8 ? &ctx->lpf8_pipe_ready : &ctx->lpf4_pipe_ready;
|
||||||
|
v3d_pipeline *p = wd_8 ? &ctx->lpf8_pipe : &ctx->lpf4_pipe;
|
||||||
|
const char *spv = wd_8 ? "v3d_lpf_h_8_8.spv" : "v3d_lpf_h_4_8.spv";
|
||||||
|
if (ensure_lpf_pipeline(ctx, wd_8, flag, p, spv) != 0) return -1;
|
||||||
|
|
||||||
|
size_t meta_bytes = n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */
|
||||||
|
/* Determine smallest dst window. Each edge writes to bytes
|
||||||
|
* [dst_off - 4 .. dst_off + 3] for 8 rows at dst_stride. */
|
||||||
|
size_t lo = (size_t) -1, hi = 0;
|
||||||
|
for (size_t i = 0; i < n_edges; i++) {
|
||||||
|
size_t base = meta[i].dst_off;
|
||||||
|
if (base >= 4) {
|
||||||
|
size_t this_lo = base - 4;
|
||||||
|
if (this_lo < lo) lo = this_lo;
|
||||||
|
} else {
|
||||||
|
lo = 0;
|
||||||
|
}
|
||||||
|
size_t this_hi = base + (size_t)(8 - 1) * dst_stride + 4;
|
||||||
|
if (this_hi > hi) hi = this_hi;
|
||||||
|
}
|
||||||
|
if (n_edges == 0) { lo = 0; hi = 0; }
|
||||||
|
size_t dst_window_size = hi - lo;
|
||||||
|
|
||||||
|
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
||||||
|
if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(buf_dst.mapped, dst + lo, dst_window_size);
|
||||||
|
uint32_t *m = buf_meta.mapped;
|
||||||
|
for (size_t i = 0; i < n_edges; i++) {
|
||||||
|
m[4*i + 0] = (uint32_t)(meta[i].dst_off - lo);
|
||||||
|
m[4*i + 1] = (uint32_t) meta[i].E;
|
||||||
|
m[4*i + 2] = (uint32_t) meta[i].I;
|
||||||
|
m[4*i + 3] = (uint32_t) meta[i].H;
|
||||||
|
}
|
||||||
|
|
||||||
|
v3d_buffer binds[2] = { buf_meta, buf_dst };
|
||||||
|
if (v3d_runner_bind_buffers(ctx->runner, p, binds, 2)) goto fail;
|
||||||
|
|
||||||
|
uint32_t wg_count = (uint32_t)((n_edges + 31) / 32);
|
||||||
|
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(ctx->runner);
|
||||||
|
if (cb == VK_NULL_HANDLE) goto fail;
|
||||||
|
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||||||
|
vkBeginCommandBuffer(cb, &cbbi);
|
||||||
|
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, p->pipeline);
|
||||||
|
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||||
|
p->layout, 0, 1, &p->desc_set, 0, NULL);
|
||||||
|
if (wd_8) {
|
||||||
|
lpf8_pc pc = { .n_edges = (uint32_t) n_edges,
|
||||||
|
.blocks_per_row = 0,
|
||||||
|
.dst_stride_u8 = (uint32_t) dst_stride,
|
||||||
|
._pad = 0 };
|
||||||
|
vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||||||
|
0, sizeof(pc), &pc);
|
||||||
|
} else {
|
||||||
|
lpf4_pc pc = { .n_edges = (uint32_t) n_edges,
|
||||||
|
.dst_stride_u8 = (uint32_t) dst_stride };
|
||||||
|
vkCmdPushConstants(cb, p->layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||||||
|
0, sizeof(pc), &pc);
|
||||||
|
}
|
||||||
|
vkCmdDispatch(cb, wg_count, 1, 1);
|
||||||
|
vkEndCommandBuffer(cb);
|
||||||
|
if (v3d_runner_submit_wait(ctx->runner, cb)) goto fail;
|
||||||
|
|
||||||
|
memcpy(dst + lo, buf_dst.mapped, dst_window_size);
|
||||||
|
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||||
|
return 0;
|
||||||
|
fail:
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||||
|
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
/* -------------------- Public dispatch entry points -------------- */
|
/* -------------------- Public dispatch entry points -------------- */
|
||||||
|
|
||||||
#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \
|
#define ROUTE_CPU_ONLY(_kernel, _cpu_fn, ...) \
|
||||||
@@ -308,7 +436,7 @@ int daedalus_dispatch_vp9_lpf4(daedalus_ctx *ctx, daedalus_substrate sub,
|
|||||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||||
return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta);
|
return dispatch_lpf_cpu(ctx, 0, dst, dst_stride, n_edges, meta);
|
||||||
return -1;
|
return dispatch_lpf_qpu(ctx, 0, dst, dst_stride, n_edges, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
|
int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
@@ -322,7 +450,7 @@ int daedalus_dispatch_vp9_lpf8(daedalus_ctx *ctx, daedalus_substrate sub,
|
|||||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||||
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
if (eff == DAEDALUS_SUBSTRATE_CPU)
|
||||||
return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta);
|
return dispatch_lpf_cpu(ctx, 1, dst, dst_stride, n_edges, meta);
|
||||||
return -1;
|
return dispatch_lpf_qpu(ctx, 1, dst, dst_stride, n_edges, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
|
int daedalus_dispatch_vp9_mc_8h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
|||||||
@@ -0,0 +1,121 @@
|
|||||||
|
/*
|
||||||
|
* Phase 8 — VP9 LPF wd=4 + wd=8 through the public API.
|
||||||
|
*
|
||||||
|
* Exercises both kernels in CPU / QPU / AUTO modes against the
|
||||||
|
* C reference (tests/vp9_lpf_ref.c, vp9_lpf8_ref.c). Bit-exact
|
||||||
|
* gate per cycle 2 and 4 phase 7 docs.
|
||||||
|
*/
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "../include/daedalus.h"
|
||||||
|
|
||||||
|
extern void daedalus_vp9_loop_filter_h_4_8_ref(
|
||||||
|
uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
|
||||||
|
extern void daedalus_vp9_loop_filter_h_8_8_ref(
|
||||||
|
uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
|
||||||
|
|
||||||
|
#define N_EDGES 32
|
||||||
|
#define EDGE_STRIDE 8
|
||||||
|
#define EDGE_H 8
|
||||||
|
#define EDGE_BYTES (EDGE_H * EDGE_STRIDE) /* 64 */
|
||||||
|
#define DST_BYTES (N_EDGES * EDGE_BYTES)
|
||||||
|
|
||||||
|
static uint64_t xs_state = 0xa57edbeef5717ULL;
|
||||||
|
static inline uint64_t xs(void) {
|
||||||
|
uint64_t x = xs_state;
|
||||||
|
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||||||
|
return xs_state = x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gen_edge_pixels(uint8_t *buf)
|
||||||
|
{
|
||||||
|
int side_a_base = (int)(xs() % 200) + 20;
|
||||||
|
int side_b_base = (int)(xs() % 200) + 20;
|
||||||
|
int noise = (int)(xs() % 30);
|
||||||
|
for (int r = 0; r < EDGE_H; r++) {
|
||||||
|
for (int c = 0; c < 8; c++) {
|
||||||
|
int base = (c < 4) ? side_a_base : side_b_base;
|
||||||
|
int n = ((int)(xs() % (2 * noise + 1))) - noise;
|
||||||
|
int v = base + n;
|
||||||
|
buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int run_lpf(int wd_8, daedalus_substrate force,
|
||||||
|
const uint8_t *dst_initial,
|
||||||
|
const uint8_t *dst_ref,
|
||||||
|
const daedalus_lpf_meta *meta,
|
||||||
|
const char *label)
|
||||||
|
{
|
||||||
|
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||||
|
if (!ctx) return 1;
|
||||||
|
int has_qpu = daedalus_ctx_has_qpu(ctx);
|
||||||
|
if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) {
|
||||||
|
printf(" [%s wd=%d] SKIP — QPU unavailable\n", label, wd_8 ? 8 : 4);
|
||||||
|
daedalus_ctx_destroy(ctx); return 0;
|
||||||
|
}
|
||||||
|
uint8_t dst[DST_BYTES];
|
||||||
|
memcpy(dst, dst_initial, DST_BYTES);
|
||||||
|
int rc = wd_8
|
||||||
|
? daedalus_dispatch_vp9_lpf8(ctx, force, dst, EDGE_STRIDE, N_EDGES, meta)
|
||||||
|
: daedalus_dispatch_vp9_lpf4(ctx, force, dst, EDGE_STRIDE, N_EDGES, meta);
|
||||||
|
if (rc) { fprintf(stderr, " rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; }
|
||||||
|
int diffs = 0;
|
||||||
|
for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++;
|
||||||
|
printf(" [%s wd=%d] %d/%d bit-exact (%.4f%%)\n",
|
||||||
|
label, wd_8 ? 8 : 4,
|
||||||
|
DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
|
||||||
|
daedalus_ctx_destroy(ctx);
|
||||||
|
return diffs == 0 ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int run_one_kernel(int wd_8)
|
||||||
|
{
|
||||||
|
/* Per-edge layout: edge i occupies bytes [i*64..i*64+63]. Edge
|
||||||
|
* center is at column 4 of row 0 → byte offset i*64 + 4. */
|
||||||
|
uint8_t initial[DST_BYTES];
|
||||||
|
uint8_t ref[DST_BYTES];
|
||||||
|
daedalus_lpf_meta meta[N_EDGES];
|
||||||
|
|
||||||
|
for (int i = 0; i < N_EDGES; i++) {
|
||||||
|
gen_edge_pixels(initial + i * EDGE_BYTES);
|
||||||
|
meta[i].dst_off = (uint32_t)(i * EDGE_BYTES + 4);
|
||||||
|
meta[i].E = (int32_t)(xs() % 81);
|
||||||
|
meta[i].I = (int32_t)(xs() % 41);
|
||||||
|
meta[i].H = (int32_t)(xs() % 11);
|
||||||
|
}
|
||||||
|
memcpy(ref, initial, DST_BYTES);
|
||||||
|
for (int i = 0; i < N_EDGES; i++) {
|
||||||
|
if (wd_8) daedalus_vp9_loop_filter_h_8_8_ref(
|
||||||
|
ref + meta[i].dst_off, EDGE_STRIDE, meta[i].E, meta[i].I, meta[i].H);
|
||||||
|
else daedalus_vp9_loop_filter_h_4_8_ref(
|
||||||
|
ref + meta[i].dst_off, EDGE_STRIDE, meta[i].E, meta[i].I, meta[i].H);
|
||||||
|
}
|
||||||
|
|
||||||
|
int fail = 0;
|
||||||
|
fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_CPU, initial, ref, meta, "CPU");
|
||||||
|
fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_QPU, initial, ref, meta, "QPU");
|
||||||
|
fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_AUTO, initial, ref, meta, "AUTO");
|
||||||
|
return fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
printf("=== Phase 8 API smoke: VP9 LPF wd=4 + wd=8 ===\n");
|
||||||
|
printf(" recipe for LPF4_INNER: %d (1=CPU, 2=QPU)\n",
|
||||||
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER));
|
||||||
|
printf(" recipe for LPF8_INNER: %d\n",
|
||||||
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER));
|
||||||
|
|
||||||
|
int fail = 0;
|
||||||
|
printf("\nLPF wd=4:\n");
|
||||||
|
fail |= run_one_kernel(0);
|
||||||
|
printf("\nLPF wd=8:\n");
|
||||||
|
fail |= run_one_kernel(1);
|
||||||
|
return fail;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user