diff --git a/src/v3d_runner.c b/src/v3d_runner.c index 05d34c5..665b6e8 100644 --- a/src/v3d_runner.c +++ b/src/v3d_runner.c @@ -8,6 +8,8 @@ #include #include #include +#include +#include #define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \ fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \ @@ -368,10 +370,68 @@ void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf) /* ---- Pipelines -------------------------------------------------- */ +/* SPV lookup tries a small set of locations. The caller passes a bare + * filename (e.g. "v3d_h264_idct4.spv"); we try, in order: + * + * 1. cwd-relative (legacy contract; works when run from build/) + * 2. $DAEDALUS_SHADER_DIR (env override for tests / packaged installs) + * 3. / (so the bench/test binary finds the SPV next + * to itself regardless of cwd — this is the + * fix for the silent-no-SPV regression that + * made PR #36's bench numbers meaningless) + * 4. /opt/fourier/share/daedalus-fourier/ (Pi 5 install layout) + * 5. /usr/share/daedalus-fourier/ (system-wide install) + * + * Returns NULL only if every location fails, with a single perror naming + * the bare filename so the user can grep for it. */ +static FILE *open_spv(const char *name) +{ + FILE *f = fopen(name, "rb"); + if (f) return f; + + const char *envdir = getenv("DAEDALUS_SHADER_DIR"); + if (envdir && *envdir) { + char p[PATH_MAX]; + snprintf(p, sizeof(p), "%s/%s", envdir, name); + f = fopen(p, "rb"); + if (f) return f; + } + + char exe[PATH_MAX]; + ssize_t n = readlink("/proc/self/exe", exe, sizeof(exe) - 1); + if (n > 0) { + exe[n] = 0; + char *slash = strrchr(exe, '/'); + if (slash) { + *slash = 0; + char p[PATH_MAX]; + snprintf(p, sizeof(p), "%s/%s", exe, name); + f = fopen(p, "rb"); + if (f) return f; + } + } + + char p[PATH_MAX]; + snprintf(p, sizeof(p), "/opt/fourier/share/daedalus-fourier/%s", name); + f = fopen(p, "rb"); + if (f) return f; + + snprintf(p, sizeof(p), "/usr/share/daedalus-fourier/%s", name); + f = fopen(p, "rb"); + if (f) return f; + + return NULL; +} + static uint32_t *read_spv(const char *path, size_t *out_size) { - FILE *f = fopen(path, "rb"); - if (!f) { perror(path); return NULL; } + FILE *f = open_spv(path); + if (!f) { + fprintf(stderr, + "daedalus: SPV not found via cwd / $DAEDALUS_SHADER_DIR / " + "binary-dir / /opt/fourier/share / /usr/share: %s\n", path); + return NULL; + } fseek(f, 0, SEEK_END); long sz = ftell(f); fseek(f, 0, SEEK_SET); diff --git a/tests/bench_h264_primitives.c b/tests/bench_h264_primitives.c index 14b8002..94f67e3 100644 --- a/tests/bench_h264_primitives.c +++ b/tests/bench_h264_primitives.c @@ -44,12 +44,28 @@ static double now_ms(void) { /* Per-1080p-frame counts (8160 MBs at 1920x1088). */ #define MBS_1080P 8160 -/* Standard benchmark loop. fn() is called n times per iteration. */ -typedef void (*bench_fn)(void); +/* Standard benchmark loop. fn() is called n times per iteration. + * + * fn() now returns the dispatch's int rc. A single preflight call is + * made before the hot loop; if rc != 0 (which on the QPU substrate + * almost always means "SPV not found via any search path"), bench_ns + * returns -1 and the caller must NOT report the kernel as measured. + * + * Without this, a missing SPV makes every dispatch fail fast at the + * cost of one fprintf+open call (~1-5 µs), and the loop times that + * cost as if it were real QPU work — producing absurdly-small ns/op + * numbers that look like a QPU speedup. This is exactly what made + * PR #36's bench numbers a measurement artifact. */ +typedef int (*bench_fn)(void); static double bench_ns(const char *name, int iters, int warmup, int ops_per_iter, bench_fn fn) { + int rc = fn(); + if (rc != 0) { + printf(" %-32s DISPATCH FAILED rc=%d — kernel skipped\n", name, rc); + return -1; + } for (int i = 0; i < warmup; i++) fn(); double t0 = now_ms(); for (int i = 0; i < iters; i++) fn(); @@ -76,8 +92,8 @@ static int16_t idct4_coeffs[1024 * 16]; static daedalus_h264_block_meta idct4_meta[1024]; static uint8_t idct_dst[64 * 4 * 16 * 16]; /* 64 MB-rows × ... */ -static void bench_idct4(void) { - daedalus_dispatch_h264_idct4(ctx, g_sub, +static int bench_idct4(void) { + return daedalus_dispatch_h264_idct4(ctx, g_sub, idct_dst, 64*16, idct4_coeffs, 1024, idct4_meta); } @@ -85,8 +101,8 @@ static void bench_idct4(void) { static int16_t idct8_coeffs[256 * 64]; static daedalus_h264_block_meta idct8_meta[256]; -static void bench_idct8(void) { - daedalus_dispatch_h264_idct8(ctx, g_sub, +static int bench_idct8(void) { + return daedalus_dispatch_h264_idct8(ctx, g_sub, idct_dst, 64*16, idct8_coeffs, 256, idct8_meta); } @@ -94,13 +110,13 @@ static void bench_idct8(void) { static daedalus_h264_deblock_meta deblock_meta[256]; static uint8_t deblock_dst[256 * 16 * 16]; -static void bench_deblock_v(void) { - daedalus_dispatch_h264_deblock_luma_v(ctx, g_sub, +static int bench_deblock_v(void) { + return daedalus_dispatch_h264_deblock_luma_v(ctx, g_sub, deblock_dst, 16, 256, deblock_meta); } -static void bench_deblock_h(void) { - daedalus_dispatch_h264_deblock_luma_h(ctx, g_sub, +static int bench_deblock_h(void) { + return daedalus_dispatch_h264_deblock_luma_h(ctx, g_sub, deblock_dst, 16, 256, deblock_meta); } @@ -109,16 +125,16 @@ static uint8_t qpel_src[256 * 16 * 16]; static uint8_t qpel_dst[256 * 16 * 16]; static daedalus_h264_qpel_meta qpel_meta[256]; -static void bench_qpel_mc20(void) { - daedalus_dispatch_h264_qpel_mc20(ctx, g_sub, +static int bench_qpel_mc20(void) { + return daedalus_dispatch_h264_qpel_mc20(ctx, g_sub, qpel_dst, qpel_src, 16, 256, qpel_meta); } -static void bench_qpel_mc02(void) { - daedalus_dispatch_h264_qpel_mc02(ctx, g_sub, +static int bench_qpel_mc02(void) { + return daedalus_dispatch_h264_qpel_mc02(ctx, g_sub, qpel_dst, qpel_src, 16, 256, qpel_meta); } -static void bench_qpel_mc22(void) { - daedalus_dispatch_h264_qpel_mc22(ctx, g_sub, +static int bench_qpel_mc22(void) { + return daedalus_dispatch_h264_qpel_mc22(ctx, g_sub, qpel_dst, qpel_src, 16, 256, qpel_meta); } @@ -197,11 +213,25 @@ int main(int argc, char **argv) rows[i].cpu_ns = bench_ns(rows[i].name, iters, warmup, rows[i].n_per_call, rows[i].fn); /* Pass 2: QPU compute (if available). */ + int qpu_failures = 0; if (has_qpu) { g_sub = DAEDALUS_SUBSTRATE_QPU; printf("\n== QPU V3D7 compute ==\n"); - for (int i = 0; i < N_ROWS; i++) + for (int i = 0; i < N_ROWS; i++) { rows[i].qpu_ns = bench_ns(rows[i].name, iters, warmup, rows[i].n_per_call, rows[i].fn); + if (rows[i].qpu_ns < 0) qpu_failures++; + } + if (qpu_failures) { + fprintf(stderr, + "\nbench_h264_primitives: %d of %d QPU dispatches failed.\n" + " Almost always means SPV files were not found via any of:\n" + " cwd / $DAEDALUS_SHADER_DIR / binary-dir /\n" + " /opt/fourier/share/daedalus-fourier / /usr/share/daedalus-fourier\n" + " Set DAEDALUS_SHADER_DIR= or run from a dir where the\n" + " .spv files exist (e.g. the cmake build dir).\n", + qpu_failures, N_ROWS); + return 2; + } } /* Summary table — both substrates side by side. */