v3d_runner: SPV path search + bench preflight — RETRACTS PR #36's headline #37
+62
-2
@@ -8,6 +8,8 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
|
#define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
|
||||||
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
|
fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
|
||||||
@@ -368,10 +370,68 @@ void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf)
|
|||||||
|
|
||||||
/* ---- Pipelines -------------------------------------------------- */
|
/* ---- Pipelines -------------------------------------------------- */
|
||||||
|
|
||||||
|
/* SPV lookup tries a small set of locations. The caller passes a bare
|
||||||
|
* filename (e.g. "v3d_h264_idct4.spv"); we try, in order:
|
||||||
|
*
|
||||||
|
* 1. cwd-relative (legacy contract; works when run from build/)
|
||||||
|
* 2. $DAEDALUS_SHADER_DIR (env override for tests / packaged installs)
|
||||||
|
* 3. <binary-dir>/<name> (so the bench/test binary finds the SPV next
|
||||||
|
* to itself regardless of cwd — this is the
|
||||||
|
* fix for the silent-no-SPV regression that
|
||||||
|
* made PR #36's bench numbers meaningless)
|
||||||
|
* 4. /opt/fourier/share/daedalus-fourier/<name> (Pi 5 install layout)
|
||||||
|
* 5. /usr/share/daedalus-fourier/<name> (system-wide install)
|
||||||
|
*
|
||||||
|
* Returns NULL only if every location fails, with a single perror naming
|
||||||
|
* the bare filename so the user can grep for it. */
|
||||||
|
static FILE *open_spv(const char *name)
|
||||||
|
{
|
||||||
|
FILE *f = fopen(name, "rb");
|
||||||
|
if (f) return f;
|
||||||
|
|
||||||
|
const char *envdir = getenv("DAEDALUS_SHADER_DIR");
|
||||||
|
if (envdir && *envdir) {
|
||||||
|
char p[PATH_MAX];
|
||||||
|
snprintf(p, sizeof(p), "%s/%s", envdir, name);
|
||||||
|
f = fopen(p, "rb");
|
||||||
|
if (f) return f;
|
||||||
|
}
|
||||||
|
|
||||||
|
char exe[PATH_MAX];
|
||||||
|
ssize_t n = readlink("/proc/self/exe", exe, sizeof(exe) - 1);
|
||||||
|
if (n > 0) {
|
||||||
|
exe[n] = 0;
|
||||||
|
char *slash = strrchr(exe, '/');
|
||||||
|
if (slash) {
|
||||||
|
*slash = 0;
|
||||||
|
char p[PATH_MAX];
|
||||||
|
snprintf(p, sizeof(p), "%s/%s", exe, name);
|
||||||
|
f = fopen(p, "rb");
|
||||||
|
if (f) return f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
char p[PATH_MAX];
|
||||||
|
snprintf(p, sizeof(p), "/opt/fourier/share/daedalus-fourier/%s", name);
|
||||||
|
f = fopen(p, "rb");
|
||||||
|
if (f) return f;
|
||||||
|
|
||||||
|
snprintf(p, sizeof(p), "/usr/share/daedalus-fourier/%s", name);
|
||||||
|
f = fopen(p, "rb");
|
||||||
|
if (f) return f;
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
static uint32_t *read_spv(const char *path, size_t *out_size)
|
static uint32_t *read_spv(const char *path, size_t *out_size)
|
||||||
{
|
{
|
||||||
FILE *f = fopen(path, "rb");
|
FILE *f = open_spv(path);
|
||||||
if (!f) { perror(path); return NULL; }
|
if (!f) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"daedalus: SPV not found via cwd / $DAEDALUS_SHADER_DIR / "
|
||||||
|
"binary-dir / /opt/fourier/share / /usr/share: %s\n", path);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
fseek(f, 0, SEEK_END);
|
fseek(f, 0, SEEK_END);
|
||||||
long sz = ftell(f);
|
long sz = ftell(f);
|
||||||
fseek(f, 0, SEEK_SET);
|
fseek(f, 0, SEEK_SET);
|
||||||
|
|||||||
@@ -44,12 +44,28 @@ static double now_ms(void) {
|
|||||||
/* Per-1080p-frame counts (8160 MBs at 1920x1088). */
|
/* Per-1080p-frame counts (8160 MBs at 1920x1088). */
|
||||||
#define MBS_1080P 8160
|
#define MBS_1080P 8160
|
||||||
|
|
||||||
/* Standard benchmark loop. fn() is called n times per iteration. */
|
/* Standard benchmark loop. fn() is called n times per iteration.
|
||||||
typedef void (*bench_fn)(void);
|
*
|
||||||
|
* fn() now returns the dispatch's int rc. A single preflight call is
|
||||||
|
* made before the hot loop; if rc != 0 (which on the QPU substrate
|
||||||
|
* almost always means "SPV not found via any search path"), bench_ns
|
||||||
|
* returns -1 and the caller must NOT report the kernel as measured.
|
||||||
|
*
|
||||||
|
* Without this, a missing SPV makes every dispatch fail fast at the
|
||||||
|
* cost of one fprintf+open call (~1-5 µs), and the loop times that
|
||||||
|
* cost as if it were real QPU work — producing absurdly-small ns/op
|
||||||
|
* numbers that look like a QPU speedup. This is exactly what made
|
||||||
|
* PR #36's bench numbers a measurement artifact. */
|
||||||
|
typedef int (*bench_fn)(void);
|
||||||
|
|
||||||
static double bench_ns(const char *name, int iters, int warmup,
|
static double bench_ns(const char *name, int iters, int warmup,
|
||||||
int ops_per_iter, bench_fn fn)
|
int ops_per_iter, bench_fn fn)
|
||||||
{
|
{
|
||||||
|
int rc = fn();
|
||||||
|
if (rc != 0) {
|
||||||
|
printf(" %-32s DISPATCH FAILED rc=%d — kernel skipped\n", name, rc);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
for (int i = 0; i < warmup; i++) fn();
|
for (int i = 0; i < warmup; i++) fn();
|
||||||
double t0 = now_ms();
|
double t0 = now_ms();
|
||||||
for (int i = 0; i < iters; i++) fn();
|
for (int i = 0; i < iters; i++) fn();
|
||||||
@@ -76,8 +92,8 @@ static int16_t idct4_coeffs[1024 * 16];
|
|||||||
static daedalus_h264_block_meta idct4_meta[1024];
|
static daedalus_h264_block_meta idct4_meta[1024];
|
||||||
static uint8_t idct_dst[64 * 4 * 16 * 16]; /* 64 MB-rows × ... */
|
static uint8_t idct_dst[64 * 4 * 16 * 16]; /* 64 MB-rows × ... */
|
||||||
|
|
||||||
static void bench_idct4(void) {
|
static int bench_idct4(void) {
|
||||||
daedalus_dispatch_h264_idct4(ctx, g_sub,
|
return daedalus_dispatch_h264_idct4(ctx, g_sub,
|
||||||
idct_dst, 64*16, idct4_coeffs, 1024, idct4_meta);
|
idct_dst, 64*16, idct4_coeffs, 1024, idct4_meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -85,8 +101,8 @@ static void bench_idct4(void) {
|
|||||||
static int16_t idct8_coeffs[256 * 64];
|
static int16_t idct8_coeffs[256 * 64];
|
||||||
static daedalus_h264_block_meta idct8_meta[256];
|
static daedalus_h264_block_meta idct8_meta[256];
|
||||||
|
|
||||||
static void bench_idct8(void) {
|
static int bench_idct8(void) {
|
||||||
daedalus_dispatch_h264_idct8(ctx, g_sub,
|
return daedalus_dispatch_h264_idct8(ctx, g_sub,
|
||||||
idct_dst, 64*16, idct8_coeffs, 256, idct8_meta);
|
idct_dst, 64*16, idct8_coeffs, 256, idct8_meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -94,13 +110,13 @@ static void bench_idct8(void) {
|
|||||||
static daedalus_h264_deblock_meta deblock_meta[256];
|
static daedalus_h264_deblock_meta deblock_meta[256];
|
||||||
static uint8_t deblock_dst[256 * 16 * 16];
|
static uint8_t deblock_dst[256 * 16 * 16];
|
||||||
|
|
||||||
static void bench_deblock_v(void) {
|
static int bench_deblock_v(void) {
|
||||||
daedalus_dispatch_h264_deblock_luma_v(ctx, g_sub,
|
return daedalus_dispatch_h264_deblock_luma_v(ctx, g_sub,
|
||||||
deblock_dst, 16, 256, deblock_meta);
|
deblock_dst, 16, 256, deblock_meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void bench_deblock_h(void) {
|
static int bench_deblock_h(void) {
|
||||||
daedalus_dispatch_h264_deblock_luma_h(ctx, g_sub,
|
return daedalus_dispatch_h264_deblock_luma_h(ctx, g_sub,
|
||||||
deblock_dst, 16, 256, deblock_meta);
|
deblock_dst, 16, 256, deblock_meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -109,16 +125,16 @@ static uint8_t qpel_src[256 * 16 * 16];
|
|||||||
static uint8_t qpel_dst[256 * 16 * 16];
|
static uint8_t qpel_dst[256 * 16 * 16];
|
||||||
static daedalus_h264_qpel_meta qpel_meta[256];
|
static daedalus_h264_qpel_meta qpel_meta[256];
|
||||||
|
|
||||||
static void bench_qpel_mc20(void) {
|
static int bench_qpel_mc20(void) {
|
||||||
daedalus_dispatch_h264_qpel_mc20(ctx, g_sub,
|
return daedalus_dispatch_h264_qpel_mc20(ctx, g_sub,
|
||||||
qpel_dst, qpel_src, 16, 256, qpel_meta);
|
qpel_dst, qpel_src, 16, 256, qpel_meta);
|
||||||
}
|
}
|
||||||
static void bench_qpel_mc02(void) {
|
static int bench_qpel_mc02(void) {
|
||||||
daedalus_dispatch_h264_qpel_mc02(ctx, g_sub,
|
return daedalus_dispatch_h264_qpel_mc02(ctx, g_sub,
|
||||||
qpel_dst, qpel_src, 16, 256, qpel_meta);
|
qpel_dst, qpel_src, 16, 256, qpel_meta);
|
||||||
}
|
}
|
||||||
static void bench_qpel_mc22(void) {
|
static int bench_qpel_mc22(void) {
|
||||||
daedalus_dispatch_h264_qpel_mc22(ctx, g_sub,
|
return daedalus_dispatch_h264_qpel_mc22(ctx, g_sub,
|
||||||
qpel_dst, qpel_src, 16, 256, qpel_meta);
|
qpel_dst, qpel_src, 16, 256, qpel_meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -197,11 +213,25 @@ int main(int argc, char **argv)
|
|||||||
rows[i].cpu_ns = bench_ns(rows[i].name, iters, warmup, rows[i].n_per_call, rows[i].fn);
|
rows[i].cpu_ns = bench_ns(rows[i].name, iters, warmup, rows[i].n_per_call, rows[i].fn);
|
||||||
|
|
||||||
/* Pass 2: QPU compute (if available). */
|
/* Pass 2: QPU compute (if available). */
|
||||||
|
int qpu_failures = 0;
|
||||||
if (has_qpu) {
|
if (has_qpu) {
|
||||||
g_sub = DAEDALUS_SUBSTRATE_QPU;
|
g_sub = DAEDALUS_SUBSTRATE_QPU;
|
||||||
printf("\n== QPU V3D7 compute ==\n");
|
printf("\n== QPU V3D7 compute ==\n");
|
||||||
for (int i = 0; i < N_ROWS; i++)
|
for (int i = 0; i < N_ROWS; i++) {
|
||||||
rows[i].qpu_ns = bench_ns(rows[i].name, iters, warmup, rows[i].n_per_call, rows[i].fn);
|
rows[i].qpu_ns = bench_ns(rows[i].name, iters, warmup, rows[i].n_per_call, rows[i].fn);
|
||||||
|
if (rows[i].qpu_ns < 0) qpu_failures++;
|
||||||
|
}
|
||||||
|
if (qpu_failures) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"\nbench_h264_primitives: %d of %d QPU dispatches failed.\n"
|
||||||
|
" Almost always means SPV files were not found via any of:\n"
|
||||||
|
" cwd / $DAEDALUS_SHADER_DIR / binary-dir /\n"
|
||||||
|
" /opt/fourier/share/daedalus-fourier / /usr/share/daedalus-fourier\n"
|
||||||
|
" Set DAEDALUS_SHADER_DIR=<path> or run from a dir where the\n"
|
||||||
|
" .spv files exist (e.g. the cmake build dir).\n",
|
||||||
|
qpu_failures, N_ROWS);
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Summary table — both substrates side by side. */
|
/* Summary table — both substrates side by side. */
|
||||||
|
|||||||
Reference in New Issue
Block a user