Files
daedalus-fourier/tests/bench_concurrent_mixed.c
T
marfrit 373f63a910 Cycle 8 closed: H.264 deblock R8=0.061 RED, opportunistic helper
Phase 6 deliverable: v3d_h264deblock.comp (132 inst, 4 threads,
no spills). Phase 5 REDs applied:
  RED-1: explicit clamp p1'/q1' to [0,255] before uint8 write
  RED-2: bench-enforced m.x >= 4*stride contract

M1: 3-way 4096/4096 bit-exact (QPU vs C ref AND vs NEON).
M2: 5.629 Medge/s isolation → R8 = 0.061 RED (predicted 0.09-0.14).
    Lower than prediction; H.264 deblock has 4 early-return paths +
    2 conditional writes that hurt V3D branchy execution more than
    expected.

M4 same-kernel: NEON-3+QPU 12.81 Medge/s ≈ pure-NEON-4 ~12-15
  (neutral).

M4 MIXED (real H.264 deployment shape): CPU=MC + QPU=h264deblock
  gives CPU MC 25.11 Mblock/s + QPU h264deblock 6.23 Medge/s.
  QPU contribution is essentially unchanged from isolation —
  the cross-substrate contention is gentle (consistent with
  Issue 003's V4 finding).

Verdict: H.264 deblock = opportunistic QPU helper. Same recipe
slot as cycle 5 CDEF. 6 Medge/s helper = 85% of single-NEON-core
deblock capacity, available when CPU is busy with other work.

Cycles 1-8 deployment recipe complete:
  Primary QPU: cycles 1+2+4 (VP9 IDCT/LPF, all bandwidth-bound)
  Primary CPU: cycles 3+6+7 (compute-heavy or trivially fast on NEON)
  Opportunistic helper: cycles 5+8 (CDEF, H.264 deblock)

Phase 9 lessons added:
  - Branchy kernels underperform V3D vs straight-line ones
  - Mixed-kernel helper value scales with isolation M2, not
    same-kernel M4
  - R prediction needs branchiness weight, not just compute density

- src/v3d_h264deblock.comp (132 inst QPU shader)
- tests/bench_v3d_h264deblock.c (3-way M1 + M2 + R classification)
- tests/bench_concurrent_mixed.c extended with K_H264DEBLOCK
- CMakeLists.txt: v3d_h264deblock.spv + bench_v3d_h264deblock
  + h264dsp linked into bench_concurrent_mixed
- docs/k8_h264deblock_phase7.md (full closure with cycles 1-8 recipe)

Next: Phase 8 — V4L2 wrapper / deployment infra. Public API
already exposes recipe-default substrate per kernel.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:44:21 +00:00

630 lines
25 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Issue 003 — Mixed-kernel M4 bench.
*
* Runs N NEON pthread workers (pinned 0..N-1) doing CPU kernel A,
* plus one QPU worker doing kernel B concurrently. Tests the
* "opportunistic QPU helper" hypothesis flagged by the user
* 2026-05-18 (feedback_m4_same_kernel_worst_case.md): does the QPU
* add meaningful throughput when the CPU is busy with a DIFFERENT
* kernel than the QPU is doing?
*
* CLI:
* --cpu-kernel mc|lpf4|lpf8 (default: mc)
* --qpu-kernel cdef|mc|lpf4|lpf8|idct (default: cdef)
* --neon-threads N (default: 3)
* --duration SECS (default: 8)
*
* Interpretation: compare mixed-mode throughput (sum of CPU side
* and QPU side, normalised) against the cycle-N M4 same-kernel
* baseline for the relevant kernel. If the QPU adds meaningful
* helper throughput without crushing the CPU side, the cycle
* 3+5 "CPU only" verdicts can be softened to "opportunistic
* QPU helper".
*
* License: BSD-2-Clause; links FFmpeg LGPL-2.1+ snapshot (MC, LPF)
* and dav1d BSD-2-Clause snapshot (CDEF).
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stddef.h>
#include <time.h>
#include <getopt.h>
#include <pthread.h>
#include <sched.h>
#include <assert.h>
#include <vulkan/vulkan.h>
#include "v3d_runner.h"
/* External NEON refs (vendored FFmpeg + dav1d). */
extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my);
extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride,
int E, int I, int H);
extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride,
int E, int I, int H);
extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob);
extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
const uint16_t *tmp, int pri_strength, int sec_strength,
int dir, int damping, int h, size_t edges);
/* --- Common helpers --- */
static volatile int g_stop = 0;
static pthread_barrier_t g_start;
static inline uint64_t xs_step(uint64_t *s) {
uint64_t x = *s; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return *s = x;
}
static uint64_t xs_init(uint64_t s) { return s ? s : 0xa57edbeef5717ULL; }
static double now_s(void) {
struct timespec t; clock_gettime(CLOCK_MONOTONIC_RAW, &t);
return t.tv_sec + t.tv_nsec * 1e-9;
}
/* --- Kernel selectors --- */
enum kernel { K_MC, K_LPF4, K_LPF8, K_CDEF, K_IDCT, K_H264DEBLOCK };
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
static const char *kernel_name(enum kernel k) {
switch (k) {
case K_MC: return "mc";
case K_LPF4: return "lpf4";
case K_LPF8: return "lpf8";
case K_CDEF: return "cdef";
case K_IDCT: return "idct";
case K_H264DEBLOCK: return "h264deblock";
}
return "?";
}
static const char *kernel_unit(enum kernel k) {
return (k == K_LPF4 || k == K_LPF8 || k == K_H264DEBLOCK) ? "Medge/s" : "Mblock/s";
}
/* --- NEON worker (per-kernel inline; pre-generate inputs, hot-loop) --- */
#define NEON_BATCH 8192
typedef struct {
int worker_id, affinity_core;
enum kernel kernel;
uint64_t units_done;
double elapsed_s;
} neon_args;
static void neon_run_mc(uint64_t *seed, uint64_t *out_done) {
/* MC: SRC_BYTES=128 (8x16) per block; DST_BYTES=64. */
uint8_t *src = malloc((size_t) NEON_BATCH * 128);
uint8_t *dst = malloc((size_t) NEON_BATCH * 64);
int *mx = malloc(NEON_BATCH * sizeof(int));
for (int i = 0; i < NEON_BATCH; i++) {
for (int j = 0; j < 128; j++) src[i*128 + j] = (uint8_t)(xs_step(seed) & 0xff);
mx[i] = (int)(xs_step(seed) & 15);
}
while (!g_stop) {
for (int i = 0; i < NEON_BATCH; i++)
ff_vp9_put_regular8_h_neon(dst + i*64, 8,
src + i*128 + 3, 16, 8, mx[i], 0);
*out_done += NEON_BATCH;
}
free(src); free(dst); free(mx);
}
static void neon_run_lpf(uint64_t *seed, uint64_t *out_done, int wd_8) {
uint8_t *master = malloc((size_t) NEON_BATCH * 64);
uint8_t *work = malloc((size_t) NEON_BATCH * 64);
int *Es = malloc(NEON_BATCH*sizeof(int)), *Is = malloc(NEON_BATCH*sizeof(int)), *Hs = malloc(NEON_BATCH*sizeof(int));
for (int i = 0; i < NEON_BATCH; i++) {
for (int j = 0; j < 64; j++) master[i*64+j] = (uint8_t)(xs_step(seed) & 0xff);
Es[i] = (int)(xs_step(seed) % 81);
Is[i] = (int)(xs_step(seed) % 41);
Hs[i] = (int)(xs_step(seed) % 11);
}
while (!g_stop) {
memcpy(work, master, (size_t) NEON_BATCH * 64);
for (int i = 0; i < NEON_BATCH; i++) {
if (wd_8) ff_vp9_loop_filter_h_8_8_neon(work + i*64 + 4, 8, Es[i], Is[i], Hs[i]);
else ff_vp9_loop_filter_h_4_8_neon(work + i*64 + 4, 8, Es[i], Is[i], Hs[i]);
}
*out_done += NEON_BATCH;
}
free(master); free(work); free(Es); free(Is); free(Hs);
}
static void neon_run_cdef(uint64_t *seed, uint64_t *out_done) {
int n = NEON_BATCH;
uint16_t *tmps = malloc((size_t) n * 192 * sizeof(uint16_t));
uint8_t *dsts = malloc((size_t) n * 64);
int *pris = malloc(n*sizeof(int)), *secs = malloc(n*sizeof(int));
int *dirs = malloc(n*sizeof(int)), *damps = malloc(n*sizeof(int));
for (int i = 0; i < n; i++) {
for (int j = 0; j < 192; j++) tmps[i*192 + j] = (uint16_t)(xs_step(seed) & 0xff);
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++)
dsts[i*64 + r*8 + c] = (uint8_t) tmps[i*192 + (r+2)*16 + (c+2)];
pris[i] = (int)(xs_step(seed) % 7) + 1;
secs[i] = (int)(xs_step(seed) % 4) + 1;
dirs[i] = (int)(xs_step(seed) & 7);
damps[i] = (int)(xs_step(seed) % 6) + 1;
}
while (!g_stop) {
for (int i = 0; i < n; i++)
dav1d_cdef_filter8_8bpc_neon(dsts + i*64, 8,
tmps + i*192 + (2*16+2),
pris[i], secs[i], dirs[i], damps[i], 8, 0);
*out_done += n;
}
free(tmps); free(dsts); free(pris); free(secs); free(dirs); free(damps);
}
static void neon_run_idct(uint64_t *seed, uint64_t *out_done) {
int16_t *blocks_master = malloc((size_t) NEON_BATCH * 64 * sizeof(int16_t));
int16_t *blocks_work = malloc((size_t) NEON_BATCH * 64 * sizeof(int16_t));
uint8_t *dsts = malloc((size_t) NEON_BATCH * 64);
int *eobs = malloc(NEON_BATCH * sizeof(int));
for (int i = 0; i < NEON_BATCH; i++) {
memset(blocks_master + i*64, 0, 64*sizeof(int16_t));
int n = 1 + (int)(xs_step(seed) % 16);
int eob = 0;
for (int j = 0; j < n; j++) {
int pos = (int)(xs_step(seed) % 64);
int16_t coef = (int16_t)((int)(xs_step(seed) % 8192) - 4096);
blocks_master[i*64 + pos] = coef;
if (pos + 1 > eob) eob = pos + 1;
}
eobs[i] = eob ? eob : 1;
}
while (!g_stop) {
memcpy(blocks_work, blocks_master, (size_t) NEON_BATCH * 64 * sizeof(int16_t));
for (int i = 0; i < NEON_BATCH; i++)
ff_vp9_idct_idct_8x8_add_neon(dsts + i*64, 8, blocks_work + i*64, eobs[i]);
*out_done += NEON_BATCH;
}
free(blocks_master); free(blocks_work); free(dsts); free(eobs);
}
static void *neon_worker(void *p) {
neon_args *a = p;
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
uint64_t seed = xs_init((uint64_t) a->worker_id * 0xc01dbeefULL);
pthread_barrier_wait(&g_start);
double t0 = now_s();
uint64_t done = 0;
switch (a->kernel) {
case K_MC: neon_run_mc(&seed, &done); break;
case K_LPF4: neon_run_lpf(&seed, &done, 0); break;
case K_LPF8: neon_run_lpf(&seed, &done, 1); break;
case K_IDCT: neon_run_idct(&seed, &done); break;
case K_CDEF: neon_run_cdef(&seed, &done); break;
case K_H264DEBLOCK: {
/* H.264 deblock: 16-row × 16-col tile per edge, EDGE_OFF = 4*16. */
int n = NEON_BATCH;
uint8_t *master = malloc((size_t) n * 256);
uint8_t *work = malloc((size_t) n * 256);
int *alphas = malloc(n*sizeof(int)), *betas = malloc(n*sizeof(int));
int8_t (*tc0s)[4] = malloc(n*4);
for (int i = 0; i < n; i++) {
for (int j = 0; j < 256; j++) master[i*256+j] = (uint8_t)(xs_step(&seed) & 0xff);
alphas[i] = (int)(xs_step(&seed) % 64) + 1;
betas[i] = (int)(xs_step(&seed) % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs_step(&seed) % 8);
tc0s[i][s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
while (!g_stop) {
memcpy(work, master, (size_t) n * 256);
for (int i = 0; i < n; i++)
ff_h264_v_loop_filter_luma_neon(work + i*256 + 4*16, 16,
alphas[i], betas[i], tc0s[i]);
done += n;
}
free(master); free(work); free(alphas); free(betas); free(tc0s);
break;
}
default: fprintf(stderr, "bad NEON kernel\n"); break;
}
a->elapsed_s = now_s() - t0;
a->units_done = done;
return NULL;
}
/* --- QPU worker (CDEF / MC / LPF4 / LPF8 / IDCT) --- */
typedef struct {
int affinity_core, n_units;
enum kernel kernel;
uint64_t units_done;
double elapsed_s;
} qpu_args;
/* Each QPU kernel has its own push-constant layout. */
typedef struct { uint32_t n, dst_stride_u8, _pad0, _pad1; } pc_lpf;
typedef struct { uint32_t n, dst_stride_u8, src_stride_u8, _pad; } pc_mc;
typedef struct { uint32_t n_blocks, blocks_per_row, dst_stride_u8, _pad; } pc_idct;
typedef struct { uint32_t n_blocks, tmp_stride_u16, dst_stride_u8, _pad; } pc_cdef;
/* CDEF: not yet — QPU CDEF kernel not implemented. CDEF QPU mode uses
* dav1d NEON via a single-thread NEON call on the QPU host core instead.
* That's a degenerate "QPU helper" but matches the deferred state of
* cycle 5. Real QPU CDEF kernel would replace this once cycle 5 closes. */
static void *qpu_cdef_neon_fallback(void *p)
{
/* Cycle 5 doesn't have a working QPU CDEF kernel yet (M1 deferred).
* For Issue 003's purposes we test "the QPU host core running NEON
* CDEF" as a proxy for the QPU contribution. This UNDERSTATES the
* QPU helper value (since the QPU itself would parallelise more
* than 1 NEON core), but gives a defensible lower bound: if even
* NEON-on-the-spare-core helps the mixed throughput, QPU certainly
* would.
*
* TODO: once cycle 5 Phase 6 lands, swap this for the QPU dispatch. */
qpu_args *a = p;
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
int n_blocks = a->n_units;
uint64_t seed = 0xcdef00000beefcULL;
uint16_t *tmps = malloc((size_t) n_blocks * 192 * sizeof(uint16_t));
uint8_t *dsts = malloc((size_t) n_blocks * 64);
int *pris = malloc(n_blocks*sizeof(int));
int *secs = malloc(n_blocks*sizeof(int));
int *dirs = malloc(n_blocks*sizeof(int));
int *damps = malloc(n_blocks*sizeof(int));
for (int i = 0; i < n_blocks; i++) {
for (int j = 0; j < 192; j++) tmps[i*192 + j] = (uint16_t)(xs_step(&seed) & 0xff);
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++)
dsts[i*64 + r*8 + c] = (uint8_t) tmps[i*192 + (r+2)*16 + (c+2)];
pris[i] = (int)(xs_step(&seed) % 7) + 1;
secs[i] = (int)(xs_step(&seed) % 4) + 1;
dirs[i] = (int)(xs_step(&seed) & 7);
damps[i] = (int)(xs_step(&seed) % 4) + 3;
}
pthread_barrier_wait(&g_start);
double t0 = now_s();
uint64_t done = 0;
while (!g_stop) {
for (int i = 0; i < n_blocks; i++)
dav1d_cdef_filter8_8bpc_neon(dsts + i*64, 8,
tmps + i*192,
pris[i], secs[i], dirs[i], damps[i], 8, 0);
done += n_blocks;
}
a->elapsed_s = now_s() - t0;
a->units_done = done;
free(tmps); free(dsts); free(pris); free(secs); free(dirs); free(damps);
return NULL;
}
/* QPU dispatch worker — generic for kernels with working shaders. */
typedef struct {
int affinity_core, n_units;
enum kernel kernel;
uint64_t units_done;
double elapsed_s;
} qpu_real_args;
static void *qpu_real_worker(void *p)
{
qpu_real_args *a = p;
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
v3d_runner *r = v3d_runner_create();
if (!r) return NULL;
int n_units = a->n_units;
const char *spv = NULL;
uint32_t bpw = 32; /* blocks/edges per WG */
size_t dst_bytes = 0, meta_bytes = 0, src_bytes = 0;
int has_src = 0;
size_t per_unit = 0;
switch (a->kernel) {
case K_LPF4:
case K_LPF8: {
spv = (a->kernel == K_LPF4) ? "v3d_lpf_h_4_8.spv" : "v3d_lpf_h_8_8.spv";
per_unit = 64;
dst_bytes = (size_t) n_units * per_unit;
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
break;
}
case K_MC:
spv = "v3d_mc_8h.spv";
dst_bytes = (size_t) n_units * 64;
src_bytes = (size_t) n_units * 128;
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
has_src = 1;
break;
case K_IDCT:
spv = "v3d_idct8.spv";
dst_bytes = (size_t) n_units * 64;
src_bytes = (size_t) n_units * 64 * sizeof(int16_t);
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
has_src = 1;
break;
case K_CDEF:
spv = "v3d_cdef.spv";
bpw = 4;
dst_bytes = (size_t) n_units * 64;
src_bytes = (size_t) n_units * 192 * sizeof(uint16_t);
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
has_src = 1;
break;
case K_H264DEBLOCK:
spv = "v3d_h264deblock.spv";
bpw = 16; /* 16 edges/WG */
dst_bytes = (size_t) n_units * 256; /* 16x16 tile */
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
has_src = 0;
break;
default:
fprintf(stderr, "qpu_real_worker: unsupported kernel\n");
v3d_runner_destroy(r);
return NULL;
}
v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_src = {0};
v3d_runner_create_buffer(r, meta_bytes, &buf_meta);
v3d_runner_create_buffer(r, dst_bytes, &buf_dst);
if (has_src) v3d_runner_create_buffer(r, src_bytes, &buf_src);
/* Synthesise meta + src + dst content based on kernel. */
uint64_t seed = 0xfeed00000beefULL;
uint32_t *meta = buf_meta.mapped;
if (a->kernel == K_LPF4 || a->kernel == K_LPF8) {
for (int i = 0; i < n_units; i++) {
meta[4*i+0] = (uint32_t)((size_t)i * 64 + 4); /* dst_off */
meta[4*i+1] = (uint32_t)(xs_step(&seed) % 81); /* E */
meta[4*i+2] = (uint32_t)(xs_step(&seed) % 41); /* I */
meta[4*i+3] = (uint32_t)(xs_step(&seed) % 11); /* H */
}
for (size_t i = 0; i < dst_bytes; i++)
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
} else if (a->kernel == K_MC) {
for (int i = 0; i < n_units; i++) {
meta[4*i+0] = (uint32_t)((size_t)i * 64); /* dst_off */
meta[4*i+1] = (uint32_t)((size_t)i * 128); /* src_off (RAW) */
meta[4*i+2] = (uint32_t)(xs_step(&seed) & 15); /* mx */
meta[4*i+3] = 0;
}
for (size_t i = 0; i < src_bytes; i++)
((uint8_t *) buf_src.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
} else if (a->kernel == K_IDCT) {
for (int i = 0; i < n_units; i++) {
meta[4*i+0] = (uint32_t)((size_t)i * 64);
meta[4*i+1] = (uint32_t)((i * 64) / 64);
meta[4*i+2] = 0;
meta[4*i+3] = 0;
}
int16_t *cf = (int16_t *) buf_src.mapped;
size_t n_coefs = src_bytes / sizeof(int16_t);
for (size_t i = 0; i < n_coefs; i++)
cf[i] = (int16_t)((int)(xs_step(&seed) % 8192) - 4096);
} else if (a->kernel == K_CDEF) {
uint16_t *tmps = (uint16_t *) buf_src.mapped;
for (int i = 0; i < n_units; i++) {
uint32_t pri = (uint32_t)((xs_step(&seed) % 7) + 1);
uint32_t sec = (uint32_t)((xs_step(&seed) % 4) + 1);
uint32_t damping = (uint32_t)((xs_step(&seed) % 6) + 1);
meta[4*i+0] = (uint32_t)((size_t)i * 64);
meta[4*i+1] = pri | (sec << 8) | (damping << 16);
meta[4*i+2] = (uint32_t)((size_t)i * 192 + (2*16 + 2));
meta[4*i+3] = (uint32_t)(xs_step(&seed) & 7);
for (int j = 0; j < 192; j++)
tmps[(size_t)i * 192 + j] = (uint16_t)(xs_step(&seed) & 0xff);
}
for (size_t i = 0; i < dst_bytes; i++)
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
} else if (a->kernel == K_H264DEBLOCK) {
for (int i = 0; i < n_units; i++) {
uint32_t alpha = (uint32_t)(xs_step(&seed) % 64) + 1;
uint32_t beta = (uint32_t)(xs_step(&seed) % 16) + 1;
uint32_t tc0p = 0;
for (int s = 0; s < 4; s++) {
int rr = (int)(xs_step(&seed) % 8);
int8_t v = (int8_t)(rr == 0 ? -1 : (rr - 1));
tc0p |= ((uint32_t)(uint8_t)v) << (s * 8);
}
meta[4*i+0] = (uint32_t)((size_t)i * 256 + 4 * 16); /* EDGE_OFF = 4*stride */
meta[4*i+1] = alpha | (beta << 8);
meta[4*i+2] = tc0p;
meta[4*i+3] = 0;
}
for (size_t i = 0; i < dst_bytes; i++)
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
}
v3d_pipeline pipe = {0};
int n_ssbos = has_src ? 3 : 2;
/* K_H264DEBLOCK reuses pc_lpf layout (n + dst_stride_u8 + 2 pads). */
size_t pc_size = (a->kernel == K_MC) ? sizeof(pc_mc) :
(a->kernel == K_IDCT) ? sizeof(pc_idct) :
(a->kernel == K_CDEF) ? sizeof(pc_cdef) : sizeof(pc_lpf);
v3d_runner_create_pipeline(r, spv, n_ssbos, pc_size, &pipe);
v3d_buffer bind_bufs[3];
bind_bufs[0] = buf_meta;
bind_bufs[1] = buf_dst;
if (has_src) bind_bufs[2] = buf_src;
v3d_runner_bind_buffers(r, &pipe, bind_bufs, n_ssbos);
uint32_t gc = (uint32_t)((n_units + bpw - 1) / bpw);
union { pc_lpf lpf; pc_mc mc; pc_idct idct; pc_cdef cdef; } pc = {0};
if (a->kernel == K_LPF4 || a->kernel == K_LPF8) {
pc.lpf = (pc_lpf){ .n = n_units, .dst_stride_u8 = 8 };
} else if (a->kernel == K_MC) {
pc.mc = (pc_mc){ .n = n_units, .dst_stride_u8 = 8, .src_stride_u8 = 16 };
} else if (a->kernel == K_IDCT) {
pc.idct = (pc_idct){ .n_blocks = n_units, .blocks_per_row = 16, .dst_stride_u8 = 128 };
} else if (a->kernel == K_CDEF) {
pc.cdef = (pc_cdef){ .n_blocks = n_units, .tmp_stride_u16 = 16, .dst_stride_u8 = 8 };
} else if (a->kernel == K_H264DEBLOCK) {
pc.lpf = (pc_lpf){ .n = n_units, .dst_stride_u8 = 16 };
}
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, pc_size, &pc);
vkCmdDispatch(cb, gc, 1, 1);
vkEndCommandBuffer(cb);
for (int i = 0; i < 5; i++) v3d_runner_submit_wait(r, cb);
pthread_barrier_wait(&g_start);
double t0 = now_s();
uint64_t done = 0;
while (!g_stop) {
v3d_runner_submit_wait(r, cb);
done += n_units;
}
a->elapsed_s = now_s() - t0;
a->units_done = done;
v3d_runner_destroy_pipeline(r, &pipe);
if (has_src) v3d_runner_destroy_buffer(r, &buf_src);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
return NULL;
}
/* --- Timer --- */
typedef struct { double duration_s; } timer_args;
static void *timer_thread(void *p) {
timer_args *a = p;
pthread_barrier_wait(&g_start);
double end = now_s() + a->duration_s;
while (now_s() < end) {
struct timespec ts = {0, 1000000}; nanosleep(&ts, NULL);
}
g_stop = 1;
return NULL;
}
/* --- Main --- */
static enum kernel parse_kernel(const char *s) {
if (!strcmp(s, "mc")) return K_MC;
if (!strcmp(s, "lpf4")) return K_LPF4;
if (!strcmp(s, "lpf8")) return K_LPF8;
if (!strcmp(s, "cdef")) return K_CDEF;
if (!strcmp(s, "idct")) return K_IDCT;
if (!strcmp(s, "h264deblock")) return K_H264DEBLOCK;
fprintf(stderr, "unknown kernel: %s\n", s); exit(2);
}
int main(int argc, char **argv)
{
enum kernel cpu_k = K_MC, qpu_k = K_CDEF;
int n_neon = 3, qpu_core = 3, qpu_n_units = 65536;
double duration = 8.0;
static struct option opts[] = {
{"cpu-kernel", required_argument, 0, 'c'},
{"qpu-kernel", required_argument, 0, 'q'},
{"neon-threads", required_argument, 0, 'n'},
{"qpu-core", required_argument, 0, 'C'},
{"qpu-units", required_argument, 0, 'u'},
{"duration", required_argument, 0, 'd'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "c:q:n:C:u:d:", opts, 0)) != -1;) {
switch (c) {
case 'c': cpu_k = parse_kernel(optarg); break;
case 'q': qpu_k = parse_kernel(optarg); break;
case 'n': n_neon = atoi(optarg); break;
case 'C': qpu_core = atoi(optarg); break;
case 'u': qpu_n_units = atoi(optarg); break;
case 'd': duration = atof(optarg); break;
default: return 2;
}
}
/* Cycle 5 Phase 6 landed — v3d_cdef.spv is M1-PASS. Use real
* QPU dispatch for CDEF too. The NEON-fallback worker remains
* compiled but is unselected. */
int use_neon_fallback_for_cdef = 0;
int barrier_count = n_neon + 1 /* QPU */ + 1 /* timer */ + 1 /* main */;
printf("=== Issue 003 mixed-kernel M4 bench ===\n");
printf(" cpu kernel: %s × %d threads (cores 0..%d)\n",
kernel_name(cpu_k), n_neon, n_neon - 1);
printf(" qpu kernel: %s on core %d (%s)\n",
kernel_name(qpu_k), qpu_core,
use_neon_fallback_for_cdef ?
"dav1d NEON fallback — real QPU CDEF deferred to cycle 5 Phase 6" :
"QPU dispatch");
printf(" duration: %.1fs\n\n", duration);
pthread_barrier_init(&g_start, NULL, barrier_count);
pthread_t timer_tid; timer_args ta = { .duration_s = duration };
pthread_create(&timer_tid, NULL, timer_thread, &ta);
pthread_t neon_tids[16] = {0};
neon_args n_args[16] = {0};
for (int i = 0; i < n_neon; i++) {
n_args[i] = (neon_args){ .worker_id = i, .affinity_core = i, .kernel = cpu_k };
pthread_create(&neon_tids[i], NULL, neon_worker, &n_args[i]);
}
pthread_t qpu_tid = 0;
qpu_args q_args = {0};
qpu_real_args qr_args = {0};
if (use_neon_fallback_for_cdef) {
q_args = (qpu_args){ .affinity_core = qpu_core, .n_units = qpu_n_units, .kernel = qpu_k };
pthread_create(&qpu_tid, NULL, qpu_cdef_neon_fallback, &q_args);
} else {
qr_args = (qpu_real_args){ .affinity_core = qpu_core, .n_units = qpu_n_units, .kernel = qpu_k };
pthread_create(&qpu_tid, NULL, qpu_real_worker, &qr_args);
}
pthread_barrier_wait(&g_start);
pthread_join(timer_tid, NULL);
for (int i = 0; i < n_neon; i++) pthread_join(neon_tids[i], NULL);
pthread_join(qpu_tid, NULL);
uint64_t cpu_total = 0; double cpu_max_e = 0;
printf("NEON workers (%s):\n", kernel_name(cpu_k));
for (int i = 0; i < n_neon; i++) {
double r = n_args[i].units_done / n_args[i].elapsed_s / 1e6;
printf(" core %d: %.3f %s\n", n_args[i].affinity_core, r, kernel_unit(cpu_k));
cpu_total += n_args[i].units_done;
if (n_args[i].elapsed_s > cpu_max_e) cpu_max_e = n_args[i].elapsed_s;
}
double cpu_rate = cpu_total / cpu_max_e / 1e6;
printf(" CPU aggregate: %.3f %s\n\n", cpu_rate, kernel_unit(cpu_k));
uint64_t qpu_done = use_neon_fallback_for_cdef ? q_args.units_done : qr_args.units_done;
double qpu_elapsed = use_neon_fallback_for_cdef ? q_args.elapsed_s : qr_args.elapsed_s;
double qpu_rate = qpu_done / qpu_elapsed / 1e6;
printf("QPU worker (%s on core %d):\n", kernel_name(qpu_k), qpu_core);
printf(" %.3f %s (%llu units / %.3f s)\n",
qpu_rate, kernel_unit(qpu_k),
(unsigned long long) qpu_done, qpu_elapsed);
pthread_barrier_destroy(&g_start);
return 0;
}