Files
daedalus-fourier/tests/bench_concurrent_mixed.c
T
marfrit 2dd774a9ab Issue 003 closed: mixed-kernel M4 validates V4 deployment shape
bench_concurrent_mixed runs NEON-N on kernel A + QPU on kernel B
concurrently. Matrix on hertz:

  V3 (CPU MC + QPU MC same-kernel): CPU 22.64 + QPU 0.39 Mblock/s
  V4 (CPU MC + QPU LPF4):            CPU 27.87 + QPU 12.74 Medge/s
  V1 (CPU MC + NEON-fb CDEF):        CPU 24.49 + 1.75 Mblock/s CDEF
  V2 (CPU LPF4 + NEON-fb CDEF):      CPU 27.28 Medge + 1.70 Mblock/s

V4 is the daedalus-fourier deployment shape (CPU runs MC; QPU runs
LPF4 via cycle 2 GREEN offload). Both substrates productive; CPU
MC +23% per-core vs same-kernel V3 control. Same-kernel M4 in
cycles 1-5 was a worst-case contention bound, not a deployment
number — user's "5%/50%" framing was correct.

Cycle 3 MC verdict unchanged (QPU MC contributes ~0.4 under any
contention); cycle 5 CDEF deferred verdict softened to
opportunistic helper (NEON-fallback proxy used since cycle 5
Phase 6 not yet built).

- tests/bench_concurrent_mixed.c (configurable cpu-kernel /
  qpu-kernel matrix; supports MC, LPF4, LPF8, IDCT real QPU
  dispatch; CDEF uses NEON-on-core-3 fallback)
- CMakeLists.txt: build target wired with all FFmpeg + dav1d sources
- docs/issues/003-mixed-kernel-m4-bench.md: closure + matrix
- docs/k3_mc_phase7.md: M4 methodology caveat extended with V3/V4
- docs/k5_cdef_phase3_partial.md: deployment recommendation updated

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:44:08 +00:00

521 lines
20 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Issue 003 — Mixed-kernel M4 bench.
*
* Runs N NEON pthread workers (pinned 0..N-1) doing CPU kernel A,
* plus one QPU worker doing kernel B concurrently. Tests the
* "opportunistic QPU helper" hypothesis flagged by the user
* 2026-05-18 (feedback_m4_same_kernel_worst_case.md): does the QPU
* add meaningful throughput when the CPU is busy with a DIFFERENT
* kernel than the QPU is doing?
*
* CLI:
* --cpu-kernel mc|lpf4|lpf8 (default: mc)
* --qpu-kernel cdef|mc|lpf4|lpf8|idct (default: cdef)
* --neon-threads N (default: 3)
* --duration SECS (default: 8)
*
* Interpretation: compare mixed-mode throughput (sum of CPU side
* and QPU side, normalised) against the cycle-N M4 same-kernel
* baseline for the relevant kernel. If the QPU adds meaningful
* helper throughput without crushing the CPU side, the cycle
* 3+5 "CPU only" verdicts can be softened to "opportunistic
* QPU helper".
*
* License: BSD-2-Clause; links FFmpeg LGPL-2.1+ snapshot (MC, LPF)
* and dav1d BSD-2-Clause snapshot (CDEF).
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stddef.h>
#include <time.h>
#include <getopt.h>
#include <pthread.h>
#include <sched.h>
#include <assert.h>
#include <vulkan/vulkan.h>
#include "v3d_runner.h"
/* External NEON refs (vendored FFmpeg + dav1d). */
extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my);
extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride,
int E, int I, int H);
extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride,
int E, int I, int H);
extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob);
extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
const uint16_t *tmp, int pri_strength, int sec_strength,
int dir, int damping, int h, size_t edges);
/* --- Common helpers --- */
static volatile int g_stop = 0;
static pthread_barrier_t g_start;
static inline uint64_t xs_step(uint64_t *s) {
uint64_t x = *s; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return *s = x;
}
static uint64_t xs_init(uint64_t s) { return s ? s : 0xa57edbeef5717ULL; }
static double now_s(void) {
struct timespec t; clock_gettime(CLOCK_MONOTONIC_RAW, &t);
return t.tv_sec + t.tv_nsec * 1e-9;
}
/* --- Kernel selectors --- */
enum kernel { K_MC, K_LPF4, K_LPF8, K_CDEF, K_IDCT };
static const char *kernel_name(enum kernel k) {
switch (k) {
case K_MC: return "mc";
case K_LPF4: return "lpf4";
case K_LPF8: return "lpf8";
case K_CDEF: return "cdef";
case K_IDCT: return "idct";
}
return "?";
}
static const char *kernel_unit(enum kernel k) {
return (k == K_LPF4 || k == K_LPF8) ? "Medge/s" : "Mblock/s";
}
/* --- NEON worker (per-kernel inline; pre-generate inputs, hot-loop) --- */
#define NEON_BATCH 8192
typedef struct {
int worker_id, affinity_core;
enum kernel kernel;
uint64_t units_done;
double elapsed_s;
} neon_args;
static void neon_run_mc(uint64_t *seed, uint64_t *out_done) {
/* MC: SRC_BYTES=128 (8x16) per block; DST_BYTES=64. */
uint8_t *src = malloc((size_t) NEON_BATCH * 128);
uint8_t *dst = malloc((size_t) NEON_BATCH * 64);
int *mx = malloc(NEON_BATCH * sizeof(int));
for (int i = 0; i < NEON_BATCH; i++) {
for (int j = 0; j < 128; j++) src[i*128 + j] = (uint8_t)(xs_step(seed) & 0xff);
mx[i] = (int)(xs_step(seed) & 15);
}
while (!g_stop) {
for (int i = 0; i < NEON_BATCH; i++)
ff_vp9_put_regular8_h_neon(dst + i*64, 8,
src + i*128 + 3, 16, 8, mx[i], 0);
*out_done += NEON_BATCH;
}
free(src); free(dst); free(mx);
}
static void neon_run_lpf(uint64_t *seed, uint64_t *out_done, int wd_8) {
uint8_t *master = malloc((size_t) NEON_BATCH * 64);
uint8_t *work = malloc((size_t) NEON_BATCH * 64);
int *Es = malloc(NEON_BATCH*sizeof(int)), *Is = malloc(NEON_BATCH*sizeof(int)), *Hs = malloc(NEON_BATCH*sizeof(int));
for (int i = 0; i < NEON_BATCH; i++) {
for (int j = 0; j < 64; j++) master[i*64+j] = (uint8_t)(xs_step(seed) & 0xff);
Es[i] = (int)(xs_step(seed) % 81);
Is[i] = (int)(xs_step(seed) % 41);
Hs[i] = (int)(xs_step(seed) % 11);
}
while (!g_stop) {
memcpy(work, master, (size_t) NEON_BATCH * 64);
for (int i = 0; i < NEON_BATCH; i++) {
if (wd_8) ff_vp9_loop_filter_h_8_8_neon(work + i*64 + 4, 8, Es[i], Is[i], Hs[i]);
else ff_vp9_loop_filter_h_4_8_neon(work + i*64 + 4, 8, Es[i], Is[i], Hs[i]);
}
*out_done += NEON_BATCH;
}
free(master); free(work); free(Es); free(Is); free(Hs);
}
static void neon_run_idct(uint64_t *seed, uint64_t *out_done) {
int16_t *blocks_master = malloc((size_t) NEON_BATCH * 64 * sizeof(int16_t));
int16_t *blocks_work = malloc((size_t) NEON_BATCH * 64 * sizeof(int16_t));
uint8_t *dsts = malloc((size_t) NEON_BATCH * 64);
int *eobs = malloc(NEON_BATCH * sizeof(int));
for (int i = 0; i < NEON_BATCH; i++) {
memset(blocks_master + i*64, 0, 64*sizeof(int16_t));
int n = 1 + (int)(xs_step(seed) % 16);
int eob = 0;
for (int j = 0; j < n; j++) {
int pos = (int)(xs_step(seed) % 64);
int16_t coef = (int16_t)((int)(xs_step(seed) % 8192) - 4096);
blocks_master[i*64 + pos] = coef;
if (pos + 1 > eob) eob = pos + 1;
}
eobs[i] = eob ? eob : 1;
}
while (!g_stop) {
memcpy(blocks_work, blocks_master, (size_t) NEON_BATCH * 64 * sizeof(int16_t));
for (int i = 0; i < NEON_BATCH; i++)
ff_vp9_idct_idct_8x8_add_neon(dsts + i*64, 8, blocks_work + i*64, eobs[i]);
*out_done += NEON_BATCH;
}
free(blocks_master); free(blocks_work); free(dsts); free(eobs);
}
static void *neon_worker(void *p) {
neon_args *a = p;
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
uint64_t seed = xs_init((uint64_t) a->worker_id * 0xc01dbeefULL);
pthread_barrier_wait(&g_start);
double t0 = now_s();
uint64_t done = 0;
switch (a->kernel) {
case K_MC: neon_run_mc(&seed, &done); break;
case K_LPF4: neon_run_lpf(&seed, &done, 0); break;
case K_LPF8: neon_run_lpf(&seed, &done, 1); break;
case K_IDCT: neon_run_idct(&seed, &done); break;
default: fprintf(stderr, "bad NEON kernel\n"); break;
}
a->elapsed_s = now_s() - t0;
a->units_done = done;
return NULL;
}
/* --- QPU worker (CDEF / MC / LPF4 / LPF8 / IDCT) --- */
typedef struct {
int affinity_core, n_units;
enum kernel kernel;
uint64_t units_done;
double elapsed_s;
} qpu_args;
/* Each QPU kernel has its own push-constant layout. */
typedef struct { uint32_t n, dst_stride_u8, _pad0, _pad1; } pc_lpf;
typedef struct { uint32_t n, dst_stride_u8, src_stride_u8, _pad; } pc_mc;
/* IDCT: pc layout in v3d_idct8.comp = (n_blocks, blocks_per_row, dst_stride_u8, _pad) */
typedef struct { uint32_t n_blocks, blocks_per_row, dst_stride_u8, _pad; } pc_idct;
/* CDEF: not yet — QPU CDEF kernel not implemented. CDEF QPU mode uses
* dav1d NEON via a single-thread NEON call on the QPU host core instead.
* That's a degenerate "QPU helper" but matches the deferred state of
* cycle 5. Real QPU CDEF kernel would replace this once cycle 5 closes. */
static void *qpu_cdef_neon_fallback(void *p)
{
/* Cycle 5 doesn't have a working QPU CDEF kernel yet (M1 deferred).
* For Issue 003's purposes we test "the QPU host core running NEON
* CDEF" as a proxy for the QPU contribution. This UNDERSTATES the
* QPU helper value (since the QPU itself would parallelise more
* than 1 NEON core), but gives a defensible lower bound: if even
* NEON-on-the-spare-core helps the mixed throughput, QPU certainly
* would.
*
* TODO: once cycle 5 Phase 6 lands, swap this for the QPU dispatch. */
qpu_args *a = p;
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
int n_blocks = a->n_units;
uint64_t seed = 0xcdef00000beefcULL;
uint16_t *tmps = malloc((size_t) n_blocks * 192 * sizeof(uint16_t));
uint8_t *dsts = malloc((size_t) n_blocks * 64);
int *pris = malloc(n_blocks*sizeof(int));
int *secs = malloc(n_blocks*sizeof(int));
int *dirs = malloc(n_blocks*sizeof(int));
int *damps = malloc(n_blocks*sizeof(int));
for (int i = 0; i < n_blocks; i++) {
for (int j = 0; j < 192; j++) tmps[i*192 + j] = (uint16_t)(xs_step(&seed) & 0xff);
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++)
dsts[i*64 + r*8 + c] = (uint8_t) tmps[i*192 + (r+2)*16 + (c+2)];
pris[i] = (int)(xs_step(&seed) % 7) + 1;
secs[i] = (int)(xs_step(&seed) % 4) + 1;
dirs[i] = (int)(xs_step(&seed) & 7);
damps[i] = (int)(xs_step(&seed) % 4) + 3;
}
pthread_barrier_wait(&g_start);
double t0 = now_s();
uint64_t done = 0;
while (!g_stop) {
for (int i = 0; i < n_blocks; i++)
dav1d_cdef_filter8_8bpc_neon(dsts + i*64, 8,
tmps + i*192,
pris[i], secs[i], dirs[i], damps[i], 8, 0);
done += n_blocks;
}
a->elapsed_s = now_s() - t0;
a->units_done = done;
free(tmps); free(dsts); free(pris); free(secs); free(dirs); free(damps);
return NULL;
}
/* QPU dispatch worker — generic for kernels with working shaders. */
typedef struct {
int affinity_core, n_units;
enum kernel kernel;
uint64_t units_done;
double elapsed_s;
} qpu_real_args;
static void *qpu_real_worker(void *p)
{
qpu_real_args *a = p;
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
v3d_runner *r = v3d_runner_create();
if (!r) return NULL;
int n_units = a->n_units;
const char *spv = NULL;
uint32_t bpw = 32; /* blocks/edges per WG */
size_t dst_bytes = 0, meta_bytes = 0, src_bytes = 0;
int has_src = 0;
size_t per_unit = 0;
switch (a->kernel) {
case K_LPF4:
case K_LPF8: {
spv = (a->kernel == K_LPF4) ? "v3d_lpf_h_4_8.spv" : "v3d_lpf_h_8_8.spv";
per_unit = 64;
dst_bytes = (size_t) n_units * per_unit;
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
break;
}
case K_MC:
spv = "v3d_mc_8h.spv";
dst_bytes = (size_t) n_units * 64;
src_bytes = (size_t) n_units * 128;
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
has_src = 1;
break;
case K_IDCT:
spv = "v3d_idct8.spv";
dst_bytes = (size_t) n_units * 64;
src_bytes = (size_t) n_units * 64 * sizeof(int16_t); /* coeffs */
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t); /* per-block pos */
has_src = 1;
break;
default:
fprintf(stderr, "qpu_real_worker: unsupported kernel\n");
v3d_runner_destroy(r);
return NULL;
}
v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_src = {0};
v3d_runner_create_buffer(r, meta_bytes, &buf_meta);
v3d_runner_create_buffer(r, dst_bytes, &buf_dst);
if (has_src) v3d_runner_create_buffer(r, src_bytes, &buf_src);
/* Synthesise meta + src + dst content based on kernel. */
uint64_t seed = 0xfeed00000beefULL;
uint32_t *meta = buf_meta.mapped;
if (a->kernel == K_LPF4 || a->kernel == K_LPF8) {
for (int i = 0; i < n_units; i++) {
meta[4*i+0] = (uint32_t)((size_t)i * 64 + 4); /* dst_off */
meta[4*i+1] = (uint32_t)(xs_step(&seed) % 81); /* E */
meta[4*i+2] = (uint32_t)(xs_step(&seed) % 41); /* I */
meta[4*i+3] = (uint32_t)(xs_step(&seed) % 11); /* H */
}
for (size_t i = 0; i < dst_bytes; i++)
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
} else if (a->kernel == K_MC) {
for (int i = 0; i < n_units; i++) {
meta[4*i+0] = (uint32_t)((size_t)i * 64); /* dst_off */
meta[4*i+1] = (uint32_t)((size_t)i * 128); /* src_off (RAW) */
meta[4*i+2] = (uint32_t)(xs_step(&seed) & 15); /* mx */
meta[4*i+3] = 0;
}
for (size_t i = 0; i < src_bytes; i++)
((uint8_t *) buf_src.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
} else if (a->kernel == K_IDCT) {
for (int i = 0; i < n_units; i++) {
meta[4*i+0] = (uint32_t)((size_t)i * 64); /* dst_off */
meta[4*i+1] = (uint32_t)((i * 64) / 64); /* coeff_off (in blocks) */
meta[4*i+2] = 0; /* eob (not used by our shader) */
meta[4*i+3] = 0;
}
/* Fill coeffs with random VP9-ish values. */
int16_t *cf = (int16_t *) buf_src.mapped;
size_t n_coefs = src_bytes / sizeof(int16_t);
for (size_t i = 0; i < n_coefs; i++)
cf[i] = (int16_t)((int)(xs_step(&seed) % 8192) - 4096);
}
v3d_pipeline pipe = {0};
int n_ssbos = has_src ? 3 : 2;
size_t pc_size = (a->kernel == K_MC) ? sizeof(pc_mc) :
(a->kernel == K_IDCT) ? sizeof(pc_idct) : sizeof(pc_lpf);
v3d_runner_create_pipeline(r, spv, n_ssbos, pc_size, &pipe);
v3d_buffer bind_bufs[3];
bind_bufs[0] = buf_meta;
bind_bufs[1] = buf_dst;
if (has_src) bind_bufs[2] = buf_src;
v3d_runner_bind_buffers(r, &pipe, bind_bufs, n_ssbos);
uint32_t gc = (uint32_t)((n_units + bpw - 1) / bpw);
union { pc_lpf lpf; pc_mc mc; pc_idct idct; } pc = {0};
if (a->kernel == K_LPF4 || a->kernel == K_LPF8) {
pc.lpf = (pc_lpf){ .n = n_units, .dst_stride_u8 = 8 };
} else if (a->kernel == K_MC) {
pc.mc = (pc_mc){ .n = n_units, .dst_stride_u8 = 8, .src_stride_u8 = 16 };
} else if (a->kernel == K_IDCT) {
pc.idct = (pc_idct){ .n_blocks = n_units, .blocks_per_row = 16, .dst_stride_u8 = 128 };
}
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, pc_size, &pc);
vkCmdDispatch(cb, gc, 1, 1);
vkEndCommandBuffer(cb);
for (int i = 0; i < 5; i++) v3d_runner_submit_wait(r, cb);
pthread_barrier_wait(&g_start);
double t0 = now_s();
uint64_t done = 0;
while (!g_stop) {
v3d_runner_submit_wait(r, cb);
done += n_units;
}
a->elapsed_s = now_s() - t0;
a->units_done = done;
v3d_runner_destroy_pipeline(r, &pipe);
if (has_src) v3d_runner_destroy_buffer(r, &buf_src);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
return NULL;
}
/* --- Timer --- */
typedef struct { double duration_s; } timer_args;
static void *timer_thread(void *p) {
timer_args *a = p;
pthread_barrier_wait(&g_start);
double end = now_s() + a->duration_s;
while (now_s() < end) {
struct timespec ts = {0, 1000000}; nanosleep(&ts, NULL);
}
g_stop = 1;
return NULL;
}
/* --- Main --- */
static enum kernel parse_kernel(const char *s) {
if (!strcmp(s, "mc")) return K_MC;
if (!strcmp(s, "lpf4")) return K_LPF4;
if (!strcmp(s, "lpf8")) return K_LPF8;
if (!strcmp(s, "cdef")) return K_CDEF;
if (!strcmp(s, "idct")) return K_IDCT;
fprintf(stderr, "unknown kernel: %s\n", s); exit(2);
}
int main(int argc, char **argv)
{
enum kernel cpu_k = K_MC, qpu_k = K_CDEF;
int n_neon = 3, qpu_core = 3, qpu_n_units = 65536;
double duration = 8.0;
static struct option opts[] = {
{"cpu-kernel", required_argument, 0, 'c'},
{"qpu-kernel", required_argument, 0, 'q'},
{"neon-threads", required_argument, 0, 'n'},
{"qpu-core", required_argument, 0, 'C'},
{"qpu-units", required_argument, 0, 'u'},
{"duration", required_argument, 0, 'd'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "c:q:n:C:u:d:", opts, 0)) != -1;) {
switch (c) {
case 'c': cpu_k = parse_kernel(optarg); break;
case 'q': qpu_k = parse_kernel(optarg); break;
case 'n': n_neon = atoi(optarg); break;
case 'C': qpu_core = atoi(optarg); break;
case 'u': qpu_n_units = atoi(optarg); break;
case 'd': duration = atof(optarg); break;
default: return 2;
}
}
/* CDEF on QPU side currently uses dav1d NEON fallback (cycle 5
* Phase 6 not yet implemented). Real QPU CDEF would replace
* qpu_cdef_neon_fallback with qpu_real_worker. */
int use_neon_fallback_for_cdef = (qpu_k == K_CDEF);
int barrier_count = n_neon + 1 /* QPU */ + 1 /* timer */ + 1 /* main */;
printf("=== Issue 003 mixed-kernel M4 bench ===\n");
printf(" cpu kernel: %s × %d threads (cores 0..%d)\n",
kernel_name(cpu_k), n_neon, n_neon - 1);
printf(" qpu kernel: %s on core %d (%s)\n",
kernel_name(qpu_k), qpu_core,
use_neon_fallback_for_cdef ?
"dav1d NEON fallback — real QPU CDEF deferred to cycle 5 Phase 6" :
"QPU dispatch");
printf(" duration: %.1fs\n\n", duration);
pthread_barrier_init(&g_start, NULL, barrier_count);
pthread_t timer_tid; timer_args ta = { .duration_s = duration };
pthread_create(&timer_tid, NULL, timer_thread, &ta);
pthread_t neon_tids[16] = {0};
neon_args n_args[16] = {0};
for (int i = 0; i < n_neon; i++) {
n_args[i] = (neon_args){ .worker_id = i, .affinity_core = i, .kernel = cpu_k };
pthread_create(&neon_tids[i], NULL, neon_worker, &n_args[i]);
}
pthread_t qpu_tid = 0;
qpu_args q_args = {0};
qpu_real_args qr_args = {0};
if (use_neon_fallback_for_cdef) {
q_args = (qpu_args){ .affinity_core = qpu_core, .n_units = qpu_n_units, .kernel = qpu_k };
pthread_create(&qpu_tid, NULL, qpu_cdef_neon_fallback, &q_args);
} else {
qr_args = (qpu_real_args){ .affinity_core = qpu_core, .n_units = qpu_n_units, .kernel = qpu_k };
pthread_create(&qpu_tid, NULL, qpu_real_worker, &qr_args);
}
pthread_barrier_wait(&g_start);
pthread_join(timer_tid, NULL);
for (int i = 0; i < n_neon; i++) pthread_join(neon_tids[i], NULL);
pthread_join(qpu_tid, NULL);
uint64_t cpu_total = 0; double cpu_max_e = 0;
printf("NEON workers (%s):\n", kernel_name(cpu_k));
for (int i = 0; i < n_neon; i++) {
double r = n_args[i].units_done / n_args[i].elapsed_s / 1e6;
printf(" core %d: %.3f %s\n", n_args[i].affinity_core, r, kernel_unit(cpu_k));
cpu_total += n_args[i].units_done;
if (n_args[i].elapsed_s > cpu_max_e) cpu_max_e = n_args[i].elapsed_s;
}
double cpu_rate = cpu_total / cpu_max_e / 1e6;
printf(" CPU aggregate: %.3f %s\n\n", cpu_rate, kernel_unit(cpu_k));
uint64_t qpu_done = use_neon_fallback_for_cdef ? q_args.units_done : qr_args.units_done;
double qpu_elapsed = use_neon_fallback_for_cdef ? q_args.elapsed_s : qr_args.elapsed_s;
double qpu_rate = qpu_done / qpu_elapsed / 1e6;
printf("QPU worker (%s on core %d):\n", kernel_name(qpu_k), qpu_core);
printf(" %.3f %s (%llu units / %.3f s)\n",
qpu_rate, kernel_unit(qpu_k),
(unsigned long long) qpu_done, qpu_elapsed);
pthread_barrier_destroy(&g_start);
return 0;
}