373f63a910
Phase 6 deliverable: v3d_h264deblock.comp (132 inst, 4 threads,
no spills). Phase 5 REDs applied:
RED-1: explicit clamp p1'/q1' to [0,255] before uint8 write
RED-2: bench-enforced m.x >= 4*stride contract
M1: 3-way 4096/4096 bit-exact (QPU vs C ref AND vs NEON).
M2: 5.629 Medge/s isolation → R8 = 0.061 RED (predicted 0.09-0.14).
Lower than prediction; H.264 deblock has 4 early-return paths +
2 conditional writes that hurt V3D branchy execution more than
expected.
M4 same-kernel: NEON-3+QPU 12.81 Medge/s ≈ pure-NEON-4 ~12-15
(neutral).
M4 MIXED (real H.264 deployment shape): CPU=MC + QPU=h264deblock
gives CPU MC 25.11 Mblock/s + QPU h264deblock 6.23 Medge/s.
QPU contribution is essentially unchanged from isolation —
the cross-substrate contention is gentle (consistent with
Issue 003's V4 finding).
Verdict: H.264 deblock = opportunistic QPU helper. Same recipe
slot as cycle 5 CDEF. 6 Medge/s helper = 85% of single-NEON-core
deblock capacity, available when CPU is busy with other work.
Cycles 1-8 deployment recipe complete:
Primary QPU: cycles 1+2+4 (VP9 IDCT/LPF, all bandwidth-bound)
Primary CPU: cycles 3+6+7 (compute-heavy or trivially fast on NEON)
Opportunistic helper: cycles 5+8 (CDEF, H.264 deblock)
Phase 9 lessons added:
- Branchy kernels underperform V3D vs straight-line ones
- Mixed-kernel helper value scales with isolation M2, not
same-kernel M4
- R prediction needs branchiness weight, not just compute density
- src/v3d_h264deblock.comp (132 inst QPU shader)
- tests/bench_v3d_h264deblock.c (3-way M1 + M2 + R classification)
- tests/bench_concurrent_mixed.c extended with K_H264DEBLOCK
- CMakeLists.txt: v3d_h264deblock.spv + bench_v3d_h264deblock
+ h264dsp linked into bench_concurrent_mixed
- docs/k8_h264deblock_phase7.md (full closure with cycles 1-8 recipe)
Next: Phase 8 — V4L2 wrapper / deployment infra. Public API
already exposes recipe-default substrate per kernel.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
630 lines
25 KiB
C
630 lines
25 KiB
C
/*
|
||
* Issue 003 — Mixed-kernel M4 bench.
|
||
*
|
||
* Runs N NEON pthread workers (pinned 0..N-1) doing CPU kernel A,
|
||
* plus one QPU worker doing kernel B concurrently. Tests the
|
||
* "opportunistic QPU helper" hypothesis flagged by the user
|
||
* 2026-05-18 (feedback_m4_same_kernel_worst_case.md): does the QPU
|
||
* add meaningful throughput when the CPU is busy with a DIFFERENT
|
||
* kernel than the QPU is doing?
|
||
*
|
||
* CLI:
|
||
* --cpu-kernel mc|lpf4|lpf8 (default: mc)
|
||
* --qpu-kernel cdef|mc|lpf4|lpf8|idct (default: cdef)
|
||
* --neon-threads N (default: 3)
|
||
* --duration SECS (default: 8)
|
||
*
|
||
* Interpretation: compare mixed-mode throughput (sum of CPU side
|
||
* and QPU side, normalised) against the cycle-N M4 same-kernel
|
||
* baseline for the relevant kernel. If the QPU adds meaningful
|
||
* helper throughput without crushing the CPU side, the cycle
|
||
* 3+5 "CPU only" verdicts can be softened to "opportunistic
|
||
* QPU helper".
|
||
*
|
||
* License: BSD-2-Clause; links FFmpeg LGPL-2.1+ snapshot (MC, LPF)
|
||
* and dav1d BSD-2-Clause snapshot (CDEF).
|
||
*/
|
||
#define _GNU_SOURCE
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <stdint.h>
|
||
#include <string.h>
|
||
#include <stddef.h>
|
||
#include <time.h>
|
||
#include <getopt.h>
|
||
#include <pthread.h>
|
||
#include <sched.h>
|
||
#include <assert.h>
|
||
#include <vulkan/vulkan.h>
|
||
|
||
#include "v3d_runner.h"
|
||
|
||
/* External NEON refs (vendored FFmpeg + dav1d). */
|
||
extern void ff_vp9_put_regular8_h_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||
const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my);
|
||
extern void ff_vp9_loop_filter_h_4_8_neon(uint8_t *dst, ptrdiff_t stride,
|
||
int E, int I, int H);
|
||
extern void ff_vp9_loop_filter_h_8_8_neon(uint8_t *dst, ptrdiff_t stride,
|
||
int E, int I, int H);
|
||
extern void ff_vp9_idct_idct_8x8_add_neon(uint8_t *dst, ptrdiff_t stride,
|
||
int16_t *block, int eob);
|
||
extern void dav1d_cdef_filter8_8bpc_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||
const uint16_t *tmp, int pri_strength, int sec_strength,
|
||
int dir, int damping, int h, size_t edges);
|
||
|
||
/* --- Common helpers --- */
|
||
|
||
static volatile int g_stop = 0;
|
||
static pthread_barrier_t g_start;
|
||
|
||
static inline uint64_t xs_step(uint64_t *s) {
|
||
uint64_t x = *s; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return *s = x;
|
||
}
|
||
static uint64_t xs_init(uint64_t s) { return s ? s : 0xa57edbeef5717ULL; }
|
||
static double now_s(void) {
|
||
struct timespec t; clock_gettime(CLOCK_MONOTONIC_RAW, &t);
|
||
return t.tv_sec + t.tv_nsec * 1e-9;
|
||
}
|
||
|
||
/* --- Kernel selectors --- */
|
||
|
||
enum kernel { K_MC, K_LPF4, K_LPF8, K_CDEF, K_IDCT, K_H264DEBLOCK };
|
||
|
||
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t *tc0);
|
||
|
||
static const char *kernel_name(enum kernel k) {
|
||
switch (k) {
|
||
case K_MC: return "mc";
|
||
case K_LPF4: return "lpf4";
|
||
case K_LPF8: return "lpf8";
|
||
case K_CDEF: return "cdef";
|
||
case K_IDCT: return "idct";
|
||
case K_H264DEBLOCK: return "h264deblock";
|
||
}
|
||
return "?";
|
||
}
|
||
static const char *kernel_unit(enum kernel k) {
|
||
return (k == K_LPF4 || k == K_LPF8 || k == K_H264DEBLOCK) ? "Medge/s" : "Mblock/s";
|
||
}
|
||
|
||
/* --- NEON worker (per-kernel inline; pre-generate inputs, hot-loop) --- */
|
||
|
||
#define NEON_BATCH 8192
|
||
|
||
typedef struct {
|
||
int worker_id, affinity_core;
|
||
enum kernel kernel;
|
||
uint64_t units_done;
|
||
double elapsed_s;
|
||
} neon_args;
|
||
|
||
static void neon_run_mc(uint64_t *seed, uint64_t *out_done) {
|
||
/* MC: SRC_BYTES=128 (8x16) per block; DST_BYTES=64. */
|
||
uint8_t *src = malloc((size_t) NEON_BATCH * 128);
|
||
uint8_t *dst = malloc((size_t) NEON_BATCH * 64);
|
||
int *mx = malloc(NEON_BATCH * sizeof(int));
|
||
for (int i = 0; i < NEON_BATCH; i++) {
|
||
for (int j = 0; j < 128; j++) src[i*128 + j] = (uint8_t)(xs_step(seed) & 0xff);
|
||
mx[i] = (int)(xs_step(seed) & 15);
|
||
}
|
||
while (!g_stop) {
|
||
for (int i = 0; i < NEON_BATCH; i++)
|
||
ff_vp9_put_regular8_h_neon(dst + i*64, 8,
|
||
src + i*128 + 3, 16, 8, mx[i], 0);
|
||
*out_done += NEON_BATCH;
|
||
}
|
||
free(src); free(dst); free(mx);
|
||
}
|
||
|
||
static void neon_run_lpf(uint64_t *seed, uint64_t *out_done, int wd_8) {
|
||
uint8_t *master = malloc((size_t) NEON_BATCH * 64);
|
||
uint8_t *work = malloc((size_t) NEON_BATCH * 64);
|
||
int *Es = malloc(NEON_BATCH*sizeof(int)), *Is = malloc(NEON_BATCH*sizeof(int)), *Hs = malloc(NEON_BATCH*sizeof(int));
|
||
for (int i = 0; i < NEON_BATCH; i++) {
|
||
for (int j = 0; j < 64; j++) master[i*64+j] = (uint8_t)(xs_step(seed) & 0xff);
|
||
Es[i] = (int)(xs_step(seed) % 81);
|
||
Is[i] = (int)(xs_step(seed) % 41);
|
||
Hs[i] = (int)(xs_step(seed) % 11);
|
||
}
|
||
while (!g_stop) {
|
||
memcpy(work, master, (size_t) NEON_BATCH * 64);
|
||
for (int i = 0; i < NEON_BATCH; i++) {
|
||
if (wd_8) ff_vp9_loop_filter_h_8_8_neon(work + i*64 + 4, 8, Es[i], Is[i], Hs[i]);
|
||
else ff_vp9_loop_filter_h_4_8_neon(work + i*64 + 4, 8, Es[i], Is[i], Hs[i]);
|
||
}
|
||
*out_done += NEON_BATCH;
|
||
}
|
||
free(master); free(work); free(Es); free(Is); free(Hs);
|
||
}
|
||
|
||
static void neon_run_cdef(uint64_t *seed, uint64_t *out_done) {
|
||
int n = NEON_BATCH;
|
||
uint16_t *tmps = malloc((size_t) n * 192 * sizeof(uint16_t));
|
||
uint8_t *dsts = malloc((size_t) n * 64);
|
||
int *pris = malloc(n*sizeof(int)), *secs = malloc(n*sizeof(int));
|
||
int *dirs = malloc(n*sizeof(int)), *damps = malloc(n*sizeof(int));
|
||
for (int i = 0; i < n; i++) {
|
||
for (int j = 0; j < 192; j++) tmps[i*192 + j] = (uint16_t)(xs_step(seed) & 0xff);
|
||
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++)
|
||
dsts[i*64 + r*8 + c] = (uint8_t) tmps[i*192 + (r+2)*16 + (c+2)];
|
||
pris[i] = (int)(xs_step(seed) % 7) + 1;
|
||
secs[i] = (int)(xs_step(seed) % 4) + 1;
|
||
dirs[i] = (int)(xs_step(seed) & 7);
|
||
damps[i] = (int)(xs_step(seed) % 6) + 1;
|
||
}
|
||
while (!g_stop) {
|
||
for (int i = 0; i < n; i++)
|
||
dav1d_cdef_filter8_8bpc_neon(dsts + i*64, 8,
|
||
tmps + i*192 + (2*16+2),
|
||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||
*out_done += n;
|
||
}
|
||
free(tmps); free(dsts); free(pris); free(secs); free(dirs); free(damps);
|
||
}
|
||
|
||
static void neon_run_idct(uint64_t *seed, uint64_t *out_done) {
|
||
int16_t *blocks_master = malloc((size_t) NEON_BATCH * 64 * sizeof(int16_t));
|
||
int16_t *blocks_work = malloc((size_t) NEON_BATCH * 64 * sizeof(int16_t));
|
||
uint8_t *dsts = malloc((size_t) NEON_BATCH * 64);
|
||
int *eobs = malloc(NEON_BATCH * sizeof(int));
|
||
for (int i = 0; i < NEON_BATCH; i++) {
|
||
memset(blocks_master + i*64, 0, 64*sizeof(int16_t));
|
||
int n = 1 + (int)(xs_step(seed) % 16);
|
||
int eob = 0;
|
||
for (int j = 0; j < n; j++) {
|
||
int pos = (int)(xs_step(seed) % 64);
|
||
int16_t coef = (int16_t)((int)(xs_step(seed) % 8192) - 4096);
|
||
blocks_master[i*64 + pos] = coef;
|
||
if (pos + 1 > eob) eob = pos + 1;
|
||
}
|
||
eobs[i] = eob ? eob : 1;
|
||
}
|
||
while (!g_stop) {
|
||
memcpy(blocks_work, blocks_master, (size_t) NEON_BATCH * 64 * sizeof(int16_t));
|
||
for (int i = 0; i < NEON_BATCH; i++)
|
||
ff_vp9_idct_idct_8x8_add_neon(dsts + i*64, 8, blocks_work + i*64, eobs[i]);
|
||
*out_done += NEON_BATCH;
|
||
}
|
||
free(blocks_master); free(blocks_work); free(dsts); free(eobs);
|
||
}
|
||
|
||
static void *neon_worker(void *p) {
|
||
neon_args *a = p;
|
||
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
|
||
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
|
||
|
||
uint64_t seed = xs_init((uint64_t) a->worker_id * 0xc01dbeefULL);
|
||
|
||
pthread_barrier_wait(&g_start);
|
||
double t0 = now_s();
|
||
uint64_t done = 0;
|
||
switch (a->kernel) {
|
||
case K_MC: neon_run_mc(&seed, &done); break;
|
||
case K_LPF4: neon_run_lpf(&seed, &done, 0); break;
|
||
case K_LPF8: neon_run_lpf(&seed, &done, 1); break;
|
||
case K_IDCT: neon_run_idct(&seed, &done); break;
|
||
case K_CDEF: neon_run_cdef(&seed, &done); break;
|
||
case K_H264DEBLOCK: {
|
||
/* H.264 deblock: 16-row × 16-col tile per edge, EDGE_OFF = 4*16. */
|
||
int n = NEON_BATCH;
|
||
uint8_t *master = malloc((size_t) n * 256);
|
||
uint8_t *work = malloc((size_t) n * 256);
|
||
int *alphas = malloc(n*sizeof(int)), *betas = malloc(n*sizeof(int));
|
||
int8_t (*tc0s)[4] = malloc(n*4);
|
||
for (int i = 0; i < n; i++) {
|
||
for (int j = 0; j < 256; j++) master[i*256+j] = (uint8_t)(xs_step(&seed) & 0xff);
|
||
alphas[i] = (int)(xs_step(&seed) % 64) + 1;
|
||
betas[i] = (int)(xs_step(&seed) % 16) + 1;
|
||
for (int s = 0; s < 4; s++) {
|
||
int r = (int)(xs_step(&seed) % 8);
|
||
tc0s[i][s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||
}
|
||
}
|
||
while (!g_stop) {
|
||
memcpy(work, master, (size_t) n * 256);
|
||
for (int i = 0; i < n; i++)
|
||
ff_h264_v_loop_filter_luma_neon(work + i*256 + 4*16, 16,
|
||
alphas[i], betas[i], tc0s[i]);
|
||
done += n;
|
||
}
|
||
free(master); free(work); free(alphas); free(betas); free(tc0s);
|
||
break;
|
||
}
|
||
default: fprintf(stderr, "bad NEON kernel\n"); break;
|
||
}
|
||
a->elapsed_s = now_s() - t0;
|
||
a->units_done = done;
|
||
return NULL;
|
||
}
|
||
|
||
/* --- QPU worker (CDEF / MC / LPF4 / LPF8 / IDCT) --- */
|
||
|
||
typedef struct {
|
||
int affinity_core, n_units;
|
||
enum kernel kernel;
|
||
uint64_t units_done;
|
||
double elapsed_s;
|
||
} qpu_args;
|
||
|
||
/* Each QPU kernel has its own push-constant layout. */
|
||
typedef struct { uint32_t n, dst_stride_u8, _pad0, _pad1; } pc_lpf;
|
||
typedef struct { uint32_t n, dst_stride_u8, src_stride_u8, _pad; } pc_mc;
|
||
typedef struct { uint32_t n_blocks, blocks_per_row, dst_stride_u8, _pad; } pc_idct;
|
||
typedef struct { uint32_t n_blocks, tmp_stride_u16, dst_stride_u8, _pad; } pc_cdef;
|
||
/* CDEF: not yet — QPU CDEF kernel not implemented. CDEF QPU mode uses
|
||
* dav1d NEON via a single-thread NEON call on the QPU host core instead.
|
||
* That's a degenerate "QPU helper" but matches the deferred state of
|
||
* cycle 5. Real QPU CDEF kernel would replace this once cycle 5 closes. */
|
||
|
||
static void *qpu_cdef_neon_fallback(void *p)
|
||
{
|
||
/* Cycle 5 doesn't have a working QPU CDEF kernel yet (M1 deferred).
|
||
* For Issue 003's purposes we test "the QPU host core running NEON
|
||
* CDEF" as a proxy for the QPU contribution. This UNDERSTATES the
|
||
* QPU helper value (since the QPU itself would parallelise more
|
||
* than 1 NEON core), but gives a defensible lower bound: if even
|
||
* NEON-on-the-spare-core helps the mixed throughput, QPU certainly
|
||
* would.
|
||
*
|
||
* TODO: once cycle 5 Phase 6 lands, swap this for the QPU dispatch. */
|
||
qpu_args *a = p;
|
||
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
|
||
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
|
||
|
||
int n_blocks = a->n_units;
|
||
uint64_t seed = 0xcdef00000beefcULL;
|
||
|
||
uint16_t *tmps = malloc((size_t) n_blocks * 192 * sizeof(uint16_t));
|
||
uint8_t *dsts = malloc((size_t) n_blocks * 64);
|
||
int *pris = malloc(n_blocks*sizeof(int));
|
||
int *secs = malloc(n_blocks*sizeof(int));
|
||
int *dirs = malloc(n_blocks*sizeof(int));
|
||
int *damps = malloc(n_blocks*sizeof(int));
|
||
for (int i = 0; i < n_blocks; i++) {
|
||
for (int j = 0; j < 192; j++) tmps[i*192 + j] = (uint16_t)(xs_step(&seed) & 0xff);
|
||
for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++)
|
||
dsts[i*64 + r*8 + c] = (uint8_t) tmps[i*192 + (r+2)*16 + (c+2)];
|
||
pris[i] = (int)(xs_step(&seed) % 7) + 1;
|
||
secs[i] = (int)(xs_step(&seed) % 4) + 1;
|
||
dirs[i] = (int)(xs_step(&seed) & 7);
|
||
damps[i] = (int)(xs_step(&seed) % 4) + 3;
|
||
}
|
||
|
||
pthread_barrier_wait(&g_start);
|
||
double t0 = now_s();
|
||
uint64_t done = 0;
|
||
while (!g_stop) {
|
||
for (int i = 0; i < n_blocks; i++)
|
||
dav1d_cdef_filter8_8bpc_neon(dsts + i*64, 8,
|
||
tmps + i*192,
|
||
pris[i], secs[i], dirs[i], damps[i], 8, 0);
|
||
done += n_blocks;
|
||
}
|
||
a->elapsed_s = now_s() - t0;
|
||
a->units_done = done;
|
||
|
||
free(tmps); free(dsts); free(pris); free(secs); free(dirs); free(damps);
|
||
return NULL;
|
||
}
|
||
|
||
/* QPU dispatch worker — generic for kernels with working shaders. */
|
||
|
||
typedef struct {
|
||
int affinity_core, n_units;
|
||
enum kernel kernel;
|
||
uint64_t units_done;
|
||
double elapsed_s;
|
||
} qpu_real_args;
|
||
|
||
static void *qpu_real_worker(void *p)
|
||
{
|
||
qpu_real_args *a = p;
|
||
cpu_set_t cs; CPU_ZERO(&cs); CPU_SET(a->affinity_core, &cs);
|
||
pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
|
||
|
||
v3d_runner *r = v3d_runner_create();
|
||
if (!r) return NULL;
|
||
|
||
int n_units = a->n_units;
|
||
const char *spv = NULL;
|
||
uint32_t bpw = 32; /* blocks/edges per WG */
|
||
size_t dst_bytes = 0, meta_bytes = 0, src_bytes = 0;
|
||
int has_src = 0;
|
||
size_t per_unit = 0;
|
||
|
||
switch (a->kernel) {
|
||
case K_LPF4:
|
||
case K_LPF8: {
|
||
spv = (a->kernel == K_LPF4) ? "v3d_lpf_h_4_8.spv" : "v3d_lpf_h_8_8.spv";
|
||
per_unit = 64;
|
||
dst_bytes = (size_t) n_units * per_unit;
|
||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||
break;
|
||
}
|
||
case K_MC:
|
||
spv = "v3d_mc_8h.spv";
|
||
dst_bytes = (size_t) n_units * 64;
|
||
src_bytes = (size_t) n_units * 128;
|
||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||
has_src = 1;
|
||
break;
|
||
case K_IDCT:
|
||
spv = "v3d_idct8.spv";
|
||
dst_bytes = (size_t) n_units * 64;
|
||
src_bytes = (size_t) n_units * 64 * sizeof(int16_t);
|
||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||
has_src = 1;
|
||
break;
|
||
case K_CDEF:
|
||
spv = "v3d_cdef.spv";
|
||
bpw = 4;
|
||
dst_bytes = (size_t) n_units * 64;
|
||
src_bytes = (size_t) n_units * 192 * sizeof(uint16_t);
|
||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||
has_src = 1;
|
||
break;
|
||
case K_H264DEBLOCK:
|
||
spv = "v3d_h264deblock.spv";
|
||
bpw = 16; /* 16 edges/WG */
|
||
dst_bytes = (size_t) n_units * 256; /* 16x16 tile */
|
||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||
has_src = 0;
|
||
break;
|
||
default:
|
||
fprintf(stderr, "qpu_real_worker: unsupported kernel\n");
|
||
v3d_runner_destroy(r);
|
||
return NULL;
|
||
}
|
||
|
||
v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_src = {0};
|
||
v3d_runner_create_buffer(r, meta_bytes, &buf_meta);
|
||
v3d_runner_create_buffer(r, dst_bytes, &buf_dst);
|
||
if (has_src) v3d_runner_create_buffer(r, src_bytes, &buf_src);
|
||
|
||
/* Synthesise meta + src + dst content based on kernel. */
|
||
uint64_t seed = 0xfeed00000beefULL;
|
||
uint32_t *meta = buf_meta.mapped;
|
||
if (a->kernel == K_LPF4 || a->kernel == K_LPF8) {
|
||
for (int i = 0; i < n_units; i++) {
|
||
meta[4*i+0] = (uint32_t)((size_t)i * 64 + 4); /* dst_off */
|
||
meta[4*i+1] = (uint32_t)(xs_step(&seed) % 81); /* E */
|
||
meta[4*i+2] = (uint32_t)(xs_step(&seed) % 41); /* I */
|
||
meta[4*i+3] = (uint32_t)(xs_step(&seed) % 11); /* H */
|
||
}
|
||
for (size_t i = 0; i < dst_bytes; i++)
|
||
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
|
||
} else if (a->kernel == K_MC) {
|
||
for (int i = 0; i < n_units; i++) {
|
||
meta[4*i+0] = (uint32_t)((size_t)i * 64); /* dst_off */
|
||
meta[4*i+1] = (uint32_t)((size_t)i * 128); /* src_off (RAW) */
|
||
meta[4*i+2] = (uint32_t)(xs_step(&seed) & 15); /* mx */
|
||
meta[4*i+3] = 0;
|
||
}
|
||
for (size_t i = 0; i < src_bytes; i++)
|
||
((uint8_t *) buf_src.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
|
||
} else if (a->kernel == K_IDCT) {
|
||
for (int i = 0; i < n_units; i++) {
|
||
meta[4*i+0] = (uint32_t)((size_t)i * 64);
|
||
meta[4*i+1] = (uint32_t)((i * 64) / 64);
|
||
meta[4*i+2] = 0;
|
||
meta[4*i+3] = 0;
|
||
}
|
||
int16_t *cf = (int16_t *) buf_src.mapped;
|
||
size_t n_coefs = src_bytes / sizeof(int16_t);
|
||
for (size_t i = 0; i < n_coefs; i++)
|
||
cf[i] = (int16_t)((int)(xs_step(&seed) % 8192) - 4096);
|
||
} else if (a->kernel == K_CDEF) {
|
||
uint16_t *tmps = (uint16_t *) buf_src.mapped;
|
||
for (int i = 0; i < n_units; i++) {
|
||
uint32_t pri = (uint32_t)((xs_step(&seed) % 7) + 1);
|
||
uint32_t sec = (uint32_t)((xs_step(&seed) % 4) + 1);
|
||
uint32_t damping = (uint32_t)((xs_step(&seed) % 6) + 1);
|
||
meta[4*i+0] = (uint32_t)((size_t)i * 64);
|
||
meta[4*i+1] = pri | (sec << 8) | (damping << 16);
|
||
meta[4*i+2] = (uint32_t)((size_t)i * 192 + (2*16 + 2));
|
||
meta[4*i+3] = (uint32_t)(xs_step(&seed) & 7);
|
||
for (int j = 0; j < 192; j++)
|
||
tmps[(size_t)i * 192 + j] = (uint16_t)(xs_step(&seed) & 0xff);
|
||
}
|
||
for (size_t i = 0; i < dst_bytes; i++)
|
||
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
|
||
} else if (a->kernel == K_H264DEBLOCK) {
|
||
for (int i = 0; i < n_units; i++) {
|
||
uint32_t alpha = (uint32_t)(xs_step(&seed) % 64) + 1;
|
||
uint32_t beta = (uint32_t)(xs_step(&seed) % 16) + 1;
|
||
uint32_t tc0p = 0;
|
||
for (int s = 0; s < 4; s++) {
|
||
int rr = (int)(xs_step(&seed) % 8);
|
||
int8_t v = (int8_t)(rr == 0 ? -1 : (rr - 1));
|
||
tc0p |= ((uint32_t)(uint8_t)v) << (s * 8);
|
||
}
|
||
meta[4*i+0] = (uint32_t)((size_t)i * 256 + 4 * 16); /* EDGE_OFF = 4*stride */
|
||
meta[4*i+1] = alpha | (beta << 8);
|
||
meta[4*i+2] = tc0p;
|
||
meta[4*i+3] = 0;
|
||
}
|
||
for (size_t i = 0; i < dst_bytes; i++)
|
||
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
|
||
}
|
||
|
||
v3d_pipeline pipe = {0};
|
||
int n_ssbos = has_src ? 3 : 2;
|
||
/* K_H264DEBLOCK reuses pc_lpf layout (n + dst_stride_u8 + 2 pads). */
|
||
size_t pc_size = (a->kernel == K_MC) ? sizeof(pc_mc) :
|
||
(a->kernel == K_IDCT) ? sizeof(pc_idct) :
|
||
(a->kernel == K_CDEF) ? sizeof(pc_cdef) : sizeof(pc_lpf);
|
||
v3d_runner_create_pipeline(r, spv, n_ssbos, pc_size, &pipe);
|
||
|
||
v3d_buffer bind_bufs[3];
|
||
bind_bufs[0] = buf_meta;
|
||
bind_bufs[1] = buf_dst;
|
||
if (has_src) bind_bufs[2] = buf_src;
|
||
v3d_runner_bind_buffers(r, &pipe, bind_bufs, n_ssbos);
|
||
|
||
uint32_t gc = (uint32_t)((n_units + bpw - 1) / bpw);
|
||
union { pc_lpf lpf; pc_mc mc; pc_idct idct; pc_cdef cdef; } pc = {0};
|
||
if (a->kernel == K_LPF4 || a->kernel == K_LPF8) {
|
||
pc.lpf = (pc_lpf){ .n = n_units, .dst_stride_u8 = 8 };
|
||
} else if (a->kernel == K_MC) {
|
||
pc.mc = (pc_mc){ .n = n_units, .dst_stride_u8 = 8, .src_stride_u8 = 16 };
|
||
} else if (a->kernel == K_IDCT) {
|
||
pc.idct = (pc_idct){ .n_blocks = n_units, .blocks_per_row = 16, .dst_stride_u8 = 128 };
|
||
} else if (a->kernel == K_CDEF) {
|
||
pc.cdef = (pc_cdef){ .n_blocks = n_units, .tmp_stride_u16 = 16, .dst_stride_u8 = 8 };
|
||
} else if (a->kernel == K_H264DEBLOCK) {
|
||
pc.lpf = (pc_lpf){ .n = n_units, .dst_stride_u8 = 16 };
|
||
}
|
||
|
||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, pc_size, &pc);
|
||
vkCmdDispatch(cb, gc, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
|
||
for (int i = 0; i < 5; i++) v3d_runner_submit_wait(r, cb);
|
||
|
||
pthread_barrier_wait(&g_start);
|
||
double t0 = now_s();
|
||
uint64_t done = 0;
|
||
while (!g_stop) {
|
||
v3d_runner_submit_wait(r, cb);
|
||
done += n_units;
|
||
}
|
||
a->elapsed_s = now_s() - t0;
|
||
a->units_done = done;
|
||
|
||
v3d_runner_destroy_pipeline(r, &pipe);
|
||
if (has_src) v3d_runner_destroy_buffer(r, &buf_src);
|
||
v3d_runner_destroy_buffer(r, &buf_dst);
|
||
v3d_runner_destroy_buffer(r, &buf_meta);
|
||
v3d_runner_destroy(r);
|
||
return NULL;
|
||
}
|
||
|
||
/* --- Timer --- */
|
||
|
||
typedef struct { double duration_s; } timer_args;
|
||
static void *timer_thread(void *p) {
|
||
timer_args *a = p;
|
||
pthread_barrier_wait(&g_start);
|
||
double end = now_s() + a->duration_s;
|
||
while (now_s() < end) {
|
||
struct timespec ts = {0, 1000000}; nanosleep(&ts, NULL);
|
||
}
|
||
g_stop = 1;
|
||
return NULL;
|
||
}
|
||
|
||
/* --- Main --- */
|
||
|
||
static enum kernel parse_kernel(const char *s) {
|
||
if (!strcmp(s, "mc")) return K_MC;
|
||
if (!strcmp(s, "lpf4")) return K_LPF4;
|
||
if (!strcmp(s, "lpf8")) return K_LPF8;
|
||
if (!strcmp(s, "cdef")) return K_CDEF;
|
||
if (!strcmp(s, "idct")) return K_IDCT;
|
||
if (!strcmp(s, "h264deblock")) return K_H264DEBLOCK;
|
||
fprintf(stderr, "unknown kernel: %s\n", s); exit(2);
|
||
}
|
||
|
||
int main(int argc, char **argv)
|
||
{
|
||
enum kernel cpu_k = K_MC, qpu_k = K_CDEF;
|
||
int n_neon = 3, qpu_core = 3, qpu_n_units = 65536;
|
||
double duration = 8.0;
|
||
|
||
static struct option opts[] = {
|
||
{"cpu-kernel", required_argument, 0, 'c'},
|
||
{"qpu-kernel", required_argument, 0, 'q'},
|
||
{"neon-threads", required_argument, 0, 'n'},
|
||
{"qpu-core", required_argument, 0, 'C'},
|
||
{"qpu-units", required_argument, 0, 'u'},
|
||
{"duration", required_argument, 0, 'd'},
|
||
{0,0,0,0}
|
||
};
|
||
for (int c; (c = getopt_long(argc, argv, "c:q:n:C:u:d:", opts, 0)) != -1;) {
|
||
switch (c) {
|
||
case 'c': cpu_k = parse_kernel(optarg); break;
|
||
case 'q': qpu_k = parse_kernel(optarg); break;
|
||
case 'n': n_neon = atoi(optarg); break;
|
||
case 'C': qpu_core = atoi(optarg); break;
|
||
case 'u': qpu_n_units = atoi(optarg); break;
|
||
case 'd': duration = atof(optarg); break;
|
||
default: return 2;
|
||
}
|
||
}
|
||
|
||
/* Cycle 5 Phase 6 landed — v3d_cdef.spv is M1-PASS. Use real
|
||
* QPU dispatch for CDEF too. The NEON-fallback worker remains
|
||
* compiled but is unselected. */
|
||
int use_neon_fallback_for_cdef = 0;
|
||
|
||
int barrier_count = n_neon + 1 /* QPU */ + 1 /* timer */ + 1 /* main */;
|
||
printf("=== Issue 003 mixed-kernel M4 bench ===\n");
|
||
printf(" cpu kernel: %s × %d threads (cores 0..%d)\n",
|
||
kernel_name(cpu_k), n_neon, n_neon - 1);
|
||
printf(" qpu kernel: %s on core %d (%s)\n",
|
||
kernel_name(qpu_k), qpu_core,
|
||
use_neon_fallback_for_cdef ?
|
||
"dav1d NEON fallback — real QPU CDEF deferred to cycle 5 Phase 6" :
|
||
"QPU dispatch");
|
||
printf(" duration: %.1fs\n\n", duration);
|
||
|
||
pthread_barrier_init(&g_start, NULL, barrier_count);
|
||
|
||
pthread_t timer_tid; timer_args ta = { .duration_s = duration };
|
||
pthread_create(&timer_tid, NULL, timer_thread, &ta);
|
||
|
||
pthread_t neon_tids[16] = {0};
|
||
neon_args n_args[16] = {0};
|
||
for (int i = 0; i < n_neon; i++) {
|
||
n_args[i] = (neon_args){ .worker_id = i, .affinity_core = i, .kernel = cpu_k };
|
||
pthread_create(&neon_tids[i], NULL, neon_worker, &n_args[i]);
|
||
}
|
||
|
||
pthread_t qpu_tid = 0;
|
||
qpu_args q_args = {0};
|
||
qpu_real_args qr_args = {0};
|
||
if (use_neon_fallback_for_cdef) {
|
||
q_args = (qpu_args){ .affinity_core = qpu_core, .n_units = qpu_n_units, .kernel = qpu_k };
|
||
pthread_create(&qpu_tid, NULL, qpu_cdef_neon_fallback, &q_args);
|
||
} else {
|
||
qr_args = (qpu_real_args){ .affinity_core = qpu_core, .n_units = qpu_n_units, .kernel = qpu_k };
|
||
pthread_create(&qpu_tid, NULL, qpu_real_worker, &qr_args);
|
||
}
|
||
|
||
pthread_barrier_wait(&g_start);
|
||
|
||
pthread_join(timer_tid, NULL);
|
||
for (int i = 0; i < n_neon; i++) pthread_join(neon_tids[i], NULL);
|
||
pthread_join(qpu_tid, NULL);
|
||
|
||
uint64_t cpu_total = 0; double cpu_max_e = 0;
|
||
printf("NEON workers (%s):\n", kernel_name(cpu_k));
|
||
for (int i = 0; i < n_neon; i++) {
|
||
double r = n_args[i].units_done / n_args[i].elapsed_s / 1e6;
|
||
printf(" core %d: %.3f %s\n", n_args[i].affinity_core, r, kernel_unit(cpu_k));
|
||
cpu_total += n_args[i].units_done;
|
||
if (n_args[i].elapsed_s > cpu_max_e) cpu_max_e = n_args[i].elapsed_s;
|
||
}
|
||
double cpu_rate = cpu_total / cpu_max_e / 1e6;
|
||
printf(" CPU aggregate: %.3f %s\n\n", cpu_rate, kernel_unit(cpu_k));
|
||
|
||
uint64_t qpu_done = use_neon_fallback_for_cdef ? q_args.units_done : qr_args.units_done;
|
||
double qpu_elapsed = use_neon_fallback_for_cdef ? q_args.elapsed_s : qr_args.elapsed_s;
|
||
double qpu_rate = qpu_done / qpu_elapsed / 1e6;
|
||
printf("QPU worker (%s on core %d):\n", kernel_name(qpu_k), qpu_core);
|
||
printf(" %.3f %s (%llu units / %.3f s)\n",
|
||
qpu_rate, kernel_unit(qpu_k),
|
||
(unsigned long long) qpu_done, qpu_elapsed);
|
||
|
||
pthread_barrier_destroy(&g_start);
|
||
return 0;
|
||
}
|