Files
daedalus-fourier/tests/bench_neon_idct.c
T
marfrit dcbbc77038 Path B pivot + Phase 0-3 closed with first baseline numbers
This is a from-scratch initial commit on a fresh .git. The original
scaffold commit (7510b56) and the earlier session's working-tree
docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted
.git is preserved at .git-broken-2026-05-18/ (gitignored) for
forensic inspection.

Scope re-anchored from Path A (custom VPU firmware on VC7 scalar
cores; blocked by BCM2712 silicon-RoT mask-ROM signature check)
to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or
direct DRM, on stock signed Pi 5 / CM5). See README.md and
docs/phase0.md for the substrate audit that closed Path A.

Phases closed:
  Phase 0 — substrate audit; Path A blocked, Path B open;
            codec-back-end-fits-QPU finding (docs/phase0.md)
  Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with
            publish-before-measure R = M2/M3 decision rules
            (docs/phase1.md)
  Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored
            under external/ffmpeg-snapshot/ (PROVENANCE.md pins
            commit f46e514 + per-file SHA-256s) (docs/phase2.md)
  Phase 3 — real baseline measurements on hertz (docs/phase3.md):
              M1 bit-exact            100.0000 % (10000/10000)
              M3 NEON IDCT8 single    8.171 Mblock/s (122.4 ns/block)
              M5a empty Vulkan submit 22.66 us
              M5b 1-WG noop dispatch  55.60 us
              M5 delta                32.95 us/dispatch
            => per-dispatch overhead is ~455x per-NEON-block cost;
               Phase 4 must batch at frame level or close to it.

Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c,
vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} +
external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM).
Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12,
libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S
assembles via the config.h shim.

Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching
constraint) -> Phase 5 second-model review -> Phase 6 implement.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 11:30:12 +00:00

249 lines
8.6 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Phase 3 — NEON baseline microbench for VP9 8×8 DCT_DCT IDCT add.
*
* Reports two numbers:
* M1 (correctness): bit-exact match rate, our C reference vs
* FFmpeg's NEON, across N random blocks.
* M3 (throughput): NEON sustained MblockS on this host.
*
* Both are gating measurements for Phase 1 (see docs/phase1.md).
* NO QPU work happens here — that's later phases.
*
* Build: see CMakeLists.txt at project root.
* Run: ./bench_neon_idct [--blocks N] [--iters K] [--seed S]
*
* License: BSD-2-Clause (daedalus-fourier), but this binary
* statically links the LGPL-2.1+ FFmpeg NEON snapshot
* — distribute the binary under LGPL-2.1+ in that case.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stddef.h>
#include <time.h>
#include <getopt.h>
/* Our C reference (tests/vp9_idct8_ref.c). */
extern void daedalus_vp9_idct_idct_8x8_add_ref(
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
/* FFmpeg NEON entry point (vendored vp9itxfm_neon.S). */
extern void ff_vp9_idct_idct_8x8_add_neon(
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
/* ---- Random-block generation ----------------------------------- */
/* xorshift64 — deterministic per seed, fast enough not to dominate
* the measurement. */
static uint64_t xs64_state;
static inline uint64_t xs64(void)
{
uint64_t x = xs64_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs64_state = x;
}
/* Random VP9-plausible coefficient block: most coefficients zero,
* a handful of nonzero ones in low-frequency positions. Bias chosen
* so eob is typically in [4, 32], hitting the general (non-DC) path.
* For Phase 3 baseline this isn't load-balanced against a real
* bitstream distribution — Phase 7 may revisit. */
static int gen_block(int16_t block[64])
{
memset(block, 0, 64 * sizeof(*block));
int eob = 0;
int n_nonzero = 1 + (int)(xs64() % 16);
for (int i = 0; i < n_nonzero; i++) {
/* Bias toward low-freq positions via xs64() % (xs64() % 64 + 1). */
int pos = (int)(xs64() % 64);
/* Coefficient range: signed 12-bit (typical dequant output). */
int16_t coef = (int16_t)((int)(xs64() % 8192) - 4096);
block[pos] = coef;
if (pos + 1 > eob) eob = pos + 1;
}
if (eob == 0) eob = 1;
return eob;
}
static void gen_pred(uint8_t pred[64])
{
for (int i = 0; i < 64; i++)
pred[i] = (uint8_t)(xs64() & 0xff);
}
/* ---- Wall-clock timing (CLOCK_MONOTONIC_RAW) ------------------- */
static double now_seconds(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
/* ---- Phase 1 M1: bit-exact gate -------------------------------- */
static int correctness_check(uint64_t seed, int n_blocks)
{
xs64_state = seed ? seed : 0xdeadbeefcafebabeULL;
int mismatches = 0;
int dc_only_seen = 0;
int16_t block_a[64], block_b[64];
uint8_t pred[64];
uint8_t dst_a[64], dst_b[64];
for (int i = 0; i < n_blocks; i++) {
int eob = gen_block(block_a);
memcpy(block_b, block_a, sizeof(block_a));
gen_pred(pred);
memcpy(dst_a, pred, 64);
memcpy(dst_b, pred, 64);
daedalus_vp9_idct_idct_8x8_add_ref(dst_a, 8, block_a, eob);
ff_vp9_idct_idct_8x8_add_neon(dst_b, 8, block_b, eob);
if (memcmp(dst_a, dst_b, 64) != 0) {
if (mismatches < 4) {
fprintf(stderr, "MISMATCH block %d eob=%d:\n", i, eob);
for (int r = 0; r < 8; r++) {
fprintf(stderr, " row %d ref ", r);
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_a[r * 8 + c]);
fprintf(stderr, " neon ");
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_b[r * 8 + c]);
fprintf(stderr, "\n");
}
}
mismatches++;
}
if (eob == 1) dc_only_seen++;
}
printf("M1 correctness: %d / %d blocks bit-exact match (%.4f%%)\n",
n_blocks - mismatches, n_blocks,
100.0 * (n_blocks - mismatches) / n_blocks);
printf(" dc-only path frequency: %d / %d (%.2f%%)\n",
dc_only_seen, n_blocks, 100.0 * dc_only_seen / n_blocks);
return mismatches;
}
/* ---- Phase 1 M3: NEON throughput ------------------------------- */
static void throughput_neon(uint64_t seed, int n_blocks, int iters)
{
xs64_state = seed ? seed : 0xfeedfacecafebeefULL;
/* Pre-generate all blocks + preds so generation cost is excluded
* from the timed region. Each block is consumed once per iteration
* (NEON path zeroes the block, so we restore from the master). */
int16_t *blocks_master = malloc(n_blocks * 64 * sizeof(int16_t));
int16_t *blocks_work = malloc(n_blocks * 64 * sizeof(int16_t));
uint8_t *preds = malloc(n_blocks * 64);
uint8_t *dsts = malloc(n_blocks * 64);
int *eobs = malloc(n_blocks * sizeof(int));
if (!blocks_master || !blocks_work || !preds || !dsts || !eobs) {
fprintf(stderr, "alloc failed\n");
exit(1);
}
for (int i = 0; i < n_blocks; i++) {
eobs[i] = gen_block(blocks_master + i * 64);
gen_pred(preds + i * 64);
}
/* Warm-up. */
memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
memcpy(dsts, preds, n_blocks * 64);
for (int i = 0; i < n_blocks; i++)
ff_vp9_idct_idct_8x8_add_neon(dsts + i * 64, 8,
blocks_work + i * 64, eobs[i]);
/* Timed region. */
double t0 = now_seconds();
for (int it = 0; it < iters; it++) {
memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
memcpy(dsts, preds, n_blocks * 64);
for (int i = 0; i < n_blocks; i++)
ff_vp9_idct_idct_8x8_add_neon(dsts + i * 64, 8,
blocks_work + i * 64, eobs[i]);
}
double t1 = now_seconds();
/* memcpy cost-only run, to subtract setup overhead. */
double s0 = now_seconds();
for (int it = 0; it < iters; it++) {
memcpy(blocks_work, blocks_master, n_blocks * 64 * sizeof(int16_t));
memcpy(dsts, preds, n_blocks * 64);
}
double s1 = now_seconds();
double total_seconds = (t1 - t0) - (s1 - s0);
double total_blocks = (double) n_blocks * iters;
double mblocks_s = total_blocks / total_seconds / 1e6;
printf("M3 NEON throughput:\n");
printf(" blocks=%d iters=%d total=%.0f\n", n_blocks, iters, total_blocks);
printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", total_seconds);
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
printf(" throughput = %.3f Mblock/s\n", mblocks_s);
printf(" per-block = %.1f ns\n", total_seconds / total_blocks * 1e9);
/* Equivalent at 1920x1080: 32 400 blocks/frame -> FPS. */
printf(" equiv 1080p = %.1f FPS (32400 blocks/frame)\n",
mblocks_s * 1e6 / 32400.0);
free(blocks_master); free(blocks_work); free(preds);
free(dsts); free(eobs);
}
/* ---- CLI ------------------------------------------------------- */
static void usage(const char *p)
{
fprintf(stderr,
"Usage: %s [--blocks N] [--iters K] [--seed S] [--no-correctness]\n"
"Defaults: N=1000000, K=10, S=0 (uses fixed default).\n", p);
}
int main(int argc, char **argv)
{
int n_blocks = 1000000;
int iters = 10;
uint64_t seed = 0;
int do_correctness = 1;
static struct option opts[] = {
{"blocks", required_argument, 0, 'b'},
{"iters", required_argument, 0, 'i'},
{"seed", required_argument, 0, 's'},
{"no-correctness", no_argument, 0, 'C'},
{"help", no_argument, 0, 'h'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "b:i:s:Ch", opts, 0)) != -1;) {
switch (c) {
case 'b': n_blocks = atoi(optarg); break;
case 'i': iters = atoi(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'C': do_correctness = 0; break;
case 'h': usage(argv[0]); return 0;
default: usage(argv[0]); return 2;
}
}
if (do_correctness) {
printf("=== M1: bit-exact correctness (10000 random blocks) ===\n");
int miss = correctness_check(seed, 10000);
if (miss != 0) {
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
return 1;
}
printf("\n");
}
printf("=== M3: NEON throughput ===\n");
throughput_neon(seed, n_blocks, iters);
return 0;
}