Files
daedalus-fourier/tests/bench_neon_mc.c
T
marfrit 356e446a49 Cycle 3 (MC interpolation) closure: M1'''=100%, R'''=0.067 RED, M4=-19.5%
Third daedalus-fourier kernel — VP9 8-tap regular subpel filter,
horizontal direction, 8-wide output. Multiply-heavy by design to
stress V3D's no-DP4A deficit. Full cycle Phase 1-7 + M4'''.

Phase 5''' second-model review delivered cleanly — caught 1 RED
bug pre-implementation (src_off off-by-3 indexing convention) and
2 YELLOW gaps (assert MUST language, shaderdb filter-LUT gate).
Without the review, M1''' would have failed silently on first run
with cryptic "high-index source pixels wrong" symptoms.

Phase 6 v1 first-light: M1''' 100.0000% bit-exact (65536/65536
blocks across all 16 mx phases). Phase 5''' filter-LUT prediction
materialised exactly: 197 uniforms (gate was 144), 2 threads (down
from cycle-2's 4 due to register pressure).

Performance:

  M2''' = 1.413 Mblock/s     (707.9 ns/block)
  M3''' = 20.997 Mblock/s    (NEON baseline phase3)
  R'''  = 0.067              (RED band — structural mismatch)
  shaderdb: 488 inst, 2 threads, 197 uniforms, 25 max-temps, 0 spills

M4''' concurrent matrix (8s windows):

  NEON 1-core           14.479 Mblock/s
  NEON 4-core           15.248 Mblock/s   <- baseline (compute-bound,
                                              not bandwidth-saturated
                                              like cycles 1+2!)
  QPU only               1.380 Mblock/s
  MIXED NEON-3 + QPU    12.277 Mblock/s   <- -19.5% (FAIL gate)
  MIXED NEON-4 + QPU    12.158 Mblock/s   <- -20.3%

NEW cross-cycle finding (Phase 9 lesson 2): compute-bound CPU
workloads make the QPU-offload story collapse. Cycles 1+2 were
bandwidth-saturated (4-core scaling 0.56-0.82x of 1-core), so
freeing a CPU core via QPU offload added throughput. Cycle 3 MC
is compute-bound (4-core scaling 1.05x of 1-core — near-linear),
no free cycles to free. QPU contribution (0.45 Mblock/s in
contention) doesn't compensate for losing 1 NEON core delivering
~3.8 Mblock/s.

But 30fps@1080p floor: PASS in every config (1.4x to 15.7x
isolation margin). Per project_30fps_floor_is_fine.md, user-facing
test never fails — daily YouTube playback works fine on any CPU/QPU
split.

DEPLOYMENT RECIPE for higgs (cycle 3 confirmed split):

  IDCT (k1)  -> QPU   (R=0.92, +7% mixed, frees CPU core)
  LPF  (k2)  -> QPU   (R=0.41, +7% mixed, frees CPU core)
  MC   (k3)  -> CPU   (R=0.067, -19.5% mixed — stays on CPU)
  Entropy    -> CPU   (structurally serial)

Mixed-substrate deployment, not "QPU does everything". Realistic for
higgs: entropy + MC on 2-3 ARM cores; IDCT + LPF dispatched to QPU
concurrently; 1-2 ARM cores left for vscode etc.

New artifacts:
- src/v3d_mc_8h.comp               — GLSL kernel
- tests/vp9_mc_ref.c               — standalone C ref (REGULAR filter
                                     embedded; clean transcription)
- tests/bench_neon_mc.c            — M1'''_c + M3''' bench
- tests/bench_v3d_mc.c             — M1''' + M2''' bench with contract
                                     asserts + 30fps margin display
- tests/bench_concurrent_mc.c      — M4''' pthread bench
- external/ffmpeg-snapshot/libavcodec/aarch64/vp9mc_neon.S    (vendored)
- external/ffmpeg-snapshot/libavcodec/vp9_subpel_filters_table.c
                                     (hand-extracted; provides
                                      ff_vp9_subpel_filters symbol
                                      without dragging in full vp9dsp.c)
- docs/k3_mc_phase{1,2,3,4,5,7}.md — full cycle documentation

Memory updates: project_30fps_floor_is_fine.md (user's 30fps target
recalibration), MEMORY.md index updated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 12:51:43 +00:00

221 lines
7.5 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Cycle 3 Phase 3 — NEON M3''' baseline for VP9 8-tap regular
* horizontal MC interpolation, 8×8 block.
*
* Reports:
* M1'''_c (correctness): C-ref ↔ NEON bit-exact rate, N random
* 8×8 blocks with random source pixels and
* random subpel phase mx ∈ [0, 15]
* M3''' (throughput): NEON sustained Mblock/s, single-thread,
* time-based
*
* License: LGPL-2.1+ (statically links FFmpeg NEON snapshot).
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <time.h>
#include <getopt.h>
extern void daedalus_vp9_put_regular_8h_ref(
uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
int h, int mx, int my);
extern void ff_vp9_put_regular8_h_neon(
uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
int h, int mx, int my);
/* RNG ------------------------------------------------------------ */
static uint64_t xs_state;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
/* Block layout: each block gets its own 8×16 source buffer + 8×8 dst.
* - source buffer is 16 cols wide; the filter is called with
* src = block_src + 3, so it reads cols [src+0-3..src+8+4] =
* [0..14] of the 16-col buffer. col 15 is unused padding.
* - dst is 8 cols × 8 rows.
*/
#define SRC_W 16
#define SRC_H 8
#define DST_W 8
#define DST_H 8
#define SRC_BYTES (SRC_H * SRC_W) /* 128 */
#define DST_BYTES (DST_H * DST_W) /* 64 */
static void gen_src(uint8_t *buf)
{
for (int i = 0; i < SRC_BYTES; i++)
buf[i] = (uint8_t)(xs() & 0xff);
}
static double now_seconds(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
/* M1'''_c correctness gate -------------------------------------- */
static int correctness_check(uint64_t seed, int n_blocks)
{
xs_state = seed ? seed : 0xabcdef1234567890ULL;
int mismatches = 0;
uint8_t src[SRC_BYTES];
uint8_t dst_a[DST_BYTES], dst_b[DST_BYTES];
int mx_hist[16] = {0};
for (int i = 0; i < n_blocks; i++) {
gen_src(src);
int mx = (int)(xs() & 15);
mx_hist[mx]++;
memset(dst_a, 0, DST_BYTES);
memset(dst_b, 0, DST_BYTES);
daedalus_vp9_put_regular_8h_ref(dst_a, DST_W, src + 3, SRC_W, DST_H, mx, 0);
ff_vp9_put_regular8_h_neon (dst_b, DST_W, src + 3, SRC_W, DST_H, mx, 0);
if (memcmp(dst_a, dst_b, DST_BYTES) != 0) {
if (mismatches < 3) {
fprintf(stderr, "MISMATCH block %d mx=%d:\n", i, mx);
fprintf(stderr, " ref:");
for (int r = 0; r < 8; r++) {
fprintf(stderr, "\n r%d ", r);
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_a[r*8+c]);
}
fprintf(stderr, "\n neon:");
for (int r = 0; r < 8; r++) {
fprintf(stderr, "\n r%d ", r);
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_b[r*8+c]);
}
fprintf(stderr, "\n");
}
mismatches++;
}
}
printf("M1'''_c correctness: %d / %d blocks bit-exact (%.4f%%)\n",
n_blocks - mismatches, n_blocks,
100.0 * (n_blocks - mismatches) / n_blocks);
/* mx histogram — confirms all 16 phases get exercised. */
int min_mx = mx_hist[0], max_mx = mx_hist[0];
for (int i = 1; i < 16; i++) {
if (mx_hist[i] < min_mx) min_mx = mx_hist[i];
if (mx_hist[i] > max_mx) max_mx = mx_hist[i];
}
printf(" mx phase coverage: min=%d max=%d (16 phases sampled)\n",
min_mx, max_mx);
return mismatches;
}
/* M3''' throughput ---------------------------------------------- */
static void throughput_neon(uint64_t seed, int n_blocks, double duration_s)
{
xs_state = seed ? seed : 0xdeadbeef12345678ULL;
uint8_t *master_src = malloc((size_t) n_blocks * SRC_BYTES);
uint8_t *work_src = malloc((size_t) n_blocks * SRC_BYTES);
uint8_t *dsts = malloc((size_t) n_blocks * DST_BYTES);
int *mxs = malloc(n_blocks * sizeof(int));
if (!master_src || !work_src || !dsts || !mxs) { fprintf(stderr, "alloc fail\n"); exit(1); }
for (int i = 0; i < n_blocks; i++) {
gen_src(master_src + (size_t)i * SRC_BYTES);
mxs[i] = (int)(xs() & 15);
}
/* Warm. */
memcpy(work_src, master_src, (size_t) n_blocks * SRC_BYTES);
for (int i = 0; i < n_blocks; i++)
ff_vp9_put_regular8_h_neon(dsts + (size_t)i * DST_BYTES, DST_W,
work_src + (size_t)i * SRC_BYTES + 3, SRC_W,
DST_H, mxs[i], 0);
double t0 = now_seconds();
double t_end = t0 + duration_s;
uint64_t done = 0;
while (now_seconds() < t_end) {
memcpy(work_src, master_src, (size_t) n_blocks * SRC_BYTES);
for (int i = 0; i < n_blocks; i++)
ff_vp9_put_regular8_h_neon(dsts + (size_t)i * DST_BYTES, DST_W,
work_src + (size_t)i * SRC_BYTES + 3, SRC_W,
DST_H, mxs[i], 0);
done += n_blocks;
}
double elapsed = now_seconds() - t0;
/* setup-only subtraction */
int setup_iters = (int) (done / n_blocks);
double s0 = now_seconds();
for (int it = 0; it < setup_iters; it++)
memcpy(work_src, master_src, (size_t) n_blocks * SRC_BYTES);
double s1 = now_seconds();
double kernel_seconds = elapsed - (s1 - s0);
double mbps = done / kernel_seconds / 1e6;
printf("M3''' NEON throughput:\n");
printf(" blocks/batch: %d\n", n_blocks);
printf(" batches done: %d\n", setup_iters);
printf(" total blocks: %llu\n", (unsigned long long) done);
printf(" elapsed (kernel)=%.6f s\n", kernel_seconds);
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
printf(" throughput = %.3f Mblock/s\n", mbps);
printf(" per-block = %.1f ns\n", kernel_seconds / done * 1e9);
/* 1080p: 32400 blocks/frame */
printf(" equiv 1080p = %.1f FPS (32400 blocks/frame)\n",
mbps * 1e6 / 32400.0);
free(master_src); free(work_src); free(dsts); free(mxs);
}
int main(int argc, char **argv)
{
int n_blocks = 65536;
double duration = 5.0;
uint64_t seed = 0;
int do_correctness = 1;
static struct option opts[] = {
{"blocks", required_argument, 0, 'b'},
{"duration", required_argument, 0, 'd'},
{"seed", required_argument, 0, 's'},
{"no-correctness", no_argument, 0, 'C'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "b:d:s:C", opts, 0)) != -1;) {
switch (c) {
case 'b': n_blocks = atoi(optarg); break;
case 'd': duration = atof(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'C': do_correctness = 0; break;
default: return 2;
}
}
if (do_correctness) {
printf("=== M1'''_c bit-exact (10000 random blocks) ===\n");
if (correctness_check(seed, 10000) != 0) {
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
return 1;
}
printf("\n");
}
printf("=== M3''' NEON throughput ===\n");
throughput_neon(seed, n_blocks, duration);
return 0;
}