Files
daedalus-fourier/tests/bench_neon_h264idct4.c
T
marfrit f92dc40f43 Cycle 6 (H.264) opened — IDCT 4x4 Phase 1+3, M3 = 175 Mblock/s
H.264 scope added 2026-05-18 per user direction. Pi 5's VideoCore
VII has no hardware H.264 decoder block (only HEVC), so a
QPU-accelerated H.264 path fills the most impactful codec gap.
Cycle 6 = first H.264 kernel (4x4 IDCT + add, smallest H.264
transform, simplest first cycle).

Phase 1: goal doc + 1080p30 floor analysis (5.85 Mblock/s
worst-case, 2.0 Mblock/s realistic since most MBs use 8x8 or
P-skip).

Phase 3: NEON M3 baseline captured. ff_h264_idct_add_neon on
hertz delivers 175 Mblock/s (5.7 ns per block) = 30x worst-case
floor margin. H.264 IDCT 4x4 is dramatically lighter than VP9
IDCT 8x8 (21x faster per block).

Phase 3 closure also caught the key Phase 9 lesson: H.264/FFmpeg
blocks are COLUMN-MAJOR (block[c*4 + r] = (row=r, col=c)). NEON
ld1 with 4 registers interleaves loading, and the FFmpeg C ref
indexing makes this convention explicit. Initial C ref assumed
row-major, M1 was 5% bit-exact; after fix, M1 = 100%.

Convention encoded for all subsequent H.264 cycles (cycle 7+).

- external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S
  (vendored verbatim from FFmpeg n7.1.3, 415 lines)
- external/ffmpeg-snapshot/PROVENANCE.md: updated
- tests/h264_idct4_ref.c: column-major C ref
- tests/bench_neon_h264idct4.c: M1 + M3 bench
- CMakeLists.txt: cycle 6 NEON bench wiring
- docs/k6_h264idct4_phase1.md, phase3.md

Phase 4 next: QPU shader for cycle 6. Predicted R6 = 0.01 (deep
RED — kernel too small relative to QPU dispatch overhead) but
worth building for cycle-completeness + the opportunistic-helper
hypothesis (cycle 6 may stay CPU per recipe).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:14:43 +00:00

211 lines
7.6 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Cycle 6 Phase 3 — NEON M3 baseline for H.264 IDCT 4x4 + add.
*
* Calls FFmpeg `ff_h264_idct_add_neon`. Reports M1 bit-exact vs
* the standalone C reference, plus M3 throughput.
*
* License: BSD-2-Clause; links FFmpeg LGPL-2.1+ snapshot.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <time.h>
#include <getopt.h>
extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
#define DST_STRIDE 16 /* arbitrary stride for the test surface */
#define DST_ROWS 4
#define DST_BYTES (DST_ROWS * DST_STRIDE)
static uint64_t xs_state;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
static void gen_block(int16_t b[16])
{
/* Realistic H.264 residual: small coefficients, mostly zero,
* a few non-zero in low-frequency positions. */
memset(b, 0, 16 * sizeof(int16_t));
int n_nonzero = 1 + (int)(xs() % 8);
for (int i = 0; i < n_nonzero; i++) {
int pos = (int)(xs() % 16);
int16_t v = (int16_t)((int)(xs() % 1024) - 512);
b[pos] = v;
}
}
static double now_seconds(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
static int correctness_check(uint64_t seed, int n)
{
xs_state = seed ? seed : 0xc0de264cULL;
int mismatches = 0;
int prints = 0;
int16_t block_a[16], block_b[16], block_saved[16];
uint8_t dst_a[DST_BYTES], dst_b[DST_BYTES], dst_initial[DST_BYTES];
for (int i = 0; i < n; i++) {
gen_block(block_a);
memcpy(block_b, block_a, sizeof(block_a));
memcpy(block_saved, block_a, sizeof(block_a));
/* Random initial dst (4×4 region at offset 0, row stride DST_STRIDE). */
for (int r = 0; r < 4; r++)
for (int c = 0; c < 4; c++)
dst_a[r * DST_STRIDE + c] = dst_b[r * DST_STRIDE + c] = (uint8_t)(xs() & 0xff);
memcpy(dst_initial, dst_a, DST_BYTES);
daedalus_h264_idct_add_ref(dst_a, block_a, DST_STRIDE);
ff_h264_idct_add_neon(dst_b, block_b, DST_STRIDE);
int diff = 0;
for (int r = 0; r < 4; r++)
for (int c = 0; c < 4; c++)
if (dst_a[r*DST_STRIDE + c] != dst_b[r*DST_STRIDE + c]) diff++;
if (diff) {
if (prints < 3) {
fprintf(stderr, "MISMATCH block %d (%d/16 pix diff):\n", i, diff);
fprintf(stderr, " input block (row-major):");
for (int r = 0; r < 4; r++) {
fprintf(stderr, "\n r%d ", r);
for (int c = 0; c < 4; c++) fprintf(stderr, "%6d ", block_saved[r*4 + c]);
}
fprintf(stderr, "\n initial dst:");
for (int r = 0; r < 4; r++) {
fprintf(stderr, "\n r%d ", r);
for (int c = 0; c < 4; c++) fprintf(stderr, "%3u ", dst_initial[r*DST_STRIDE + c]);
}
fprintf(stderr, "\n");
fprintf(stderr, " ref:");
for (int r = 0; r < 4; r++) {
fprintf(stderr, "\n r%d ", r);
for (int c = 0; c < 4; c++) fprintf(stderr, "%3u ", dst_a[r*DST_STRIDE+c]);
}
fprintf(stderr, "\n neon:");
for (int r = 0; r < 4; r++) {
fprintf(stderr, "\n r%d ", r);
for (int c = 0; c < 4; c++) fprintf(stderr, "%3u ", dst_b[r*DST_STRIDE+c]);
}
fprintf(stderr, "\n");
prints++;
}
mismatches++;
}
}
printf("M1₆ correctness: %d / %d blocks bit-exact (%.4f%%)\n",
n - mismatches, n, 100.0 * (n - mismatches) / n);
return mismatches;
}
static void throughput_neon(uint64_t seed, int n_blocks, double duration_s)
{
xs_state = seed ? seed : 0xc0de264cULL;
int16_t *master_blocks = malloc((size_t) n_blocks * 16 * sizeof(int16_t));
int16_t *work_blocks = malloc((size_t) n_blocks * 16 * sizeof(int16_t));
uint8_t *master_dst = malloc((size_t) n_blocks * 16);
uint8_t *work_dst = malloc((size_t) n_blocks * 16);
if (!master_blocks || !work_blocks || !master_dst || !work_dst) {
fprintf(stderr, "alloc fail\n"); exit(1);
}
for (int i = 0; i < n_blocks; i++) {
gen_block(master_blocks + i * 16);
for (int j = 0; j < 16; j++) master_dst[i * 16 + j] = (uint8_t)(xs() & 0xff);
}
/* Warm-up. */
memcpy(work_blocks, master_blocks, (size_t) n_blocks * 16 * sizeof(int16_t));
memcpy(work_dst, master_dst, (size_t) n_blocks * 16);
for (int i = 0; i < n_blocks; i++)
ff_h264_idct_add_neon(work_dst + i * 16, work_blocks + i * 16, 4);
double t0 = now_seconds();
double t_end = t0 + duration_s;
uint64_t done = 0;
while (now_seconds() < t_end) {
memcpy(work_blocks, master_blocks, (size_t) n_blocks * 16 * sizeof(int16_t));
memcpy(work_dst, master_dst, (size_t) n_blocks * 16);
for (int i = 0; i < n_blocks; i++)
ff_h264_idct_add_neon(work_dst + i * 16, work_blocks + i * 16, 4);
done += n_blocks;
}
double elapsed = now_seconds() - t0;
/* Subtract setup cost. */
int iters = (int)(done / n_blocks);
double s0 = now_seconds();
for (int i = 0; i < iters; i++) {
memcpy(work_blocks, master_blocks, (size_t) n_blocks * 16 * sizeof(int16_t));
memcpy(work_dst, master_dst, (size_t) n_blocks * 16);
}
double s1 = now_seconds();
double kernel_seconds = elapsed - (s1 - s0);
double mbps = done / kernel_seconds / 1e6;
printf("M3₆ NEON throughput:\n");
printf(" blocks/batch: %d\n", n_blocks);
printf(" batches done: %d\n", iters);
printf(" total blocks: %llu\n", (unsigned long long) done);
printf(" elapsed (kernel)=%.6f s\n", kernel_seconds);
printf(" throughput = %.3f Mblock/s\n", mbps);
printf(" per-block = %.1f ns\n", kernel_seconds / done * 1e9);
/* H.264 1080p 4×4 floor: ~5.85 Mblock/s worst-case, ~2 realistic. */
printf(" H.264 1080p30 worst-case floor: %.2fx margin (5.85 Mblock/s req'd)\n", mbps / 5.85);
printf(" H.264 1080p30 realistic floor: %.2fx margin (2.0 Mblock/s req'd)\n", mbps / 2.0);
free(master_blocks); free(work_blocks); free(master_dst); free(work_dst);
}
int main(int argc, char **argv)
{
int n_blocks = 65536;
double duration = 5.0;
uint64_t seed = 0;
int do_correctness = 1;
static struct option opts[] = {
{"blocks", required_argument, 0, 'b'},
{"duration", required_argument, 0, 'd'},
{"seed", required_argument, 0, 's'},
{"no-correctness", no_argument, 0, 'C'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "b:d:s:C", opts, 0)) != -1;) {
switch (c) {
case 'b': n_blocks = atoi(optarg); break;
case 'd': duration = atof(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'C': do_correctness = 0; break;
default: return 2;
}
}
if (do_correctness) {
printf("=== M1₆ bit-exact (10000 random 4x4 blocks) ===\n");
int mis = correctness_check(seed, 10000);
if (mis != 0) {
fprintf(stderr, "M1 gate FAILED — refusing to measure throughput.\n");
return 1;
}
printf("\n");
}
printf("=== M3₆ NEON throughput ===\n");
throughput_neon(seed, n_blocks, duration);
return 0;
}