Files
daedalus-fourier/tests/bench_neon_h264qpel_mc20.c
marfrit 5c8b09349c Cycle 9 closed: H.264 luma qpel mc20 = 131 Mblock/s NEON, CPU-only
Last unmeasured H.264 kernel. mc20 picked as representative
(horizontal half-pel, 6-tap filter; canonical for the H.264 luma
qpel family). M1 PASS 10000/10000 first try, M3 = 131.477
Mblock/s on a single core (7.6 ns/block), 135x the 1080p30 floor.

Per the cycles 6+7 lightweight-kernel rationale, Phase 4 deferred:
QPU dispatch floor (~250 ns/block) is 33x above the NEON per-block
cost; R9 ≈ 0.03 deep RED. No realistic QPU offload value.

Generalization: all H.264 luma MC variants (mc02, mc11, mc22,
etc.) will share this verdict. No need to measure each variant
individually.

H.264 NEON is dramatically faster than VP9 NEON across the board:
- IDCT 4x4: 175 vs N/A    (no VP9 analog)
- IDCT 8x8: 151 vs 8.2 Mblock/s (18x faster)
- MC 6/8-tap: 131 vs 7.0   (19x faster)
- Deblock: 92 vs 48 Medge/s (2x faster)

H.264 deployment recipe: all CPU NEON except deblock (opportunistic
QPU). On a Pi 5 running H.264-only, the QPU is mostly idle.

Cycles 1-9 complete. Public API exposes all 9.
Next: daedalus-v4l2 sibling repo per locked Phase 8 architecture
(B + γ + sibling), then README polish.

- external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
  vendored (1467 lines, all qpel variants)
- tests/h264_qpel8_mc20_ref.c: 40-line C ref (clip255 of
  6-tap convolution)
- tests/bench_neon_h264qpel_mc20.c: M1 + M3 bench
- docs/k9_h264qpel_mc20.md: cycle 9 closure with comparison
  matrix

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:53:21 +00:00

177 lines
6.1 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Cycle 9 Phase 3 — NEON M3 baseline for H.264 luma qpel mc20 (8x8,
* horizontal half-pel, 6-tap filter).
*
* M1 vs C ref + M3 throughput. License: BSD-2-Clause.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <time.h>
#include <getopt.h>
extern void daedalus_put_h264_qpel8_mc20_ref(
uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc20_neon(
uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
#define TILE_STRIDE 16
#define TILE_ROWS 12 /* room for src[-2..+8] + dst[0..7] in one tile */
#define TILE_BYTES (TILE_ROWS * TILE_STRIDE)
#define SRC_COL 3 /* src points at col SRC_COL of tile = leftmost output col */
#define DST_COL 3 /* dst also at col SRC_COL (overwrite in place); use separate tile for compare */
static uint64_t xs_state;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
static void gen_tile(uint8_t *tile)
{
for (int i = 0; i < TILE_BYTES; i++) tile[i] = (uint8_t)(xs() & 0xff);
}
static double now_seconds(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
static int correctness_check(uint64_t seed, int n)
{
xs_state = seed ? seed : 0xc0de9264cULL;
int mismatches = 0, prints = 0;
/* Use a SRC tile (input) and two DST tiles (one for ref, one for NEON). */
uint8_t src_tile[TILE_BYTES];
uint8_t dst_a[TILE_BYTES], dst_b[TILE_BYTES];
for (int i = 0; i < n; i++) {
gen_tile(src_tile);
memset(dst_a, 0, sizeof(dst_a));
memset(dst_b, 0, sizeof(dst_b));
const uint8_t *src_ptr = src_tile + SRC_COL;
uint8_t *dst_a_ptr = dst_a + DST_COL;
uint8_t *dst_b_ptr = dst_b + DST_COL;
daedalus_put_h264_qpel8_mc20_ref(dst_a_ptr, src_ptr, TILE_STRIDE);
ff_put_h264_qpel8_mc20_neon(dst_b_ptr, src_ptr, TILE_STRIDE);
int diff = 0;
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++)
if (dst_a[r*TILE_STRIDE + DST_COL + c] != dst_b[r*TILE_STRIDE + DST_COL + c]) diff++;
if (diff) {
if (prints < 3) {
fprintf(stderr, "MISMATCH block %d (%d/64 pix diff):\n", i, diff);
prints++;
}
mismatches++;
}
}
printf("M1₉ correctness: %d / %d blocks bit-exact (%.4f%%)\n",
n - mismatches, n, 100.0 * (n - mismatches) / n);
return mismatches;
}
static void throughput_neon(uint64_t seed, int n_blocks, double duration_s)
{
xs_state = seed ? seed : 0xc0de9264cULL;
uint8_t *src_master = malloc((size_t) n_blocks * TILE_BYTES);
uint8_t *dst_master = malloc((size_t) n_blocks * TILE_BYTES);
uint8_t *dst_work = malloc((size_t) n_blocks * TILE_BYTES);
if (!src_master || !dst_master || !dst_work) { fprintf(stderr, "alloc fail\n"); exit(1); }
for (int i = 0; i < n_blocks; i++) {
for (int j = 0; j < TILE_BYTES; j++) {
src_master[i*TILE_BYTES + j] = (uint8_t)(xs() & 0xff);
dst_master[i*TILE_BYTES + j] = 0;
}
}
memcpy(dst_work, dst_master, (size_t) n_blocks * TILE_BYTES);
for (int i = 0; i < n_blocks; i++)
ff_put_h264_qpel8_mc20_neon(dst_work + i*TILE_BYTES + DST_COL,
src_master + i*TILE_BYTES + SRC_COL, TILE_STRIDE);
double t0 = now_seconds();
double t_end = t0 + duration_s;
uint64_t done = 0;
while (now_seconds() < t_end) {
memcpy(dst_work, dst_master, (size_t) n_blocks * TILE_BYTES);
for (int i = 0; i < n_blocks; i++)
ff_put_h264_qpel8_mc20_neon(dst_work + i*TILE_BYTES + DST_COL,
src_master + i*TILE_BYTES + SRC_COL, TILE_STRIDE);
done += n_blocks;
}
double elapsed = now_seconds() - t0;
int iters = (int)(done / n_blocks);
double s0 = now_seconds();
for (int i = 0; i < iters; i++)
memcpy(dst_work, dst_master, (size_t) n_blocks * TILE_BYTES);
double s1 = now_seconds();
double kernel_seconds = elapsed - (s1 - s0);
double mbps = done / kernel_seconds / 1e6;
printf("M3₉ NEON throughput:\n");
printf(" blocks/batch: %d\n", n_blocks);
printf(" batches done: %d\n", iters);
printf(" total blocks: %llu\n", (unsigned long long) done);
printf(" elapsed (kernel)=%.6f s\n", kernel_seconds);
printf(" throughput = %.3f Mblock/s\n", mbps);
printf(" per-block = %.1f ns\n", kernel_seconds / done * 1e9);
/* 1080p H.264 luma MC: ~32400 blocks/frame × 30 fps ≈ 0.972 Mblock/s
* for 8x8 blocks. For 16x16 (typical macroblock-mode MC) it's
* ~0.243 Mblock/s. Use the conservative 8x8 estimate. */
printf(" H.264 1080p30 8x8 MC floor: %.2fx margin (0.972 Mblock/s req'd)\n", mbps / 0.972);
free(src_master); free(dst_master); free(dst_work);
}
int main(int argc, char **argv)
{
int n_blocks = 65536;
double duration = 5.0;
uint64_t seed = 0;
int do_correctness = 1;
static struct option opts[] = {
{"blocks", required_argument, 0, 'b'},
{"duration", required_argument, 0, 'd'},
{"seed", required_argument, 0, 's'},
{"no-correctness", no_argument, 0, 'C'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "b:d:s:C", opts, 0)) != -1;) {
switch (c) {
case 'b': n_blocks = atoi(optarg); break;
case 'd': duration = atof(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'C': do_correctness = 0; break;
default: return 2;
}
}
if (do_correctness) {
printf("=== M1₉ bit-exact (10000 random 8x8 blocks) ===\n");
int mis = correctness_check(seed, 10000);
if (mis != 0) {
fprintf(stderr, "M1 gate FAILED — refusing to measure throughput.\n");
return 1;
}
printf("\n");
}
printf("=== M3₉ NEON throughput ===\n");
throughput_neon(seed, n_blocks, duration);
return 0;
}