Files
daedalus-fourier/tests/bench_neon_h264deblock.c
marfrit 436a5c4f74 Cycle 8 Phase 3 closed: H.264 deblock NEON = 92 Medge/s
M1: 10000/10000 bit-exact (after orientation fix: ff_h264_v_loop_
filter is "vertical filtering of horizontal edges", not "vertical
edge"; 16 columns process the edge horizontally with 8 rows of
vertical context).

M3: 91.947 Medge/s per core. Per-edge 10.9 ns. 11x worst-case
1080p30 floor, 30x realistic floor. Filter triggers on 25 % of
edges (random alpha/beta/tc0 covers both gating paths).

Cycle 8 Phase 9 lesson: H.264/FFmpeg "v_loop_filter" naming uses
filter DIRECTION (vertical) not edge orientation. Edge is
horizontal; filter operates vertically across it. Distinct from
cycle 6's column-major-block lesson but related discovery
pattern. Encoded for future cycles.

R8 prediction revised: 0.09-0.14 ORANGE (down from Phase 1's
0.3-0.8 estimate). H.264 deblock is 2x faster on NEON than VP9
LPF wd=4 (cycle 2) but H.264 deblock has more per-edge branches
that hurt QPU more. Worth building anyway:
- ORANGE in cycle 1's "M4 may rescue" band
- Mixed-kernel deployment helper value (Issue 003) matters more
  than isolation R
- 25%-trigger rate gives 4x effective contribution multiplier
  on QPU side

- tests/h264_deblock_ref.c (column-walking C ref per row segment)
- tests/bench_neon_h264deblock.c (M1 + M3 bench)
- CMakeLists.txt: cycle 8 NEON bench wiring + h264dsp_neon.S
- docs/k8_h264deblock_phase3.md (closure)

Next: Phase 4 plan QPU shader, Phase 5 Sonnet review.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:39:36 +00:00

255 lines
9.3 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Cycle 8 Phase 3 — NEON M3 baseline for H.264 luma vertical
* deblock (non-intra, bS<4).
*
* M1 against the standalone C reference, M3 throughput.
*
* License: BSD-2-Clause; links FFmpeg LGPL-2.1+ snapshot.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <time.h>
#include <getopt.h>
extern void daedalus_h264_v_loop_filter_luma_ref(
uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void ff_h264_v_loop_filter_luma_neon(
uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
/* Edge layout: 8 rows × 16 cols (rows -4..+3 around edge). The
* edge is between rows -1 and 0 (= a HORIZONTAL edge filtered
* VERTICALLY per H.264 v_loop_filter convention).
*
* Tile: 16 rows × 16 cols. Edge at row 4 (rows 0..3 above + edge
* + rows 5..7 below; rows 8..15 are halo). pix points to tile +
* EDGE_ROW*stride. */
#define TILE_STRIDE 16
#define TILE_ROWS 16
#define TILE_BYTES (TILE_ROWS * TILE_STRIDE)
#define EDGE_ROW 4
static uint64_t xs_state;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
/* Generate a tile with a horizontal edge at row EDGE_ROW (between
* rows 3 and 4). Top side (rows 0..3) clusters around side_a_base,
* bottom (rows 4..7) around side_b_base. Other rows are halo. */
static void gen_tile(uint8_t *tile)
{
int side_a_base = (int)(xs() % 200) + 20;
int side_b_base = (int)(xs() % 200) + 20;
int noise = (int)(xs() % 30) + 1;
for (int r = 0; r < TILE_ROWS; r++) {
for (int c = 0; c < TILE_STRIDE; c++) {
int v;
if (r >= EDGE_ROW - 4 && r < EDGE_ROW + 4) {
/* edge region rows EDGE_ROW-4..EDGE_ROW+3 */
int local = r - (EDGE_ROW - 4);
int base = local < 4 ? side_a_base : side_b_base;
int n = ((int)(xs() % (2 * noise + 1))) - noise;
v = base + n;
} else {
v = (int)(xs() & 0xff); /* halo */
}
tile[r * TILE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
}
}
}
static void gen_thresholds(int *alpha, int *beta, int8_t tc0[4])
{
/* Realistic H.264 alpha/beta ranges: typical 0..30 in spec
* tables for QP 30..40. Allow up to 64 to stress alpha/beta
* gating. */
*alpha = (int)(xs() % 64) + 1;
*beta = (int)(xs() % 16) + 1;
/* tc0 from spec table: -1 means "no filter for this segment",
* 0..6 typical non-zero values. */
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
static double now_seconds(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
static int correctness_check(uint64_t seed, int n)
{
xs_state = seed ? seed : 0xdeb1ec500dULL;
int mismatches = 0, prints = 0;
int filtered_count = 0;
uint8_t tile_a[TILE_BYTES], tile_b[TILE_BYTES], tile_saved[TILE_BYTES];
for (int i = 0; i < n; i++) {
gen_tile(tile_a);
memcpy(tile_b, tile_a, TILE_BYTES);
memcpy(tile_saved, tile_a, TILE_BYTES);
int alpha, beta;
int8_t tc0[4];
gen_thresholds(&alpha, &beta, tc0);
uint8_t *pix_a = tile_a + EDGE_ROW * TILE_STRIDE;
uint8_t *pix_b = tile_b + EDGE_ROW * TILE_STRIDE;
daedalus_h264_v_loop_filter_luma_ref(pix_a, TILE_STRIDE, alpha, beta, tc0);
ff_h264_v_loop_filter_luma_neon(pix_b, TILE_STRIDE, alpha, beta, tc0);
/* Check the edge region rows ±2 (the only rows deblock can modify). */
int diff = 0;
for (int r = EDGE_ROW - 2; r < EDGE_ROW + 2; r++) {
for (int c = 0; c < TILE_STRIDE; c++) {
if (tile_a[r*TILE_STRIDE + c] != tile_b[r*TILE_STRIDE + c]) diff++;
}
}
/* Count whether filter actually triggered for any row. */
int triggered = (memcmp(tile_a, tile_saved, TILE_BYTES) != 0);
if (triggered) filtered_count++;
if (diff) {
if (prints < 3) {
fprintf(stderr, "MISMATCH edge %d (%d/64 modifiable pixels differ), alpha=%d beta=%d, tc0=[%d,%d,%d,%d]:\n",
i, diff, alpha, beta, tc0[0], tc0[1], tc0[2], tc0[3]);
fprintf(stderr, " input tile (cols 0..15):");
for (int r = 0; r < TILE_ROWS; r++) {
fprintf(stderr, "\n r%2d ", r);
for (int c = 0; c < TILE_STRIDE; c++)
fprintf(stderr, "%3u ", tile_saved[r*TILE_STRIDE + c]);
}
fprintf(stderr, "\n ref out (edge rows 2..5, all cols):");
for (int r = EDGE_ROW - 2; r < EDGE_ROW + 2; r++) {
fprintf(stderr, "\n r%2d ", r);
for (int c = 0; c < TILE_STRIDE; c++)
fprintf(stderr, "%3u ", tile_a[r*TILE_STRIDE + c]);
}
fprintf(stderr, "\n neon out (edge rows 2..5, all cols):");
for (int r = EDGE_ROW - 2; r < EDGE_ROW + 2; r++) {
fprintf(stderr, "\n r%2d ", r);
for (int c = 0; c < TILE_STRIDE; c++)
fprintf(stderr, "%3u ", tile_b[r*TILE_STRIDE + c]);
}
fprintf(stderr, "\n");
prints++;
}
mismatches++;
}
}
printf("M1₈ correctness: %d / %d edges bit-exact (%.4f%%)\n",
n - mismatches, n, 100.0 * (n - mismatches) / n);
printf(" filter triggered on %d/%d edges (%.2f%%)\n",
filtered_count, n, 100.0 * filtered_count / n);
return mismatches;
}
static void throughput_neon(uint64_t seed, int n_edges, double duration_s)
{
xs_state = seed ? seed : 0xdeb1ec500dULL;
uint8_t *master = malloc((size_t) n_edges * TILE_BYTES);
uint8_t *work = malloc((size_t) n_edges * TILE_BYTES);
int *alphas = malloc(n_edges * sizeof(int));
int *betas = malloc(n_edges * sizeof(int));
int8_t (*tc0s)[4] = malloc(n_edges * 4);
if (!master || !work || !alphas || !betas || !tc0s) {
fprintf(stderr, "alloc fail\n"); exit(1);
}
for (int i = 0; i < n_edges; i++) {
gen_tile(master + i * TILE_BYTES);
gen_thresholds(&alphas[i], &betas[i], tc0s[i]);
}
memcpy(work, master, (size_t) n_edges * TILE_BYTES);
for (int i = 0; i < n_edges; i++)
ff_h264_v_loop_filter_luma_neon(work + i * TILE_BYTES + EDGE_ROW * TILE_STRIDE,
TILE_STRIDE, alphas[i], betas[i], tc0s[i]);
double t0 = now_seconds();
double t_end = t0 + duration_s;
uint64_t done = 0;
while (now_seconds() < t_end) {
memcpy(work, master, (size_t) n_edges * TILE_BYTES);
for (int i = 0; i < n_edges; i++)
ff_h264_v_loop_filter_luma_neon(work + i * TILE_BYTES + EDGE_ROW * TILE_STRIDE,
TILE_STRIDE, alphas[i], betas[i], tc0s[i]);
done += n_edges;
}
double elapsed = now_seconds() - t0;
int iters = (int)(done / n_edges);
double s0 = now_seconds();
for (int i = 0; i < iters; i++)
memcpy(work, master, (size_t) n_edges * TILE_BYTES);
double s1 = now_seconds();
double kernel_seconds = elapsed - (s1 - s0);
double medges = done / kernel_seconds / 1e6;
printf("M3₈ NEON throughput:\n");
printf(" edges/batch: %d\n", n_edges);
printf(" batches done: %d\n", iters);
printf(" total edges: %llu\n", (unsigned long long) done);
printf(" elapsed (kernel)=%.6f s\n", kernel_seconds);
printf(" throughput = %.3f Medge/s\n", medges);
printf(" per-edge = %.1f ns\n", kernel_seconds / done * 1e9);
/* 1080p H.264 worst-case: ~8 Medge/s (luma v+h). Realistic: 2-4. */
printf(" H.264 1080p30 worst-case floor: %.2fx margin (8.0 Medge/s req'd)\n", medges / 8.0);
printf(" H.264 1080p30 realistic floor: %.2fx margin (3.0 Medge/s req'd)\n", medges / 3.0);
free(master); free(work); free(alphas); free(betas); free(tc0s);
}
int main(int argc, char **argv)
{
int n_edges = 65536;
double duration = 5.0;
uint64_t seed = 0;
int do_correctness = 1;
static struct option opts[] = {
{"edges", required_argument, 0, 'e'},
{"duration", required_argument, 0, 'd'},
{"seed", required_argument, 0, 's'},
{"no-correctness", no_argument, 0, 'C'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "e:d:s:C", opts, 0)) != -1;) {
switch (c) {
case 'e': n_edges = atoi(optarg); break;
case 'd': duration = atof(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'C': do_correctness = 0; break;
default: return 2;
}
}
if (do_correctness) {
printf("=== M1₈ bit-exact (10000 random edges) ===\n");
int mis = correctness_check(seed, 10000);
if (mis != 0) {
fprintf(stderr, "M1 gate FAILED — refusing to measure throughput.\n");
return 1;
}
printf("\n");
}
printf("=== M3₈ NEON throughput ===\n");
throughput_neon(seed, n_edges, duration);
return 0;
}