/* * Cycle-2 Phase 3 — NEON baseline microbench for VP9 4-tap loop filter * (horizontal, 8-pixel edge). * * Reports: * M1''_c (correctness): C-ref ↔ NEON bit-exact rate across N random edges * M3'' (throughput): NEON sustained Medge/s, single-thread, time-based * * License: LGPL-2.1+ (statically links FFmpeg n7.1.3 NEON snapshot). */ #define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include #include extern void daedalus_vp9_loop_filter_h_4_8_ref( uint8_t *dst, ptrdiff_t stride, int E, int I, int H); extern void ff_vp9_loop_filter_h_4_8_neon( uint8_t *dst, ptrdiff_t stride, int E, int I, int H); /* --- RNG (matches bench_neon_idct.c shape) ----------------------- */ static uint64_t xs_state; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } /* Per-edge memory layout: 8 rows × 8 cols (the 4 cols on each side of * the edge). The "center" is column 4. Edge stride between rows = 8. * Per edge: 64 bytes of pixel data. */ #define EDGE_W 8 #define EDGE_H 8 #define EDGE_STRIDE 8 #define EDGE_BYTES (EDGE_H * EDGE_STRIDE) static void gen_edge_pixels(uint8_t *buf) { /* Bias toward "edge-like" content: half random uniform, half * structured to look like a real edge (different mean on each side). * This makes `fm` more likely to be true and `hev` to trigger, * exercising the interesting code paths. */ int side_a_base = (int)(xs() % 200) + 20; int side_b_base = (int)(xs() % 200) + 20; int noise_scale = (int)(xs() % 30); for (int r = 0; r < EDGE_H; r++) { for (int c = 0; c < EDGE_W; c++) { int base = (c < 4) ? side_a_base : side_b_base; int noise = ((int)(xs() % (2 * noise_scale + 1))) - noise_scale; int v = base + noise; buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v); } } } static void gen_thresholds(int *E, int *I, int *H) { /* Typical VP9 ranges for the inner filter at low/mid qp. */ *E = (int)(xs() % 81); /* mb_lim: 0..80 */ *I = (int)(xs() % 41); /* lim: 0..40 */ *H = (int)(xs() % 11); /* hev: 0..10 */ } static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } /* --- Correctness gate -------------------------------------------- */ static int correctness_check(uint64_t seed, int n_edges) { xs_state = seed ? seed : 0xa57edbeef5717ULL; int mismatches = 0; int fm_pass = 0; int hev_count = 0; uint8_t buf_a[EDGE_BYTES], buf_b[EDGE_BYTES]; for (int i = 0; i < n_edges; i++) { gen_edge_pixels(buf_a); memcpy(buf_b, buf_a, EDGE_BYTES); int E, I, H; gen_thresholds(&E, &I, &H); /* Call both implementations on independent copies. */ daedalus_vp9_loop_filter_h_4_8_ref(buf_a + 4, EDGE_STRIDE, E, I, H); ff_vp9_loop_filter_h_4_8_neon (buf_b + 4, EDGE_STRIDE, E, I, H); if (memcmp(buf_a, buf_b, EDGE_BYTES) != 0) { if (mismatches < 3) { fprintf(stderr, "MISMATCH edge %d (E=%d I=%d H=%d):\n", i, E, I, H); fprintf(stderr, " ref:"); for (int r = 0; r < EDGE_H; r++) { fprintf(stderr, "\n r%d ", r); for (int c = 0; c < EDGE_W; c++) fprintf(stderr, "%3u ", buf_a[r * EDGE_STRIDE + c]); } fprintf(stderr, "\n neon:"); for (int r = 0; r < EDGE_H; r++) { fprintf(stderr, "\n r%d ", r); for (int c = 0; c < EDGE_W; c++) fprintf(stderr, "%3u ", buf_b[r * EDGE_STRIDE + c]); } fprintf(stderr, "\n"); } mismatches++; } /* Reset for the next iteration. */ /* Detect work paths via comparing buf_b to a pristine copy * — we don't have that here; just track macro stats. */ fm_pass += (memcmp(buf_a, buf_b, EDGE_BYTES) == 0); /* tautological — fix below */ } /* fm_pass above is broken — left as TODO. Headline is mismatch count. */ (void) fm_pass; (void) hev_count; printf("M1''_c correctness: %d / %d edges bit-exact (%.4f%%)\n", n_edges - mismatches, n_edges, 100.0 * (n_edges - mismatches) / n_edges); return mismatches; } /* --- M3'' NEON throughput ---------------------------------------- */ static void throughput_neon(uint64_t seed, int n_edges, double duration_s) { xs_state = seed ? seed : 0xa57edfeed5170ULL; /* Pre-generate one master batch; reuse across iterations. * Each edge has its own private 64-byte buffer. */ uint8_t *master = malloc((size_t) n_edges * EDGE_BYTES); uint8_t *work = malloc((size_t) n_edges * EDGE_BYTES); int *Es = malloc(n_edges * sizeof(int)); int *Is = malloc(n_edges * sizeof(int)); int *Hs = malloc(n_edges * sizeof(int)); if (!master || !work || !Es || !Is || !Hs) { fprintf(stderr, "alloc fail\n"); exit(1); } for (int i = 0; i < n_edges; i++) { gen_edge_pixels(master + (size_t)i * EDGE_BYTES); gen_thresholds(&Es[i], &Is[i], &Hs[i]); } /* Warm-up. */ memcpy(work, master, (size_t) n_edges * EDGE_BYTES); for (int i = 0; i < n_edges; i++) ff_vp9_loop_filter_h_4_8_neon(work + (size_t)i * EDGE_BYTES + 4, EDGE_STRIDE, Es[i], Is[i], Hs[i]); /* Timed: keep running passes until duration elapses, count edges. */ double t0 = now_seconds(); double t_end = t0 + duration_s; uint64_t edges_done = 0; while (now_seconds() < t_end) { memcpy(work, master, (size_t) n_edges * EDGE_BYTES); for (int i = 0; i < n_edges; i++) ff_vp9_loop_filter_h_4_8_neon(work + (size_t)i * EDGE_BYTES + 4, EDGE_STRIDE, Es[i], Is[i], Hs[i]); edges_done += n_edges; } double elapsed = now_seconds() - t0; /* Setup-only timing for memcpy subtraction estimate. */ double s0 = now_seconds(); int setup_iters = (int) (edges_done / n_edges); for (int it = 0; it < setup_iters; it++) memcpy(work, master, (size_t) n_edges * EDGE_BYTES); double s1 = now_seconds(); double kernel_seconds = elapsed - (s1 - s0); double medges_s = edges_done / kernel_seconds / 1e6; printf("M3'' NEON throughput:\n"); printf(" edges/batch: %d\n", n_edges); printf(" batches done: %d\n", setup_iters); printf(" total edges: %llu\n", (unsigned long long) edges_done); printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", kernel_seconds); printf(" elapsed (setup) =%.6f s\n", s1 - s0); printf(" throughput = %.3f Medge/s\n", medges_s); printf(" per-edge = %.1f ns\n", kernel_seconds / edges_done * 1e9); /* Per-frame at 1080p VP9 worst-case ~64k edges: */ printf(" equiv 1080p = %.1f FPS (~64530 edges/frame, worst case)\n", medges_s * 1e6 / 64530.0); free(master); free(work); free(Es); free(Is); free(Hs); } /* --- CLI --------------------------------------------------------- */ int main(int argc, char **argv) { int n_edges = 65536; /* 64k edges per batch fits in ~4 MB */ double duration = 5.0; uint64_t seed = 0; int do_correctness = 1; static struct option opts[] = { {"edges", required_argument, 0, 'e'}, {"duration", required_argument, 0, 'd'}, {"seed", required_argument, 0, 's'}, {"no-correctness", no_argument, 0, 'C'}, {0,0,0,0} }; for (int c; (c = getopt_long(argc, argv, "e:d:s:C", opts, 0)) != -1;) { switch (c) { case 'e': n_edges = atoi(optarg); break; case 'd': duration = atof(optarg); break; case 's': seed = strtoull(optarg, 0, 0); break; case 'C': do_correctness = 0; break; default: return 2; } } if (do_correctness) { printf("=== M1''_c: bit-exact correctness (10000 random edges) ===\n"); if (correctness_check(seed, 10000) != 0) { fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); return 1; } printf("\n"); } printf("=== M3'': NEON throughput ===\n"); throughput_neon(seed, n_edges, duration); return 0; }