/* * Cycle 4 Phase 3 — NEON M3'''' baseline for VP9 8-tap inner LPF wd=8 * (horizontal direction, 8-pixel edge). * * Same harness shape as bench_neon_lpf.c (cycle 2); the only changes * are calling ff_vp9_loop_filter_h_8_8_neon + the wd=8 C reference. * * License: LGPL-2.1+ (links FFmpeg NEON snapshot). */ #define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include #include extern void daedalus_vp9_loop_filter_h_8_8_ref( uint8_t *dst, ptrdiff_t stride, int E, int I, int H); extern void ff_vp9_loop_filter_h_8_8_neon( uint8_t *dst, ptrdiff_t stride, int E, int I, int H); static uint64_t xs_state; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } #define EDGE_W 8 #define EDGE_H 8 #define EDGE_STRIDE 8 #define EDGE_BYTES (EDGE_H * EDGE_STRIDE) static void gen_edge_pixels(uint8_t *buf) { int side_a = (int)(xs() % 200) + 20; int side_b = (int)(xs() % 200) + 20; int noise = (int)(xs() % 30); for (int r = 0; r < EDGE_H; r++) for (int c = 0; c < EDGE_W; c++) { int base = (c < 4) ? side_a : side_b; int n = ((int)(xs() % (2 * noise + 1))) - noise; int v = base + n; buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v); } } static void gen_thresholds(int *E, int *I, int *H) { *E = (int)(xs() % 81); *I = (int)(xs() % 41); *H = (int)(xs() % 11); } static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } static int correctness_check(uint64_t seed, int n) { xs_state = seed ? seed : 0xa57edbeef5717ULL; int mis = 0; uint8_t a[EDGE_BYTES], b[EDGE_BYTES]; for (int i = 0; i < n; i++) { gen_edge_pixels(a); memcpy(b, a, EDGE_BYTES); int E, I, H; gen_thresholds(&E, &I, &H); daedalus_vp9_loop_filter_h_8_8_ref(a + 4, EDGE_STRIDE, E, I, H); ff_vp9_loop_filter_h_8_8_neon (b + 4, EDGE_STRIDE, E, I, H); if (memcmp(a, b, EDGE_BYTES) != 0) { if (mis < 3) fprintf(stderr, "MISMATCH edge %d E=%d I=%d H=%d\n", i, E, I, H); mis++; } } printf("M1''''_c correctness: %d / %d edges bit-exact (%.4f%%)\n", n - mis, n, 100.0 * (n - mis) / n); return mis; } static void throughput(uint64_t seed, int n_edges, double duration) { xs_state = seed ? seed : 0xa57edfeed5170ULL; uint8_t *master = malloc((size_t) n_edges * EDGE_BYTES); uint8_t *work = malloc((size_t) n_edges * EDGE_BYTES); int *Es = malloc(n_edges*sizeof(int)), *Is = malloc(n_edges*sizeof(int)), *Hs = malloc(n_edges*sizeof(int)); for (int i = 0; i < n_edges; i++) { gen_edge_pixels(master + (size_t)i * EDGE_BYTES); gen_thresholds(&Es[i], &Is[i], &Hs[i]); } memcpy(work, master, (size_t) n_edges * EDGE_BYTES); for (int i = 0; i < n_edges; i++) ff_vp9_loop_filter_h_8_8_neon(work + (size_t)i * EDGE_BYTES + 4, EDGE_STRIDE, Es[i], Is[i], Hs[i]); double t0 = now_seconds(), tend = t0 + duration; uint64_t done = 0; while (now_seconds() < tend) { memcpy(work, master, (size_t) n_edges * EDGE_BYTES); for (int i = 0; i < n_edges; i++) ff_vp9_loop_filter_h_8_8_neon(work + (size_t)i * EDGE_BYTES + 4, EDGE_STRIDE, Es[i], Is[i], Hs[i]); done += n_edges; } double el = now_seconds() - t0; int it = (int)(done / n_edges); double s0 = now_seconds(); for (int i = 0; i < it; i++) memcpy(work, master, (size_t) n_edges * EDGE_BYTES); double s1 = now_seconds(); double ks = el - (s1 - s0); double mes = done / ks / 1e6; printf("M3'''' NEON throughput:\n"); printf(" edges/batch: %d\n", n_edges); printf(" total edges: %llu\n", (unsigned long long) done); printf(" elapsed (kernel)=%.6f s\n", ks); printf(" throughput = %.3f Medge/s\n", mes); printf(" per-edge = %.1f ns\n", ks / done * 1e9); printf(" equiv 1080p = %.1f FPS (~64530 edges/frame, worst case)\n", mes * 1e6 / 64530.0); free(master); free(work); free(Es); free(Is); free(Hs); } int main(int argc, char **argv) { int n_edges = 65536; double duration = 5.0; uint64_t seed = 0; int do_corr = 1; static struct option opts[] = { {"edges", required_argument, 0, 'e'}, {"duration", required_argument, 0, 'd'}, {"seed", required_argument, 0, 's'}, {"no-correctness", no_argument, 0, 'C'}, {0,0,0,0} }; for (int c; (c = getopt_long(argc, argv, "e:d:s:C", opts, 0)) != -1;) { switch (c) { case 'e': n_edges = atoi(optarg); break; case 'd': duration = atof(optarg); break; case 's': seed = strtoull(optarg, 0, 0); break; case 'C': do_corr = 0; break; default: return 2; } } if (do_corr) { printf("=== M1''''_c bit-exact (10000 random edges) ===\n"); if (correctness_check(seed, 10000) != 0) return 1; printf("\n"); } printf("=== M3'''' NEON throughput ===\n"); throughput(seed, n_edges, duration); return 0; }