/* * Cycle 3 Phase 3 — NEON M3''' baseline for VP9 8-tap regular * horizontal MC interpolation, 8×8 block. * * Reports: * M1'''_c (correctness): C-ref ↔ NEON bit-exact rate, N random * 8×8 blocks with random source pixels and * random subpel phase mx ∈ [0, 15] * M3''' (throughput): NEON sustained Mblock/s, single-thread, * time-based * * License: LGPL-2.1+ (statically links FFmpeg NEON snapshot). */ #define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include #include extern void daedalus_vp9_put_regular_8h_ref( uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my); extern void ff_vp9_put_regular8_h_neon( uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my); /* RNG ------------------------------------------------------------ */ static uint64_t xs_state; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } /* Block layout: each block gets its own 8×16 source buffer + 8×8 dst. * - source buffer is 16 cols wide; the filter is called with * src = block_src + 3, so it reads cols [src+0-3..src+8+4] = * [0..14] of the 16-col buffer. col 15 is unused padding. * - dst is 8 cols × 8 rows. */ #define SRC_W 16 #define SRC_H 8 #define DST_W 8 #define DST_H 8 #define SRC_BYTES (SRC_H * SRC_W) /* 128 */ #define DST_BYTES (DST_H * DST_W) /* 64 */ static void gen_src(uint8_t *buf) { for (int i = 0; i < SRC_BYTES; i++) buf[i] = (uint8_t)(xs() & 0xff); } static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } /* M1'''_c correctness gate -------------------------------------- */ static int correctness_check(uint64_t seed, int n_blocks) { xs_state = seed ? seed : 0xabcdef1234567890ULL; int mismatches = 0; uint8_t src[SRC_BYTES]; uint8_t dst_a[DST_BYTES], dst_b[DST_BYTES]; int mx_hist[16] = {0}; for (int i = 0; i < n_blocks; i++) { gen_src(src); int mx = (int)(xs() & 15); mx_hist[mx]++; memset(dst_a, 0, DST_BYTES); memset(dst_b, 0, DST_BYTES); daedalus_vp9_put_regular_8h_ref(dst_a, DST_W, src + 3, SRC_W, DST_H, mx, 0); ff_vp9_put_regular8_h_neon (dst_b, DST_W, src + 3, SRC_W, DST_H, mx, 0); if (memcmp(dst_a, dst_b, DST_BYTES) != 0) { if (mismatches < 3) { fprintf(stderr, "MISMATCH block %d mx=%d:\n", i, mx); fprintf(stderr, " ref:"); for (int r = 0; r < 8; r++) { fprintf(stderr, "\n r%d ", r); for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_a[r*8+c]); } fprintf(stderr, "\n neon:"); for (int r = 0; r < 8; r++) { fprintf(stderr, "\n r%d ", r); for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", dst_b[r*8+c]); } fprintf(stderr, "\n"); } mismatches++; } } printf("M1'''_c correctness: %d / %d blocks bit-exact (%.4f%%)\n", n_blocks - mismatches, n_blocks, 100.0 * (n_blocks - mismatches) / n_blocks); /* mx histogram — confirms all 16 phases get exercised. */ int min_mx = mx_hist[0], max_mx = mx_hist[0]; for (int i = 1; i < 16; i++) { if (mx_hist[i] < min_mx) min_mx = mx_hist[i]; if (mx_hist[i] > max_mx) max_mx = mx_hist[i]; } printf(" mx phase coverage: min=%d max=%d (16 phases sampled)\n", min_mx, max_mx); return mismatches; } /* M3''' throughput ---------------------------------------------- */ static void throughput_neon(uint64_t seed, int n_blocks, double duration_s) { xs_state = seed ? seed : 0xdeadbeef12345678ULL; uint8_t *master_src = malloc((size_t) n_blocks * SRC_BYTES); uint8_t *work_src = malloc((size_t) n_blocks * SRC_BYTES); uint8_t *dsts = malloc((size_t) n_blocks * DST_BYTES); int *mxs = malloc(n_blocks * sizeof(int)); if (!master_src || !work_src || !dsts || !mxs) { fprintf(stderr, "alloc fail\n"); exit(1); } for (int i = 0; i < n_blocks; i++) { gen_src(master_src + (size_t)i * SRC_BYTES); mxs[i] = (int)(xs() & 15); } /* Warm. */ memcpy(work_src, master_src, (size_t) n_blocks * SRC_BYTES); for (int i = 0; i < n_blocks; i++) ff_vp9_put_regular8_h_neon(dsts + (size_t)i * DST_BYTES, DST_W, work_src + (size_t)i * SRC_BYTES + 3, SRC_W, DST_H, mxs[i], 0); double t0 = now_seconds(); double t_end = t0 + duration_s; uint64_t done = 0; while (now_seconds() < t_end) { memcpy(work_src, master_src, (size_t) n_blocks * SRC_BYTES); for (int i = 0; i < n_blocks; i++) ff_vp9_put_regular8_h_neon(dsts + (size_t)i * DST_BYTES, DST_W, work_src + (size_t)i * SRC_BYTES + 3, SRC_W, DST_H, mxs[i], 0); done += n_blocks; } double elapsed = now_seconds() - t0; /* setup-only subtraction */ int setup_iters = (int) (done / n_blocks); double s0 = now_seconds(); for (int it = 0; it < setup_iters; it++) memcpy(work_src, master_src, (size_t) n_blocks * SRC_BYTES); double s1 = now_seconds(); double kernel_seconds = elapsed - (s1 - s0); double mbps = done / kernel_seconds / 1e6; printf("M3''' NEON throughput:\n"); printf(" blocks/batch: %d\n", n_blocks); printf(" batches done: %d\n", setup_iters); printf(" total blocks: %llu\n", (unsigned long long) done); printf(" elapsed (kernel)=%.6f s\n", kernel_seconds); printf(" elapsed (setup) =%.6f s\n", s1 - s0); printf(" throughput = %.3f Mblock/s\n", mbps); printf(" per-block = %.1f ns\n", kernel_seconds / done * 1e9); /* 1080p: 32400 blocks/frame */ printf(" equiv 1080p = %.1f FPS (32400 blocks/frame)\n", mbps * 1e6 / 32400.0); free(master_src); free(work_src); free(dsts); free(mxs); } int main(int argc, char **argv) { int n_blocks = 65536; double duration = 5.0; uint64_t seed = 0; int do_correctness = 1; static struct option opts[] = { {"blocks", required_argument, 0, 'b'}, {"duration", required_argument, 0, 'd'}, {"seed", required_argument, 0, 's'}, {"no-correctness", no_argument, 0, 'C'}, {0,0,0,0} }; for (int c; (c = getopt_long(argc, argv, "b:d:s:C", opts, 0)) != -1;) { switch (c) { case 'b': n_blocks = atoi(optarg); break; case 'd': duration = atof(optarg); break; case 's': seed = strtoull(optarg, 0, 0); break; case 'C': do_correctness = 0; break; default: return 2; } } if (do_correctness) { printf("=== M1'''_c bit-exact (10000 random blocks) ===\n"); if (correctness_check(seed, 10000) != 0) { fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); return 1; } printf("\n"); } printf("=== M3''' NEON throughput ===\n"); throughput_neon(seed, n_blocks, duration); return 0; }