/* * Cycle 9 Phase 3 — NEON M3 baseline for H.264 luma qpel mc20 (8x8, * horizontal half-pel, 6-tap filter). * * M1 vs C ref + M3 throughput. License: BSD-2-Clause. */ #define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include #include extern void daedalus_put_h264_qpel8_mc20_ref( uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc20_neon( uint8_t *dst, const uint8_t *src, ptrdiff_t stride); #define TILE_STRIDE 16 #define TILE_ROWS 12 /* room for src[-2..+8] + dst[0..7] in one tile */ #define TILE_BYTES (TILE_ROWS * TILE_STRIDE) #define SRC_COL 3 /* src points at col SRC_COL of tile = leftmost output col */ #define DST_COL 3 /* dst also at col SRC_COL (overwrite in place); use separate tile for compare */ static uint64_t xs_state; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } static void gen_tile(uint8_t *tile) { for (int i = 0; i < TILE_BYTES; i++) tile[i] = (uint8_t)(xs() & 0xff); } static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } static int correctness_check(uint64_t seed, int n) { xs_state = seed ? seed : 0xc0de9264cULL; int mismatches = 0, prints = 0; /* Use a SRC tile (input) and two DST tiles (one for ref, one for NEON). */ uint8_t src_tile[TILE_BYTES]; uint8_t dst_a[TILE_BYTES], dst_b[TILE_BYTES]; for (int i = 0; i < n; i++) { gen_tile(src_tile); memset(dst_a, 0, sizeof(dst_a)); memset(dst_b, 0, sizeof(dst_b)); const uint8_t *src_ptr = src_tile + SRC_COL; uint8_t *dst_a_ptr = dst_a + DST_COL; uint8_t *dst_b_ptr = dst_b + DST_COL; daedalus_put_h264_qpel8_mc20_ref(dst_a_ptr, src_ptr, TILE_STRIDE); ff_put_h264_qpel8_mc20_neon(dst_b_ptr, src_ptr, TILE_STRIDE); int diff = 0; for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) if (dst_a[r*TILE_STRIDE + DST_COL + c] != dst_b[r*TILE_STRIDE + DST_COL + c]) diff++; if (diff) { if (prints < 3) { fprintf(stderr, "MISMATCH block %d (%d/64 pix diff):\n", i, diff); prints++; } mismatches++; } } printf("M1₉ correctness: %d / %d blocks bit-exact (%.4f%%)\n", n - mismatches, n, 100.0 * (n - mismatches) / n); return mismatches; } static void throughput_neon(uint64_t seed, int n_blocks, double duration_s) { xs_state = seed ? seed : 0xc0de9264cULL; uint8_t *src_master = malloc((size_t) n_blocks * TILE_BYTES); uint8_t *dst_master = malloc((size_t) n_blocks * TILE_BYTES); uint8_t *dst_work = malloc((size_t) n_blocks * TILE_BYTES); if (!src_master || !dst_master || !dst_work) { fprintf(stderr, "alloc fail\n"); exit(1); } for (int i = 0; i < n_blocks; i++) { for (int j = 0; j < TILE_BYTES; j++) { src_master[i*TILE_BYTES + j] = (uint8_t)(xs() & 0xff); dst_master[i*TILE_BYTES + j] = 0; } } memcpy(dst_work, dst_master, (size_t) n_blocks * TILE_BYTES); for (int i = 0; i < n_blocks; i++) ff_put_h264_qpel8_mc20_neon(dst_work + i*TILE_BYTES + DST_COL, src_master + i*TILE_BYTES + SRC_COL, TILE_STRIDE); double t0 = now_seconds(); double t_end = t0 + duration_s; uint64_t done = 0; while (now_seconds() < t_end) { memcpy(dst_work, dst_master, (size_t) n_blocks * TILE_BYTES); for (int i = 0; i < n_blocks; i++) ff_put_h264_qpel8_mc20_neon(dst_work + i*TILE_BYTES + DST_COL, src_master + i*TILE_BYTES + SRC_COL, TILE_STRIDE); done += n_blocks; } double elapsed = now_seconds() - t0; int iters = (int)(done / n_blocks); double s0 = now_seconds(); for (int i = 0; i < iters; i++) memcpy(dst_work, dst_master, (size_t) n_blocks * TILE_BYTES); double s1 = now_seconds(); double kernel_seconds = elapsed - (s1 - s0); double mbps = done / kernel_seconds / 1e6; printf("M3₉ NEON throughput:\n"); printf(" blocks/batch: %d\n", n_blocks); printf(" batches done: %d\n", iters); printf(" total blocks: %llu\n", (unsigned long long) done); printf(" elapsed (kernel)=%.6f s\n", kernel_seconds); printf(" throughput = %.3f Mblock/s\n", mbps); printf(" per-block = %.1f ns\n", kernel_seconds / done * 1e9); /* 1080p H.264 luma MC: ~32400 blocks/frame × 30 fps ≈ 0.972 Mblock/s * for 8x8 blocks. For 16x16 (typical macroblock-mode MC) it's * ~0.243 Mblock/s. Use the conservative 8x8 estimate. */ printf(" H.264 1080p30 8x8 MC floor: %.2fx margin (0.972 Mblock/s req'd)\n", mbps / 0.972); free(src_master); free(dst_master); free(dst_work); } int main(int argc, char **argv) { int n_blocks = 65536; double duration = 5.0; uint64_t seed = 0; int do_correctness = 1; static struct option opts[] = { {"blocks", required_argument, 0, 'b'}, {"duration", required_argument, 0, 'd'}, {"seed", required_argument, 0, 's'}, {"no-correctness", no_argument, 0, 'C'}, {0,0,0,0} }; for (int c; (c = getopt_long(argc, argv, "b:d:s:C", opts, 0)) != -1;) { switch (c) { case 'b': n_blocks = atoi(optarg); break; case 'd': duration = atof(optarg); break; case 's': seed = strtoull(optarg, 0, 0); break; case 'C': do_correctness = 0; break; default: return 2; } } if (do_correctness) { printf("=== M1₉ bit-exact (10000 random 8x8 blocks) ===\n"); int mis = correctness_check(seed, 10000); if (mis != 0) { fprintf(stderr, "M1 gate FAILED — refusing to measure throughput.\n"); return 1; } printf("\n"); } printf("=== M3₉ NEON throughput ===\n"); throughput_neon(seed, n_blocks, duration); return 0; }