/* * Cycle 6 Phase 3 — NEON M3 baseline for H.264 IDCT 4x4 + add. * * Calls FFmpeg `ff_h264_idct_add_neon`. Reports M1 bit-exact vs * the standalone C reference, plus M3 throughput. * * License: BSD-2-Clause; links FFmpeg LGPL-2.1+ snapshot. */ #define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include #include extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride); #define DST_STRIDE 16 /* arbitrary stride for the test surface */ #define DST_ROWS 4 #define DST_BYTES (DST_ROWS * DST_STRIDE) static uint64_t xs_state; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } static void gen_block(int16_t b[16]) { /* Realistic H.264 residual: small coefficients, mostly zero, * a few non-zero in low-frequency positions. */ memset(b, 0, 16 * sizeof(int16_t)); int n_nonzero = 1 + (int)(xs() % 8); for (int i = 0; i < n_nonzero; i++) { int pos = (int)(xs() % 16); int16_t v = (int16_t)((int)(xs() % 1024) - 512); b[pos] = v; } } static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } static int correctness_check(uint64_t seed, int n) { xs_state = seed ? seed : 0xc0de264cULL; int mismatches = 0; int prints = 0; int16_t block_a[16], block_b[16], block_saved[16]; uint8_t dst_a[DST_BYTES], dst_b[DST_BYTES], dst_initial[DST_BYTES]; for (int i = 0; i < n; i++) { gen_block(block_a); memcpy(block_b, block_a, sizeof(block_a)); memcpy(block_saved, block_a, sizeof(block_a)); /* Random initial dst (4×4 region at offset 0, row stride DST_STRIDE). */ for (int r = 0; r < 4; r++) for (int c = 0; c < 4; c++) dst_a[r * DST_STRIDE + c] = dst_b[r * DST_STRIDE + c] = (uint8_t)(xs() & 0xff); memcpy(dst_initial, dst_a, DST_BYTES); daedalus_h264_idct_add_ref(dst_a, block_a, DST_STRIDE); ff_h264_idct_add_neon(dst_b, block_b, DST_STRIDE); int diff = 0; for (int r = 0; r < 4; r++) for (int c = 0; c < 4; c++) if (dst_a[r*DST_STRIDE + c] != dst_b[r*DST_STRIDE + c]) diff++; if (diff) { if (prints < 3) { fprintf(stderr, "MISMATCH block %d (%d/16 pix diff):\n", i, diff); fprintf(stderr, " input block (row-major):"); for (int r = 0; r < 4; r++) { fprintf(stderr, "\n r%d ", r); for (int c = 0; c < 4; c++) fprintf(stderr, "%6d ", block_saved[r*4 + c]); } fprintf(stderr, "\n initial dst:"); for (int r = 0; r < 4; r++) { fprintf(stderr, "\n r%d ", r); for (int c = 0; c < 4; c++) fprintf(stderr, "%3u ", dst_initial[r*DST_STRIDE + c]); } fprintf(stderr, "\n"); fprintf(stderr, " ref:"); for (int r = 0; r < 4; r++) { fprintf(stderr, "\n r%d ", r); for (int c = 0; c < 4; c++) fprintf(stderr, "%3u ", dst_a[r*DST_STRIDE+c]); } fprintf(stderr, "\n neon:"); for (int r = 0; r < 4; r++) { fprintf(stderr, "\n r%d ", r); for (int c = 0; c < 4; c++) fprintf(stderr, "%3u ", dst_b[r*DST_STRIDE+c]); } fprintf(stderr, "\n"); prints++; } mismatches++; } } printf("M1₆ correctness: %d / %d blocks bit-exact (%.4f%%)\n", n - mismatches, n, 100.0 * (n - mismatches) / n); return mismatches; } static void throughput_neon(uint64_t seed, int n_blocks, double duration_s) { xs_state = seed ? seed : 0xc0de264cULL; int16_t *master_blocks = malloc((size_t) n_blocks * 16 * sizeof(int16_t)); int16_t *work_blocks = malloc((size_t) n_blocks * 16 * sizeof(int16_t)); uint8_t *master_dst = malloc((size_t) n_blocks * 16); uint8_t *work_dst = malloc((size_t) n_blocks * 16); if (!master_blocks || !work_blocks || !master_dst || !work_dst) { fprintf(stderr, "alloc fail\n"); exit(1); } for (int i = 0; i < n_blocks; i++) { gen_block(master_blocks + i * 16); for (int j = 0; j < 16; j++) master_dst[i * 16 + j] = (uint8_t)(xs() & 0xff); } /* Warm-up. */ memcpy(work_blocks, master_blocks, (size_t) n_blocks * 16 * sizeof(int16_t)); memcpy(work_dst, master_dst, (size_t) n_blocks * 16); for (int i = 0; i < n_blocks; i++) ff_h264_idct_add_neon(work_dst + i * 16, work_blocks + i * 16, 4); double t0 = now_seconds(); double t_end = t0 + duration_s; uint64_t done = 0; while (now_seconds() < t_end) { memcpy(work_blocks, master_blocks, (size_t) n_blocks * 16 * sizeof(int16_t)); memcpy(work_dst, master_dst, (size_t) n_blocks * 16); for (int i = 0; i < n_blocks; i++) ff_h264_idct_add_neon(work_dst + i * 16, work_blocks + i * 16, 4); done += n_blocks; } double elapsed = now_seconds() - t0; /* Subtract setup cost. */ int iters = (int)(done / n_blocks); double s0 = now_seconds(); for (int i = 0; i < iters; i++) { memcpy(work_blocks, master_blocks, (size_t) n_blocks * 16 * sizeof(int16_t)); memcpy(work_dst, master_dst, (size_t) n_blocks * 16); } double s1 = now_seconds(); double kernel_seconds = elapsed - (s1 - s0); double mbps = done / kernel_seconds / 1e6; printf("M3₆ NEON throughput:\n"); printf(" blocks/batch: %d\n", n_blocks); printf(" batches done: %d\n", iters); printf(" total blocks: %llu\n", (unsigned long long) done); printf(" elapsed (kernel)=%.6f s\n", kernel_seconds); printf(" throughput = %.3f Mblock/s\n", mbps); printf(" per-block = %.1f ns\n", kernel_seconds / done * 1e9); /* H.264 1080p 4×4 floor: ~5.85 Mblock/s worst-case, ~2 realistic. */ printf(" H.264 1080p30 worst-case floor: %.2fx margin (5.85 Mblock/s req'd)\n", mbps / 5.85); printf(" H.264 1080p30 realistic floor: %.2fx margin (2.0 Mblock/s req'd)\n", mbps / 2.0); free(master_blocks); free(work_blocks); free(master_dst); free(work_dst); } int main(int argc, char **argv) { int n_blocks = 65536; double duration = 5.0; uint64_t seed = 0; int do_correctness = 1; static struct option opts[] = { {"blocks", required_argument, 0, 'b'}, {"duration", required_argument, 0, 'd'}, {"seed", required_argument, 0, 's'}, {"no-correctness", no_argument, 0, 'C'}, {0,0,0,0} }; for (int c; (c = getopt_long(argc, argv, "b:d:s:C", opts, 0)) != -1;) { switch (c) { case 'b': n_blocks = atoi(optarg); break; case 'd': duration = atof(optarg); break; case 's': seed = strtoull(optarg, 0, 0); break; case 'C': do_correctness = 0; break; default: return 2; } } if (do_correctness) { printf("=== M1₆ bit-exact (10000 random 4x4 blocks) ===\n"); int mis = correctness_check(seed, 10000); if (mis != 0) { fprintf(stderr, "M1 gate FAILED — refusing to measure throughput.\n"); return 1; } printf("\n"); } printf("=== M3₆ NEON throughput ===\n"); throughput_neon(seed, n_blocks, duration); return 0; }