/* * Cycle 3 Phase 6 — QPU bench for VP9 8-tap "regular" subpel filter, * horizontal, 8-wide output on V3D 7.1. * * Reports: * M1''' (correctness): QPU output vs C reference, N blocks across * all 16 mx phases * M2''' (throughput): QPU sustained Mblock/s * * Per k3_mc_phase4.md §5 (revised per phase5''' findings 4 + 6): * - src_off is the RAW block base (no +3 shift) * - assert(dst_stride_u8 >= 8 && src_stride_u8 >= 15) * * License: BSD-2-Clause. */ #define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include #include #include #include #include "v3d_runner.h" extern void daedalus_vp9_put_regular_8h_ref( uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my); /* Per-block layout: src buffer 8 rows × 16 cols = 128 bytes. The * C bench's src+3 convention: NEON/C ref is called with * `src = block_base + 3, src_stride = 16`. The shader's src_off * is the RAW block_base (no +3 shift), and the shader reads * s[0..14] from src_off + row*stride. Together this means: * shader's s[k] for k=0..14 = master_src[block_base + row*16 + k] * C ref's `src[x+k-3]` for x=0..7, k=0..7 with `src = block_base+3` * = master_src[block_base + row*16 + (x+k)] * = master_src[block_base + row*16 + (0..14)] * which is exactly what the shader reads. */ #define SRC_W 16 #define SRC_H 8 #define DST_W 8 #define DST_H 8 #define SRC_BYTES (SRC_H * SRC_W) #define DST_BYTES (DST_H * DST_W) static uint64_t xs_state; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } static void gen_src(uint8_t *b) { for (int i = 0; i < SRC_BYTES; i++) b[i] = (uint8_t)(xs() & 0xff); } static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } typedef struct { uint32_t n_blocks; uint32_t dst_stride_u8; uint32_t src_stride_u8; uint32_t _pad; } push_consts; int main(int argc, char **argv) { int n_blocks = 65536; int iters = 100; uint64_t seed = 0; int verify_only = 0; const char *spv_path = "v3d_mc_8h.spv"; static struct option opts[] = { {"blocks", required_argument, 0, 'b'}, {"iters", required_argument, 0, 'i'}, {"seed", required_argument, 0, 's'}, {"spv", required_argument, 0, 'S'}, {"verify-only", no_argument, 0, 'V'}, {0,0,0,0} }; for (int c; (c = getopt_long(argc, argv, "b:i:s:S:V", opts, 0)) != -1;) { switch (c) { case 'b': n_blocks = atoi(optarg); break; case 'i': iters = atoi(optarg); break; case 's': seed = strtoull(optarg, 0, 0); break; case 'S': spv_path = optarg; break; case 'V': verify_only = 1; break; default: return 2; } } xs_state = seed ? seed : 0xabcdef1234567890ULL; v3d_runner *r = v3d_runner_create(); if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; } printf("=== v3d MC 8h bench ===\n"); printf(" device: %s\n", v3d_runner_device_name(r)); printf(" n_blocks: %d iters: %d\n", n_blocks, iters); /* Buffers: meta + dst + src, all blocks contiguous. */ size_t meta_bytes = (size_t) n_blocks * 4 * sizeof(uint32_t); size_t src_bytes = (size_t) n_blocks * SRC_BYTES; size_t dst_bytes = (size_t) n_blocks * DST_BYTES; v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_src = {0}; if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1; if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1; if (v3d_runner_create_buffer(r, src_bytes, &buf_src)) return 1; uint8_t *master_src = malloc(src_bytes); uint8_t *expected = malloc(dst_bytes); int *mxs = malloc(n_blocks * sizeof(int)); if (!master_src || !expected || !mxs) { fprintf(stderr, "alloc\n"); return 1; } for (int i = 0; i < n_blocks; i++) { gen_src(master_src + (size_t)i * SRC_BYTES); mxs[i] = (int)(xs() & 15); } /* Build C-ref expected. C ref takes `src + 3, src_stride = SRC_W`. */ memset(expected, 0, dst_bytes); for (int i = 0; i < n_blocks; i++) { daedalus_vp9_put_regular_8h_ref( expected + (size_t)i * DST_BYTES, DST_W, master_src + (size_t)i * SRC_BYTES + 3, SRC_W, DST_H, mxs[i], 0); } /* Populate GPU buffers. Contracts (phase4 §5) enforced via asserts. */ uint32_t dst_stride_u8 = DST_W; uint32_t src_stride_u8 = SRC_W; assert(dst_stride_u8 >= 8 && "phase4 §5 contract 1"); assert(src_stride_u8 >= 15 && "phase4 §5 contract 2"); uint32_t *meta = (uint32_t *) buf_meta.mapped; for (int i = 0; i < n_blocks; i++) { /* src_off: RAW block base. NO +3 shift. (phase5''' finding 4) */ uint32_t src_off = (uint32_t)((size_t)i * SRC_BYTES); uint32_t dst_off = (uint32_t)((size_t)i * DST_BYTES); meta[4*i + 0] = dst_off; meta[4*i + 1] = src_off; meta[4*i + 2] = (uint32_t) mxs[i]; meta[4*i + 3] = 0; } memcpy(buf_src.mapped, master_src, src_bytes); memset(buf_dst.mapped, 0, dst_bytes); /* Pipeline. */ v3d_pipeline pipe = {0}; if (v3d_runner_create_pipeline(r, spv_path, /*n_ssbos=*/3, /*push_const_size=*/sizeof(push_consts), &pipe)) return 1; v3d_buffer bind_bufs[3] = { buf_meta, buf_dst, buf_src }; if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 3)) return 1; const uint32_t blocks_per_wg = 32; uint32_t group_count_x = (uint32_t)((n_blocks + blocks_per_wg - 1) / blocks_per_wg); printf(" dispatch: %u WGs × 256 invocations = %u blocks (rounded up from %d)\n", group_count_x, group_count_x * blocks_per_wg, n_blocks); push_consts pc = { .n_blocks = (uint32_t) n_blocks, .dst_stride_u8 = dst_stride_u8, .src_stride_u8 = src_stride_u8, ._pad = 0, }; VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r); VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.layout, 0, 1, &pipe.desc_set, 0, NULL); vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); vkCmdDispatch(cb, group_count_x, 1, 1); vkEndCommandBuffer(cb); /* --- M1''' bit-exact --- */ printf("\n=== M1''': QPU vs C reference bit-exact ===\n"); memset(buf_dst.mapped, 0, dst_bytes); if (v3d_runner_submit_wait(r, cb)) return 1; int mismatch_blocks = 0; int total_byte_diffs = 0; int prints = 0; for (int i = 0; i < n_blocks; i++) { const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * DST_BYTES; const uint8_t *e = expected + (size_t)i * DST_BYTES; if (memcmp(q, e, DST_BYTES) != 0) { int diffs = 0; for (int j = 0; j < DST_BYTES; j++) if (q[j] != e[j]) diffs++; total_byte_diffs += diffs; if (prints < 3) { fprintf(stderr, "MISMATCH block %d mx=%d: %d/64 bytes differ\n", i, mxs[i], diffs); fprintf(stderr, " ref:"); for (int r0 = 0; r0 < 8; r0++) { fprintf(stderr, "\n r%d ", r0); for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", e[r0*8+c]); } fprintf(stderr, "\n qpu:"); for (int r0 = 0; r0 < 8; r0++) { fprintf(stderr, "\n r%d ", r0); for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", q[r0*8+c]); } fprintf(stderr, "\n"); prints++; } mismatch_blocks++; } } printf(" blocks bit-exact: %d / %d (%.4f%%)\n", n_blocks - mismatch_blocks, n_blocks, 100.0 * (n_blocks - mismatch_blocks) / n_blocks); printf(" total byte diffs: %d / %zu (%.4f%%)\n", total_byte_diffs, (size_t) n_blocks * DST_BYTES, 100.0 * total_byte_diffs / ((double) n_blocks * DST_BYTES)); if (mismatch_blocks > 0) { fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); v3d_runner_destroy_pipeline(r, &pipe); v3d_runner_destroy_buffer(r, &buf_src); v3d_runner_destroy_buffer(r, &buf_dst); v3d_runner_destroy_buffer(r, &buf_meta); v3d_runner_destroy(r); return 1; } if (verify_only) { v3d_runner_destroy_pipeline(r, &pipe); v3d_runner_destroy_buffer(r, &buf_src); v3d_runner_destroy_buffer(r, &buf_dst); v3d_runner_destroy_buffer(r, &buf_meta); v3d_runner_destroy(r); return 0; } /* --- M2''' throughput --- */ printf("\n=== M2''': QPU throughput ===\n"); for (int i = 0; i < 10; i++) { memset(buf_dst.mapped, 0, dst_bytes); if (v3d_runner_submit_wait(r, cb)) return 1; } double t0 = now_seconds(); for (int i = 0; i < iters; i++) { memset(buf_dst.mapped, 0, dst_bytes); if (v3d_runner_submit_wait(r, cb)) return 1; } double t1 = now_seconds(); double s0 = now_seconds(); for (int i = 0; i < iters; i++) memset(buf_dst.mapped, 0, dst_bytes); double s1 = now_seconds(); double kernel_seconds = (t1 - t0) - (s1 - s0); double total_blocks = (double) n_blocks * iters; double mbps = total_blocks / kernel_seconds / 1e6; printf(" blocks/dispatch: %d\n", n_blocks); printf(" iters: %d\n", iters); printf(" total blocks: %.0f\n", total_blocks); printf(" elapsed (kernel)=%.6f s\n", kernel_seconds); printf(" elapsed (setup) =%.6f s\n", s1 - s0); printf(" M2''' throughput = %.3f Mblock/s\n", mbps); printf(" per-block = %.1f ns\n", kernel_seconds / total_blocks * 1e9); printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6); double M3 = 20.997; /* from k3_mc_phase3.md */ double R = mbps / M3; printf("\n Cycle 3 NEON M3''' = %.3f Mblock/s\n", M3); printf(" R''' = M2'''/M3''' = %.3f\n", R); if (R >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n"); else if (R >= 0.5) printf(" decision band = YELLOW: M4''' decides\n"); else if (R >= 0.1) printf(" decision band = ORANGE: M4''' may still rescue\n"); else printf(" decision band = RED: structural mismatch\n"); /* 30fps@1080p floor check (per project_30fps_floor_is_fine.md) */ double mblocks_per_1080p = 32400.0 * 30.0 / 1e6; printf("\n 30fps@1080p floor : %.3f Mblock/s (32400 blocks × 30 fps)\n", mblocks_per_1080p); printf(" isolation margin : %.1fx over 30fps floor\n", mbps / mblocks_per_1080p); v3d_runner_destroy_pipeline(r, &pipe); v3d_runner_destroy_buffer(r, &buf_src); v3d_runner_destroy_buffer(r, &buf_dst); v3d_runner_destroy_buffer(r, &buf_meta); v3d_runner_destroy(r); free(master_src); free(expected); free(mxs); return 0; }