/* * Cycle 2 Phase 6 — QPU bench for VP9 4-tap inner loop filter on V3D 7.1. * * Reports: * M1'' (correctness): bit-exact rate, QPU output vs C reference * M2'' (throughput): QPU sustained Medge/s over K dispatched batches * fm/hev pass rates (phase5'' finding 8 instrumentation) * * Asserts the two contracts from k2_deblock_phase4.md §4 * (phase5'' findings 2+4): m.x ≥ 4, dst_stride ≥ 4. * * License: BSD-2-Clause. */ #define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include #include #include #include #include "v3d_runner.h" extern void daedalus_vp9_loop_filter_h_4_8_ref( uint8_t *dst, ptrdiff_t stride, int E, int I, int H); /* --- RNG / generators (match bench_neon_lpf.c shape) ------------- */ static uint64_t xs_state; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } #define EDGE_STRIDE 8 #define EDGE_W 8 #define EDGE_H 8 #define EDGE_BYTES (EDGE_H * EDGE_STRIDE) /* 64 */ static void gen_edge_pixels(uint8_t *buf) { int side_a_base = (int)(xs() % 200) + 20; int side_b_base = (int)(xs() % 200) + 20; int noise_scale = (int)(xs() % 30); for (int r = 0; r < EDGE_H; r++) { for (int c = 0; c < EDGE_W; c++) { int base = (c < 4) ? side_a_base : side_b_base; int noise = ((int)(xs() % (2 * noise_scale + 1))) - noise_scale; int v = base + noise; buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v); } } } static void gen_thresholds(int *E, int *I, int *H) { *E = (int)(xs() % 81); *I = (int)(xs() % 41); *H = (int)(xs() % 11); } static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } /* --- Push constants — match shader layout ------------------------ */ typedef struct { uint32_t n_edges; uint32_t dst_stride_u8; uint32_t _pad0; uint32_t _pad1; } push_consts; /* --- Pre-flight: fm/hev rate on the same RNG seed (informational) - */ static void estimate_pass_rates(uint64_t seed, int n_edges, double *fm_rate, double *hev_rate) { uint64_t saved = xs_state; xs_state = seed ? seed : 0xa57edbeef5717ULL; int fm_pass = 0, hev_pass = 0; uint8_t buf[EDGE_BYTES]; for (int i = 0; i < n_edges; i++) { gen_edge_pixels(buf); int E, I, H; gen_thresholds(&E, &I, &H); /* Mirror the C-ref fm/hev for just the first row of this * edge — gives a sample of what the QPU would see. (For a * more rigorous picture, count per-row, but per-edge is * fine for instrumentation.) */ uint8_t *d = buf + 4; /* col 4 */ int p3 = d[-4], p2 = d[-3], p1 = d[-2], p0 = d[-1]; int q0 = d[ 0], q1 = d[+1], q2 = d[+2], q3 = d[+3]; int aP3P2 = p3-p2; if (aP3P2 < 0) aP3P2 = -aP3P2; int aP2P1 = p2-p1; if (aP2P1 < 0) aP2P1 = -aP2P1; int aP1P0 = p1-p0; if (aP1P0 < 0) aP1P0 = -aP1P0; int aQ1Q0 = q1-q0; if (aQ1Q0 < 0) aQ1Q0 = -aQ1Q0; int aQ2Q1 = q2-q1; if (aQ2Q1 < 0) aQ2Q1 = -aQ2Q1; int aQ3Q2 = q3-q2; if (aQ3Q2 < 0) aQ3Q2 = -aQ3Q2; int aP0Q0 = p0-q0; if (aP0Q0 < 0) aP0Q0 = -aP0Q0; int aP1Q1 = p1-q1; if (aP1Q1 < 0) aP1Q1 = -aP1Q1; int fm = (aP3P2 <= I) && (aP2P1 <= I) && (aP1P0 <= I) && (aQ1Q0 <= I) && (aQ2Q1 <= I) && (aQ3Q2 <= I) && (aP0Q0 * 2 + (aP1Q1 >> 1) <= E); if (fm) { fm_pass++; if (aP1P0 > H || aQ1Q0 > H) hev_pass++; } } *fm_rate = (double) fm_pass / n_edges; *hev_rate = (double) hev_pass / n_edges; xs_state = saved; } /* --- Main ------------------------------------------------------- */ int main(int argc, char **argv) { int n_edges = 65536; int iters = 100; int verify_only = 0; uint64_t seed = 0; const char *spv_path = "v3d_lpf_h_4_8.spv"; static struct option opts[] = { {"edges", required_argument, 0, 'e'}, {"iters", required_argument, 0, 'i'}, {"seed", required_argument, 0, 's'}, {"spv", required_argument, 0, 'S'}, {"verify-only", no_argument, 0, 'V'}, {0,0,0,0} }; for (int c; (c = getopt_long(argc, argv, "e:i:s:S:V", opts, 0)) != -1;) { switch (c) { case 'e': n_edges = atoi(optarg); break; case 'i': iters = atoi(optarg); break; case 's': seed = strtoull(optarg, 0, 0); break; case 'S': spv_path = optarg; break; case 'V': verify_only = 1; break; default: return 2; } } xs_state = seed ? seed : 0xa57edbeef5717ULL; /* --- Setup ---- */ v3d_runner *r = v3d_runner_create(); if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; } printf("=== v3d LPF h_4_8 bench ===\n"); printf(" device: %s\n", v3d_runner_device_name(r)); printf(" n_edges: %d iters: %d seed: 0x%016llx\n", n_edges, iters, (unsigned long long) (seed ? seed : 0xa57edbeef5717ULL)); /* Per-edge layout in dst buffer: edge i occupies bytes * [i*64 .. i*64+63]. The "edge center" (column 4 of row 0) is at * byte offset i*64 + 4. Stride between rows of the same edge = 8. */ size_t dst_bytes = (size_t) n_edges * EDGE_BYTES; size_t meta_bytes = (size_t) n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */ v3d_buffer buf_meta = {0}, buf_dst = {0}; if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1; if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1; /* Master pixel set + thresholds — kept stable across iters. */ uint8_t *master_pred = malloc(dst_bytes); uint8_t *expected = malloc(dst_bytes); int *Es = malloc(n_edges * sizeof(int)); int *Is = malloc(n_edges * sizeof(int)); int *Hs = malloc(n_edges * sizeof(int)); if (!master_pred || !expected || !Es || !Is || !Hs) { fprintf(stderr, "alloc\n"); return 1; } for (int i = 0; i < n_edges; i++) { gen_edge_pixels(master_pred + (size_t)i * EDGE_BYTES); gen_thresholds(&Es[i], &Is[i], &Hs[i]); } /* Build C-ref expected output (separate copies, since the filter * mutates dst in place). */ memcpy(expected, master_pred, dst_bytes); for (int i = 0; i < n_edges; i++) { daedalus_vp9_loop_filter_h_4_8_ref( expected + (size_t)i * EDGE_BYTES + 4, /* col 4 of this edge */ EDGE_STRIDE, Es[i], Is[i], Hs[i]); } /* Populate GPU buffers. Asserts enforce phase4 §4 contracts. */ uint32_t *meta = (uint32_t *) buf_meta.mapped; uint32_t dst_stride_u8 = EDGE_STRIDE; assert(dst_stride_u8 >= 4 && "phase4 §4 contract 2 violated"); for (int i = 0; i < n_edges; i++) { uint32_t mx = (uint32_t)((size_t)i * EDGE_BYTES + 4); assert(mx >= 4 && "phase4 §4 contract 1 violated"); meta[4*i + 0] = mx; meta[4*i + 1] = (uint32_t) Es[i]; meta[4*i + 2] = (uint32_t) Is[i]; meta[4*i + 3] = (uint32_t) Hs[i]; } memcpy(buf_dst.mapped, master_pred, dst_bytes); /* --- Pre-flight estimate of fm/hev pass rates --- */ double fm_rate, hev_rate; estimate_pass_rates(seed, 10000, &fm_rate, &hev_rate); printf(" fm pass rate: %.2f%% (10k-edge sample)\n", fm_rate * 100); printf(" hev pass rate: %.2f%% (of fm-passing)\n", hev_rate * 100); /* --- Pipeline --- */ v3d_pipeline pipe = {0}; if (v3d_runner_create_pipeline(r, spv_path, /*n_ssbos=*/2, /*push_const_size=*/sizeof(push_consts), &pipe)) return 1; v3d_buffer bind_bufs[2] = { buf_meta, buf_dst }; if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 2)) return 1; const uint32_t edges_per_wg = 32; uint32_t group_count_x = (uint32_t)((n_edges + edges_per_wg - 1) / edges_per_wg); printf(" dispatch: %u WGs × 256 invocations = %u edges (rounded up from %d)\n", group_count_x, group_count_x * edges_per_wg, n_edges); push_consts pc = { .n_edges = (uint32_t) n_edges, .dst_stride_u8 = dst_stride_u8, ._pad0 = 0, ._pad1 = 0, }; /* Record command buffer once. */ VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r); if (cb == VK_NULL_HANDLE) return 1; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.layout, 0, 1, &pipe.desc_set, 0, NULL); vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); vkCmdDispatch(cb, group_count_x, 1, 1); vkEndCommandBuffer(cb); /* --- M1'': bit-exact verification --- */ printf("\n=== M1'': QPU vs C-reference bit-exact ===\n"); memcpy(buf_dst.mapped, master_pred, dst_bytes); if (v3d_runner_submit_wait(r, cb)) return 1; int mismatch_edges = 0; int total_byte_diffs = 0; int prints = 0; for (int i = 0; i < n_edges; i++) { const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * EDGE_BYTES; const uint8_t *e = expected + (size_t)i * EDGE_BYTES; if (memcmp(q, e, EDGE_BYTES) != 0) { int diffs = 0; for (int j = 0; j < EDGE_BYTES; j++) if (q[j] != e[j]) diffs++; total_byte_diffs += diffs; if (prints < 3) { fprintf(stderr, "MISMATCH edge %d (E=%d I=%d H=%d): %d/64 bytes differ\n", i, Es[i], Is[i], Hs[i], diffs); fprintf(stderr, " ref:"); for (int r0 = 0; r0 < 8; r0++) { fprintf(stderr, "\n r%d ", r0); for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", e[r0*8+c]); } fprintf(stderr, "\n qpu:"); for (int r0 = 0; r0 < 8; r0++) { fprintf(stderr, "\n r%d ", r0); for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", q[r0*8+c]); } fprintf(stderr, "\n"); prints++; } mismatch_edges++; } } printf(" edges bit-exact: %d / %d (%.4f%%)\n", n_edges - mismatch_edges, n_edges, 100.0 * (n_edges - mismatch_edges) / n_edges); printf(" total byte diffs: %d / %zu (%.4f%%)\n", total_byte_diffs, (size_t) n_edges * EDGE_BYTES, 100.0 * total_byte_diffs / ((double) n_edges * EDGE_BYTES)); if (mismatch_edges > 0) { fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); v3d_runner_destroy_pipeline(r, &pipe); v3d_runner_destroy_buffer(r, &buf_dst); v3d_runner_destroy_buffer(r, &buf_meta); v3d_runner_destroy(r); return 1; } if (verify_only) { v3d_runner_destroy_pipeline(r, &pipe); v3d_runner_destroy_buffer(r, &buf_dst); v3d_runner_destroy_buffer(r, &buf_meta); v3d_runner_destroy(r); return 0; } /* --- M2'': throughput --- */ printf("\n=== M2'': QPU throughput ===\n"); for (int i = 0; i < 10; i++) { /* warm-up */ memcpy(buf_dst.mapped, master_pred, dst_bytes); if (v3d_runner_submit_wait(r, cb)) return 1; } double t0 = now_seconds(); for (int i = 0; i < iters; i++) { memcpy(buf_dst.mapped, master_pred, dst_bytes); if (v3d_runner_submit_wait(r, cb)) return 1; } double t1 = now_seconds(); double s0 = now_seconds(); for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master_pred, dst_bytes); double s1 = now_seconds(); double kernel_seconds = (t1 - t0) - (s1 - s0); double total_edges = (double) n_edges * iters; double medges_s = total_edges / kernel_seconds / 1e6; printf(" edges/dispatch: %d\n", n_edges); printf(" iters: %d\n", iters); printf(" total edges: %.0f\n", total_edges); printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", kernel_seconds); printf(" elapsed (setup) =%.6f s\n", s1 - s0); printf(" M2'' throughput = %.3f Medge/s\n", medges_s); printf(" per-edge = %.1f ns\n", kernel_seconds / total_edges * 1e9); printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6); double M3pp = 48.285; /* from k2_deblock_phase3.md */ double Rpp = medges_s / M3pp; printf("\n Cycle 2 NEON M3'' = %.3f Medge/s\n", M3pp); printf(" R'' = M2''/M3'' = %.3f\n", Rpp); if (Rpp >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n"); else if (Rpp >= 0.5) printf(" decision band = YELLOW: M4'' decides\n"); else if (Rpp >= 0.1) printf(" decision band = ORANGE: M4'' may still rescue (cycle-1 calibration)\n"); else printf(" decision band = RED: structural mismatch\n"); v3d_runner_destroy_pipeline(r, &pipe); v3d_runner_destroy_buffer(r, &buf_dst); v3d_runner_destroy_buffer(r, &buf_meta); v3d_runner_destroy(r); free(master_pred); free(expected); free(Es); free(Is); free(Hs); return 0; }