/* * Cycle 8 Phase 6+7 — QPU bench for H.264 luma deblock. * * Reports: * M1: 3-way bit-exact (QPU vs NEON vs C ref) per Phase 5 YELLOW-1. * M2: QPU sustained Medge/s. * * Bench contract enforcement (Phase 5 RED-2): m.x is positioned so * that m.x >= 4 * stride for every edge. * * License: BSD-2-Clause. */ #define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include #include #include #include #include "v3d_runner.h" extern void daedalus_h264_v_loop_filter_luma_ref( uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); extern void ff_h264_v_loop_filter_luma_neon( uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); #define TILE_STRIDE 16 #define TILE_ROWS 16 #define TILE_BYTES (TILE_ROWS * TILE_STRIDE) #define EDGE_ROW 4 #define EDGE_OFF (EDGE_ROW * TILE_STRIDE) /* byte offset into a tile to row 0 of bottom block */ static uint64_t xs_state; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } static void gen_tile(uint8_t *tile) { int a = (int)(xs() % 200) + 20; int b = (int)(xs() % 200) + 20; int noise = (int)(xs() % 30) + 1; for (int r = 0; r < TILE_ROWS; r++) { for (int c = 0; c < TILE_STRIDE; c++) { int v; if (r >= EDGE_ROW - 4 && r < EDGE_ROW + 4) { int base = (r < EDGE_ROW) ? a : b; int n = ((int)(xs() % (2*noise + 1))) - noise; v = base + n; } else { v = (int)(xs() & 0xff); } tile[r * TILE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v); } } } static void gen_thresholds(int *alpha, int *beta, int8_t tc0[4]) { *alpha = (int)(xs() % 64) + 1; *beta = (int)(xs() % 16) + 1; for (int s = 0; s < 4; s++) { int r = (int)(xs() % 8); tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); } } static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); return ts.tv_sec + ts.tv_nsec * 1e-9; } typedef struct { uint32_t n_edges; uint32_t dst_stride_u8; uint32_t _pad0; uint32_t _pad1; } push_consts; int main(int argc, char **argv) { int n_edges = 16384; int iters = 200; int verify_only = 0; uint64_t seed = 0; const char *spv_path = "v3d_h264deblock.spv"; static struct option opts[] = { {"edges", required_argument, 0, 'e'}, {"iters", required_argument, 0, 'i'}, {"seed", required_argument, 0, 's'}, {"spv", required_argument, 0, 'S'}, {"verify-only", no_argument, 0, 'V'}, {0,0,0,0} }; for (int c; (c = getopt_long(argc, argv, "e:i:s:S:V", opts, 0)) != -1;) { switch (c) { case 'e': n_edges = atoi(optarg); break; case 'i': iters = atoi(optarg); break; case 's': seed = strtoull(optarg, 0, 0); break; case 'S': spv_path = optarg; break; case 'V': verify_only = 1; break; default: return 2; } } xs_state = seed ? seed : 0xdeb1ec500dULL; v3d_runner *r = v3d_runner_create(); if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; } printf("=== v3d H.264 deblock bench ===\n"); printf(" device: %s\n", v3d_runner_device_name(r)); printf(" n_edges: %d iters: %d seed: 0x%016llx\n", n_edges, iters, (unsigned long long) (seed ? seed : 0xdeb1ec500dULL)); size_t meta_bytes = (size_t) n_edges * 4 * sizeof(uint32_t); size_t dst_bytes = (size_t) n_edges * TILE_BYTES; v3d_buffer buf_meta = {0}, buf_dst = {0}; if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1; if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1; uint8_t *master = malloc(dst_bytes); uint8_t *expected_c = malloc(dst_bytes); uint8_t *expected_n = malloc(dst_bytes); int *alphas = malloc(n_edges*sizeof(int)); int *betas = malloc(n_edges*sizeof(int)); int8_t (*tc0s)[4] = malloc(n_edges * 4); if (!master || !expected_c || !expected_n || !alphas || !betas || !tc0s) { fprintf(stderr, "alloc fail\n"); return 1; } for (int i = 0; i < n_edges; i++) { gen_tile(master + (size_t)i * TILE_BYTES); gen_thresholds(&alphas[i], &betas[i], tc0s[i]); } /* C ref expected. */ memcpy(expected_c, master, dst_bytes); for (int i = 0; i < n_edges; i++) daedalus_h264_v_loop_filter_luma_ref( expected_c + (size_t)i * TILE_BYTES + EDGE_OFF, TILE_STRIDE, alphas[i], betas[i], tc0s[i]); /* NEON expected. */ memcpy(expected_n, master, dst_bytes); for (int i = 0; i < n_edges; i++) ff_h264_v_loop_filter_luma_neon( expected_n + (size_t)i * TILE_BYTES + EDGE_OFF, TILE_STRIDE, alphas[i], betas[i], tc0s[i]); /* Parity check C ref vs NEON. */ int cn_mis = 0; for (size_t b = 0; b < dst_bytes; b++) if (expected_c[b] != expected_n[b]) cn_mis++; printf(" C ref vs NEON parity: %d/%zu byte mismatches\n", cn_mis, dst_bytes); if (cn_mis > 0) { fprintf(stderr, "ERROR: C ref disagrees with NEON before QPU.\n"); return 1; } /* Populate meta SSBO (Phase 5 RED-2: enforce m.x >= 4*stride). */ uint32_t *meta = (uint32_t *) buf_meta.mapped; uint32_t stride_u8 = TILE_STRIDE; for (int i = 0; i < n_edges; i++) { uint32_t mx = (uint32_t)((size_t)i * TILE_BYTES + EDGE_OFF); assert(mx >= 4 * stride_u8 && "Phase 5 RED-2 contract violated"); meta[4*i + 0] = mx; meta[4*i + 1] = ((uint32_t)alphas[i]) | (((uint32_t)betas[i]) << 8); /* Pack tc0[0..3] as 4 int8 in low 32 bits of m.z. */ meta[4*i + 2] = ((uint32_t)(uint8_t)tc0s[i][0]) | (((uint32_t)(uint8_t)tc0s[i][1]) << 8) | (((uint32_t)(uint8_t)tc0s[i][2]) << 16) | (((uint32_t)(uint8_t)tc0s[i][3]) << 24); meta[4*i + 3] = 0; } memcpy(buf_dst.mapped, master, dst_bytes); /* Pipeline. */ v3d_pipeline pipe = {0}; if (v3d_runner_create_pipeline(r, spv_path, /*n_ssbos=*/2, /*push_const_size=*/sizeof(push_consts), &pipe)) return 1; v3d_buffer binds[2] = { buf_meta, buf_dst }; if (v3d_runner_bind_buffers(r, &pipe, binds, 2)) return 1; const uint32_t edges_per_wg = 16; uint32_t wg_count = (uint32_t)((n_edges + edges_per_wg - 1) / edges_per_wg); printf(" dispatch: %u WGs × 256 invocations = %u edges\n", wg_count, wg_count * edges_per_wg); push_consts pc = { .n_edges = (uint32_t) n_edges, .dst_stride_u8 = stride_u8, }; VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r); if (cb == VK_NULL_HANDLE) return 1; VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; vkBeginCommandBuffer(cb, &cbbi); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline); vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.layout, 0, 1, &pipe.desc_set, 0, NULL); vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc); vkCmdDispatch(cb, wg_count, 1, 1); vkEndCommandBuffer(cb); /* M1 3-way. */ printf("\n=== M1₈: QPU vs C ref vs NEON ===\n"); memcpy(buf_dst.mapped, master, dst_bytes); if (v3d_runner_submit_wait(r, cb)) return 1; int qc_mis = 0, qn_mis = 0, prints = 0; for (int i = 0; i < n_edges; i++) { uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * TILE_BYTES; uint8_t *c = expected_c + (size_t)i * TILE_BYTES; uint8_t *n = expected_n + (size_t)i * TILE_BYTES; int qc = memcmp(q, c, TILE_BYTES); int qn = memcmp(q, n, TILE_BYTES); if (qc) qc_mis++; if (qn) qn_mis++; if ((qc || qn) && prints < 3) { fprintf(stderr, "MISMATCH edge %d alpha=%d beta=%d tc0=[%d,%d,%d,%d]\n", i, alphas[i], betas[i], tc0s[i][0], tc0s[i][1], tc0s[i][2], tc0s[i][3]); prints++; } } printf(" QPU vs C ref: %d/%d edges bit-exact (%.4f%%)\n", n_edges - qc_mis, n_edges, 100.0 * (n_edges - qc_mis) / n_edges); printf(" QPU vs NEON: %d/%d edges bit-exact (%.4f%%)\n", n_edges - qn_mis, n_edges, 100.0 * (n_edges - qn_mis) / n_edges); if (qc_mis || qn_mis) { fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n"); return 1; } if (verify_only) { v3d_runner_destroy_pipeline(r, &pipe); v3d_runner_destroy_buffer(r, &buf_dst); v3d_runner_destroy_buffer(r, &buf_meta); v3d_runner_destroy(r); return 0; } /* M2 throughput. */ printf("\n=== M2₈: QPU throughput ===\n"); for (int i = 0; i < 5; i++) { memcpy(buf_dst.mapped, master, dst_bytes); if (v3d_runner_submit_wait(r, cb)) return 1; } double t0 = now_seconds(); for (int i = 0; i < iters; i++) { memcpy(buf_dst.mapped, master, dst_bytes); if (v3d_runner_submit_wait(r, cb)) return 1; } double t1 = now_seconds(); double s0 = now_seconds(); for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master, dst_bytes); double s1 = now_seconds(); double kernel_seconds = (t1 - t0) - (s1 - s0); double total = (double) n_edges * iters; double medges = total / kernel_seconds / 1e6; printf(" edges/dispatch: %d\n", n_edges); printf(" iters: %d\n", iters); printf(" total edges: %.0f\n", total); printf(" elapsed (kern) = %.6f s\n", kernel_seconds); printf(" M2₈ throughput = %.3f Medge/s\n", medges); printf(" per-edge = %.1f ns\n", kernel_seconds / total * 1e9); printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6); double M3_8 = 91.947; double R8 = medges / M3_8; printf("\n Cycle 8 NEON M3₈ = %.3f Medge/s\n", M3_8); printf(" R₈ = M2₈/M3₈ = %.3f\n", R8); if (R8 >= 1.0) printf(" decision band = GREEN\n"); else if (R8 >= 0.5) printf(" decision band = YELLOW (M4 decides)\n"); else if (R8 >= 0.1) printf(" decision band = ORANGE (M4 may rescue)\n"); else printf(" decision band = RED (structural)\n"); /* H.264 1080p30 floor: 8 Medge/s worst, 3 realistic. */ printf(" H.264 1080p30 worst-case floor: %.2fx margin (8.0 Medge/s req'd)\n", medges / 8.0); v3d_runner_destroy_pipeline(r, &pipe); v3d_runner_destroy_buffer(r, &buf_dst); v3d_runner_destroy_buffer(r, &buf_meta); v3d_runner_destroy(r); free(master); free(expected_c); free(expected_n); free(alphas); free(betas); free(tc0s); return 0; }