36eca40ff2
Phase 4 plan + Phase 5 second-model review (PASS-WITH-REVISIONS:
2 YELLOW contract gaps applied) + Phase 6 v1 implementation +
Phase 7 verification including M4'' concurrent gate.
Phase 5'' review delivered cleanly — no RED bugs (cycle 1 lessons
applied successfully). 2 YELLOW findings baked into phase4 §4:
- stride >= 4 contract added alongside m.x >= 4 (finding 2)
- assert(...) in bench made a MUST not a suggestion (finding 4)
- V3D divergence-cost note: don't restructure to always-execute,
masked lanes consume clock anyway (finding 3, informational)
Phase 6 v1 first-light hit M1'' 100.0000% bit-exact on first run
(65536/65536 edges) — the cycle-1 v4 patterns (WG=256, 2-per-sg,
uint8_t SSBO, oob early-return discipline) baked in from start
worked as expected.
Performance:
M2'' = 19.645 Medge/s (50.9 ns/edge)
M3'' = 48.285 Medge/s (NEON baseline from phase3)
R'' = 0.41 (ORANGE band - doesn't auto-close per
cycle-1 calibration adjustment)
shaderdb: 160 inst, **4 threads**, 0 spills, 21 max-temps —
shader is already at the compiler ceiling. No v2/v3/v4 iteration
loop like cycle 1 because there's nothing more to extract from
the compiled shape. The 30x gap between theoretical instruction
throughput and measured wall-clock is divergence-tax + memory
latency, not compile quality.
M4'' concurrent matrix on hertz (8s windows):
NEON-1 LPF 41.131 Medge/s
NEON-4 LPF 33.726 Medge/s <- realistic CPU ceiling
(per-core 7-9; same
bandwidth-saturation as
cycle-1 F1)
QPU only 14.299 Medge/s
MIXED NEON-3 + QPU 36.049 Medge/s <- +6.9% over NEON-4
MIXED NEON-4 + QPU 31.892 Medge/s <- -5.4% oversubscribed
The "freed-core" pattern generalizes from IDCT to LPF: NEON-3+QPU
beats pure NEON-4 by ~7% in both cycles. Cycle-2 NEW finding:
**oversubscribed mode hurts for lighter kernels** (LPF -5.4% vs
cycle-1 IDCT +9.4%). Recommendation for higgs deployment hardens
to "always N-1 NEON cores + QPU, never N + QPU".
Phase 9 lessons (in phase7 §"Phase 9 lessons"):
1. Cycle-1 v4-pattern is the v1 starting point (saves 3 iterations)
2. Phase 5 review pays off every cycle
3. R isolation misleading on bandwidth-saturated hardware
4. Oversubscription tax depends on kernel weight
5. shaderdb 4-threads/0-spills = compute not the bottleneck
New artifacts:
- src/v3d_lpf_h_4_8.comp — GLSL kernel
- tests/bench_v3d_lpf.c — M1'' + M2'' harness with
contract asserts + fm/hev
pass-rate instrumentation
- tests/bench_concurrent_lpf.c — M4'' pthread bench
(mirrors bench_concurrent.c)
- docs/k2_deblock_phase{4,5,7}.md — plan + review + verification
Project verdict: continue. Cycle 3 candidates: MC interpolation
(multiply-heavy, stress V3D SMUL24), CDEF (AV1-only, different
neighborhood shape), or wd=8/wd=16 LPF variants. User to direct.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
355 lines
13 KiB
C
355 lines
13 KiB
C
/*
|
||
* Cycle 2 Phase 6 — QPU bench for VP9 4-tap inner loop filter on V3D 7.1.
|
||
*
|
||
* Reports:
|
||
* M1'' (correctness): bit-exact rate, QPU output vs C reference
|
||
* M2'' (throughput): QPU sustained Medge/s over K dispatched batches
|
||
* fm/hev pass rates (phase5'' finding 8 instrumentation)
|
||
*
|
||
* Asserts the two contracts from k2_deblock_phase4.md §4
|
||
* (phase5'' findings 2+4): m.x ≥ 4, dst_stride ≥ 4.
|
||
*
|
||
* License: BSD-2-Clause.
|
||
*/
|
||
#define _POSIX_C_SOURCE 200809L
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <stdint.h>
|
||
#include <string.h>
|
||
#include <stddef.h>
|
||
#include <assert.h>
|
||
#include <time.h>
|
||
#include <getopt.h>
|
||
#include <vulkan/vulkan.h>
|
||
|
||
#include "v3d_runner.h"
|
||
|
||
extern void daedalus_vp9_loop_filter_h_4_8_ref(
|
||
uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
|
||
|
||
/* --- RNG / generators (match bench_neon_lpf.c shape) ------------- */
|
||
|
||
static uint64_t xs_state;
|
||
static inline uint64_t xs(void) {
|
||
uint64_t x = xs_state;
|
||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||
return xs_state = x;
|
||
}
|
||
|
||
#define EDGE_STRIDE 8
|
||
#define EDGE_W 8
|
||
#define EDGE_H 8
|
||
#define EDGE_BYTES (EDGE_H * EDGE_STRIDE) /* 64 */
|
||
|
||
static void gen_edge_pixels(uint8_t *buf)
|
||
{
|
||
int side_a_base = (int)(xs() % 200) + 20;
|
||
int side_b_base = (int)(xs() % 200) + 20;
|
||
int noise_scale = (int)(xs() % 30);
|
||
for (int r = 0; r < EDGE_H; r++) {
|
||
for (int c = 0; c < EDGE_W; c++) {
|
||
int base = (c < 4) ? side_a_base : side_b_base;
|
||
int noise = ((int)(xs() % (2 * noise_scale + 1))) - noise_scale;
|
||
int v = base + noise;
|
||
buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
|
||
}
|
||
}
|
||
}
|
||
|
||
static void gen_thresholds(int *E, int *I, int *H)
|
||
{
|
||
*E = (int)(xs() % 81);
|
||
*I = (int)(xs() % 41);
|
||
*H = (int)(xs() % 11);
|
||
}
|
||
|
||
static double now_seconds(void)
|
||
{
|
||
struct timespec ts;
|
||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||
}
|
||
|
||
/* --- Push constants — match shader layout ------------------------ */
|
||
|
||
typedef struct {
|
||
uint32_t n_edges;
|
||
uint32_t dst_stride_u8;
|
||
uint32_t _pad0;
|
||
uint32_t _pad1;
|
||
} push_consts;
|
||
|
||
/* --- Pre-flight: fm/hev rate on the same RNG seed (informational) - */
|
||
|
||
static void estimate_pass_rates(uint64_t seed, int n_edges,
|
||
double *fm_rate, double *hev_rate)
|
||
{
|
||
uint64_t saved = xs_state;
|
||
xs_state = seed ? seed : 0xa57edbeef5717ULL;
|
||
int fm_pass = 0, hev_pass = 0;
|
||
|
||
uint8_t buf[EDGE_BYTES];
|
||
for (int i = 0; i < n_edges; i++) {
|
||
gen_edge_pixels(buf);
|
||
int E, I, H;
|
||
gen_thresholds(&E, &I, &H);
|
||
|
||
/* Mirror the C-ref fm/hev for just the first row of this
|
||
* edge — gives a sample of what the QPU would see. (For a
|
||
* more rigorous picture, count per-row, but per-edge is
|
||
* fine for instrumentation.) */
|
||
uint8_t *d = buf + 4; /* col 4 */
|
||
int p3 = d[-4], p2 = d[-3], p1 = d[-2], p0 = d[-1];
|
||
int q0 = d[ 0], q1 = d[+1], q2 = d[+2], q3 = d[+3];
|
||
int aP3P2 = p3-p2; if (aP3P2 < 0) aP3P2 = -aP3P2;
|
||
int aP2P1 = p2-p1; if (aP2P1 < 0) aP2P1 = -aP2P1;
|
||
int aP1P0 = p1-p0; if (aP1P0 < 0) aP1P0 = -aP1P0;
|
||
int aQ1Q0 = q1-q0; if (aQ1Q0 < 0) aQ1Q0 = -aQ1Q0;
|
||
int aQ2Q1 = q2-q1; if (aQ2Q1 < 0) aQ2Q1 = -aQ2Q1;
|
||
int aQ3Q2 = q3-q2; if (aQ3Q2 < 0) aQ3Q2 = -aQ3Q2;
|
||
int aP0Q0 = p0-q0; if (aP0Q0 < 0) aP0Q0 = -aP0Q0;
|
||
int aP1Q1 = p1-q1; if (aP1Q1 < 0) aP1Q1 = -aP1Q1;
|
||
int fm = (aP3P2 <= I) && (aP2P1 <= I) && (aP1P0 <= I) &&
|
||
(aQ1Q0 <= I) && (aQ2Q1 <= I) && (aQ3Q2 <= I) &&
|
||
(aP0Q0 * 2 + (aP1Q1 >> 1) <= E);
|
||
if (fm) {
|
||
fm_pass++;
|
||
if (aP1P0 > H || aQ1Q0 > H) hev_pass++;
|
||
}
|
||
}
|
||
*fm_rate = (double) fm_pass / n_edges;
|
||
*hev_rate = (double) hev_pass / n_edges;
|
||
xs_state = saved;
|
||
}
|
||
|
||
/* --- Main ------------------------------------------------------- */
|
||
|
||
int main(int argc, char **argv)
|
||
{
|
||
int n_edges = 65536;
|
||
int iters = 100;
|
||
int verify_only = 0;
|
||
uint64_t seed = 0;
|
||
const char *spv_path = "v3d_lpf_h_4_8.spv";
|
||
|
||
static struct option opts[] = {
|
||
{"edges", required_argument, 0, 'e'},
|
||
{"iters", required_argument, 0, 'i'},
|
||
{"seed", required_argument, 0, 's'},
|
||
{"spv", required_argument, 0, 'S'},
|
||
{"verify-only", no_argument, 0, 'V'},
|
||
{0,0,0,0}
|
||
};
|
||
for (int c; (c = getopt_long(argc, argv, "e:i:s:S:V", opts, 0)) != -1;) {
|
||
switch (c) {
|
||
case 'e': n_edges = atoi(optarg); break;
|
||
case 'i': iters = atoi(optarg); break;
|
||
case 's': seed = strtoull(optarg, 0, 0); break;
|
||
case 'S': spv_path = optarg; break;
|
||
case 'V': verify_only = 1; break;
|
||
default: return 2;
|
||
}
|
||
}
|
||
|
||
xs_state = seed ? seed : 0xa57edbeef5717ULL;
|
||
|
||
/* --- Setup ---- */
|
||
v3d_runner *r = v3d_runner_create();
|
||
if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; }
|
||
printf("=== v3d LPF h_4_8 bench ===\n");
|
||
printf(" device: %s\n", v3d_runner_device_name(r));
|
||
printf(" n_edges: %d iters: %d seed: 0x%016llx\n",
|
||
n_edges, iters, (unsigned long long) (seed ? seed : 0xa57edbeef5717ULL));
|
||
|
||
/* Per-edge layout in dst buffer: edge i occupies bytes
|
||
* [i*64 .. i*64+63]. The "edge center" (column 4 of row 0) is at
|
||
* byte offset i*64 + 4. Stride between rows of the same edge = 8. */
|
||
size_t dst_bytes = (size_t) n_edges * EDGE_BYTES;
|
||
size_t meta_bytes = (size_t) n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */
|
||
|
||
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
||
if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1;
|
||
if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1;
|
||
|
||
/* Master pixel set + thresholds — kept stable across iters. */
|
||
uint8_t *master_pred = malloc(dst_bytes);
|
||
uint8_t *expected = malloc(dst_bytes);
|
||
int *Es = malloc(n_edges * sizeof(int));
|
||
int *Is = malloc(n_edges * sizeof(int));
|
||
int *Hs = malloc(n_edges * sizeof(int));
|
||
if (!master_pred || !expected || !Es || !Is || !Hs) { fprintf(stderr, "alloc\n"); return 1; }
|
||
|
||
for (int i = 0; i < n_edges; i++) {
|
||
gen_edge_pixels(master_pred + (size_t)i * EDGE_BYTES);
|
||
gen_thresholds(&Es[i], &Is[i], &Hs[i]);
|
||
}
|
||
|
||
/* Build C-ref expected output (separate copies, since the filter
|
||
* mutates dst in place). */
|
||
memcpy(expected, master_pred, dst_bytes);
|
||
for (int i = 0; i < n_edges; i++) {
|
||
daedalus_vp9_loop_filter_h_4_8_ref(
|
||
expected + (size_t)i * EDGE_BYTES + 4, /* col 4 of this edge */
|
||
EDGE_STRIDE, Es[i], Is[i], Hs[i]);
|
||
}
|
||
|
||
/* Populate GPU buffers. Asserts enforce phase4 §4 contracts. */
|
||
uint32_t *meta = (uint32_t *) buf_meta.mapped;
|
||
uint32_t dst_stride_u8 = EDGE_STRIDE;
|
||
assert(dst_stride_u8 >= 4 && "phase4 §4 contract 2 violated");
|
||
for (int i = 0; i < n_edges; i++) {
|
||
uint32_t mx = (uint32_t)((size_t)i * EDGE_BYTES + 4);
|
||
assert(mx >= 4 && "phase4 §4 contract 1 violated");
|
||
meta[4*i + 0] = mx;
|
||
meta[4*i + 1] = (uint32_t) Es[i];
|
||
meta[4*i + 2] = (uint32_t) Is[i];
|
||
meta[4*i + 3] = (uint32_t) Hs[i];
|
||
}
|
||
memcpy(buf_dst.mapped, master_pred, dst_bytes);
|
||
|
||
/* --- Pre-flight estimate of fm/hev pass rates --- */
|
||
double fm_rate, hev_rate;
|
||
estimate_pass_rates(seed, 10000, &fm_rate, &hev_rate);
|
||
printf(" fm pass rate: %.2f%% (10k-edge sample)\n", fm_rate * 100);
|
||
printf(" hev pass rate: %.2f%% (of fm-passing)\n", hev_rate * 100);
|
||
|
||
/* --- Pipeline --- */
|
||
v3d_pipeline pipe = {0};
|
||
if (v3d_runner_create_pipeline(r, spv_path,
|
||
/*n_ssbos=*/2,
|
||
/*push_const_size=*/sizeof(push_consts),
|
||
&pipe)) return 1;
|
||
v3d_buffer bind_bufs[2] = { buf_meta, buf_dst };
|
||
if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 2)) return 1;
|
||
|
||
const uint32_t edges_per_wg = 32;
|
||
uint32_t group_count_x = (uint32_t)((n_edges + edges_per_wg - 1) / edges_per_wg);
|
||
printf(" dispatch: %u WGs × 256 invocations = %u edges (rounded up from %d)\n",
|
||
group_count_x, group_count_x * edges_per_wg, n_edges);
|
||
|
||
push_consts pc = {
|
||
.n_edges = (uint32_t) n_edges,
|
||
.dst_stride_u8 = dst_stride_u8,
|
||
._pad0 = 0, ._pad1 = 0,
|
||
};
|
||
|
||
/* Record command buffer once. */
|
||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
|
||
if (cb == VK_NULL_HANDLE) return 1;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, group_count_x, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
|
||
/* --- M1'': bit-exact verification --- */
|
||
printf("\n=== M1'': QPU vs C-reference bit-exact ===\n");
|
||
memcpy(buf_dst.mapped, master_pred, dst_bytes);
|
||
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||
|
||
int mismatch_edges = 0;
|
||
int total_byte_diffs = 0;
|
||
int prints = 0;
|
||
for (int i = 0; i < n_edges; i++) {
|
||
const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * EDGE_BYTES;
|
||
const uint8_t *e = expected + (size_t)i * EDGE_BYTES;
|
||
if (memcmp(q, e, EDGE_BYTES) != 0) {
|
||
int diffs = 0;
|
||
for (int j = 0; j < EDGE_BYTES; j++) if (q[j] != e[j]) diffs++;
|
||
total_byte_diffs += diffs;
|
||
if (prints < 3) {
|
||
fprintf(stderr, "MISMATCH edge %d (E=%d I=%d H=%d): %d/64 bytes differ\n",
|
||
i, Es[i], Is[i], Hs[i], diffs);
|
||
fprintf(stderr, " ref:");
|
||
for (int r0 = 0; r0 < 8; r0++) {
|
||
fprintf(stderr, "\n r%d ", r0);
|
||
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", e[r0*8+c]);
|
||
}
|
||
fprintf(stderr, "\n qpu:");
|
||
for (int r0 = 0; r0 < 8; r0++) {
|
||
fprintf(stderr, "\n r%d ", r0);
|
||
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", q[r0*8+c]);
|
||
}
|
||
fprintf(stderr, "\n");
|
||
prints++;
|
||
}
|
||
mismatch_edges++;
|
||
}
|
||
}
|
||
printf(" edges bit-exact: %d / %d (%.4f%%)\n",
|
||
n_edges - mismatch_edges, n_edges,
|
||
100.0 * (n_edges - mismatch_edges) / n_edges);
|
||
printf(" total byte diffs: %d / %zu (%.4f%%)\n",
|
||
total_byte_diffs, (size_t) n_edges * EDGE_BYTES,
|
||
100.0 * total_byte_diffs / ((double) n_edges * EDGE_BYTES));
|
||
|
||
if (mismatch_edges > 0) {
|
||
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
|
||
v3d_runner_destroy_pipeline(r, &pipe);
|
||
v3d_runner_destroy_buffer(r, &buf_dst);
|
||
v3d_runner_destroy_buffer(r, &buf_meta);
|
||
v3d_runner_destroy(r);
|
||
return 1;
|
||
}
|
||
|
||
if (verify_only) {
|
||
v3d_runner_destroy_pipeline(r, &pipe);
|
||
v3d_runner_destroy_buffer(r, &buf_dst);
|
||
v3d_runner_destroy_buffer(r, &buf_meta);
|
||
v3d_runner_destroy(r);
|
||
return 0;
|
||
}
|
||
|
||
/* --- M2'': throughput --- */
|
||
printf("\n=== M2'': QPU throughput ===\n");
|
||
|
||
for (int i = 0; i < 10; i++) { /* warm-up */
|
||
memcpy(buf_dst.mapped, master_pred, dst_bytes);
|
||
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||
}
|
||
|
||
double t0 = now_seconds();
|
||
for (int i = 0; i < iters; i++) {
|
||
memcpy(buf_dst.mapped, master_pred, dst_bytes);
|
||
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||
}
|
||
double t1 = now_seconds();
|
||
|
||
double s0 = now_seconds();
|
||
for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master_pred, dst_bytes);
|
||
double s1 = now_seconds();
|
||
|
||
double kernel_seconds = (t1 - t0) - (s1 - s0);
|
||
double total_edges = (double) n_edges * iters;
|
||
double medges_s = total_edges / kernel_seconds / 1e6;
|
||
|
||
printf(" edges/dispatch: %d\n", n_edges);
|
||
printf(" iters: %d\n", iters);
|
||
printf(" total edges: %.0f\n", total_edges);
|
||
printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", kernel_seconds);
|
||
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
|
||
printf(" M2'' throughput = %.3f Medge/s\n", medges_s);
|
||
printf(" per-edge = %.1f ns\n", kernel_seconds / total_edges * 1e9);
|
||
printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6);
|
||
|
||
double M3pp = 48.285; /* from k2_deblock_phase3.md */
|
||
double Rpp = medges_s / M3pp;
|
||
printf("\n Cycle 2 NEON M3'' = %.3f Medge/s\n", M3pp);
|
||
printf(" R'' = M2''/M3'' = %.3f\n", Rpp);
|
||
if (Rpp >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n");
|
||
else if (Rpp >= 0.5) printf(" decision band = YELLOW: M4'' decides\n");
|
||
else if (Rpp >= 0.1) printf(" decision band = ORANGE: M4'' may still rescue (cycle-1 calibration)\n");
|
||
else printf(" decision band = RED: structural mismatch\n");
|
||
|
||
v3d_runner_destroy_pipeline(r, &pipe);
|
||
v3d_runner_destroy_buffer(r, &buf_dst);
|
||
v3d_runner_destroy_buffer(r, &buf_meta);
|
||
v3d_runner_destroy(r);
|
||
free(master_pred); free(expected); free(Es); free(Is); free(Hs);
|
||
return 0;
|
||
}
|