Files
daedalus-fourier/tests/bench_v3d_lpf.c
T
marfrit 36eca40ff2 Cycle 2 (LPF) closure: M1''=100%, R''=0.41, M4''=+6.9%, PASS
Phase 4 plan + Phase 5 second-model review (PASS-WITH-REVISIONS:
2 YELLOW contract gaps applied) + Phase 6 v1 implementation +
Phase 7 verification including M4'' concurrent gate.

Phase 5'' review delivered cleanly — no RED bugs (cycle 1 lessons
applied successfully). 2 YELLOW findings baked into phase4 §4:
  - stride >= 4 contract added alongside m.x >= 4 (finding 2)
  - assert(...) in bench made a MUST not a suggestion (finding 4)
  - V3D divergence-cost note: don't restructure to always-execute,
    masked lanes consume clock anyway (finding 3, informational)

Phase 6 v1 first-light hit M1'' 100.0000% bit-exact on first run
(65536/65536 edges) — the cycle-1 v4 patterns (WG=256, 2-per-sg,
uint8_t SSBO, oob early-return discipline) baked in from start
worked as expected.

Performance:

  M2'' = 19.645 Medge/s     (50.9 ns/edge)
  M3'' = 48.285 Medge/s     (NEON baseline from phase3)
  R''  = 0.41               (ORANGE band - doesn't auto-close per
                             cycle-1 calibration adjustment)

shaderdb: 160 inst, **4 threads**, 0 spills, 21 max-temps —
shader is already at the compiler ceiling. No v2/v3/v4 iteration
loop like cycle 1 because there's nothing more to extract from
the compiled shape. The 30x gap between theoretical instruction
throughput and measured wall-clock is divergence-tax + memory
latency, not compile quality.

M4'' concurrent matrix on hertz (8s windows):

  NEON-1 LPF          41.131 Medge/s
  NEON-4 LPF          33.726 Medge/s  <- realistic CPU ceiling
                                          (per-core 7-9; same
                                          bandwidth-saturation as
                                          cycle-1 F1)
  QPU only            14.299 Medge/s
  MIXED NEON-3 + QPU  36.049 Medge/s  <- +6.9% over NEON-4
  MIXED NEON-4 + QPU  31.892 Medge/s  <- -5.4% oversubscribed

The "freed-core" pattern generalizes from IDCT to LPF: NEON-3+QPU
beats pure NEON-4 by ~7% in both cycles. Cycle-2 NEW finding:
**oversubscribed mode hurts for lighter kernels** (LPF -5.4% vs
cycle-1 IDCT +9.4%). Recommendation for higgs deployment hardens
to "always N-1 NEON cores + QPU, never N + QPU".

Phase 9 lessons (in phase7 §"Phase 9 lessons"):
1. Cycle-1 v4-pattern is the v1 starting point (saves 3 iterations)
2. Phase 5 review pays off every cycle
3. R isolation misleading on bandwidth-saturated hardware
4. Oversubscription tax depends on kernel weight
5. shaderdb 4-threads/0-spills = compute not the bottleneck

New artifacts:
- src/v3d_lpf_h_4_8.comp                — GLSL kernel
- tests/bench_v3d_lpf.c                 — M1'' + M2'' harness with
                                          contract asserts + fm/hev
                                          pass-rate instrumentation
- tests/bench_concurrent_lpf.c          — M4'' pthread bench
                                          (mirrors bench_concurrent.c)
- docs/k2_deblock_phase{4,5,7}.md       — plan + review + verification

Project verdict: continue. Cycle 3 candidates: MC interpolation
(multiply-heavy, stress V3D SMUL24), CDEF (AV1-only, different
neighborhood shape), or wd=8/wd=16 LPF variants. User to direct.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 12:39:26 +00:00

355 lines
13 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Cycle 2 Phase 6 — QPU bench for VP9 4-tap inner loop filter on V3D 7.1.
*
* Reports:
* M1'' (correctness): bit-exact rate, QPU output vs C reference
* M2'' (throughput): QPU sustained Medge/s over K dispatched batches
* fm/hev pass rates (phase5'' finding 8 instrumentation)
*
* Asserts the two contracts from k2_deblock_phase4.md §4
* (phase5'' findings 2+4): m.x ≥ 4, dst_stride ≥ 4.
*
* License: BSD-2-Clause.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stddef.h>
#include <assert.h>
#include <time.h>
#include <getopt.h>
#include <vulkan/vulkan.h>
#include "v3d_runner.h"
extern void daedalus_vp9_loop_filter_h_4_8_ref(
uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
/* --- RNG / generators (match bench_neon_lpf.c shape) ------------- */
static uint64_t xs_state;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
#define EDGE_STRIDE 8
#define EDGE_W 8
#define EDGE_H 8
#define EDGE_BYTES (EDGE_H * EDGE_STRIDE) /* 64 */
static void gen_edge_pixels(uint8_t *buf)
{
int side_a_base = (int)(xs() % 200) + 20;
int side_b_base = (int)(xs() % 200) + 20;
int noise_scale = (int)(xs() % 30);
for (int r = 0; r < EDGE_H; r++) {
for (int c = 0; c < EDGE_W; c++) {
int base = (c < 4) ? side_a_base : side_b_base;
int noise = ((int)(xs() % (2 * noise_scale + 1))) - noise_scale;
int v = base + noise;
buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
}
}
}
static void gen_thresholds(int *E, int *I, int *H)
{
*E = (int)(xs() % 81);
*I = (int)(xs() % 41);
*H = (int)(xs() % 11);
}
static double now_seconds(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
/* --- Push constants — match shader layout ------------------------ */
typedef struct {
uint32_t n_edges;
uint32_t dst_stride_u8;
uint32_t _pad0;
uint32_t _pad1;
} push_consts;
/* --- Pre-flight: fm/hev rate on the same RNG seed (informational) - */
static void estimate_pass_rates(uint64_t seed, int n_edges,
double *fm_rate, double *hev_rate)
{
uint64_t saved = xs_state;
xs_state = seed ? seed : 0xa57edbeef5717ULL;
int fm_pass = 0, hev_pass = 0;
uint8_t buf[EDGE_BYTES];
for (int i = 0; i < n_edges; i++) {
gen_edge_pixels(buf);
int E, I, H;
gen_thresholds(&E, &I, &H);
/* Mirror the C-ref fm/hev for just the first row of this
* edge — gives a sample of what the QPU would see. (For a
* more rigorous picture, count per-row, but per-edge is
* fine for instrumentation.) */
uint8_t *d = buf + 4; /* col 4 */
int p3 = d[-4], p2 = d[-3], p1 = d[-2], p0 = d[-1];
int q0 = d[ 0], q1 = d[+1], q2 = d[+2], q3 = d[+3];
int aP3P2 = p3-p2; if (aP3P2 < 0) aP3P2 = -aP3P2;
int aP2P1 = p2-p1; if (aP2P1 < 0) aP2P1 = -aP2P1;
int aP1P0 = p1-p0; if (aP1P0 < 0) aP1P0 = -aP1P0;
int aQ1Q0 = q1-q0; if (aQ1Q0 < 0) aQ1Q0 = -aQ1Q0;
int aQ2Q1 = q2-q1; if (aQ2Q1 < 0) aQ2Q1 = -aQ2Q1;
int aQ3Q2 = q3-q2; if (aQ3Q2 < 0) aQ3Q2 = -aQ3Q2;
int aP0Q0 = p0-q0; if (aP0Q0 < 0) aP0Q0 = -aP0Q0;
int aP1Q1 = p1-q1; if (aP1Q1 < 0) aP1Q1 = -aP1Q1;
int fm = (aP3P2 <= I) && (aP2P1 <= I) && (aP1P0 <= I) &&
(aQ1Q0 <= I) && (aQ2Q1 <= I) && (aQ3Q2 <= I) &&
(aP0Q0 * 2 + (aP1Q1 >> 1) <= E);
if (fm) {
fm_pass++;
if (aP1P0 > H || aQ1Q0 > H) hev_pass++;
}
}
*fm_rate = (double) fm_pass / n_edges;
*hev_rate = (double) hev_pass / n_edges;
xs_state = saved;
}
/* --- Main ------------------------------------------------------- */
int main(int argc, char **argv)
{
int n_edges = 65536;
int iters = 100;
int verify_only = 0;
uint64_t seed = 0;
const char *spv_path = "v3d_lpf_h_4_8.spv";
static struct option opts[] = {
{"edges", required_argument, 0, 'e'},
{"iters", required_argument, 0, 'i'},
{"seed", required_argument, 0, 's'},
{"spv", required_argument, 0, 'S'},
{"verify-only", no_argument, 0, 'V'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "e:i:s:S:V", opts, 0)) != -1;) {
switch (c) {
case 'e': n_edges = atoi(optarg); break;
case 'i': iters = atoi(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'S': spv_path = optarg; break;
case 'V': verify_only = 1; break;
default: return 2;
}
}
xs_state = seed ? seed : 0xa57edbeef5717ULL;
/* --- Setup ---- */
v3d_runner *r = v3d_runner_create();
if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; }
printf("=== v3d LPF h_4_8 bench ===\n");
printf(" device: %s\n", v3d_runner_device_name(r));
printf(" n_edges: %d iters: %d seed: 0x%016llx\n",
n_edges, iters, (unsigned long long) (seed ? seed : 0xa57edbeef5717ULL));
/* Per-edge layout in dst buffer: edge i occupies bytes
* [i*64 .. i*64+63]. The "edge center" (column 4 of row 0) is at
* byte offset i*64 + 4. Stride between rows of the same edge = 8. */
size_t dst_bytes = (size_t) n_edges * EDGE_BYTES;
size_t meta_bytes = (size_t) n_edges * 4 * sizeof(uint32_t); /* uvec4 per edge */
v3d_buffer buf_meta = {0}, buf_dst = {0};
if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1;
if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1;
/* Master pixel set + thresholds — kept stable across iters. */
uint8_t *master_pred = malloc(dst_bytes);
uint8_t *expected = malloc(dst_bytes);
int *Es = malloc(n_edges * sizeof(int));
int *Is = malloc(n_edges * sizeof(int));
int *Hs = malloc(n_edges * sizeof(int));
if (!master_pred || !expected || !Es || !Is || !Hs) { fprintf(stderr, "alloc\n"); return 1; }
for (int i = 0; i < n_edges; i++) {
gen_edge_pixels(master_pred + (size_t)i * EDGE_BYTES);
gen_thresholds(&Es[i], &Is[i], &Hs[i]);
}
/* Build C-ref expected output (separate copies, since the filter
* mutates dst in place). */
memcpy(expected, master_pred, dst_bytes);
for (int i = 0; i < n_edges; i++) {
daedalus_vp9_loop_filter_h_4_8_ref(
expected + (size_t)i * EDGE_BYTES + 4, /* col 4 of this edge */
EDGE_STRIDE, Es[i], Is[i], Hs[i]);
}
/* Populate GPU buffers. Asserts enforce phase4 §4 contracts. */
uint32_t *meta = (uint32_t *) buf_meta.mapped;
uint32_t dst_stride_u8 = EDGE_STRIDE;
assert(dst_stride_u8 >= 4 && "phase4 §4 contract 2 violated");
for (int i = 0; i < n_edges; i++) {
uint32_t mx = (uint32_t)((size_t)i * EDGE_BYTES + 4);
assert(mx >= 4 && "phase4 §4 contract 1 violated");
meta[4*i + 0] = mx;
meta[4*i + 1] = (uint32_t) Es[i];
meta[4*i + 2] = (uint32_t) Is[i];
meta[4*i + 3] = (uint32_t) Hs[i];
}
memcpy(buf_dst.mapped, master_pred, dst_bytes);
/* --- Pre-flight estimate of fm/hev pass rates --- */
double fm_rate, hev_rate;
estimate_pass_rates(seed, 10000, &fm_rate, &hev_rate);
printf(" fm pass rate: %.2f%% (10k-edge sample)\n", fm_rate * 100);
printf(" hev pass rate: %.2f%% (of fm-passing)\n", hev_rate * 100);
/* --- Pipeline --- */
v3d_pipeline pipe = {0};
if (v3d_runner_create_pipeline(r, spv_path,
/*n_ssbos=*/2,
/*push_const_size=*/sizeof(push_consts),
&pipe)) return 1;
v3d_buffer bind_bufs[2] = { buf_meta, buf_dst };
if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 2)) return 1;
const uint32_t edges_per_wg = 32;
uint32_t group_count_x = (uint32_t)((n_edges + edges_per_wg - 1) / edges_per_wg);
printf(" dispatch: %u WGs × 256 invocations = %u edges (rounded up from %d)\n",
group_count_x, group_count_x * edges_per_wg, n_edges);
push_consts pc = {
.n_edges = (uint32_t) n_edges,
.dst_stride_u8 = dst_stride_u8,
._pad0 = 0, ._pad1 = 0,
};
/* Record command buffer once. */
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
if (cb == VK_NULL_HANDLE) return 1;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, group_count_x, 1, 1);
vkEndCommandBuffer(cb);
/* --- M1'': bit-exact verification --- */
printf("\n=== M1'': QPU vs C-reference bit-exact ===\n");
memcpy(buf_dst.mapped, master_pred, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
int mismatch_edges = 0;
int total_byte_diffs = 0;
int prints = 0;
for (int i = 0; i < n_edges; i++) {
const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * EDGE_BYTES;
const uint8_t *e = expected + (size_t)i * EDGE_BYTES;
if (memcmp(q, e, EDGE_BYTES) != 0) {
int diffs = 0;
for (int j = 0; j < EDGE_BYTES; j++) if (q[j] != e[j]) diffs++;
total_byte_diffs += diffs;
if (prints < 3) {
fprintf(stderr, "MISMATCH edge %d (E=%d I=%d H=%d): %d/64 bytes differ\n",
i, Es[i], Is[i], Hs[i], diffs);
fprintf(stderr, " ref:");
for (int r0 = 0; r0 < 8; r0++) {
fprintf(stderr, "\n r%d ", r0);
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", e[r0*8+c]);
}
fprintf(stderr, "\n qpu:");
for (int r0 = 0; r0 < 8; r0++) {
fprintf(stderr, "\n r%d ", r0);
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", q[r0*8+c]);
}
fprintf(stderr, "\n");
prints++;
}
mismatch_edges++;
}
}
printf(" edges bit-exact: %d / %d (%.4f%%)\n",
n_edges - mismatch_edges, n_edges,
100.0 * (n_edges - mismatch_edges) / n_edges);
printf(" total byte diffs: %d / %zu (%.4f%%)\n",
total_byte_diffs, (size_t) n_edges * EDGE_BYTES,
100.0 * total_byte_diffs / ((double) n_edges * EDGE_BYTES));
if (mismatch_edges > 0) {
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
return 1;
}
if (verify_only) {
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
return 0;
}
/* --- M2'': throughput --- */
printf("\n=== M2'': QPU throughput ===\n");
for (int i = 0; i < 10; i++) { /* warm-up */
memcpy(buf_dst.mapped, master_pred, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
}
double t0 = now_seconds();
for (int i = 0; i < iters; i++) {
memcpy(buf_dst.mapped, master_pred, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
}
double t1 = now_seconds();
double s0 = now_seconds();
for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master_pred, dst_bytes);
double s1 = now_seconds();
double kernel_seconds = (t1 - t0) - (s1 - s0);
double total_edges = (double) n_edges * iters;
double medges_s = total_edges / kernel_seconds / 1e6;
printf(" edges/dispatch: %d\n", n_edges);
printf(" iters: %d\n", iters);
printf(" total edges: %.0f\n", total_edges);
printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", kernel_seconds);
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
printf(" M2'' throughput = %.3f Medge/s\n", medges_s);
printf(" per-edge = %.1f ns\n", kernel_seconds / total_edges * 1e9);
printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6);
double M3pp = 48.285; /* from k2_deblock_phase3.md */
double Rpp = medges_s / M3pp;
printf("\n Cycle 2 NEON M3'' = %.3f Medge/s\n", M3pp);
printf(" R'' = M2''/M3'' = %.3f\n", Rpp);
if (Rpp >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n");
else if (Rpp >= 0.5) printf(" decision band = YELLOW: M4'' decides\n");
else if (Rpp >= 0.1) printf(" decision band = ORANGE: M4'' may still rescue (cycle-1 calibration)\n");
else printf(" decision band = RED: structural mismatch\n");
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
free(master_pred); free(expected); free(Es); free(Is); free(Hs);
return 0;
}