Files
daedalus-fourier/tests/bench_v3d_idct.c
T
marfrit d66f22f333 Phase 6 (v1+v4 production) + Phase 7 closure: R = 0.92 ± 0.03 on hertz
First QPU IDCT8 kernel running and bit-exact on V3D 7.1 via Mesa
v3dv compute. Five iterations through a Phase 7→Phase 4' loopback;
production kernel is v4.

New files:
- src/v3d_runner.{c,h}  — reusable Vulkan compute plumbing (instance,
                          V3D device picker, HOST_VISIBLE|COHERENT
                          SSBOs with mmap, compute pipeline from .spv,
                          enables storageBuffer{8,16}BitAccess)
- src/v3d_idct8.comp    — VP9 8x8 DCT_DCT IDCT add, v4 production:
                          256 invocations/WG, 2 blocks/subgroup
                          (no idle lanes), uint8 dst SSBO (race-free
                          per phase5 finding 5), unrolled writes
                          (no chained ternary), oob-flag pattern
                          (barrier-safe per phase5 finding 7)
- tests/bench_v3d_idct.c — M1' bit-exact gate + M2 throughput vs C ref
- docs/phase7.md         — full iteration journey + decision verdict

CMakeLists.txt updated to build the new shader, library, and bench
when DAEDALUS_BUILD_VULKAN=ON.

Iteration record (1920x1088 luma, 32640 blocks/dispatch, N=3):

  ver  change                              R       ns/block
  v1   first-light                         0.230   533
  v2   kill ternary + 2-blocks-per-sg      0.474   258
  v3   per-pass scope oN                   0.481   254  (noise)
  v4   WG 64 -> 256 invocations            0.947   129
  v5   packed uint32 coeff reads           0.938   130  (noise, reverted)
  v4 final N=3                             0.918 +/- 0.033

Bit-exactness 100.0000% across all iterations (10000-block sample
on 128x128, 32640-block sample on 1080p) against both the C
reference (tests/vp9_idct8_ref.c) and the vendored FFmpeg NEON
ff_vp9_idct_idct_8x8_add_neon.

Key learning over the Phase 5 review's prediction model: the
chained ternary was NOT a spill killer on V3D 7.1 (shaderdb
showed 0:0 spills:fills even in v1). The actual lever was
workgroup-size-driven latency hiding — going from 64 to 256
invocations doubled throughput with the same compiled code
(270 inst, 2 threads, 21 max-temps, 0 spills) because the
v3dv scheduler had 4x more in-flight work to overlap TMU
latency.

Verdict per phase1.md decision rules: YELLOW band (0.5 <= R < 1.0)
by a wide margin, near GREEN boundary. Phase 1 YELLOW rule:
add M4 (concurrent CPU+QPU throughput) before honest-close or
continue. M4 is the next measurement, not more shader tuning —
at R = 0.92 with all 4 A76 cores still 100% free for other work,
the question is whether the system aggregate beats pure 4-core
NEON.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 12:09:00 +00:00

335 lines
13 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Phase 6 — first-light QPU bench for VP9 8×8 DCT_DCT IDCT add on V3D 7.1.
*
* Reports:
* M1' (correctness): bit-exact rate, QPU output vs C reference,
* across N synthetic blocks.
* M2 (throughput): QPU sustained MblockS over K dispatched frames.
*
* Compares against M3 (bench_neon_idct) to compute R = M2 / M3.
* Decision rules per docs/phase1.md §"Decision rules".
*
* License: BSD-2-Clause. Links statically against the LGPL-2.1+
* vp9_idct8_ref.c (a clean-room transcription from spec), so this
* binary distributes under BSD-2-Clause-or-later if separated; left
* as LGPL-2.1+ when linked together.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stddef.h>
#include <time.h>
#include <getopt.h>
#include <vulkan/vulkan.h>
#include "v3d_runner.h"
/* C bit-exact reference from tests/vp9_idct8_ref.c. */
extern void daedalus_vp9_idct_idct_8x8_add_ref(
uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
/* ---- RNG (matches bench_neon_idct.c shape for reproducibility) -- */
static uint64_t xs64_state;
static inline uint64_t xs64(void)
{
uint64_t x = xs64_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs64_state = x;
}
static int gen_block(int16_t block[64])
{
memset(block, 0, 64 * sizeof(*block));
int eob = 0;
int n_nonzero = 1 + (int)(xs64() % 16);
for (int i = 0; i < n_nonzero; i++) {
int pos = (int)(xs64() % 64);
int16_t coef = (int16_t)((int)(xs64() % 8192) - 4096);
block[pos] = coef;
if (pos + 1 > eob) eob = pos + 1;
}
if (eob == 0) eob = 1;
return eob;
}
static double now_seconds(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
/* ---- Push-constant layout — must match src/v3d_idct8.comp ------- */
typedef struct {
uint32_t n_blocks;
uint32_t blocks_per_row;
uint32_t dst_stride_u8;
uint32_t _pad;
} push_consts;
/* ---- Main ------------------------------------------------------- */
int main(int argc, char **argv)
{
/* Default synthetic frame: 128×128 pixels = 16×16 blocks = 256
* blocks. Small enough for fast bring-up; large enough that the
* 4-blocks/WG geometry gets exercised (64 WGs). */
int blocks_per_row = 16;
int rows_of_blocks = 16;
int iters = 100;
uint64_t seed = 0;
const char *spv_path = "v3d_idct8.spv";
int verify_only = 0;
int max_mismatch_print = 4;
static struct option opts[] = {
{"width", required_argument, 0, 'w'},
{"height", required_argument, 0, 'h'},
{"iters", required_argument, 0, 'i'},
{"seed", required_argument, 0, 's'},
{"spv", required_argument, 0, 'S'},
{"verify-only", no_argument, 0, 'V'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "w:h:i:s:S:V", opts, 0)) != -1;) {
switch (c) {
case 'w': blocks_per_row = atoi(optarg) / 8; break;
case 'h': rows_of_blocks = atoi(optarg) / 8; break;
case 'i': iters = atoi(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'S': spv_path = optarg; break;
case 'V': verify_only = 1; break;
default: return 2;
}
}
int dst_width = blocks_per_row * 8;
int dst_height = rows_of_blocks * 8;
int dst_stride = dst_width; /* tightly packed */
size_t n_blocks = (size_t)blocks_per_row * rows_of_blocks;
size_t dst_bytes = (size_t)dst_height * dst_stride;
printf("=== v3d IDCT8 first-light ===\n");
printf(" frame: %dx%d (%dx%d blocks, %zu blocks total)\n",
dst_width, dst_height, blocks_per_row, rows_of_blocks, n_blocks);
printf(" spv: %s\n", spv_path);
printf(" iters: %d (for throughput phase)\n", iters);
xs64_state = seed ? seed : 0xdeadbeefcafebabeULL;
/* ---- Init runner ---- */
v3d_runner *r = v3d_runner_create();
if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; }
printf(" device: %s\n", v3d_runner_device_name(r));
/* ---- Buffers ---- */
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
if (v3d_runner_create_buffer(r, n_blocks * 64 * sizeof(int16_t), &buf_coeffs)) return 1;
if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1;
if (v3d_runner_create_buffer(r, n_blocks * 2 * sizeof(uint32_t), &buf_meta)) return 1;
/* Fill master inputs — these stay constant across iterations. */
int16_t *master_coeffs = malloc(n_blocks * 64 * sizeof(int16_t));
uint8_t *master_pred = malloc(dst_bytes);
uint8_t *expected_dst = malloc(dst_bytes); /* C-reference output */
int *eobs = malloc(n_blocks * sizeof(int));
if (!master_coeffs || !master_pred || !expected_dst || !eobs) return 1;
for (size_t b = 0; b < n_blocks; b++)
eobs[b] = gen_block(master_coeffs + b * 64);
for (size_t i = 0; i < dst_bytes; i++)
master_pred[i] = (uint8_t)(xs64() & 0xff);
/* Build the expected (C-reference) output frame. The C ref
* mutates its input block (zeros it after column pass), so we
* work on copies. */
memcpy(expected_dst, master_pred, dst_bytes);
int16_t scratch[64];
for (size_t b = 0; b < n_blocks; b++) {
int bx = (int)(b % blocks_per_row);
int by = (int)(b / blocks_per_row);
memcpy(scratch, master_coeffs + b * 64, sizeof(scratch));
daedalus_vp9_idct_idct_8x8_add_ref(
expected_dst + by * 8 * dst_stride + bx * 8,
dst_stride, scratch, eobs[b]);
}
/* Populate GPU buffers. */
memcpy(buf_coeffs.mapped, master_coeffs, buf_coeffs.size);
memcpy(buf_dst.mapped, master_pred, buf_dst.size);
uint32_t *meta = (uint32_t *) buf_meta.mapped;
for (size_t b = 0; b < n_blocks; b++) {
meta[2*b + 0] = (uint32_t)(b % blocks_per_row); /* block_x_8 */
meta[2*b + 1] = (uint32_t)(b / blocks_per_row); /* block_y_8 */
}
/* ---- Pipeline ---- */
v3d_pipeline pipe = {0};
if (v3d_runner_create_pipeline(r, spv_path,
/*n_ssbos=*/3,
/*push_const_size=*/sizeof(push_consts),
&pipe)) return 1;
v3d_buffer bind_bufs[3] = { buf_coeffs, buf_dst, buf_meta };
if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 3)) return 1;
/* ---- Dispatch geometry ---- */
/* v4: 32 blocks per WG (2 per 16-lane subgroup × 16 subgroups).
* 4× v2's count — more in-flight work per WG for latency hiding. */
const uint32_t blocks_per_wg = 32;
uint32_t group_count_x = (uint32_t)((n_blocks + blocks_per_wg - 1)
/ blocks_per_wg);
printf(" dispatch: %u WGs × 64 invocations = %u blocks (rounded up from %zu)\n",
group_count_x, group_count_x * blocks_per_wg, n_blocks);
push_consts pc = {
.n_blocks = (uint32_t)n_blocks,
.blocks_per_row = (uint32_t)blocks_per_row,
.dst_stride_u8 = (uint32_t)dst_stride,
._pad = 0,
};
/* Record once, reuse for every iteration. */
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
if (cb == VK_NULL_HANDLE) return 1;
VkCommandBufferBeginInfo cbbi = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
};
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, group_count_x, 1, 1);
vkEndCommandBuffer(cb);
/* ---- M1': bit-exact verification (first dispatch only) ---- */
printf("\n=== M1': QPU vs C-reference bit-exact ===\n");
memcpy(buf_dst.mapped, master_pred, buf_dst.size);
if (v3d_runner_submit_wait(r, cb)) return 1;
int mismatch_blocks = 0;
int total_byte_diffs = 0;
for (size_t b = 0; b < n_blocks; b++) {
int bx = (int)(b % blocks_per_row);
int by = (int)(b / blocks_per_row);
const uint8_t *qpu_block = (uint8_t *)buf_dst.mapped
+ by * 8 * dst_stride + bx * 8;
const uint8_t *ref_block = expected_dst
+ by * 8 * dst_stride + bx * 8;
int block_diffs = 0;
for (int r0 = 0; r0 < 8; r0++)
for (int c = 0; c < 8; c++)
if (qpu_block[r0 * dst_stride + c]
!= ref_block[r0 * dst_stride + c]) {
block_diffs++;
total_byte_diffs++;
}
if (block_diffs > 0 && mismatch_blocks < max_mismatch_print) {
fprintf(stderr,
"MISMATCH block %zu @ (bx=%d by=%d) eob=%d: %d/64 bytes differ\n",
b, bx, by, eobs[b], block_diffs);
fprintf(stderr, " ref:");
for (int r0 = 0; r0 < 8; r0++) {
fprintf(stderr, "\n r%d ", r0);
for (int c = 0; c < 8; c++)
fprintf(stderr, "%3u ", ref_block[r0 * dst_stride + c]);
}
fprintf(stderr, "\n qpu:");
for (int r0 = 0; r0 < 8; r0++) {
fprintf(stderr, "\n r%d ", r0);
for (int c = 0; c < 8; c++)
fprintf(stderr, "%3u ", qpu_block[r0 * dst_stride + c]);
}
fprintf(stderr, "\n");
}
if (block_diffs > 0) mismatch_blocks++;
}
printf(" blocks bit-exact: %zu / %zu (%.4f%%)\n",
n_blocks - mismatch_blocks, n_blocks,
100.0 * (n_blocks - mismatch_blocks) / n_blocks);
printf(" total byte diffs: %d / %zu (%.4f%%)\n",
total_byte_diffs, n_blocks * 64,
100.0 * total_byte_diffs / (n_blocks * 64));
if (mismatch_blocks > 0) {
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_coeffs);
v3d_runner_destroy(r);
return 1;
}
if (verify_only) {
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_coeffs);
v3d_runner_destroy(r);
return 0;
}
/* ---- M2: throughput ---- */
printf("\n=== M2: QPU throughput ===\n");
/* Warm-up. */
for (int i = 0; i < 10; i++) {
memcpy(buf_dst.mapped, master_pred, buf_dst.size);
if (v3d_runner_submit_wait(r, cb)) return 1;
}
double t0 = now_seconds();
for (int i = 0; i < iters; i++) {
memcpy(buf_dst.mapped, master_pred, buf_dst.size);
if (v3d_runner_submit_wait(r, cb)) return 1;
}
double t1 = now_seconds();
/* Setup-only timing for memcpy subtraction. */
double s0 = now_seconds();
for (int i = 0; i < iters; i++) {
memcpy(buf_dst.mapped, master_pred, buf_dst.size);
}
double s1 = now_seconds();
double total_seconds = (t1 - t0) - (s1 - s0);
double total_blocks = (double) n_blocks * iters;
double mblocks_s = total_blocks / total_seconds / 1e6;
printf(" blocks/dispatch: %zu\n", n_blocks);
printf(" iters: %d\n", iters);
printf(" total blocks: %.0f\n", total_blocks);
printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", total_seconds);
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
printf(" M2 throughput = %.3f Mblock/s\n", mblocks_s);
printf(" per-block = %.1f ns\n",
total_seconds / total_blocks * 1e9);
printf(" per-dispatch = %.1f us\n",
total_seconds / iters * 1e6);
/* R = M2 / M3 = M2 / 8.171 Mblock/s (Phase 3 baseline). */
double M3 = 8.171;
double R = mblocks_s / M3;
printf("\n Phase 3 NEON M3 = %.3f Mblock/s\n", M3);
printf(" R = M2 / M3 = %.3f\n", R);
if (R >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n");
else if (R >= 0.5) printf(" decision band = YELLOW: concurrent-work hypothesis viable\n");
else if (R >= 0.1) printf(" decision band = ORANGE: material loss; honest close suggested\n");
else printf(" decision band = RED: structural mismatch\n");
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_coeffs);
v3d_runner_destroy(r);
free(master_coeffs); free(master_pred); free(expected_dst); free(eobs);
return 0;
}