Files
daedalus-fourier/tests/bench_v3d_cdef.c
T
marfrit 5223d3cb3f Cycle 5 closed: CDEF QPU R5=0.116 ORANGE, opportunistic helper
Phase 4 plan with 3 Phase-5 REDs applied inline:
  - meta layout: m.z=tmp_off, m.w=dir
  - sec_shift clamped to >=0 (NEON uqsub semantics)
  - directions table as const ivec2[14], not OR-packed

Phase 6 deliverable: v3d_cdef.comp (387 inst, 2 threads, no spills).
3-way M1 (QPU vs C ref vs NEON) PASS 4096/4096.

M2: 0.443 Mblock/s -> R5 = 0.116 ORANGE (predicted 0.02-0.05 RED).
M4 same-kernel: NEON-3+QPU 8.46 < NEON-4 alone ~10 (negative).
M4 mixed (NEON-3 MC + QPU CDEF): CPU 34.17 Mblock/s MC,
  QPU 0.42 Mblock/s CDEF helper. CPU side higher than the
  Issue 003 NEON-fallback proxy suggested - cross-substrate
  contention is gentler than same-side NEON contention.

Verdict: CDEF stays on CPU; QPU dispatch path exists for
opportunistic use. Deployment recipe table updated for all 5
cycles. Phase 9 lessons: linear extrapolation across cycles is
too pessimistic; CDEF is bandwidth-bound on NEON despite high
per-block ns; real-substrate-cross contention < NEON-proxy
contention.

- src/v3d_cdef.comp: cycle 5 QPU shader
- tests/bench_v3d_cdef.c: 3-way M1, M2 bench
- tests/bench_concurrent_mixed.c: K_CDEF on both sides
- tests/cdef_ref.c + bench_neon_cdef.c: sec_shift clamp +
  expanded damping range to exercise the edge case
- CMakeLists.txt: v3d_cdef.spv + bench_v3d_cdef wiring
- docs/k5_cdef_phase4.md updated with Phase 5 review applied
- docs/k5_cdef_phase7.md: closure doc with full verdict matrix

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:52:46 +00:00

333 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Cycle 5 Phase 6 — QPU bench for AV1 CDEF primary+secondary 8x8
* luma filter on V3D 7.1.
*
* Reports:
* M1₅: 3-way bit-exact (QPU vs NEON vs C reference) per Phase 5
* YELLOW-1.
* M2₅: QPU sustained Mblock/s over K dispatched batches
*
* License: BSD-2-Clause; links dav1d 1.4.3 NEON snapshot.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <assert.h>
#include <time.h>
#include <getopt.h>
#include <vulkan/vulkan.h>
#include "v3d_runner.h"
extern void daedalus_cdef_filter_8x8_pri_sec_ref(
uint8_t *dst, ptrdiff_t dst_stride,
const uint16_t *tmp,
int pri_strength, int sec_strength,
int dir, int damping, int h);
extern void dav1d_cdef_filter8_8bpc_neon(
uint8_t *dst, ptrdiff_t dst_stride,
const uint16_t *tmp,
int pri_strength, int sec_strength,
int dir, int damping, int h, size_t edges);
#define TMP_W 16
#define TMP_H 12
#define TMP_INTS (TMP_W * TMP_H) /* 192 */
#define DST_W 8
#define DST_H 8
#define DST_BYTES (DST_H * DST_W) /* 64 */
#define BLOCK_ORIGIN_U16 (2 * TMP_W + 2) /* 34 */
static uint64_t xs_state;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
static void gen_tmp(uint16_t *tmp)
{
for (int i = 0; i < TMP_INTS; i++)
tmp[i] = (uint16_t)(xs() & 0xff);
}
static void tmp_center_to_dst(uint8_t *dst, const uint16_t *tmp)
{
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++)
dst[r * 8 + c] = (uint8_t) tmp[(r + 2) * TMP_W + (c + 2)];
}
static void gen_filter_params(int *pri, int *sec, int *dir, int *damping)
{
*pri = (int)(xs() % 7) + 1;
*sec = (int)(xs() % 4) + 1;
*dir = (int)(xs() & 7);
*damping = (int)(xs() % 6) + 1; /* includes negative-sec_shift cases */
}
static double now_seconds(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
typedef struct {
uint32_t n_blocks;
uint32_t tmp_stride_u16;
uint32_t dst_stride_u8;
uint32_t _pad;
} push_consts;
int main(int argc, char **argv)
{
int n_blocks = 16384;
int iters = 200;
int verify_only = 0;
uint64_t seed = 0;
const char *spv_path = "v3d_cdef.spv";
static struct option opts[] = {
{"blocks", required_argument, 0, 'b'},
{"iters", required_argument, 0, 'i'},
{"seed", required_argument, 0, 's'},
{"spv", required_argument, 0, 'S'},
{"verify-only", no_argument, 0, 'V'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "b:i:s:S:V", opts, 0)) != -1;) {
switch (c) {
case 'b': n_blocks = atoi(optarg); break;
case 'i': iters = atoi(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'S': spv_path = optarg; break;
case 'V': verify_only = 1; break;
default: return 2;
}
}
xs_state = seed ? seed : 0xc0defacedcafebebULL;
v3d_runner *r = v3d_runner_create();
if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; }
printf("=== v3d CDEF bench ===\n");
printf(" device: %s\n", v3d_runner_device_name(r));
printf(" n_blocks: %d iters: %d seed: 0x%016llx\n",
n_blocks, iters, (unsigned long long) (seed ? seed : 0xc0defacedcafebebULL));
size_t meta_bytes = (size_t) n_blocks * 4 * sizeof(uint32_t); /* uvec4 */
size_t dst_bytes = (size_t) n_blocks * DST_BYTES;
size_t tmp_bytes = (size_t) n_blocks * TMP_INTS * sizeof(uint16_t);
v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_tmp = {0};
if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1;
if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1;
if (v3d_runner_create_buffer(r, tmp_bytes, &buf_tmp)) return 1;
uint8_t *master_dst = malloc(dst_bytes);
uint8_t *expected_c = malloc(dst_bytes);
uint8_t *expected_n = malloc(dst_bytes);
int *pris = malloc(n_blocks * sizeof(int));
int *secs = malloc(n_blocks * sizeof(int));
int *dirs = malloc(n_blocks * sizeof(int));
int *damps = malloc(n_blocks * sizeof(int));
if (!master_dst || !expected_c || !expected_n || !pris || !secs || !dirs || !damps) {
fprintf(stderr, "alloc fail\n"); return 1;
}
/* Generate tmp + params + initial dst (block center extracted). */
uint16_t *tmp_gpu = (uint16_t *) buf_tmp.mapped;
for (int i = 0; i < n_blocks; i++) {
uint16_t *tmp = tmp_gpu + (size_t)i * TMP_INTS;
gen_tmp(tmp);
tmp_center_to_dst(master_dst + (size_t)i * DST_BYTES, tmp);
gen_filter_params(&pris[i], &secs[i], &dirs[i], &damps[i]);
}
/* Compute C-ref and NEON expected outputs (serial, on master_dst). */
memcpy(expected_c, master_dst, dst_bytes);
memcpy(expected_n, master_dst, dst_bytes);
for (int i = 0; i < n_blocks; i++) {
daedalus_cdef_filter_8x8_pri_sec_ref(
expected_c + (size_t)i * DST_BYTES, DST_W,
tmp_gpu + (size_t)i * TMP_INTS,
pris[i], secs[i], dirs[i], damps[i], 8);
dav1d_cdef_filter8_8bpc_neon(
expected_n + (size_t)i * DST_BYTES, DST_W,
tmp_gpu + (size_t)i * TMP_INTS + BLOCK_ORIGIN_U16,
pris[i], secs[i], dirs[i], damps[i], 8, 0);
}
/* Confirm 2-way C vs NEON parity (defence in depth — Phase 3 already
* passed this for 10000 blocks, but n_blocks may be larger here). */
int cn_mis = 0;
for (int i = 0; i < n_blocks; i++) {
if (memcmp(expected_c + (size_t)i * DST_BYTES,
expected_n + (size_t)i * DST_BYTES, DST_BYTES) != 0) cn_mis++;
}
printf(" C ref vs NEON parity check: %d/%d mismatches\n", cn_mis, n_blocks);
if (cn_mis > 0) {
fprintf(stderr, "ERROR: C ref disagrees with NEON before QPU even runs.\n");
return 1;
}
/* Populate meta SSBO (post Phase 5 RED-1 layout). */
uint32_t *meta = (uint32_t *) buf_meta.mapped;
uint32_t dst_stride_u8 = DST_W; /* 8 */
uint32_t tmp_stride_u16 = TMP_W; /* 16 */
for (int i = 0; i < n_blocks; i++) {
uint32_t pri = (uint32_t) pris[i];
uint32_t sec = (uint32_t) secs[i];
uint32_t damping = (uint32_t) damps[i];
meta[4*i + 0] = (uint32_t)((size_t)i * DST_BYTES);
meta[4*i + 1] = pri | (sec << 8) | (damping << 16);
meta[4*i + 2] = (uint32_t)((size_t)i * TMP_INTS + BLOCK_ORIGIN_U16);
meta[4*i + 3] = (uint32_t) dirs[i];
}
/* Pipeline (3 SSBOs). */
v3d_pipeline pipe = {0};
if (v3d_runner_create_pipeline(r, spv_path,
/*n_ssbos=*/3,
/*push_const_size=*/sizeof(push_consts),
&pipe)) return 1;
v3d_buffer bind_bufs[3] = { buf_meta, buf_dst, buf_tmp };
if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 3)) return 1;
const uint32_t blocks_per_wg = 4;
uint32_t group_count_x = (uint32_t)((n_blocks + blocks_per_wg - 1) / blocks_per_wg);
printf(" dispatch: %u WGs × 256 invocations = %u blocks\n",
group_count_x, group_count_x * blocks_per_wg);
push_consts pc = {
.n_blocks = (uint32_t) n_blocks,
.tmp_stride_u16 = tmp_stride_u16,
.dst_stride_u8 = dst_stride_u8,
._pad = 0,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
if (cb == VK_NULL_HANDLE) return 1;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, group_count_x, 1, 1);
vkEndCommandBuffer(cb);
/* --- M1: QPU vs C-ref vs NEON 3-way --- */
printf("\n=== M1₅: QPU vs C-ref vs NEON 3-way ===\n");
memcpy(buf_dst.mapped, master_dst, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
int qc_mismatches = 0, qn_mismatches = 0;
int prints = 0;
for (int i = 0; i < n_blocks; i++) {
const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * DST_BYTES;
const uint8_t *c = expected_c + (size_t)i * DST_BYTES;
const uint8_t *n = expected_n + (size_t)i * DST_BYTES;
int qc = memcmp(q, c, DST_BYTES);
int qn = memcmp(q, n, DST_BYTES);
if (qc) qc_mismatches++;
if (qn) qn_mismatches++;
if ((qc || qn) && prints < 3) {
fprintf(stderr, "MISMATCH block %d (pri=%d sec=%d dir=%d damp=%d):\n",
i, pris[i], secs[i], dirs[i], damps[i]);
fprintf(stderr, " C ref:");
for (int r0 = 0; r0 < 8; r0++) {
fprintf(stderr, "\n r%d ", r0);
for (int c0 = 0; c0 < 8; c0++) fprintf(stderr, "%3u ", c[r0*8+c0]);
}
fprintf(stderr, "\n QPU:");
for (int r0 = 0; r0 < 8; r0++) {
fprintf(stderr, "\n r%d ", r0);
for (int c0 = 0; c0 < 8; c0++) fprintf(stderr, "%3u ", q[r0*8+c0]);
}
fprintf(stderr, "\n");
prints++;
}
}
printf(" QPU vs C ref: %d / %d blocks bit-exact (%.4f%%)\n",
n_blocks - qc_mismatches, n_blocks,
100.0 * (n_blocks - qc_mismatches) / n_blocks);
printf(" QPU vs NEON: %d / %d blocks bit-exact (%.4f%%)\n",
n_blocks - qn_mismatches, n_blocks,
100.0 * (n_blocks - qn_mismatches) / n_blocks);
if (qc_mismatches > 0 || qn_mismatches > 0) {
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
return 1;
}
if (verify_only) {
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_tmp);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
return 0;
}
/* --- M2: throughput --- */
printf("\n=== M2₅: QPU throughput ===\n");
for (int i = 0; i < 5; i++) {
memcpy(buf_dst.mapped, master_dst, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
}
double t0 = now_seconds();
for (int i = 0; i < iters; i++) {
memcpy(buf_dst.mapped, master_dst, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
}
double t1 = now_seconds();
double s0 = now_seconds();
for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master_dst, dst_bytes);
double s1 = now_seconds();
double kernel_seconds = (t1 - t0) - (s1 - s0);
double total_blocks = (double) n_blocks * iters;
double mbps = total_blocks / kernel_seconds / 1e6;
printf(" blocks/dispatch: %d\n", n_blocks);
printf(" iters: %d\n", iters);
printf(" total blocks: %.0f\n", total_blocks);
printf(" elapsed (kernel)=%.6f s (setup-subtracted)\n", kernel_seconds);
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
printf(" M2₅ throughput = %.3f Mblock/s\n", mbps);
printf(" per-block = %.1f ns\n", kernel_seconds / total_blocks * 1e9);
printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6);
double M3_5 = 3.809;
double R5 = mbps / M3_5;
printf("\n Cycle 5 NEON M3₅ = %.3f Mblock/s\n", M3_5);
printf(" R₅ = M2₅/M3₅ = %.3f\n", R5);
if (R5 >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n");
else if (R5 >= 0.5) printf(" decision band = YELLOW: M4 decides\n");
else if (R5 >= 0.1) printf(" decision band = ORANGE: M4 may still rescue\n");
else printf(" decision band = RED: structural mismatch (predicted)\n");
/* 30fps@1080p floor: 32400 blocks/frame × 30 fps = 0.972 Mblock/s */
double floor_rate = 0.972;
printf(" 30fps@1080p floor: %.2fx margin (isolation)\n", mbps / floor_rate);
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_tmp);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
free(master_dst); free(expected_c); free(expected_n);
free(pris); free(secs); free(dirs); free(damps);
return 0;
}