Files
marfrit 356e446a49 Cycle 3 (MC interpolation) closure: M1'''=100%, R'''=0.067 RED, M4=-19.5%
Third daedalus-fourier kernel — VP9 8-tap regular subpel filter,
horizontal direction, 8-wide output. Multiply-heavy by design to
stress V3D's no-DP4A deficit. Full cycle Phase 1-7 + M4'''.

Phase 5''' second-model review delivered cleanly — caught 1 RED
bug pre-implementation (src_off off-by-3 indexing convention) and
2 YELLOW gaps (assert MUST language, shaderdb filter-LUT gate).
Without the review, M1''' would have failed silently on first run
with cryptic "high-index source pixels wrong" symptoms.

Phase 6 v1 first-light: M1''' 100.0000% bit-exact (65536/65536
blocks across all 16 mx phases). Phase 5''' filter-LUT prediction
materialised exactly: 197 uniforms (gate was 144), 2 threads (down
from cycle-2's 4 due to register pressure).

Performance:

  M2''' = 1.413 Mblock/s     (707.9 ns/block)
  M3''' = 20.997 Mblock/s    (NEON baseline phase3)
  R'''  = 0.067              (RED band — structural mismatch)
  shaderdb: 488 inst, 2 threads, 197 uniforms, 25 max-temps, 0 spills

M4''' concurrent matrix (8s windows):

  NEON 1-core           14.479 Mblock/s
  NEON 4-core           15.248 Mblock/s   <- baseline (compute-bound,
                                              not bandwidth-saturated
                                              like cycles 1+2!)
  QPU only               1.380 Mblock/s
  MIXED NEON-3 + QPU    12.277 Mblock/s   <- -19.5% (FAIL gate)
  MIXED NEON-4 + QPU    12.158 Mblock/s   <- -20.3%

NEW cross-cycle finding (Phase 9 lesson 2): compute-bound CPU
workloads make the QPU-offload story collapse. Cycles 1+2 were
bandwidth-saturated (4-core scaling 0.56-0.82x of 1-core), so
freeing a CPU core via QPU offload added throughput. Cycle 3 MC
is compute-bound (4-core scaling 1.05x of 1-core — near-linear),
no free cycles to free. QPU contribution (0.45 Mblock/s in
contention) doesn't compensate for losing 1 NEON core delivering
~3.8 Mblock/s.

But 30fps@1080p floor: PASS in every config (1.4x to 15.7x
isolation margin). Per project_30fps_floor_is_fine.md, user-facing
test never fails — daily YouTube playback works fine on any CPU/QPU
split.

DEPLOYMENT RECIPE for higgs (cycle 3 confirmed split):

  IDCT (k1)  -> QPU   (R=0.92, +7% mixed, frees CPU core)
  LPF  (k2)  -> QPU   (R=0.41, +7% mixed, frees CPU core)
  MC   (k3)  -> CPU   (R=0.067, -19.5% mixed — stays on CPU)
  Entropy    -> CPU   (structurally serial)

Mixed-substrate deployment, not "QPU does everything". Realistic for
higgs: entropy + MC on 2-3 ARM cores; IDCT + LPF dispatched to QPU
concurrently; 1-2 ARM cores left for vscode etc.

New artifacts:
- src/v3d_mc_8h.comp               — GLSL kernel
- tests/vp9_mc_ref.c               — standalone C ref (REGULAR filter
                                     embedded; clean transcription)
- tests/bench_neon_mc.c            — M1'''_c + M3''' bench
- tests/bench_v3d_mc.c             — M1''' + M2''' bench with contract
                                     asserts + 30fps margin display
- tests/bench_concurrent_mc.c      — M4''' pthread bench
- external/ffmpeg-snapshot/libavcodec/aarch64/vp9mc_neon.S    (vendored)
- external/ffmpeg-snapshot/libavcodec/vp9_subpel_filters_table.c
                                     (hand-extracted; provides
                                      ff_vp9_subpel_filters symbol
                                      without dragging in full vp9dsp.c)
- docs/k3_mc_phase{1,2,3,4,5,7}.md — full cycle documentation

Memory updates: project_30fps_floor_is_fine.md (user's 30fps target
recalibration), MEMORY.md index updated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 12:51:43 +00:00

304 lines
11 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Cycle 3 Phase 6 — QPU bench for VP9 8-tap "regular" subpel filter,
* horizontal, 8-wide output on V3D 7.1.
*
* Reports:
* M1''' (correctness): QPU output vs C reference, N blocks across
* all 16 mx phases
* M2''' (throughput): QPU sustained Mblock/s
*
* Per k3_mc_phase4.md §5 (revised per phase5''' findings 4 + 6):
* - src_off is the RAW block base (no +3 shift)
* - assert(dst_stride_u8 >= 8 && src_stride_u8 >= 15)
*
* License: BSD-2-Clause.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <assert.h>
#include <time.h>
#include <getopt.h>
#include <vulkan/vulkan.h>
#include "v3d_runner.h"
extern void daedalus_vp9_put_regular_8h_ref(
uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
int h, int mx, int my);
/* Per-block layout: src buffer 8 rows × 16 cols = 128 bytes. The
* C bench's src+3 convention: NEON/C ref is called with
* `src = block_base + 3, src_stride = 16`. The shader's src_off
* is the RAW block_base (no +3 shift), and the shader reads
* s[0..14] from src_off + row*stride. Together this means:
* shader's s[k] for k=0..14 = master_src[block_base + row*16 + k]
* C ref's `src[x+k-3]` for x=0..7, k=0..7 with `src = block_base+3`
* = master_src[block_base + row*16 + (x+k)]
* = master_src[block_base + row*16 + (0..14)]
* which is exactly what the shader reads. */
#define SRC_W 16
#define SRC_H 8
#define DST_W 8
#define DST_H 8
#define SRC_BYTES (SRC_H * SRC_W)
#define DST_BYTES (DST_H * DST_W)
static uint64_t xs_state;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
static void gen_src(uint8_t *b) {
for (int i = 0; i < SRC_BYTES; i++) b[i] = (uint8_t)(xs() & 0xff);
}
static double now_seconds(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
typedef struct {
uint32_t n_blocks;
uint32_t dst_stride_u8;
uint32_t src_stride_u8;
uint32_t _pad;
} push_consts;
int main(int argc, char **argv)
{
int n_blocks = 65536;
int iters = 100;
uint64_t seed = 0;
int verify_only = 0;
const char *spv_path = "v3d_mc_8h.spv";
static struct option opts[] = {
{"blocks", required_argument, 0, 'b'},
{"iters", required_argument, 0, 'i'},
{"seed", required_argument, 0, 's'},
{"spv", required_argument, 0, 'S'},
{"verify-only", no_argument, 0, 'V'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "b:i:s:S:V", opts, 0)) != -1;) {
switch (c) {
case 'b': n_blocks = atoi(optarg); break;
case 'i': iters = atoi(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'S': spv_path = optarg; break;
case 'V': verify_only = 1; break;
default: return 2;
}
}
xs_state = seed ? seed : 0xabcdef1234567890ULL;
v3d_runner *r = v3d_runner_create();
if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; }
printf("=== v3d MC 8h bench ===\n");
printf(" device: %s\n", v3d_runner_device_name(r));
printf(" n_blocks: %d iters: %d\n", n_blocks, iters);
/* Buffers: meta + dst + src, all blocks contiguous. */
size_t meta_bytes = (size_t) n_blocks * 4 * sizeof(uint32_t);
size_t src_bytes = (size_t) n_blocks * SRC_BYTES;
size_t dst_bytes = (size_t) n_blocks * DST_BYTES;
v3d_buffer buf_meta = {0}, buf_dst = {0}, buf_src = {0};
if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1;
if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1;
if (v3d_runner_create_buffer(r, src_bytes, &buf_src)) return 1;
uint8_t *master_src = malloc(src_bytes);
uint8_t *expected = malloc(dst_bytes);
int *mxs = malloc(n_blocks * sizeof(int));
if (!master_src || !expected || !mxs) { fprintf(stderr, "alloc\n"); return 1; }
for (int i = 0; i < n_blocks; i++) {
gen_src(master_src + (size_t)i * SRC_BYTES);
mxs[i] = (int)(xs() & 15);
}
/* Build C-ref expected. C ref takes `src + 3, src_stride = SRC_W`. */
memset(expected, 0, dst_bytes);
for (int i = 0; i < n_blocks; i++) {
daedalus_vp9_put_regular_8h_ref(
expected + (size_t)i * DST_BYTES, DST_W,
master_src + (size_t)i * SRC_BYTES + 3, SRC_W,
DST_H, mxs[i], 0);
}
/* Populate GPU buffers. Contracts (phase4 §5) enforced via asserts. */
uint32_t dst_stride_u8 = DST_W;
uint32_t src_stride_u8 = SRC_W;
assert(dst_stride_u8 >= 8 && "phase4 §5 contract 1");
assert(src_stride_u8 >= 15 && "phase4 §5 contract 2");
uint32_t *meta = (uint32_t *) buf_meta.mapped;
for (int i = 0; i < n_blocks; i++) {
/* src_off: RAW block base. NO +3 shift. (phase5''' finding 4) */
uint32_t src_off = (uint32_t)((size_t)i * SRC_BYTES);
uint32_t dst_off = (uint32_t)((size_t)i * DST_BYTES);
meta[4*i + 0] = dst_off;
meta[4*i + 1] = src_off;
meta[4*i + 2] = (uint32_t) mxs[i];
meta[4*i + 3] = 0;
}
memcpy(buf_src.mapped, master_src, src_bytes);
memset(buf_dst.mapped, 0, dst_bytes);
/* Pipeline. */
v3d_pipeline pipe = {0};
if (v3d_runner_create_pipeline(r, spv_path,
/*n_ssbos=*/3,
/*push_const_size=*/sizeof(push_consts),
&pipe)) return 1;
v3d_buffer bind_bufs[3] = { buf_meta, buf_dst, buf_src };
if (v3d_runner_bind_buffers(r, &pipe, bind_bufs, 3)) return 1;
const uint32_t blocks_per_wg = 32;
uint32_t group_count_x = (uint32_t)((n_blocks + blocks_per_wg - 1) / blocks_per_wg);
printf(" dispatch: %u WGs × 256 invocations = %u blocks (rounded up from %d)\n",
group_count_x, group_count_x * blocks_per_wg, n_blocks);
push_consts pc = {
.n_blocks = (uint32_t) n_blocks,
.dst_stride_u8 = dst_stride_u8,
.src_stride_u8 = src_stride_u8,
._pad = 0,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, group_count_x, 1, 1);
vkEndCommandBuffer(cb);
/* --- M1''' bit-exact --- */
printf("\n=== M1''': QPU vs C reference bit-exact ===\n");
memset(buf_dst.mapped, 0, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
int mismatch_blocks = 0;
int total_byte_diffs = 0;
int prints = 0;
for (int i = 0; i < n_blocks; i++) {
const uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * DST_BYTES;
const uint8_t *e = expected + (size_t)i * DST_BYTES;
if (memcmp(q, e, DST_BYTES) != 0) {
int diffs = 0;
for (int j = 0; j < DST_BYTES; j++) if (q[j] != e[j]) diffs++;
total_byte_diffs += diffs;
if (prints < 3) {
fprintf(stderr, "MISMATCH block %d mx=%d: %d/64 bytes differ\n",
i, mxs[i], diffs);
fprintf(stderr, " ref:");
for (int r0 = 0; r0 < 8; r0++) {
fprintf(stderr, "\n r%d ", r0);
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", e[r0*8+c]);
}
fprintf(stderr, "\n qpu:");
for (int r0 = 0; r0 < 8; r0++) {
fprintf(stderr, "\n r%d ", r0);
for (int c = 0; c < 8; c++) fprintf(stderr, "%3u ", q[r0*8+c]);
}
fprintf(stderr, "\n");
prints++;
}
mismatch_blocks++;
}
}
printf(" blocks bit-exact: %d / %d (%.4f%%)\n",
n_blocks - mismatch_blocks, n_blocks,
100.0 * (n_blocks - mismatch_blocks) / n_blocks);
printf(" total byte diffs: %d / %zu (%.4f%%)\n",
total_byte_diffs, (size_t) n_blocks * DST_BYTES,
100.0 * total_byte_diffs / ((double) n_blocks * DST_BYTES));
if (mismatch_blocks > 0) {
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_src);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
return 1;
}
if (verify_only) {
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_src);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
return 0;
}
/* --- M2''' throughput --- */
printf("\n=== M2''': QPU throughput ===\n");
for (int i = 0; i < 10; i++) {
memset(buf_dst.mapped, 0, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
}
double t0 = now_seconds();
for (int i = 0; i < iters; i++) {
memset(buf_dst.mapped, 0, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
}
double t1 = now_seconds();
double s0 = now_seconds();
for (int i = 0; i < iters; i++) memset(buf_dst.mapped, 0, dst_bytes);
double s1 = now_seconds();
double kernel_seconds = (t1 - t0) - (s1 - s0);
double total_blocks = (double) n_blocks * iters;
double mbps = total_blocks / kernel_seconds / 1e6;
printf(" blocks/dispatch: %d\n", n_blocks);
printf(" iters: %d\n", iters);
printf(" total blocks: %.0f\n", total_blocks);
printf(" elapsed (kernel)=%.6f s\n", kernel_seconds);
printf(" elapsed (setup) =%.6f s\n", s1 - s0);
printf(" M2''' throughput = %.3f Mblock/s\n", mbps);
printf(" per-block = %.1f ns\n", kernel_seconds / total_blocks * 1e9);
printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6);
double M3 = 20.997; /* from k3_mc_phase3.md */
double R = mbps / M3;
printf("\n Cycle 3 NEON M3''' = %.3f Mblock/s\n", M3);
printf(" R''' = M2'''/M3''' = %.3f\n", R);
if (R >= 1.0) printf(" decision band = GREEN: QPU beats NEON in isolation\n");
else if (R >= 0.5) printf(" decision band = YELLOW: M4''' decides\n");
else if (R >= 0.1) printf(" decision band = ORANGE: M4''' may still rescue\n");
else printf(" decision band = RED: structural mismatch\n");
/* 30fps@1080p floor check (per project_30fps_floor_is_fine.md) */
double mblocks_per_1080p = 32400.0 * 30.0 / 1e6;
printf("\n 30fps@1080p floor : %.3f Mblock/s (32400 blocks × 30 fps)\n",
mblocks_per_1080p);
printf(" isolation margin : %.1fx over 30fps floor\n",
mbps / mblocks_per_1080p);
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_src);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
free(master_src); free(expected); free(mxs);
return 0;
}