Files
daedalus-fourier/tests/bench_v3d_h264deblock.c
marfrit 373f63a910 Cycle 8 closed: H.264 deblock R8=0.061 RED, opportunistic helper
Phase 6 deliverable: v3d_h264deblock.comp (132 inst, 4 threads,
no spills). Phase 5 REDs applied:
  RED-1: explicit clamp p1'/q1' to [0,255] before uint8 write
  RED-2: bench-enforced m.x >= 4*stride contract

M1: 3-way 4096/4096 bit-exact (QPU vs C ref AND vs NEON).
M2: 5.629 Medge/s isolation → R8 = 0.061 RED (predicted 0.09-0.14).
    Lower than prediction; H.264 deblock has 4 early-return paths +
    2 conditional writes that hurt V3D branchy execution more than
    expected.

M4 same-kernel: NEON-3+QPU 12.81 Medge/s ≈ pure-NEON-4 ~12-15
  (neutral).

M4 MIXED (real H.264 deployment shape): CPU=MC + QPU=h264deblock
  gives CPU MC 25.11 Mblock/s + QPU h264deblock 6.23 Medge/s.
  QPU contribution is essentially unchanged from isolation —
  the cross-substrate contention is gentle (consistent with
  Issue 003's V4 finding).

Verdict: H.264 deblock = opportunistic QPU helper. Same recipe
slot as cycle 5 CDEF. 6 Medge/s helper = 85% of single-NEON-core
deblock capacity, available when CPU is busy with other work.

Cycles 1-8 deployment recipe complete:
  Primary QPU: cycles 1+2+4 (VP9 IDCT/LPF, all bandwidth-bound)
  Primary CPU: cycles 3+6+7 (compute-heavy or trivially fast on NEON)
  Opportunistic helper: cycles 5+8 (CDEF, H.264 deblock)

Phase 9 lessons added:
  - Branchy kernels underperform V3D vs straight-line ones
  - Mixed-kernel helper value scales with isolation M2, not
    same-kernel M4
  - R prediction needs branchiness weight, not just compute density

- src/v3d_h264deblock.comp (132 inst QPU shader)
- tests/bench_v3d_h264deblock.c (3-way M1 + M2 + R classification)
- tests/bench_concurrent_mixed.c extended with K_H264DEBLOCK
- CMakeLists.txt: v3d_h264deblock.spv + bench_v3d_h264deblock
  + h264dsp linked into bench_concurrent_mixed
- docs/k8_h264deblock_phase7.md (full closure with cycles 1-8 recipe)

Next: Phase 8 — V4L2 wrapper / deployment infra. Public API
already exposes recipe-default substrate per kernel.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:44:21 +00:00

307 lines
11 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Cycle 8 Phase 6+7 — QPU bench for H.264 luma deblock.
*
* Reports:
* M1: 3-way bit-exact (QPU vs NEON vs C ref) per Phase 5 YELLOW-1.
* M2: QPU sustained Medge/s.
*
* Bench contract enforcement (Phase 5 RED-2): m.x is positioned so
* that m.x >= 4 * stride for every edge.
*
* License: BSD-2-Clause.
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <assert.h>
#include <time.h>
#include <getopt.h>
#include <vulkan/vulkan.h>
#include "v3d_runner.h"
extern void daedalus_h264_v_loop_filter_luma_ref(
uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void ff_h264_v_loop_filter_luma_neon(
uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
#define TILE_STRIDE 16
#define TILE_ROWS 16
#define TILE_BYTES (TILE_ROWS * TILE_STRIDE)
#define EDGE_ROW 4
#define EDGE_OFF (EDGE_ROW * TILE_STRIDE) /* byte offset into a tile to row 0 of bottom block */
static uint64_t xs_state;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
static void gen_tile(uint8_t *tile)
{
int a = (int)(xs() % 200) + 20;
int b = (int)(xs() % 200) + 20;
int noise = (int)(xs() % 30) + 1;
for (int r = 0; r < TILE_ROWS; r++) {
for (int c = 0; c < TILE_STRIDE; c++) {
int v;
if (r >= EDGE_ROW - 4 && r < EDGE_ROW + 4) {
int base = (r < EDGE_ROW) ? a : b;
int n = ((int)(xs() % (2*noise + 1))) - noise;
v = base + n;
} else {
v = (int)(xs() & 0xff);
}
tile[r * TILE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
}
}
}
static void gen_thresholds(int *alpha, int *beta, int8_t tc0[4])
{
*alpha = (int)(xs() % 64) + 1;
*beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
static double now_seconds(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
typedef struct {
uint32_t n_edges;
uint32_t dst_stride_u8;
uint32_t _pad0;
uint32_t _pad1;
} push_consts;
int main(int argc, char **argv)
{
int n_edges = 16384;
int iters = 200;
int verify_only = 0;
uint64_t seed = 0;
const char *spv_path = "v3d_h264deblock.spv";
static struct option opts[] = {
{"edges", required_argument, 0, 'e'},
{"iters", required_argument, 0, 'i'},
{"seed", required_argument, 0, 's'},
{"spv", required_argument, 0, 'S'},
{"verify-only", no_argument, 0, 'V'},
{0,0,0,0}
};
for (int c; (c = getopt_long(argc, argv, "e:i:s:S:V", opts, 0)) != -1;) {
switch (c) {
case 'e': n_edges = atoi(optarg); break;
case 'i': iters = atoi(optarg); break;
case 's': seed = strtoull(optarg, 0, 0); break;
case 'S': spv_path = optarg; break;
case 'V': verify_only = 1; break;
default: return 2;
}
}
xs_state = seed ? seed : 0xdeb1ec500dULL;
v3d_runner *r = v3d_runner_create();
if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; }
printf("=== v3d H.264 deblock bench ===\n");
printf(" device: %s\n", v3d_runner_device_name(r));
printf(" n_edges: %d iters: %d seed: 0x%016llx\n",
n_edges, iters, (unsigned long long) (seed ? seed : 0xdeb1ec500dULL));
size_t meta_bytes = (size_t) n_edges * 4 * sizeof(uint32_t);
size_t dst_bytes = (size_t) n_edges * TILE_BYTES;
v3d_buffer buf_meta = {0}, buf_dst = {0};
if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1;
if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1;
uint8_t *master = malloc(dst_bytes);
uint8_t *expected_c = malloc(dst_bytes);
uint8_t *expected_n = malloc(dst_bytes);
int *alphas = malloc(n_edges*sizeof(int));
int *betas = malloc(n_edges*sizeof(int));
int8_t (*tc0s)[4] = malloc(n_edges * 4);
if (!master || !expected_c || !expected_n || !alphas || !betas || !tc0s) {
fprintf(stderr, "alloc fail\n"); return 1;
}
for (int i = 0; i < n_edges; i++) {
gen_tile(master + (size_t)i * TILE_BYTES);
gen_thresholds(&alphas[i], &betas[i], tc0s[i]);
}
/* C ref expected. */
memcpy(expected_c, master, dst_bytes);
for (int i = 0; i < n_edges; i++)
daedalus_h264_v_loop_filter_luma_ref(
expected_c + (size_t)i * TILE_BYTES + EDGE_OFF,
TILE_STRIDE, alphas[i], betas[i], tc0s[i]);
/* NEON expected. */
memcpy(expected_n, master, dst_bytes);
for (int i = 0; i < n_edges; i++)
ff_h264_v_loop_filter_luma_neon(
expected_n + (size_t)i * TILE_BYTES + EDGE_OFF,
TILE_STRIDE, alphas[i], betas[i], tc0s[i]);
/* Parity check C ref vs NEON. */
int cn_mis = 0;
for (size_t b = 0; b < dst_bytes; b++)
if (expected_c[b] != expected_n[b]) cn_mis++;
printf(" C ref vs NEON parity: %d/%zu byte mismatches\n", cn_mis, dst_bytes);
if (cn_mis > 0) {
fprintf(stderr, "ERROR: C ref disagrees with NEON before QPU.\n");
return 1;
}
/* Populate meta SSBO (Phase 5 RED-2: enforce m.x >= 4*stride). */
uint32_t *meta = (uint32_t *) buf_meta.mapped;
uint32_t stride_u8 = TILE_STRIDE;
for (int i = 0; i < n_edges; i++) {
uint32_t mx = (uint32_t)((size_t)i * TILE_BYTES + EDGE_OFF);
assert(mx >= 4 * stride_u8 && "Phase 5 RED-2 contract violated");
meta[4*i + 0] = mx;
meta[4*i + 1] = ((uint32_t)alphas[i]) | (((uint32_t)betas[i]) << 8);
/* Pack tc0[0..3] as 4 int8 in low 32 bits of m.z. */
meta[4*i + 2] = ((uint32_t)(uint8_t)tc0s[i][0])
| (((uint32_t)(uint8_t)tc0s[i][1]) << 8)
| (((uint32_t)(uint8_t)tc0s[i][2]) << 16)
| (((uint32_t)(uint8_t)tc0s[i][3]) << 24);
meta[4*i + 3] = 0;
}
memcpy(buf_dst.mapped, master, dst_bytes);
/* Pipeline. */
v3d_pipeline pipe = {0};
if (v3d_runner_create_pipeline(r, spv_path, /*n_ssbos=*/2,
/*push_const_size=*/sizeof(push_consts),
&pipe)) return 1;
v3d_buffer binds[2] = { buf_meta, buf_dst };
if (v3d_runner_bind_buffers(r, &pipe, binds, 2)) return 1;
const uint32_t edges_per_wg = 16;
uint32_t wg_count = (uint32_t)((n_edges + edges_per_wg - 1) / edges_per_wg);
printf(" dispatch: %u WGs × 256 invocations = %u edges\n",
wg_count, wg_count * edges_per_wg);
push_consts pc = {
.n_edges = (uint32_t) n_edges,
.dst_stride_u8 = stride_u8,
};
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
if (cb == VK_NULL_HANDLE) return 1;
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
vkBeginCommandBuffer(cb, &cbbi);
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pc), &pc);
vkCmdDispatch(cb, wg_count, 1, 1);
vkEndCommandBuffer(cb);
/* M1 3-way. */
printf("\n=== M1₈: QPU vs C ref vs NEON ===\n");
memcpy(buf_dst.mapped, master, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
int qc_mis = 0, qn_mis = 0, prints = 0;
for (int i = 0; i < n_edges; i++) {
uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * TILE_BYTES;
uint8_t *c = expected_c + (size_t)i * TILE_BYTES;
uint8_t *n = expected_n + (size_t)i * TILE_BYTES;
int qc = memcmp(q, c, TILE_BYTES);
int qn = memcmp(q, n, TILE_BYTES);
if (qc) qc_mis++;
if (qn) qn_mis++;
if ((qc || qn) && prints < 3) {
fprintf(stderr, "MISMATCH edge %d alpha=%d beta=%d tc0=[%d,%d,%d,%d]\n",
i, alphas[i], betas[i],
tc0s[i][0], tc0s[i][1], tc0s[i][2], tc0s[i][3]);
prints++;
}
}
printf(" QPU vs C ref: %d/%d edges bit-exact (%.4f%%)\n",
n_edges - qc_mis, n_edges, 100.0 * (n_edges - qc_mis) / n_edges);
printf(" QPU vs NEON: %d/%d edges bit-exact (%.4f%%)\n",
n_edges - qn_mis, n_edges, 100.0 * (n_edges - qn_mis) / n_edges);
if (qc_mis || qn_mis) {
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
return 1;
}
if (verify_only) {
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
return 0;
}
/* M2 throughput. */
printf("\n=== M2₈: QPU throughput ===\n");
for (int i = 0; i < 5; i++) {
memcpy(buf_dst.mapped, master, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
}
double t0 = now_seconds();
for (int i = 0; i < iters; i++) {
memcpy(buf_dst.mapped, master, dst_bytes);
if (v3d_runner_submit_wait(r, cb)) return 1;
}
double t1 = now_seconds();
double s0 = now_seconds();
for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master, dst_bytes);
double s1 = now_seconds();
double kernel_seconds = (t1 - t0) - (s1 - s0);
double total = (double) n_edges * iters;
double medges = total / kernel_seconds / 1e6;
printf(" edges/dispatch: %d\n", n_edges);
printf(" iters: %d\n", iters);
printf(" total edges: %.0f\n", total);
printf(" elapsed (kern) = %.6f s\n", kernel_seconds);
printf(" M2₈ throughput = %.3f Medge/s\n", medges);
printf(" per-edge = %.1f ns\n", kernel_seconds / total * 1e9);
printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6);
double M3_8 = 91.947;
double R8 = medges / M3_8;
printf("\n Cycle 8 NEON M3₈ = %.3f Medge/s\n", M3_8);
printf(" R₈ = M2₈/M3₈ = %.3f\n", R8);
if (R8 >= 1.0) printf(" decision band = GREEN\n");
else if (R8 >= 0.5) printf(" decision band = YELLOW (M4 decides)\n");
else if (R8 >= 0.1) printf(" decision band = ORANGE (M4 may rescue)\n");
else printf(" decision band = RED (structural)\n");
/* H.264 1080p30 floor: 8 Medge/s worst, 3 realistic. */
printf(" H.264 1080p30 worst-case floor: %.2fx margin (8.0 Medge/s req'd)\n", medges / 8.0);
v3d_runner_destroy_pipeline(r, &pipe);
v3d_runner_destroy_buffer(r, &buf_dst);
v3d_runner_destroy_buffer(r, &buf_meta);
v3d_runner_destroy(r);
free(master); free(expected_c); free(expected_n);
free(alphas); free(betas); free(tc0s);
return 0;
}