373f63a910
Phase 6 deliverable: v3d_h264deblock.comp (132 inst, 4 threads,
no spills). Phase 5 REDs applied:
RED-1: explicit clamp p1'/q1' to [0,255] before uint8 write
RED-2: bench-enforced m.x >= 4*stride contract
M1: 3-way 4096/4096 bit-exact (QPU vs C ref AND vs NEON).
M2: 5.629 Medge/s isolation → R8 = 0.061 RED (predicted 0.09-0.14).
Lower than prediction; H.264 deblock has 4 early-return paths +
2 conditional writes that hurt V3D branchy execution more than
expected.
M4 same-kernel: NEON-3+QPU 12.81 Medge/s ≈ pure-NEON-4 ~12-15
(neutral).
M4 MIXED (real H.264 deployment shape): CPU=MC + QPU=h264deblock
gives CPU MC 25.11 Mblock/s + QPU h264deblock 6.23 Medge/s.
QPU contribution is essentially unchanged from isolation —
the cross-substrate contention is gentle (consistent with
Issue 003's V4 finding).
Verdict: H.264 deblock = opportunistic QPU helper. Same recipe
slot as cycle 5 CDEF. 6 Medge/s helper = 85% of single-NEON-core
deblock capacity, available when CPU is busy with other work.
Cycles 1-8 deployment recipe complete:
Primary QPU: cycles 1+2+4 (VP9 IDCT/LPF, all bandwidth-bound)
Primary CPU: cycles 3+6+7 (compute-heavy or trivially fast on NEON)
Opportunistic helper: cycles 5+8 (CDEF, H.264 deblock)
Phase 9 lessons added:
- Branchy kernels underperform V3D vs straight-line ones
- Mixed-kernel helper value scales with isolation M2, not
same-kernel M4
- R prediction needs branchiness weight, not just compute density
- src/v3d_h264deblock.comp (132 inst QPU shader)
- tests/bench_v3d_h264deblock.c (3-way M1 + M2 + R classification)
- tests/bench_concurrent_mixed.c extended with K_H264DEBLOCK
- CMakeLists.txt: v3d_h264deblock.spv + bench_v3d_h264deblock
+ h264dsp linked into bench_concurrent_mixed
- docs/k8_h264deblock_phase7.md (full closure with cycles 1-8 recipe)
Next: Phase 8 — V4L2 wrapper / deployment infra. Public API
already exposes recipe-default substrate per kernel.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
307 lines
11 KiB
C
307 lines
11 KiB
C
/*
|
||
* Cycle 8 Phase 6+7 — QPU bench for H.264 luma deblock.
|
||
*
|
||
* Reports:
|
||
* M1: 3-way bit-exact (QPU vs NEON vs C ref) per Phase 5 YELLOW-1.
|
||
* M2: QPU sustained Medge/s.
|
||
*
|
||
* Bench contract enforcement (Phase 5 RED-2): m.x is positioned so
|
||
* that m.x >= 4 * stride for every edge.
|
||
*
|
||
* License: BSD-2-Clause.
|
||
*/
|
||
#define _POSIX_C_SOURCE 200809L
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <stdint.h>
|
||
#include <stddef.h>
|
||
#include <string.h>
|
||
#include <assert.h>
|
||
#include <time.h>
|
||
#include <getopt.h>
|
||
#include <vulkan/vulkan.h>
|
||
|
||
#include "v3d_runner.h"
|
||
|
||
extern void daedalus_h264_v_loop_filter_luma_ref(
|
||
uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t tc0[4]);
|
||
|
||
extern void ff_h264_v_loop_filter_luma_neon(
|
||
uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t *tc0);
|
||
|
||
#define TILE_STRIDE 16
|
||
#define TILE_ROWS 16
|
||
#define TILE_BYTES (TILE_ROWS * TILE_STRIDE)
|
||
#define EDGE_ROW 4
|
||
#define EDGE_OFF (EDGE_ROW * TILE_STRIDE) /* byte offset into a tile to row 0 of bottom block */
|
||
|
||
static uint64_t xs_state;
|
||
static inline uint64_t xs(void) {
|
||
uint64_t x = xs_state;
|
||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||
return xs_state = x;
|
||
}
|
||
|
||
static void gen_tile(uint8_t *tile)
|
||
{
|
||
int a = (int)(xs() % 200) + 20;
|
||
int b = (int)(xs() % 200) + 20;
|
||
int noise = (int)(xs() % 30) + 1;
|
||
for (int r = 0; r < TILE_ROWS; r++) {
|
||
for (int c = 0; c < TILE_STRIDE; c++) {
|
||
int v;
|
||
if (r >= EDGE_ROW - 4 && r < EDGE_ROW + 4) {
|
||
int base = (r < EDGE_ROW) ? a : b;
|
||
int n = ((int)(xs() % (2*noise + 1))) - noise;
|
||
v = base + n;
|
||
} else {
|
||
v = (int)(xs() & 0xff);
|
||
}
|
||
tile[r * TILE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
|
||
}
|
||
}
|
||
}
|
||
|
||
static void gen_thresholds(int *alpha, int *beta, int8_t tc0[4])
|
||
{
|
||
*alpha = (int)(xs() % 64) + 1;
|
||
*beta = (int)(xs() % 16) + 1;
|
||
for (int s = 0; s < 4; s++) {
|
||
int r = (int)(xs() % 8);
|
||
tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||
}
|
||
}
|
||
|
||
static double now_seconds(void) {
|
||
struct timespec ts;
|
||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||
return ts.tv_sec + ts.tv_nsec * 1e-9;
|
||
}
|
||
|
||
typedef struct {
|
||
uint32_t n_edges;
|
||
uint32_t dst_stride_u8;
|
||
uint32_t _pad0;
|
||
uint32_t _pad1;
|
||
} push_consts;
|
||
|
||
int main(int argc, char **argv)
|
||
{
|
||
int n_edges = 16384;
|
||
int iters = 200;
|
||
int verify_only = 0;
|
||
uint64_t seed = 0;
|
||
const char *spv_path = "v3d_h264deblock.spv";
|
||
|
||
static struct option opts[] = {
|
||
{"edges", required_argument, 0, 'e'},
|
||
{"iters", required_argument, 0, 'i'},
|
||
{"seed", required_argument, 0, 's'},
|
||
{"spv", required_argument, 0, 'S'},
|
||
{"verify-only", no_argument, 0, 'V'},
|
||
{0,0,0,0}
|
||
};
|
||
for (int c; (c = getopt_long(argc, argv, "e:i:s:S:V", opts, 0)) != -1;) {
|
||
switch (c) {
|
||
case 'e': n_edges = atoi(optarg); break;
|
||
case 'i': iters = atoi(optarg); break;
|
||
case 's': seed = strtoull(optarg, 0, 0); break;
|
||
case 'S': spv_path = optarg; break;
|
||
case 'V': verify_only = 1; break;
|
||
default: return 2;
|
||
}
|
||
}
|
||
|
||
xs_state = seed ? seed : 0xdeb1ec500dULL;
|
||
|
||
v3d_runner *r = v3d_runner_create();
|
||
if (!r) { fprintf(stderr, "v3d_runner_create failed\n"); return 1; }
|
||
printf("=== v3d H.264 deblock bench ===\n");
|
||
printf(" device: %s\n", v3d_runner_device_name(r));
|
||
printf(" n_edges: %d iters: %d seed: 0x%016llx\n",
|
||
n_edges, iters, (unsigned long long) (seed ? seed : 0xdeb1ec500dULL));
|
||
|
||
size_t meta_bytes = (size_t) n_edges * 4 * sizeof(uint32_t);
|
||
size_t dst_bytes = (size_t) n_edges * TILE_BYTES;
|
||
|
||
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
||
if (v3d_runner_create_buffer(r, meta_bytes, &buf_meta)) return 1;
|
||
if (v3d_runner_create_buffer(r, dst_bytes, &buf_dst)) return 1;
|
||
|
||
uint8_t *master = malloc(dst_bytes);
|
||
uint8_t *expected_c = malloc(dst_bytes);
|
||
uint8_t *expected_n = malloc(dst_bytes);
|
||
int *alphas = malloc(n_edges*sizeof(int));
|
||
int *betas = malloc(n_edges*sizeof(int));
|
||
int8_t (*tc0s)[4] = malloc(n_edges * 4);
|
||
if (!master || !expected_c || !expected_n || !alphas || !betas || !tc0s) {
|
||
fprintf(stderr, "alloc fail\n"); return 1;
|
||
}
|
||
|
||
for (int i = 0; i < n_edges; i++) {
|
||
gen_tile(master + (size_t)i * TILE_BYTES);
|
||
gen_thresholds(&alphas[i], &betas[i], tc0s[i]);
|
||
}
|
||
|
||
/* C ref expected. */
|
||
memcpy(expected_c, master, dst_bytes);
|
||
for (int i = 0; i < n_edges; i++)
|
||
daedalus_h264_v_loop_filter_luma_ref(
|
||
expected_c + (size_t)i * TILE_BYTES + EDGE_OFF,
|
||
TILE_STRIDE, alphas[i], betas[i], tc0s[i]);
|
||
|
||
/* NEON expected. */
|
||
memcpy(expected_n, master, dst_bytes);
|
||
for (int i = 0; i < n_edges; i++)
|
||
ff_h264_v_loop_filter_luma_neon(
|
||
expected_n + (size_t)i * TILE_BYTES + EDGE_OFF,
|
||
TILE_STRIDE, alphas[i], betas[i], tc0s[i]);
|
||
|
||
/* Parity check C ref vs NEON. */
|
||
int cn_mis = 0;
|
||
for (size_t b = 0; b < dst_bytes; b++)
|
||
if (expected_c[b] != expected_n[b]) cn_mis++;
|
||
printf(" C ref vs NEON parity: %d/%zu byte mismatches\n", cn_mis, dst_bytes);
|
||
if (cn_mis > 0) {
|
||
fprintf(stderr, "ERROR: C ref disagrees with NEON before QPU.\n");
|
||
return 1;
|
||
}
|
||
|
||
/* Populate meta SSBO (Phase 5 RED-2: enforce m.x >= 4*stride). */
|
||
uint32_t *meta = (uint32_t *) buf_meta.mapped;
|
||
uint32_t stride_u8 = TILE_STRIDE;
|
||
for (int i = 0; i < n_edges; i++) {
|
||
uint32_t mx = (uint32_t)((size_t)i * TILE_BYTES + EDGE_OFF);
|
||
assert(mx >= 4 * stride_u8 && "Phase 5 RED-2 contract violated");
|
||
meta[4*i + 0] = mx;
|
||
meta[4*i + 1] = ((uint32_t)alphas[i]) | (((uint32_t)betas[i]) << 8);
|
||
/* Pack tc0[0..3] as 4 int8 in low 32 bits of m.z. */
|
||
meta[4*i + 2] = ((uint32_t)(uint8_t)tc0s[i][0])
|
||
| (((uint32_t)(uint8_t)tc0s[i][1]) << 8)
|
||
| (((uint32_t)(uint8_t)tc0s[i][2]) << 16)
|
||
| (((uint32_t)(uint8_t)tc0s[i][3]) << 24);
|
||
meta[4*i + 3] = 0;
|
||
}
|
||
memcpy(buf_dst.mapped, master, dst_bytes);
|
||
|
||
/* Pipeline. */
|
||
v3d_pipeline pipe = {0};
|
||
if (v3d_runner_create_pipeline(r, spv_path, /*n_ssbos=*/2,
|
||
/*push_const_size=*/sizeof(push_consts),
|
||
&pipe)) return 1;
|
||
v3d_buffer binds[2] = { buf_meta, buf_dst };
|
||
if (v3d_runner_bind_buffers(r, &pipe, binds, 2)) return 1;
|
||
|
||
const uint32_t edges_per_wg = 16;
|
||
uint32_t wg_count = (uint32_t)((n_edges + edges_per_wg - 1) / edges_per_wg);
|
||
printf(" dispatch: %u WGs × 256 invocations = %u edges\n",
|
||
wg_count, wg_count * edges_per_wg);
|
||
|
||
push_consts pc = {
|
||
.n_edges = (uint32_t) n_edges,
|
||
.dst_stride_u8 = stride_u8,
|
||
};
|
||
|
||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
|
||
if (cb == VK_NULL_HANDLE) return 1;
|
||
VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO };
|
||
vkBeginCommandBuffer(cb, &cbbi);
|
||
vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_COMPUTE, pipe.pipeline);
|
||
vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||
pipe.layout, 0, 1, &pipe.desc_set, 0, NULL);
|
||
vkCmdPushConstants(cb, pipe.layout, VK_SHADER_STAGE_COMPUTE_BIT,
|
||
0, sizeof(pc), &pc);
|
||
vkCmdDispatch(cb, wg_count, 1, 1);
|
||
vkEndCommandBuffer(cb);
|
||
|
||
/* M1 3-way. */
|
||
printf("\n=== M1₈: QPU vs C ref vs NEON ===\n");
|
||
memcpy(buf_dst.mapped, master, dst_bytes);
|
||
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||
|
||
int qc_mis = 0, qn_mis = 0, prints = 0;
|
||
for (int i = 0; i < n_edges; i++) {
|
||
uint8_t *q = (uint8_t *) buf_dst.mapped + (size_t)i * TILE_BYTES;
|
||
uint8_t *c = expected_c + (size_t)i * TILE_BYTES;
|
||
uint8_t *n = expected_n + (size_t)i * TILE_BYTES;
|
||
int qc = memcmp(q, c, TILE_BYTES);
|
||
int qn = memcmp(q, n, TILE_BYTES);
|
||
if (qc) qc_mis++;
|
||
if (qn) qn_mis++;
|
||
if ((qc || qn) && prints < 3) {
|
||
fprintf(stderr, "MISMATCH edge %d alpha=%d beta=%d tc0=[%d,%d,%d,%d]\n",
|
||
i, alphas[i], betas[i],
|
||
tc0s[i][0], tc0s[i][1], tc0s[i][2], tc0s[i][3]);
|
||
prints++;
|
||
}
|
||
}
|
||
printf(" QPU vs C ref: %d/%d edges bit-exact (%.4f%%)\n",
|
||
n_edges - qc_mis, n_edges, 100.0 * (n_edges - qc_mis) / n_edges);
|
||
printf(" QPU vs NEON: %d/%d edges bit-exact (%.4f%%)\n",
|
||
n_edges - qn_mis, n_edges, 100.0 * (n_edges - qn_mis) / n_edges);
|
||
if (qc_mis || qn_mis) {
|
||
fprintf(stderr, "REFUSING to measure throughput on a broken kernel.\n");
|
||
return 1;
|
||
}
|
||
|
||
if (verify_only) {
|
||
v3d_runner_destroy_pipeline(r, &pipe);
|
||
v3d_runner_destroy_buffer(r, &buf_dst);
|
||
v3d_runner_destroy_buffer(r, &buf_meta);
|
||
v3d_runner_destroy(r);
|
||
return 0;
|
||
}
|
||
|
||
/* M2 throughput. */
|
||
printf("\n=== M2₈: QPU throughput ===\n");
|
||
for (int i = 0; i < 5; i++) {
|
||
memcpy(buf_dst.mapped, master, dst_bytes);
|
||
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||
}
|
||
|
||
double t0 = now_seconds();
|
||
for (int i = 0; i < iters; i++) {
|
||
memcpy(buf_dst.mapped, master, dst_bytes);
|
||
if (v3d_runner_submit_wait(r, cb)) return 1;
|
||
}
|
||
double t1 = now_seconds();
|
||
|
||
double s0 = now_seconds();
|
||
for (int i = 0; i < iters; i++) memcpy(buf_dst.mapped, master, dst_bytes);
|
||
double s1 = now_seconds();
|
||
|
||
double kernel_seconds = (t1 - t0) - (s1 - s0);
|
||
double total = (double) n_edges * iters;
|
||
double medges = total / kernel_seconds / 1e6;
|
||
|
||
printf(" edges/dispatch: %d\n", n_edges);
|
||
printf(" iters: %d\n", iters);
|
||
printf(" total edges: %.0f\n", total);
|
||
printf(" elapsed (kern) = %.6f s\n", kernel_seconds);
|
||
printf(" M2₈ throughput = %.3f Medge/s\n", medges);
|
||
printf(" per-edge = %.1f ns\n", kernel_seconds / total * 1e9);
|
||
printf(" per-dispatch = %.1f us\n", kernel_seconds / iters * 1e6);
|
||
|
||
double M3_8 = 91.947;
|
||
double R8 = medges / M3_8;
|
||
printf("\n Cycle 8 NEON M3₈ = %.3f Medge/s\n", M3_8);
|
||
printf(" R₈ = M2₈/M3₈ = %.3f\n", R8);
|
||
if (R8 >= 1.0) printf(" decision band = GREEN\n");
|
||
else if (R8 >= 0.5) printf(" decision band = YELLOW (M4 decides)\n");
|
||
else if (R8 >= 0.1) printf(" decision band = ORANGE (M4 may rescue)\n");
|
||
else printf(" decision band = RED (structural)\n");
|
||
|
||
/* H.264 1080p30 floor: 8 Medge/s worst, 3 realistic. */
|
||
printf(" H.264 1080p30 worst-case floor: %.2fx margin (8.0 Medge/s req'd)\n", medges / 8.0);
|
||
|
||
v3d_runner_destroy_pipeline(r, &pipe);
|
||
v3d_runner_destroy_buffer(r, &buf_dst);
|
||
v3d_runner_destroy_buffer(r, &buf_meta);
|
||
v3d_runner_destroy(r);
|
||
free(master); free(expected_c); free(expected_n);
|
||
free(alphas); free(betas); free(tc0s);
|
||
return 0;
|
||
}
|