Cycle 8 closed: H.264 deblock R8=0.061 RED, opportunistic helper

Phase 6 deliverable: v3d_h264deblock.comp (132 inst, 4 threads,
no spills). Phase 5 REDs applied:
  RED-1: explicit clamp p1'/q1' to [0,255] before uint8 write
  RED-2: bench-enforced m.x >= 4*stride contract

M1: 3-way 4096/4096 bit-exact (QPU vs C ref AND vs NEON).
M2: 5.629 Medge/s isolation → R8 = 0.061 RED (predicted 0.09-0.14).
    Lower than prediction; H.264 deblock has 4 early-return paths +
    2 conditional writes that hurt V3D branchy execution more than
    expected.

M4 same-kernel: NEON-3+QPU 12.81 Medge/s ≈ pure-NEON-4 ~12-15
  (neutral).

M4 MIXED (real H.264 deployment shape): CPU=MC + QPU=h264deblock
  gives CPU MC 25.11 Mblock/s + QPU h264deblock 6.23 Medge/s.
  QPU contribution is essentially unchanged from isolation —
  the cross-substrate contention is gentle (consistent with
  Issue 003's V4 finding).

Verdict: H.264 deblock = opportunistic QPU helper. Same recipe
slot as cycle 5 CDEF. 6 Medge/s helper = 85% of single-NEON-core
deblock capacity, available when CPU is busy with other work.

Cycles 1-8 deployment recipe complete:
  Primary QPU: cycles 1+2+4 (VP9 IDCT/LPF, all bandwidth-bound)
  Primary CPU: cycles 3+6+7 (compute-heavy or trivially fast on NEON)
  Opportunistic helper: cycles 5+8 (CDEF, H.264 deblock)

Phase 9 lessons added:
  - Branchy kernels underperform V3D vs straight-line ones
  - Mixed-kernel helper value scales with isolation M2, not
    same-kernel M4
  - R prediction needs branchiness weight, not just compute density

- src/v3d_h264deblock.comp (132 inst QPU shader)
- tests/bench_v3d_h264deblock.c (3-way M1 + M2 + R classification)
- tests/bench_concurrent_mixed.c extended with K_H264DEBLOCK
- CMakeLists.txt: v3d_h264deblock.spv + bench_v3d_h264deblock
  + h264dsp linked into bench_concurrent_mixed
- docs/k8_h264deblock_phase7.md (full closure with cycles 1-8 recipe)

Next: Phase 8 — V4L2 wrapper / deployment infra. Public API
already exposes recipe-default substrate per kernel.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 14:44:21 +00:00
parent f2ba08e1cf
commit 373f63a910
5 changed files with 695 additions and 4 deletions
+60 -2
View File
@@ -68,7 +68,10 @@ static double now_s(void) {
/* --- Kernel selectors --- */
enum kernel { K_MC, K_LPF4, K_LPF8, K_CDEF, K_IDCT };
enum kernel { K_MC, K_LPF4, K_LPF8, K_CDEF, K_IDCT, K_H264DEBLOCK };
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
static const char *kernel_name(enum kernel k) {
switch (k) {
@@ -77,11 +80,12 @@ static const char *kernel_name(enum kernel k) {
case K_LPF8: return "lpf8";
case K_CDEF: return "cdef";
case K_IDCT: return "idct";
case K_H264DEBLOCK: return "h264deblock";
}
return "?";
}
static const char *kernel_unit(enum kernel k) {
return (k == K_LPF4 || k == K_LPF8) ? "Medge/s" : "Mblock/s";
return (k == K_LPF4 || k == K_LPF8 || k == K_H264DEBLOCK) ? "Medge/s" : "Mblock/s";
}
/* --- NEON worker (per-kernel inline; pre-generate inputs, hot-loop) --- */
@@ -201,6 +205,32 @@ static void *neon_worker(void *p) {
case K_LPF8: neon_run_lpf(&seed, &done, 1); break;
case K_IDCT: neon_run_idct(&seed, &done); break;
case K_CDEF: neon_run_cdef(&seed, &done); break;
case K_H264DEBLOCK: {
/* H.264 deblock: 16-row × 16-col tile per edge, EDGE_OFF = 4*16. */
int n = NEON_BATCH;
uint8_t *master = malloc((size_t) n * 256);
uint8_t *work = malloc((size_t) n * 256);
int *alphas = malloc(n*sizeof(int)), *betas = malloc(n*sizeof(int));
int8_t (*tc0s)[4] = malloc(n*4);
for (int i = 0; i < n; i++) {
for (int j = 0; j < 256; j++) master[i*256+j] = (uint8_t)(xs_step(&seed) & 0xff);
alphas[i] = (int)(xs_step(&seed) % 64) + 1;
betas[i] = (int)(xs_step(&seed) % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs_step(&seed) % 8);
tc0s[i][s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
while (!g_stop) {
memcpy(work, master, (size_t) n * 256);
for (int i = 0; i < n; i++)
ff_h264_v_loop_filter_luma_neon(work + i*256 + 4*16, 16,
alphas[i], betas[i], tc0s[i]);
done += n;
}
free(master); free(work); free(alphas); free(betas); free(tc0s);
break;
}
default: fprintf(stderr, "bad NEON kernel\n"); break;
}
a->elapsed_s = now_s() - t0;
@@ -334,6 +364,13 @@ static void *qpu_real_worker(void *p)
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
has_src = 1;
break;
case K_H264DEBLOCK:
spv = "v3d_h264deblock.spv";
bpw = 16; /* 16 edges/WG */
dst_bytes = (size_t) n_units * 256; /* 16x16 tile */
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
has_src = 0;
break;
default:
fprintf(stderr, "qpu_real_worker: unsupported kernel\n");
v3d_runner_destroy(r);
@@ -392,10 +429,28 @@ static void *qpu_real_worker(void *p)
}
for (size_t i = 0; i < dst_bytes; i++)
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
} else if (a->kernel == K_H264DEBLOCK) {
for (int i = 0; i < n_units; i++) {
uint32_t alpha = (uint32_t)(xs_step(&seed) % 64) + 1;
uint32_t beta = (uint32_t)(xs_step(&seed) % 16) + 1;
uint32_t tc0p = 0;
for (int s = 0; s < 4; s++) {
int rr = (int)(xs_step(&seed) % 8);
int8_t v = (int8_t)(rr == 0 ? -1 : (rr - 1));
tc0p |= ((uint32_t)(uint8_t)v) << (s * 8);
}
meta[4*i+0] = (uint32_t)((size_t)i * 256 + 4 * 16); /* EDGE_OFF = 4*stride */
meta[4*i+1] = alpha | (beta << 8);
meta[4*i+2] = tc0p;
meta[4*i+3] = 0;
}
for (size_t i = 0; i < dst_bytes; i++)
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
}
v3d_pipeline pipe = {0};
int n_ssbos = has_src ? 3 : 2;
/* K_H264DEBLOCK reuses pc_lpf layout (n + dst_stride_u8 + 2 pads). */
size_t pc_size = (a->kernel == K_MC) ? sizeof(pc_mc) :
(a->kernel == K_IDCT) ? sizeof(pc_idct) :
(a->kernel == K_CDEF) ? sizeof(pc_cdef) : sizeof(pc_lpf);
@@ -417,6 +472,8 @@ static void *qpu_real_worker(void *p)
pc.idct = (pc_idct){ .n_blocks = n_units, .blocks_per_row = 16, .dst_stride_u8 = 128 };
} else if (a->kernel == K_CDEF) {
pc.cdef = (pc_cdef){ .n_blocks = n_units, .tmp_stride_u16 = 16, .dst_stride_u8 = 8 };
} else if (a->kernel == K_H264DEBLOCK) {
pc.lpf = (pc_lpf){ .n = n_units, .dst_stride_u8 = 16 };
}
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
@@ -472,6 +529,7 @@ static enum kernel parse_kernel(const char *s) {
if (!strcmp(s, "lpf8")) return K_LPF8;
if (!strcmp(s, "cdef")) return K_CDEF;
if (!strcmp(s, "idct")) return K_IDCT;
if (!strcmp(s, "h264deblock")) return K_H264DEBLOCK;
fprintf(stderr, "unknown kernel: %s\n", s); exit(2);
}