Cycle 8 closed: H.264 deblock R8=0.061 RED, opportunistic helper
Phase 6 deliverable: v3d_h264deblock.comp (132 inst, 4 threads,
no spills). Phase 5 REDs applied:
RED-1: explicit clamp p1'/q1' to [0,255] before uint8 write
RED-2: bench-enforced m.x >= 4*stride contract
M1: 3-way 4096/4096 bit-exact (QPU vs C ref AND vs NEON).
M2: 5.629 Medge/s isolation → R8 = 0.061 RED (predicted 0.09-0.14).
Lower than prediction; H.264 deblock has 4 early-return paths +
2 conditional writes that hurt V3D branchy execution more than
expected.
M4 same-kernel: NEON-3+QPU 12.81 Medge/s ≈ pure-NEON-4 ~12-15
(neutral).
M4 MIXED (real H.264 deployment shape): CPU=MC + QPU=h264deblock
gives CPU MC 25.11 Mblock/s + QPU h264deblock 6.23 Medge/s.
QPU contribution is essentially unchanged from isolation —
the cross-substrate contention is gentle (consistent with
Issue 003's V4 finding).
Verdict: H.264 deblock = opportunistic QPU helper. Same recipe
slot as cycle 5 CDEF. 6 Medge/s helper = 85% of single-NEON-core
deblock capacity, available when CPU is busy with other work.
Cycles 1-8 deployment recipe complete:
Primary QPU: cycles 1+2+4 (VP9 IDCT/LPF, all bandwidth-bound)
Primary CPU: cycles 3+6+7 (compute-heavy or trivially fast on NEON)
Opportunistic helper: cycles 5+8 (CDEF, H.264 deblock)
Phase 9 lessons added:
- Branchy kernels underperform V3D vs straight-line ones
- Mixed-kernel helper value scales with isolation M2, not
same-kernel M4
- R prediction needs branchiness weight, not just compute density
- src/v3d_h264deblock.comp (132 inst QPU shader)
- tests/bench_v3d_h264deblock.c (3-way M1 + M2 + R classification)
- tests/bench_concurrent_mixed.c extended with K_H264DEBLOCK
- CMakeLists.txt: v3d_h264deblock.spv + bench_v3d_h264deblock
+ h264dsp linked into bench_concurrent_mixed
- docs/k8_h264deblock_phase7.md (full closure with cycles 1-8 recipe)
Next: Phase 8 — V4L2 wrapper / deployment infra. Public API
already exposes recipe-default substrate per kernel.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -68,7 +68,10 @@ static double now_s(void) {
|
||||
|
||||
/* --- Kernel selectors --- */
|
||||
|
||||
enum kernel { K_MC, K_LPF4, K_LPF8, K_CDEF, K_IDCT };
|
||||
enum kernel { K_MC, K_LPF4, K_LPF8, K_CDEF, K_IDCT, K_H264DEBLOCK };
|
||||
|
||||
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
|
||||
static const char *kernel_name(enum kernel k) {
|
||||
switch (k) {
|
||||
@@ -77,11 +80,12 @@ static const char *kernel_name(enum kernel k) {
|
||||
case K_LPF8: return "lpf8";
|
||||
case K_CDEF: return "cdef";
|
||||
case K_IDCT: return "idct";
|
||||
case K_H264DEBLOCK: return "h264deblock";
|
||||
}
|
||||
return "?";
|
||||
}
|
||||
static const char *kernel_unit(enum kernel k) {
|
||||
return (k == K_LPF4 || k == K_LPF8) ? "Medge/s" : "Mblock/s";
|
||||
return (k == K_LPF4 || k == K_LPF8 || k == K_H264DEBLOCK) ? "Medge/s" : "Mblock/s";
|
||||
}
|
||||
|
||||
/* --- NEON worker (per-kernel inline; pre-generate inputs, hot-loop) --- */
|
||||
@@ -201,6 +205,32 @@ static void *neon_worker(void *p) {
|
||||
case K_LPF8: neon_run_lpf(&seed, &done, 1); break;
|
||||
case K_IDCT: neon_run_idct(&seed, &done); break;
|
||||
case K_CDEF: neon_run_cdef(&seed, &done); break;
|
||||
case K_H264DEBLOCK: {
|
||||
/* H.264 deblock: 16-row × 16-col tile per edge, EDGE_OFF = 4*16. */
|
||||
int n = NEON_BATCH;
|
||||
uint8_t *master = malloc((size_t) n * 256);
|
||||
uint8_t *work = malloc((size_t) n * 256);
|
||||
int *alphas = malloc(n*sizeof(int)), *betas = malloc(n*sizeof(int));
|
||||
int8_t (*tc0s)[4] = malloc(n*4);
|
||||
for (int i = 0; i < n; i++) {
|
||||
for (int j = 0; j < 256; j++) master[i*256+j] = (uint8_t)(xs_step(&seed) & 0xff);
|
||||
alphas[i] = (int)(xs_step(&seed) % 64) + 1;
|
||||
betas[i] = (int)(xs_step(&seed) % 16) + 1;
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int r = (int)(xs_step(&seed) % 8);
|
||||
tc0s[i][s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||||
}
|
||||
}
|
||||
while (!g_stop) {
|
||||
memcpy(work, master, (size_t) n * 256);
|
||||
for (int i = 0; i < n; i++)
|
||||
ff_h264_v_loop_filter_luma_neon(work + i*256 + 4*16, 16,
|
||||
alphas[i], betas[i], tc0s[i]);
|
||||
done += n;
|
||||
}
|
||||
free(master); free(work); free(alphas); free(betas); free(tc0s);
|
||||
break;
|
||||
}
|
||||
default: fprintf(stderr, "bad NEON kernel\n"); break;
|
||||
}
|
||||
a->elapsed_s = now_s() - t0;
|
||||
@@ -334,6 +364,13 @@ static void *qpu_real_worker(void *p)
|
||||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||||
has_src = 1;
|
||||
break;
|
||||
case K_H264DEBLOCK:
|
||||
spv = "v3d_h264deblock.spv";
|
||||
bpw = 16; /* 16 edges/WG */
|
||||
dst_bytes = (size_t) n_units * 256; /* 16x16 tile */
|
||||
meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
|
||||
has_src = 0;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "qpu_real_worker: unsupported kernel\n");
|
||||
v3d_runner_destroy(r);
|
||||
@@ -392,10 +429,28 @@ static void *qpu_real_worker(void *p)
|
||||
}
|
||||
for (size_t i = 0; i < dst_bytes; i++)
|
||||
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
|
||||
} else if (a->kernel == K_H264DEBLOCK) {
|
||||
for (int i = 0; i < n_units; i++) {
|
||||
uint32_t alpha = (uint32_t)(xs_step(&seed) % 64) + 1;
|
||||
uint32_t beta = (uint32_t)(xs_step(&seed) % 16) + 1;
|
||||
uint32_t tc0p = 0;
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int rr = (int)(xs_step(&seed) % 8);
|
||||
int8_t v = (int8_t)(rr == 0 ? -1 : (rr - 1));
|
||||
tc0p |= ((uint32_t)(uint8_t)v) << (s * 8);
|
||||
}
|
||||
meta[4*i+0] = (uint32_t)((size_t)i * 256 + 4 * 16); /* EDGE_OFF = 4*stride */
|
||||
meta[4*i+1] = alpha | (beta << 8);
|
||||
meta[4*i+2] = tc0p;
|
||||
meta[4*i+3] = 0;
|
||||
}
|
||||
for (size_t i = 0; i < dst_bytes; i++)
|
||||
((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
|
||||
}
|
||||
|
||||
v3d_pipeline pipe = {0};
|
||||
int n_ssbos = has_src ? 3 : 2;
|
||||
/* K_H264DEBLOCK reuses pc_lpf layout (n + dst_stride_u8 + 2 pads). */
|
||||
size_t pc_size = (a->kernel == K_MC) ? sizeof(pc_mc) :
|
||||
(a->kernel == K_IDCT) ? sizeof(pc_idct) :
|
||||
(a->kernel == K_CDEF) ? sizeof(pc_cdef) : sizeof(pc_lpf);
|
||||
@@ -417,6 +472,8 @@ static void *qpu_real_worker(void *p)
|
||||
pc.idct = (pc_idct){ .n_blocks = n_units, .blocks_per_row = 16, .dst_stride_u8 = 128 };
|
||||
} else if (a->kernel == K_CDEF) {
|
||||
pc.cdef = (pc_cdef){ .n_blocks = n_units, .tmp_stride_u16 = 16, .dst_stride_u8 = 8 };
|
||||
} else if (a->kernel == K_H264DEBLOCK) {
|
||||
pc.lpf = (pc_lpf){ .n = n_units, .dst_stride_u8 = 16 };
|
||||
}
|
||||
|
||||
VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
|
||||
@@ -472,6 +529,7 @@ static enum kernel parse_kernel(const char *s) {
|
||||
if (!strcmp(s, "lpf8")) return K_LPF8;
|
||||
if (!strcmp(s, "cdef")) return K_CDEF;
|
||||
if (!strcmp(s, "idct")) return K_IDCT;
|
||||
if (!strcmp(s, "h264deblock")) return K_H264DEBLOCK;
|
||||
fprintf(stderr, "unknown kernel: %s\n", s); exit(2);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user