Cycle 8 closed: H.264 deblock R8=0.061 RED, opportunistic helper

Phase 6 deliverable: v3d_h264deblock.comp (132 inst, 4 threads, no spills). Phase 5 REDs applied: RED-1: explicit clamp p1'/q1' to [0,255] before uint8 write RED-2: bench-enforced m.x >= 4*stride contract M1: 3-way 4096/4096 bit-exact (QPU vs C ref AND vs NEON). M2: 5.629 Medge/s isolation → R8 = 0.061 RED (predicted 0.09-0.14). Lower than prediction; H.264 deblock has 4 early-return paths + 2 conditional writes that hurt V3D branchy execution more than expected. M4 same-kernel: NEON-3+QPU 12.81 Medge/s ≈ pure-NEON-4 ~12-15 (neutral). M4 MIXED (real H.264 deployment shape): CPU=MC + QPU=h264deblock gives CPU MC 25.11 Mblock/s + QPU h264deblock 6.23 Medge/s. QPU contribution is essentially unchanged from isolation — the cross-substrate contention is gentle (consistent with Issue 003's V4 finding). Verdict: H.264 deblock = opportunistic QPU helper. Same recipe slot as cycle 5 CDEF. 6 Medge/s helper = 85% of single-NEON-core deblock capacity, available when CPU is busy with other work. Cycles 1-8 deployment recipe complete: Primary QPU: cycles 1+2+4 (VP9 IDCT/LPF, all bandwidth-bound) Primary CPU: cycles 3+6+7 (compute-heavy or trivially fast on NEON) Opportunistic helper: cycles 5+8 (CDEF, H.264 deblock) Phase 9 lessons added: - Branchy kernels underperform V3D vs straight-line ones - Mixed-kernel helper value scales with isolation M2, not same-kernel M4 - R prediction needs branchiness weight, not just compute density - src/v3d_h264deblock.comp (132 inst QPU shader) - tests/bench_v3d_h264deblock.c (3-way M1 + M2 + R classification) - tests/bench_concurrent_mixed.c extended with K_H264DEBLOCK - CMakeLists.txt: v3d_h264deblock.spv + bench_v3d_h264deblock + h264dsp linked into bench_concurrent_mixed - docs/k8_h264deblock_phase7.md (full closure with cycles 1-8 recipe) Next: Phase 8 — V4L2 wrapper / deployment infra. Public API already exposes recipe-default substrate per kernel. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 14:44:21 +00:00
parent f2ba08e1cf
commit 373f63a910
5 changed files with 695 additions and 4 deletions
@@ -68,7 +68,10 @@ static double now_s(void) {

 /* --- Kernel selectors --- */

-enum kernel { K_MC, K_LPF4, K_LPF8, K_CDEF, K_IDCT };
+enum kernel { K_MC, K_LPF4, K_LPF8, K_CDEF, K_IDCT, K_H264DEBLOCK };
+
+extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
+                                             int alpha, int beta, int8_t *tc0);

 static const char *kernel_name(enum kernel k) {
    switch (k) {
@@ -77,11 +80,12 @@ static const char *kernel_name(enum kernel k) {
    case K_LPF8: return "lpf8";
    case K_CDEF: return "cdef";
    case K_IDCT: return "idct";
+    case K_H264DEBLOCK: return "h264deblock";
    }
    return "?";
 }
 static const char *kernel_unit(enum kernel k) {
-    return (k == K_LPF4 || k == K_LPF8) ? "Medge/s" : "Mblock/s";
+    return (k == K_LPF4 || k == K_LPF8 || k == K_H264DEBLOCK) ? "Medge/s" : "Mblock/s";
 }

 /* --- NEON worker (per-kernel inline; pre-generate inputs, hot-loop) --- */
@@ -201,6 +205,32 @@ static void *neon_worker(void *p) {
    case K_LPF8: neon_run_lpf(&seed, &done, 1); break;
    case K_IDCT: neon_run_idct(&seed, &done); break;
    case K_CDEF: neon_run_cdef(&seed, &done); break;
+    case K_H264DEBLOCK: {
+        /* H.264 deblock: 16-row × 16-col tile per edge, EDGE_OFF = 4*16. */
+        int n = NEON_BATCH;
+        uint8_t *master = malloc((size_t) n * 256);
+        uint8_t *work   = malloc((size_t) n * 256);
+        int *alphas = malloc(n*sizeof(int)), *betas = malloc(n*sizeof(int));
+        int8_t (*tc0s)[4] = malloc(n*4);
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < 256; j++) master[i*256+j] = (uint8_t)(xs_step(&seed) & 0xff);
+            alphas[i] = (int)(xs_step(&seed) % 64) + 1;
+            betas[i]  = (int)(xs_step(&seed) % 16) + 1;
+            for (int s = 0; s < 4; s++) {
+                int r = (int)(xs_step(&seed) % 8);
+                tc0s[i][s] = (int8_t)(r == 0 ? -1 : (r - 1));
+            }
+        }
+        while (!g_stop) {
+            memcpy(work, master, (size_t) n * 256);
+            for (int i = 0; i < n; i++)
+                ff_h264_v_loop_filter_luma_neon(work + i*256 + 4*16, 16,
+                                                 alphas[i], betas[i], tc0s[i]);
+            done += n;
+        }
+        free(master); free(work); free(alphas); free(betas); free(tc0s);
+        break;
+    }
    default: fprintf(stderr, "bad NEON kernel\n"); break;
    }
    a->elapsed_s = now_s() - t0;
@@ -334,6 +364,13 @@ static void *qpu_real_worker(void *p)
        meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
        has_src = 1;
        break;
+    case K_H264DEBLOCK:
+        spv = "v3d_h264deblock.spv";
+        bpw = 16;                                                /* 16 edges/WG */
+        dst_bytes = (size_t) n_units * 256;                      /* 16x16 tile */
+        meta_bytes = (size_t) n_units * 4 * sizeof(uint32_t);
+        has_src = 0;
+        break;
    default:
        fprintf(stderr, "qpu_real_worker: unsupported kernel\n");
        v3d_runner_destroy(r);
@@ -392,10 +429,28 @@ static void *qpu_real_worker(void *p)
        }
        for (size_t i = 0; i < dst_bytes; i++)
            ((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
+    } else if (a->kernel == K_H264DEBLOCK) {
+        for (int i = 0; i < n_units; i++) {
+            uint32_t alpha = (uint32_t)(xs_step(&seed) % 64) + 1;
+            uint32_t beta  = (uint32_t)(xs_step(&seed) % 16) + 1;
+            uint32_t tc0p = 0;
+            for (int s = 0; s < 4; s++) {
+                int rr = (int)(xs_step(&seed) % 8);
+                int8_t v = (int8_t)(rr == 0 ? -1 : (rr - 1));
+                tc0p |= ((uint32_t)(uint8_t)v) << (s * 8);
+            }
+            meta[4*i+0] = (uint32_t)((size_t)i * 256 + 4 * 16);   /* EDGE_OFF = 4*stride */
+            meta[4*i+1] = alpha | (beta << 8);
+            meta[4*i+2] = tc0p;
+            meta[4*i+3] = 0;
+        }
+        for (size_t i = 0; i < dst_bytes; i++)
+            ((uint8_t *) buf_dst.mapped)[i] = (uint8_t)(xs_step(&seed) & 0xff);
    }

    v3d_pipeline pipe = {0};
    int n_ssbos = has_src ? 3 : 2;
+    /* K_H264DEBLOCK reuses pc_lpf layout (n + dst_stride_u8 + 2 pads). */
    size_t pc_size = (a->kernel == K_MC) ? sizeof(pc_mc) :
                     (a->kernel == K_IDCT) ? sizeof(pc_idct) :
                     (a->kernel == K_CDEF) ? sizeof(pc_cdef) : sizeof(pc_lpf);
@@ -417,6 +472,8 @@ static void *qpu_real_worker(void *p)
        pc.idct = (pc_idct){ .n_blocks = n_units, .blocks_per_row = 16, .dst_stride_u8 = 128 };
    } else if (a->kernel == K_CDEF) {
        pc.cdef = (pc_cdef){ .n_blocks = n_units, .tmp_stride_u16 = 16, .dst_stride_u8 = 8 };
+    } else if (a->kernel == K_H264DEBLOCK) {
+        pc.lpf = (pc_lpf){ .n = n_units, .dst_stride_u8 = 16 };
    }

    VkCommandBuffer cb = v3d_runner_alloc_cmdbuf(r);
@@ -472,6 +529,7 @@ static enum kernel parse_kernel(const char *s) {
    if (!strcmp(s, "lpf8")) return K_LPF8;
    if (!strcmp(s, "cdef")) return K_CDEF;
    if (!strcmp(s, "idct")) return K_IDCT;
+    if (!strcmp(s, "h264deblock")) return K_H264DEBLOCK;
    fprintf(stderr, "unknown kernel: %s\n", s); exit(2);
 }