Cycle 4 (LPF wd=8) closure: M1=100%, R=0.34, M4=+4.1%, PASS

Fourth daedalus-fourier kernel — VP9 8-tap inner loop filter wd=8 h_8_8 variant. Width extension of cycle 2's wd=4; completes VP9 inner-edge LPF coverage. Full cycle Phase 1-7 + M4'''' in one combined go (cycle compressed since incremental from cycle 2). Phase 5 review explicitly skipped (incremental ~30-line shader delta from cycle 2 + same geometry + cycle-2 RED-pattern checks still apply). Flagged in docs/k4_lpf8_phase4_7.md per dev_process.md "Skipping phases is a deliberate choice that should be flagged." Phase 6 v1 first-light: M1'''' 100.0000% bit-exact (65536/65536) first try. Shaderdb shows 231 inst, 4 hardware threads, 0 spills, 27 max-temps, 48 uniforms — compiler at the latency-hiding ceiling. Performance: M3'''' NEON (single-core) 52.382 Medge/s M2'''' QPU isolation 17.847 Medge/s R'''' 0.341 → ORANGE band 30fps floor margin 9.2x (isolation), 20.3x (mixed) M4'''' concurrent matrix: NEON 4-core 37.823 Medge/s <- baseline QPU only 14.867 Medge/s MIXED NEON-3 + QPU 39.389 Medge/s <- +4.1% PASS Verdict: YELLOW-via-M4'''' PASS. Deploy wd=8 LPF on QPU alongside cycle 2 wd=4. Combined VP9 inner-edge LPF coverage now complete. Cross-cycle LPF comparison: | | wd=4 (k2) | wd=8 (k4) | | M3 NEON | 48.3 | 52.4 | | M2 QPU iso | 19.6 | 17.8 | | R iso | 0.41 | 0.34 | | M4 delta | +6.9% | +4.1% | | 30fps mixed | 7.2x | 20.3x | | Verdict | GO QPU | GO QPU | NEW finding (Phase 9 lesson): NEON gets faster per edge as filter width grows (20.7 → 19.1 ns wd=4 → wd=8). The relative QPU loss grows with width. wd=16 would probably flip negative based on the trend line. Deployment recipe with cycle 4: IDCT 8x8 (k1) -> QPU (R=0.92, +7% mixed) LPF wd=4 (k2) -> QPU (R=0.41, +7% mixed) LPF wd=8 (k4) -> QPU (R=0.34, +4% mixed) MC 8h (k3) -> CPU (R=0.067, -19% mixed) Entropy -> CPU (structural) VP9 inner-edge LPF coverage complete. Project continues to higgs deployment plumbing or further kernels per user direction. New artifacts: - src/v3d_lpf_h_8_8.comp — GLSL shader - tests/vp9_lpf8_ref.c — standalone C ref - tests/bench_neon_lpf8.c — M1+M3 bench - tests/bench_v3d_lpf8.c — M1+M2 bench - tests/bench_concurrent_lpf8.c — M4 pthread bench - docs/k4_lpf8_phase1_3.md + phase4_7.md — combined cycle docs Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 12:56:25 +00:00
parent 356e446a49
commit 85feba4087
8 changed files with 1106 additions and 1 deletions
@@ -0,0 +1,99 @@
+// daedalus-fourier cycle 4 — VP9 8-tap inner LPF, wd=8, h direction,
+// 8-pixel edge. V3D 7.1 via Mesa v3dv.
+//
+// Extension of cycle 2's wd=4 kernel: adds flat8in test + 6-write
+// flat-region path. Same lane/edge geometry (32 edges/WG, 8 lanes
+// per edge, no barrier, no shared mem).
+//
+// Contracts (per k4_lpf8_phase4_7.md):
+//   - meta[i].x: dst_off (≥ 4 for cycle-2 reasons; >= 3 strictly here
+//                for the -3 read, but ≥ 4 keeps invariant with cycle 2)
+//   - **dst_stride_u8 ≥ 6** (cycle 4 update: flat8in path writes
+//     6 contiguous bytes per row at base-3..base+2)
+//
+// License: BSD-2-Clause.
+
+#version 450
+#extension GL_EXT_shader_8bit_storage              : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout(binding = 0) readonly buffer Meta { uvec4 meta[]; } u_meta;
+layout(binding = 1) buffer Dst { uint8_t dst[]; } u_dst;
+
+layout(push_constant) uniform PC {
+    uint n_edges;
+    uint blocks_per_row;   /* unused */
+    uint dst_stride_u8;
+    uint _pad;
+} pc;
+
+void main()
+{
+    uint gid         = gl_GlobalInvocationID.x;
+    uint wg_id       = gid / 256u;
+    uint lane_in_wg  = gid & 255u;
+    uint sg_in_wg    = lane_in_wg >> 4;
+    uint lane_in_sg  = lane_in_wg & 15u;
+    uint edge_slot   = lane_in_sg >> 3;
+    uint row         = lane_in_sg & 7u;
+
+    uint edge_local  = sg_in_wg * 2u + edge_slot;
+    uint edge_idx    = wg_id * 32u + edge_local;
+    if (edge_idx >= pc.n_edges) return;
+
+    uvec4 m = u_meta.meta[edge_idx];
+    uint base = m.x + row * pc.dst_stride_u8;
+    int E = int(m.y), I = int(m.z), H = int(m.w);
+
+    int p3 = int(u_dst.dst[base - 4u]);
+    int p2 = int(u_dst.dst[base - 3u]);
+    int p1 = int(u_dst.dst[base - 2u]);
+    int p0 = int(u_dst.dst[base - 1u]);
+    int q0 = int(u_dst.dst[base + 0u]);
+    int q1 = int(u_dst.dst[base + 1u]);
+    int q2 = int(u_dst.dst[base + 2u]);
+    int q3 = int(u_dst.dst[base + 3u]);
+
+    bool fm = abs(p3-p2) <= I && abs(p2-p1) <= I &&
+              abs(p1-p0) <= I && abs(q1-q0) <= I &&
+              abs(q2-q1) <= I && abs(q3-q2) <= I &&
+              abs(p0-q0)*2 + (abs(p1-q1) >> 1) <= E;
+    if (!fm) return;
+
+    /* F = 1 << (BIT_DEPTH - 8) = 1 for 8-bit pixels. */
+    bool flat8in = abs(p3-p0) <= 1 && abs(p2-p0) <= 1 &&
+                   abs(p1-p0) <= 1 && abs(q1-q0) <= 1 &&
+                   abs(q2-q0) <= 1 && abs(q3-q0) <= 1;
+
+    if (flat8in) {
+        /* wd=8 inner-flat filter — 8-pixel-input, 6 outputs. Each
+         * output is a weighted average; rounding bias +4, >>3. */
+        u_dst.dst[base - 3u] = uint8_t((p3+p3+p3 + 2*p2 + p1+p0+q0 + 4) >> 3);
+        u_dst.dst[base - 2u] = uint8_t((p3+p3+p2 + 2*p1 + p0+q0+q1 + 4) >> 3);
+        u_dst.dst[base - 1u] = uint8_t((p3+p2+p1 + 2*p0 + q0+q1+q2 + 4) >> 3);
+        u_dst.dst[base + 0u] = uint8_t((p2+p1+p0 + 2*q0 + q1+q2+q3 + 4) >> 3);
+        u_dst.dst[base + 1u] = uint8_t((p1+p0+q0 + 2*q1 + q2+q3+q3 + 4) >> 3);
+        u_dst.dst[base + 2u] = uint8_t((p0+q0+q1 + 2*q2 + q3+q3+q3 + 4) >> 3);
+    } else {
+        bool hev = abs(p1-p0) > H || abs(q1-q0) > H;
+        if (hev) {
+            int f  = clamp(p1 - q1, -128, 127);
+            f      = clamp(3*(q0-p0) + f, -128, 127);
+            int f1 = min(f + 4, 127) >> 3;
+            int f2 = min(f + 3, 127) >> 3;
+            u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255));
+            u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255));
+        } else {
+            int f  = clamp(3*(q0-p0), -128, 127);
+            int f1 = min(f + 4, 127) >> 3;
+            int f2 = min(f + 3, 127) >> 3;
+            u_dst.dst[base - 1u] = uint8_t(clamp(p0 + f2, 0, 255));
+            u_dst.dst[base + 0u] = uint8_t(clamp(q0 - f1, 0, 255));
+            int fp = (f1 + 1) >> 1;
+            u_dst.dst[base - 2u] = uint8_t(clamp(p1 + fp, 0, 255));
+            u_dst.dst[base + 1u] = uint8_t(clamp(q1 - fp, 0, 255));
+        }
+    }
+}