wip: mc02 v2 shared-tile

2026-05-25 21:28:04 +02:00
3 changed files with 77 additions and 126 deletions
@@ -1,7 +1,6 @@
 // daedalus-fourier — H.264 luma qpel mc02 (8x8, vertical half-pel), V3D 7.1.
 //
-// Sibling of cycle 9's v3d_h264_qpel_mc20.comp.  Same 6-tap filter,
-// transposed to vertical direction:
+// v2: cooperative-load shared-memory tile.
 //
 //   dst[r,c] = clip255(
 //       ( s[r-2,c]
@@ -14,9 +13,30 @@
 //       ) >> 5)
 //
 // src+src_off points at row 0 col 0 of the OUTPUT block; the filter
-// reads rows -2..+3 (2 rows of top context, 3 rows of bottom).
+// reads rows -2..+3 (2 rows of top context, 3 rows of bottom), total
+// 13 distinct source rows × 8 cols = 104 bytes per 8x8 output.
 //
-// Same WG layout as mc20: 64 lanes / 1 block-per-WG / 1 lane-per-pixel.
+// v1 had each of the 64 lanes do 6 SSBO loads → 384 loads/WG to cover
+// 104 unique bytes (3.7x redundant), and each lane's loads were stride-
+// spaced (one cache line per byte under V3D's TMU).  PR #36 bench
+// showed mc02 was the only qpel position where CPU NEON still beat
+// QPU (16.96 ns/op CPU vs 20.54 ns/op QPU; 1.21x CPU favoring).
+//
+// v2 splits the work into a coalesced load phase + a shared-memory
+// compute phase:
+//
+//   Phase 1: each of the 64 lanes cooperatively loads the 104-byte
+//   source tile into shared memory.  Lanes 0..63 load bytes at indices
+//   0..63 (covers source rows 0..7 of the 13-row tile); lanes 0..39
+//   second-load bytes 64..103 (rows 8..12).  Reads within a row are
+//   contiguous so the SIMD groups coalesce; total SSBO loads = 104,
+//   matching the unique-byte count.
+//
+//   Phase 2: all 64 lanes compute one output pixel each, reading 6
+//   bytes from shared.  Shared-memory access on V3D is local-store
+//   backed (no TMU round-trip).
+//
+// Same WG layout as v1: 64 lanes / 1 block-per-WG / 1 lane-per-pixel.
 //
 // License: BSD-2-Clause.

@@ -36,31 +56,52 @@ layout(push_constant) uniform PC {
    uint _pad0, _pad1;
 } pc;

+// 13 source rows × 8 cols.  int storage (4 bytes each) — wasteful vs
+// uint8_t but avoids 8-bit-shared interop concerns on glslang+v3dv;
+// 416 bytes shared/WG is well within any reasonable local-store budget.
+shared int s_tile[13 * 8];
+
 void main()
 {
    uint block_idx = gl_WorkGroupID.x;
    if (block_idx >= pc.n_blocks) return;

    uint lane = gl_LocalInvocationID.x;
-    uint r = lane >> 3;
-    uint c = lane & 7u;

    uint dst_off = u_meta.meta[block_idx].x;
    uint src_off = u_meta.meta[block_idx].y;
    uint stride  = pc.stride_u8;

-    // Read the 6 rows of vertical context at col (c) of THIS output row.
-    // src_off+r*stride+c is at the OUTPUT pixel position; the kernel
-    // samples r-2..r+3 along the column.  Unsigned-safe because the
-    // public API contract guarantees src_off >= 2*stride.
-    uint col_base = src_off + c;
+    // Source-tile base: src_off points at output-row-0 col-0, the tile
+    // starts 2 rows above.  Unsigned-safe because the public API
+    // contract guarantees src_off >= 2*stride.
+    uint tile_base = src_off - 2u * stride;

-    int s_m2 = int(u_src.src[col_base + (r - 2u) * stride]);
-    int s_m1 = int(u_src.src[col_base + (r - 1u) * stride]);
-    int s_0  = int(u_src.src[col_base +  r       * stride]);
-    int s_p1 = int(u_src.src[col_base + (r + 1u) * stride]);
-    int s_p2 = int(u_src.src[col_base + (r + 2u) * stride]);
-    int s_p3 = int(u_src.src[col_base + (r + 3u) * stride]);
+    // Phase 1: cooperative load — 64 lanes load 104 bytes.
+    {
+        uint sr = lane >> 3;        // 0..7
+        uint sc = lane & 7u;
+        s_tile[lane] = int(u_src.src[tile_base + sr * stride + sc]);
+    }
+    if (lane < 40u) {
+        uint idx = lane + 64u;      // 64..103
+        uint sr = idx >> 3;         // 8..12
+        uint sc = idx & 7u;
+        s_tile[idx] = int(u_src.src[tile_base + sr * stride + sc]);
+    }
+
+    barrier();
+
+    // Phase 2: each lane computes one output pixel from the shared tile.
+    uint r = lane >> 3;
+    uint c = lane & 7u;
+
+    int s_m2 = s_tile[(r + 0u) * 8u + c];
+    int s_m1 = s_tile[(r + 1u) * 8u + c];
+    int s_0  = s_tile[(r + 2u) * 8u + c];
+    int s_p1 = s_tile[(r + 3u) * 8u + c];
+    int s_p2 = s_tile[(r + 4u) * 8u + c];
+    int s_p3 = s_tile[(r + 5u) * 8u + c];

    int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
    int p = clamp(v >> 5, 0, 255);
@@ -8,8 +8,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <unistd.h>
-#include <limits.h>

 #define CHK(call) do { VkResult r__ = (call); if (r__ != VK_SUCCESS) { \
    fprintf(stderr, "v3d_runner: vulkan error %d at %s:%d (%s)\n", \
@@ -370,68 +368,10 @@ void v3d_runner_destroy_buffer(v3d_runner *r, v3d_buffer *buf)

 /* ---- Pipelines -------------------------------------------------- */

-/* SPV lookup tries a small set of locations.  The caller passes a bare
- * filename (e.g. "v3d_h264_idct4.spv"); we try, in order:
- *
- *   1. cwd-relative           (legacy contract; works when run from build/)
- *   2. $DAEDALUS_SHADER_DIR   (env override for tests / packaged installs)
- *   3. <binary-dir>/<name>    (so the bench/test binary finds the SPV next
- *                              to itself regardless of cwd — this is the
- *                              fix for the silent-no-SPV regression that
- *                              made PR #36's bench numbers meaningless)
- *   4. /opt/fourier/share/daedalus-fourier/<name>  (Pi 5 install layout)
- *   5. /usr/share/daedalus-fourier/<name>          (system-wide install)
- *
- * Returns NULL only if every location fails, with a single perror naming
- * the bare filename so the user can grep for it. */
-static FILE *open_spv(const char *name)
-{
-    FILE *f = fopen(name, "rb");
-    if (f) return f;
-
-    const char *envdir = getenv("DAEDALUS_SHADER_DIR");
-    if (envdir && *envdir) {
-        char p[PATH_MAX];
-        snprintf(p, sizeof(p), "%s/%s", envdir, name);
-        f = fopen(p, "rb");
-        if (f) return f;
-    }
-
-    char exe[PATH_MAX];
-    ssize_t n = readlink("/proc/self/exe", exe, sizeof(exe) - 1);
-    if (n > 0) {
-        exe[n] = 0;
-        char *slash = strrchr(exe, '/');
-        if (slash) {
-            *slash = 0;
-            char p[PATH_MAX];
-            snprintf(p, sizeof(p), "%s/%s", exe, name);
-            f = fopen(p, "rb");
-            if (f) return f;
-        }
-    }
-
-    char p[PATH_MAX];
-    snprintf(p, sizeof(p), "/opt/fourier/share/daedalus-fourier/%s", name);
-    f = fopen(p, "rb");
-    if (f) return f;
-
-    snprintf(p, sizeof(p), "/usr/share/daedalus-fourier/%s", name);
-    f = fopen(p, "rb");
-    if (f) return f;
-
-    return NULL;
-}
-
 static uint32_t *read_spv(const char *path, size_t *out_size)
 {
-    FILE *f = open_spv(path);
-    if (!f) {
-        fprintf(stderr,
-                "daedalus: SPV not found via cwd / $DAEDALUS_SHADER_DIR / "
-                "binary-dir / /opt/fourier/share / /usr/share: %s\n", path);
-        return NULL;
-    }
+    FILE *f = fopen(path, "rb");
+    if (!f) { perror(path); return NULL; }
    fseek(f, 0, SEEK_END);
    long sz = ftell(f);
    fseek(f, 0, SEEK_SET);
@@ -44,28 +44,12 @@ static double now_ms(void) {
 /* Per-1080p-frame counts (8160 MBs at 1920x1088). */
 #define MBS_1080P  8160

-/* Standard benchmark loop.  fn() is called n times per iteration.
- *
- * fn() now returns the dispatch's int rc.  A single preflight call is
- * made before the hot loop; if rc != 0 (which on the QPU substrate
- * almost always means "SPV not found via any search path"), bench_ns
- * returns -1 and the caller must NOT report the kernel as measured.
- *
- * Without this, a missing SPV makes every dispatch fail fast at the
- * cost of one fprintf+open call (~1-5 µs), and the loop times that
- * cost as if it were real QPU work — producing absurdly-small ns/op
- * numbers that look like a QPU speedup.  This is exactly what made
- * PR #36's bench numbers a measurement artifact. */
-typedef int (*bench_fn)(void);
+/* Standard benchmark loop.  fn() is called n times per iteration. */
+typedef void (*bench_fn)(void);

 static double bench_ns(const char *name, int iters, int warmup,
                        int ops_per_iter, bench_fn fn)
 {
-    int rc = fn();
-    if (rc != 0) {
-        printf("  %-32s    DISPATCH FAILED rc=%d — kernel skipped\n", name, rc);
-        return -1;
-    }
    for (int i = 0; i < warmup; i++) fn();
    double t0 = now_ms();
    for (int i = 0; i < iters; i++) fn();
@@ -92,8 +76,8 @@ static int16_t              idct4_coeffs[1024 * 16];
 static daedalus_h264_block_meta idct4_meta[1024];
 static uint8_t              idct_dst[64 * 4 * 16 * 16];   /* 64 MB-rows × ... */

-static int bench_idct4(void) {
-    return daedalus_dispatch_h264_idct4(ctx, g_sub,
+static void bench_idct4(void) {
+    daedalus_dispatch_h264_idct4(ctx, g_sub,
                                  idct_dst, 64*16, idct4_coeffs, 1024, idct4_meta);
 }

@@ -101,8 +85,8 @@ static int bench_idct4(void) {
 static int16_t              idct8_coeffs[256 * 64];
 static daedalus_h264_block_meta idct8_meta[256];

-static int bench_idct8(void) {
-    return daedalus_dispatch_h264_idct8(ctx, g_sub,
+static void bench_idct8(void) {
+    daedalus_dispatch_h264_idct8(ctx, g_sub,
                                  idct_dst, 64*16, idct8_coeffs, 256, idct8_meta);
 }

@@ -110,13 +94,13 @@ static int bench_idct8(void) {
 static daedalus_h264_deblock_meta deblock_meta[256];
 static uint8_t deblock_dst[256 * 16 * 16];

-static int bench_deblock_v(void) {
-    return daedalus_dispatch_h264_deblock_luma_v(ctx, g_sub,
+static void bench_deblock_v(void) {
+    daedalus_dispatch_h264_deblock_luma_v(ctx, g_sub,
                                           deblock_dst, 16, 256, deblock_meta);
 }

-static int bench_deblock_h(void) {
-    return daedalus_dispatch_h264_deblock_luma_h(ctx, g_sub,
+static void bench_deblock_h(void) {
+    daedalus_dispatch_h264_deblock_luma_h(ctx, g_sub,
                                           deblock_dst, 16, 256, deblock_meta);
 }

@@ -125,16 +109,16 @@ static uint8_t qpel_src[256 * 16 * 16];
 static uint8_t qpel_dst[256 * 16 * 16];
 static daedalus_h264_qpel_meta qpel_meta[256];

-static int bench_qpel_mc20(void) {
-    return daedalus_dispatch_h264_qpel_mc20(ctx, g_sub,
+static void bench_qpel_mc20(void) {
+    daedalus_dispatch_h264_qpel_mc20(ctx, g_sub,
                                      qpel_dst, qpel_src, 16, 256, qpel_meta);
 }
-static int bench_qpel_mc02(void) {
-    return daedalus_dispatch_h264_qpel_mc02(ctx, g_sub,
+static void bench_qpel_mc02(void) {
+    daedalus_dispatch_h264_qpel_mc02(ctx, g_sub,
                                      qpel_dst, qpel_src, 16, 256, qpel_meta);
 }
-static int bench_qpel_mc22(void) {
-    return daedalus_dispatch_h264_qpel_mc22(ctx, g_sub,
+static void bench_qpel_mc22(void) {
+    daedalus_dispatch_h264_qpel_mc22(ctx, g_sub,
                                      qpel_dst, qpel_src, 16, 256, qpel_meta);
 }

@@ -213,25 +197,11 @@ int main(int argc, char **argv)
        rows[i].cpu_ns = bench_ns(rows[i].name, iters, warmup, rows[i].n_per_call, rows[i].fn);

    /* Pass 2: QPU compute (if available). */
-    int qpu_failures = 0;
    if (has_qpu) {
        g_sub = DAEDALUS_SUBSTRATE_QPU;
        printf("\n== QPU V3D7 compute ==\n");
-        for (int i = 0; i < N_ROWS; i++) {
+        for (int i = 0; i < N_ROWS; i++)
            rows[i].qpu_ns = bench_ns(rows[i].name, iters, warmup, rows[i].n_per_call, rows[i].fn);
-            if (rows[i].qpu_ns < 0) qpu_failures++;
-        }
-        if (qpu_failures) {
-            fprintf(stderr,
-                "\nbench_h264_primitives: %d of %d QPU dispatches failed.\n"
-                "  Almost always means SPV files were not found via any of:\n"
-                "    cwd  /  $DAEDALUS_SHADER_DIR  /  binary-dir  /\n"
-                "    /opt/fourier/share/daedalus-fourier  /  /usr/share/daedalus-fourier\n"
-                "  Set DAEDALUS_SHADER_DIR=<path> or run from a dir where the\n"
-                "  .spv files exist (e.g. the cmake build dir).\n",
-                qpu_failures, N_ROWS);
-            return 2;
-        }
    }

    /* Summary table — both substrates side by side. */