Phase 8: wire LPF wd=4 + wd=8 QPU through public API

Mirror the IDCT pattern (lazy pipeline + per-call SSBO alloc + dispatch + readback) for cycles 2 (LPF wd=4) and 4 (LPF wd=8). Important caught-empirically bug: the two LPF shaders disagree on push-constant slot order — wd=4 puts dst_stride_u8 at slot 1, wd=8 puts it at slot 2 (with unused blocks_per_row at slot 1). Initial single-struct attempt silently corrupted wd=8 output (1958/2048 = 95.6 % bit-exact on test_api_lpf). Fixed by keeping separate lpf4_pc and lpf8_pc struct definitions. dst-window calc handles both kernels (same -4..+3 byte footprint per row). test_api_lpf exercises both kernels in CPU / QPU / AUTO modes against the C reference. All 6 mode/kernel combinations pass 2048/2048 bit-exact (32 edges × 8 rows × 8 bytes/edge). Phase 8 status after this commit: 3 of 5 kernels wired through API for QPU dispatch (IDCT, LPF wd=4, LPF wd=8 — i.e., all 3 QPU-default kernels per recipe). Cycle 3 MC and cycle 5 CDEF still need wiring for opportunistic-override mode but aren't needed for recipe-AUTO path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 13:57:25 +00:00
parent 1085c5699c
commit eb5cfb34c4
3 changed files with 262 additions and 5 deletions
@@ -0,0 +1,121 @@
+/*
+ * Phase 8 — VP9 LPF wd=4 + wd=8 through the public API.
+ *
+ * Exercises both kernels in CPU / QPU / AUTO modes against the
+ * C reference (tests/vp9_lpf_ref.c, vp9_lpf8_ref.c). Bit-exact
+ * gate per cycle 2 and 4 phase 7 docs.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "../include/daedalus.h"
+
+extern void daedalus_vp9_loop_filter_h_4_8_ref(
+    uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
+extern void daedalus_vp9_loop_filter_h_8_8_ref(
+    uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
+
+#define N_EDGES 32
+#define EDGE_STRIDE 8
+#define EDGE_H 8
+#define EDGE_BYTES (EDGE_H * EDGE_STRIDE)   /* 64 */
+#define DST_BYTES (N_EDGES * EDGE_BYTES)
+
+static uint64_t xs_state = 0xa57edbeef5717ULL;
+static inline uint64_t xs(void) {
+    uint64_t x = xs_state;
+    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
+    return xs_state = x;
+}
+
+static void gen_edge_pixels(uint8_t *buf)
+{
+    int side_a_base = (int)(xs() % 200) + 20;
+    int side_b_base = (int)(xs() % 200) + 20;
+    int noise = (int)(xs() % 30);
+    for (int r = 0; r < EDGE_H; r++) {
+        for (int c = 0; c < 8; c++) {
+            int base = (c < 4) ? side_a_base : side_b_base;
+            int n = ((int)(xs() % (2 * noise + 1))) - noise;
+            int v = base + n;
+            buf[r * EDGE_STRIDE + c] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
+        }
+    }
+}
+
+static int run_lpf(int wd_8, daedalus_substrate force,
+                    const uint8_t *dst_initial,
+                    const uint8_t *dst_ref,
+                    const daedalus_lpf_meta *meta,
+                    const char *label)
+{
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+    int has_qpu = daedalus_ctx_has_qpu(ctx);
+    if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) {
+        printf("    [%s wd=%d] SKIP — QPU unavailable\n", label, wd_8 ? 8 : 4);
+        daedalus_ctx_destroy(ctx); return 0;
+    }
+    uint8_t dst[DST_BYTES];
+    memcpy(dst, dst_initial, DST_BYTES);
+    int rc = wd_8
+        ? daedalus_dispatch_vp9_lpf8(ctx, force, dst, EDGE_STRIDE, N_EDGES, meta)
+        : daedalus_dispatch_vp9_lpf4(ctx, force, dst, EDGE_STRIDE, N_EDGES, meta);
+    if (rc) { fprintf(stderr, "    rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; }
+    int diffs = 0;
+    for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++;
+    printf("    [%s wd=%d] %d/%d bit-exact (%.4f%%)\n",
+           label, wd_8 ? 8 : 4,
+           DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
+    daedalus_ctx_destroy(ctx);
+    return diffs == 0 ? 0 : 1;
+}
+
+static int run_one_kernel(int wd_8)
+{
+    /* Per-edge layout: edge i occupies bytes [i*64..i*64+63]. Edge
+     * center is at column 4 of row 0 → byte offset i*64 + 4. */
+    uint8_t initial[DST_BYTES];
+    uint8_t ref[DST_BYTES];
+    daedalus_lpf_meta meta[N_EDGES];
+
+    for (int i = 0; i < N_EDGES; i++) {
+        gen_edge_pixels(initial + i * EDGE_BYTES);
+        meta[i].dst_off = (uint32_t)(i * EDGE_BYTES + 4);
+        meta[i].E = (int32_t)(xs() % 81);
+        meta[i].I = (int32_t)(xs() % 41);
+        meta[i].H = (int32_t)(xs() % 11);
+    }
+    memcpy(ref, initial, DST_BYTES);
+    for (int i = 0; i < N_EDGES; i++) {
+        if (wd_8) daedalus_vp9_loop_filter_h_8_8_ref(
+            ref + meta[i].dst_off, EDGE_STRIDE, meta[i].E, meta[i].I, meta[i].H);
+        else      daedalus_vp9_loop_filter_h_4_8_ref(
+            ref + meta[i].dst_off, EDGE_STRIDE, meta[i].E, meta[i].I, meta[i].H);
+    }
+
+    int fail = 0;
+    fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_CPU,  initial, ref, meta, "CPU");
+    fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_QPU,  initial, ref, meta, "QPU");
+    fail |= run_lpf(wd_8, DAEDALUS_SUBSTRATE_AUTO, initial, ref, meta, "AUTO");
+    return fail;
+}
+
+int main(void)
+{
+    printf("=== Phase 8 API smoke: VP9 LPF wd=4 + wd=8 ===\n");
+    printf("  recipe for LPF4_INNER: %d (1=CPU, 2=QPU)\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF4_INNER));
+    printf("  recipe for LPF8_INNER: %d\n",
+           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_LPF8_INNER));
+
+    int fail = 0;
+    printf("\nLPF wd=4:\n");
+    fail |= run_one_kernel(0);
+    printf("\nLPF wd=8:\n");
+    fail |= run_one_kernel(1);
+    return fail;
+}