Path B pivot + Phase 0-3 closed with first baseline numbers

This is a from-scratch initial commit on a fresh .git. The original scaffold commit (7510b56) and the earlier session's working-tree docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted .git is preserved at .git-broken-2026-05-18/ (gitignored) for forensic inspection. Scope re-anchored from Path A (custom VPU firmware on VC7 scalar cores; blocked by BCM2712 silicon-RoT mask-ROM signature check) to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or direct DRM, on stock signed Pi 5 / CM5). See README.md and docs/phase0.md for the substrate audit that closed Path A. Phases closed: Phase 0 — substrate audit; Path A blocked, Path B open; codec-back-end-fits-QPU finding (docs/phase0.md) Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with publish-before-measure R = M2/M3 decision rules (docs/phase1.md) Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored under external/ffmpeg-snapshot/ (PROVENANCE.md pins commit f46e514 + per-file SHA-256s) (docs/phase2.md) Phase 3 — real baseline measurements on hertz (docs/phase3.md): M1 bit-exact 100.0000 % (10000/10000) M3 NEON IDCT8 single 8.171 Mblock/s (122.4 ns/block) M5a empty Vulkan submit 22.66 us M5b 1-WG noop dispatch 55.60 us M5 delta 32.95 us/dispatch => per-dispatch overhead is ~455x per-NEON-block cost; Phase 4 must batch at frame level or close to it. Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c, vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} + external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM). Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12, libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S assembles via the config.h shim. Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching constraint) -> Phase 5 second-model review -> Phase 6 implement. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 11:30:12 +00:00
commit dcbbc77038
22 changed files with 9030 additions and 0 deletions
@@ -0,0 +1,114 @@
+/*
+ * Standalone bit-exact C reference for VP9 8×8 DCT_DCT inverse
+ * transform + add (8-bit pixels), transcribed from the spec
+ * structure as represented in FFmpeg's libavcodec/vp9dsp_template.c
+ * (vendored under external/ffmpeg-snapshot/ at commit f46e514).
+ *
+ * Provided as a self-contained translation unit so the harness
+ * doesn't need to wrestle FFmpeg's BIT_DEPTH-templated macro
+ * expansion. Cross-checked against the vendored reference at
+ * runtime (see bench_neon_idct.c::cross_check_vs_ffmpeg_c()).
+ *
+ * License: LGPL-2.1-or-later (matches the upstream reference).
+ *
+ * Spec source: VP9 specification §8.7 — Inverse transform process.
+ */
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+/* Q14 trig constants — VP9 spec table 8.7.1.4. */
+#define COSPI_16_64 11585  /* cos(pi/4)  * 2^14 */
+#define COSPI_24_64  6270  /* cos(3pi/8) * 2^14 */
+#define COSPI_8_64  15137  /* sin(3pi/8) * 2^14 */
+#define COSPI_28_64  3196  /* cos(7pi/16)* 2^14 */
+#define COSPI_4_64  16069  /* sin(7pi/16)* 2^14 */
+#define COSPI_20_64  9102  /* cos(5pi/16)* 2^14 */
+#define COSPI_12_64 13623  /* sin(5pi/16)* 2^14 */
+
+/* Q14 round-shift: (x + (1<<13)) >> 14, with overflow-safe widening. */
+static inline int32_t qround14(int64_t x)
+{
+    return (int32_t) ((x + (1 << 13)) >> 14);
+}
+
+static inline uint8_t clip_u8(int x)
+{
+    return (uint8_t) (x < 0 ? 0 : x > 255 ? 255 : x);
+}
+
+/* 1-D 8-point inverse DCT, signed int32 throughout. Matches
+ * idct8_1d in libavcodec/vp9dsp_template.c (with the stride
+ * collapsed to indexed access; identical arithmetic). */
+static void idct8_1d(const int32_t in[8], int32_t out[8])
+{
+    int32_t t0a = qround14((int64_t)(in[0] + in[4]) * COSPI_16_64);
+    int32_t t1a = qround14((int64_t)(in[0] - in[4]) * COSPI_16_64);
+    int32_t t2a = qround14((int64_t)in[2] * COSPI_24_64 - (int64_t)in[6] * COSPI_8_64);
+    int32_t t3a = qround14((int64_t)in[2] * COSPI_8_64  + (int64_t)in[6] * COSPI_24_64);
+    int32_t t4a = qround14((int64_t)in[1] * COSPI_28_64 - (int64_t)in[7] * COSPI_4_64);
+    int32_t t5a = qround14((int64_t)in[5] * COSPI_12_64 - (int64_t)in[3] * COSPI_20_64);
+    int32_t t6a = qround14((int64_t)in[5] * COSPI_20_64 + (int64_t)in[3] * COSPI_12_64);
+    int32_t t7a = qround14((int64_t)in[1] * COSPI_4_64  + (int64_t)in[7] * COSPI_28_64);
+
+    int32_t t0 = t0a + t3a, t1 = t1a + t2a;
+    int32_t t2 = t1a - t2a, t3 = t0a - t3a;
+    int32_t t4 = t4a + t5a;
+    int32_t t5p = t4a - t5a;
+    int32_t t7 = t7a + t6a;
+    int32_t t6p = t7a - t6a;
+
+    int32_t t5 = qround14((int64_t)(t6p - t5p) * COSPI_16_64);
+    int32_t t6 = qround14((int64_t)(t6p + t5p) * COSPI_16_64);
+
+    out[0] = t0 + t7; out[1] = t1 + t6;
+    out[2] = t2 + t5; out[3] = t3 + t4;
+    out[4] = t3 - t4; out[5] = t2 - t5;
+    out[6] = t1 - t6; out[7] = t0 - t7;
+}
+
+/* Public reference entry point. Signature matches
+ * ff_vp9_idct_idct_8x8_add_neon. After the call, *block is
+ * zeroed (matches FFmpeg behaviour). */
+void daedalus_vp9_idct_idct_8x8_add_ref(uint8_t *dst, ptrdiff_t stride,
+                                        int16_t *block, int eob)
+{
+    int32_t tmp[64];
+    int32_t out[8];
+    int32_t col[8];
+
+    /* DC-only fast path: (((coef * 11585) Q14) * 11585) Q14, then
+     * broadcast (+16) >> 5 added to every pixel. */
+    if (eob == 1) {
+        int32_t dc = qround14(qround14((int64_t)block[0] * COSPI_16_64)
+                              * (int64_t) COSPI_16_64);
+        block[0] = 0;
+        int32_t add = (dc + 16) >> 5;
+        for (int r = 0; r < 8; r++)
+            for (int c = 0; c < 8; c++)
+                dst[r * stride + c] = clip_u8(dst[r * stride + c] + add);
+        return;
+    }
+
+    /* 8 column passes, transposed write: IDCT of block column i lands
+     * in row i of tmp. This matches FFmpeg's idct_idct_8x8_add_c which
+     * uses `tmp + i*8` as the column-pass output base — the transpose
+     * is implicit in the offset pattern, making the row pass below
+     * read columns of tmp and write columns of dst. */
+    for (int i = 0; i < 8; i++) {
+        for (int r = 0; r < 8; r++) col[r] = block[r * 8 + i];
+        idct8_1d(col, out);
+        for (int r = 0; r < 8; r++) tmp[i * 8 + r] = out[r];
+    }
+    memset(block, 0, 64 * sizeof(*block));
+
+    /* 8 row passes: column i of tmp -> column i of dst (matches
+     * FFmpeg's `dst[j*stride] = out[j]; dst++` pattern). */
+    for (int i = 0; i < 8; i++) {
+        for (int r = 0; r < 8; r++) col[r] = tmp[r * 8 + i];
+        idct8_1d(col, out);
+        for (int r = 0; r < 8; r++)
+            dst[r * stride + i] = clip_u8(dst[r * stride + i]
+                                          + ((out[r] + 16) >> 5));
+    }
+}