Path B pivot + Phase 0-3 closed with first baseline numbers

This is a from-scratch initial commit on a fresh .git. The original
scaffold commit (7510b56) and the earlier session's working-tree
docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted
.git is preserved at .git-broken-2026-05-18/ (gitignored) for
forensic inspection.

Scope re-anchored from Path A (custom VPU firmware on VC7 scalar
cores; blocked by BCM2712 silicon-RoT mask-ROM signature check)
to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or
direct DRM, on stock signed Pi 5 / CM5). See README.md and
docs/phase0.md for the substrate audit that closed Path A.

Phases closed:
  Phase 0 — substrate audit; Path A blocked, Path B open;
            codec-back-end-fits-QPU finding (docs/phase0.md)
  Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with
            publish-before-measure R = M2/M3 decision rules
            (docs/phase1.md)
  Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored
            under external/ffmpeg-snapshot/ (PROVENANCE.md pins
            commit f46e514 + per-file SHA-256s) (docs/phase2.md)
  Phase 3 — real baseline measurements on hertz (docs/phase3.md):
              M1 bit-exact            100.0000 % (10000/10000)
              M3 NEON IDCT8 single    8.171 Mblock/s (122.4 ns/block)
              M5a empty Vulkan submit 22.66 us
              M5b 1-WG noop dispatch  55.60 us
              M5 delta                32.95 us/dispatch
            => per-dispatch overhead is ~455x per-NEON-block cost;
               Phase 4 must batch at frame level or close to it.

Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c,
vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} +
external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM).
Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12,
libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S
assembles via the config.h shim.

Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching
constraint) -> Phase 5 second-model review -> Phase 6 implement.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 11:30:12 +00:00
commit dcbbc77038
22 changed files with 9030 additions and 0 deletions
+114
View File
@@ -0,0 +1,114 @@
/*
* Standalone bit-exact C reference for VP9 8×8 DCT_DCT inverse
* transform + add (8-bit pixels), transcribed from the spec
* structure as represented in FFmpeg's libavcodec/vp9dsp_template.c
* (vendored under external/ffmpeg-snapshot/ at commit f46e514).
*
* Provided as a self-contained translation unit so the harness
* doesn't need to wrestle FFmpeg's BIT_DEPTH-templated macro
* expansion. Cross-checked against the vendored reference at
* runtime (see bench_neon_idct.c::cross_check_vs_ffmpeg_c()).
*
* License: LGPL-2.1-or-later (matches the upstream reference).
*
* Spec source: VP9 specification §8.7 — Inverse transform process.
*/
#include <stdint.h>
#include <stddef.h>
#include <string.h>
/* Q14 trig constants — VP9 spec table 8.7.1.4. */
#define COSPI_16_64 11585 /* cos(pi/4) * 2^14 */
#define COSPI_24_64 6270 /* cos(3pi/8) * 2^14 */
#define COSPI_8_64 15137 /* sin(3pi/8) * 2^14 */
#define COSPI_28_64 3196 /* cos(7pi/16)* 2^14 */
#define COSPI_4_64 16069 /* sin(7pi/16)* 2^14 */
#define COSPI_20_64 9102 /* cos(5pi/16)* 2^14 */
#define COSPI_12_64 13623 /* sin(5pi/16)* 2^14 */
/* Q14 round-shift: (x + (1<<13)) >> 14, with overflow-safe widening. */
static inline int32_t qround14(int64_t x)
{
return (int32_t) ((x + (1 << 13)) >> 14);
}
static inline uint8_t clip_u8(int x)
{
return (uint8_t) (x < 0 ? 0 : x > 255 ? 255 : x);
}
/* 1-D 8-point inverse DCT, signed int32 throughout. Matches
* idct8_1d in libavcodec/vp9dsp_template.c (with the stride
* collapsed to indexed access; identical arithmetic). */
static void idct8_1d(const int32_t in[8], int32_t out[8])
{
int32_t t0a = qround14((int64_t)(in[0] + in[4]) * COSPI_16_64);
int32_t t1a = qround14((int64_t)(in[0] - in[4]) * COSPI_16_64);
int32_t t2a = qround14((int64_t)in[2] * COSPI_24_64 - (int64_t)in[6] * COSPI_8_64);
int32_t t3a = qround14((int64_t)in[2] * COSPI_8_64 + (int64_t)in[6] * COSPI_24_64);
int32_t t4a = qround14((int64_t)in[1] * COSPI_28_64 - (int64_t)in[7] * COSPI_4_64);
int32_t t5a = qround14((int64_t)in[5] * COSPI_12_64 - (int64_t)in[3] * COSPI_20_64);
int32_t t6a = qround14((int64_t)in[5] * COSPI_20_64 + (int64_t)in[3] * COSPI_12_64);
int32_t t7a = qround14((int64_t)in[1] * COSPI_4_64 + (int64_t)in[7] * COSPI_28_64);
int32_t t0 = t0a + t3a, t1 = t1a + t2a;
int32_t t2 = t1a - t2a, t3 = t0a - t3a;
int32_t t4 = t4a + t5a;
int32_t t5p = t4a - t5a;
int32_t t7 = t7a + t6a;
int32_t t6p = t7a - t6a;
int32_t t5 = qround14((int64_t)(t6p - t5p) * COSPI_16_64);
int32_t t6 = qround14((int64_t)(t6p + t5p) * COSPI_16_64);
out[0] = t0 + t7; out[1] = t1 + t6;
out[2] = t2 + t5; out[3] = t3 + t4;
out[4] = t3 - t4; out[5] = t2 - t5;
out[6] = t1 - t6; out[7] = t0 - t7;
}
/* Public reference entry point. Signature matches
* ff_vp9_idct_idct_8x8_add_neon. After the call, *block is
* zeroed (matches FFmpeg behaviour). */
void daedalus_vp9_idct_idct_8x8_add_ref(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob)
{
int32_t tmp[64];
int32_t out[8];
int32_t col[8];
/* DC-only fast path: (((coef * 11585) Q14) * 11585) Q14, then
* broadcast (+16) >> 5 added to every pixel. */
if (eob == 1) {
int32_t dc = qround14(qround14((int64_t)block[0] * COSPI_16_64)
* (int64_t) COSPI_16_64);
block[0] = 0;
int32_t add = (dc + 16) >> 5;
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++)
dst[r * stride + c] = clip_u8(dst[r * stride + c] + add);
return;
}
/* 8 column passes, transposed write: IDCT of block column i lands
* in row i of tmp. This matches FFmpeg's idct_idct_8x8_add_c which
* uses `tmp + i*8` as the column-pass output base — the transpose
* is implicit in the offset pattern, making the row pass below
* read columns of tmp and write columns of dst. */
for (int i = 0; i < 8; i++) {
for (int r = 0; r < 8; r++) col[r] = block[r * 8 + i];
idct8_1d(col, out);
for (int r = 0; r < 8; r++) tmp[i * 8 + r] = out[r];
}
memset(block, 0, 64 * sizeof(*block));
/* 8 row passes: column i of tmp -> column i of dst (matches
* FFmpeg's `dst[j*stride] = out[j]; dst++` pattern). */
for (int i = 0; i < 8; i++) {
for (int r = 0; r < 8; r++) col[r] = tmp[r * 8 + i];
idct8_1d(col, out);
for (int r = 0; r < 8; r++)
dst[r * stride + i] = clip_u8(dst[r * stride + i]
+ ((out[r] + 16) >> 5));
}
}