Path B pivot + Phase 0-3 closed with first baseline numbers
This is a from-scratch initial commit on a fresh .git. The original
scaffold commit (7510b56) and the earlier session's working-tree
docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted
.git is preserved at .git-broken-2026-05-18/ (gitignored) for
forensic inspection.
Scope re-anchored from Path A (custom VPU firmware on VC7 scalar
cores; blocked by BCM2712 silicon-RoT mask-ROM signature check)
to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or
direct DRM, on stock signed Pi 5 / CM5). See README.md and
docs/phase0.md for the substrate audit that closed Path A.
Phases closed:
Phase 0 — substrate audit; Path A blocked, Path B open;
codec-back-end-fits-QPU finding (docs/phase0.md)
Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with
publish-before-measure R = M2/M3 decision rules
(docs/phase1.md)
Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored
under external/ffmpeg-snapshot/ (PROVENANCE.md pins
commit f46e514 + per-file SHA-256s) (docs/phase2.md)
Phase 3 — real baseline measurements on hertz (docs/phase3.md):
M1 bit-exact 100.0000 % (10000/10000)
M3 NEON IDCT8 single 8.171 Mblock/s (122.4 ns/block)
M5a empty Vulkan submit 22.66 us
M5b 1-WG noop dispatch 55.60 us
M5 delta 32.95 us/dispatch
=> per-dispatch overhead is ~455x per-NEON-block cost;
Phase 4 must batch at frame level or close to it.
Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c,
vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} +
external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM).
Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12,
libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S
assembles via the config.h shim.
Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching
constraint) -> Phase 5 second-model review -> Phase 6 implement.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for VP9 8×8 DCT_DCT inverse
|
||||
* transform + add (8-bit pixels), transcribed from the spec
|
||||
* structure as represented in FFmpeg's libavcodec/vp9dsp_template.c
|
||||
* (vendored under external/ffmpeg-snapshot/ at commit f46e514).
|
||||
*
|
||||
* Provided as a self-contained translation unit so the harness
|
||||
* doesn't need to wrestle FFmpeg's BIT_DEPTH-templated macro
|
||||
* expansion. Cross-checked against the vendored reference at
|
||||
* runtime (see bench_neon_idct.c::cross_check_vs_ffmpeg_c()).
|
||||
*
|
||||
* License: LGPL-2.1-or-later (matches the upstream reference).
|
||||
*
|
||||
* Spec source: VP9 specification §8.7 — Inverse transform process.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
/* Q14 trig constants — VP9 spec table 8.7.1.4. */
|
||||
#define COSPI_16_64 11585 /* cos(pi/4) * 2^14 */
|
||||
#define COSPI_24_64 6270 /* cos(3pi/8) * 2^14 */
|
||||
#define COSPI_8_64 15137 /* sin(3pi/8) * 2^14 */
|
||||
#define COSPI_28_64 3196 /* cos(7pi/16)* 2^14 */
|
||||
#define COSPI_4_64 16069 /* sin(7pi/16)* 2^14 */
|
||||
#define COSPI_20_64 9102 /* cos(5pi/16)* 2^14 */
|
||||
#define COSPI_12_64 13623 /* sin(5pi/16)* 2^14 */
|
||||
|
||||
/* Q14 round-shift: (x + (1<<13)) >> 14, with overflow-safe widening. */
|
||||
static inline int32_t qround14(int64_t x)
|
||||
{
|
||||
return (int32_t) ((x + (1 << 13)) >> 14);
|
||||
}
|
||||
|
||||
static inline uint8_t clip_u8(int x)
|
||||
{
|
||||
return (uint8_t) (x < 0 ? 0 : x > 255 ? 255 : x);
|
||||
}
|
||||
|
||||
/* 1-D 8-point inverse DCT, signed int32 throughout. Matches
|
||||
* idct8_1d in libavcodec/vp9dsp_template.c (with the stride
|
||||
* collapsed to indexed access; identical arithmetic). */
|
||||
static void idct8_1d(const int32_t in[8], int32_t out[8])
|
||||
{
|
||||
int32_t t0a = qround14((int64_t)(in[0] + in[4]) * COSPI_16_64);
|
||||
int32_t t1a = qround14((int64_t)(in[0] - in[4]) * COSPI_16_64);
|
||||
int32_t t2a = qround14((int64_t)in[2] * COSPI_24_64 - (int64_t)in[6] * COSPI_8_64);
|
||||
int32_t t3a = qround14((int64_t)in[2] * COSPI_8_64 + (int64_t)in[6] * COSPI_24_64);
|
||||
int32_t t4a = qround14((int64_t)in[1] * COSPI_28_64 - (int64_t)in[7] * COSPI_4_64);
|
||||
int32_t t5a = qround14((int64_t)in[5] * COSPI_12_64 - (int64_t)in[3] * COSPI_20_64);
|
||||
int32_t t6a = qround14((int64_t)in[5] * COSPI_20_64 + (int64_t)in[3] * COSPI_12_64);
|
||||
int32_t t7a = qround14((int64_t)in[1] * COSPI_4_64 + (int64_t)in[7] * COSPI_28_64);
|
||||
|
||||
int32_t t0 = t0a + t3a, t1 = t1a + t2a;
|
||||
int32_t t2 = t1a - t2a, t3 = t0a - t3a;
|
||||
int32_t t4 = t4a + t5a;
|
||||
int32_t t5p = t4a - t5a;
|
||||
int32_t t7 = t7a + t6a;
|
||||
int32_t t6p = t7a - t6a;
|
||||
|
||||
int32_t t5 = qround14((int64_t)(t6p - t5p) * COSPI_16_64);
|
||||
int32_t t6 = qround14((int64_t)(t6p + t5p) * COSPI_16_64);
|
||||
|
||||
out[0] = t0 + t7; out[1] = t1 + t6;
|
||||
out[2] = t2 + t5; out[3] = t3 + t4;
|
||||
out[4] = t3 - t4; out[5] = t2 - t5;
|
||||
out[6] = t1 - t6; out[7] = t0 - t7;
|
||||
}
|
||||
|
||||
/* Public reference entry point. Signature matches
|
||||
* ff_vp9_idct_idct_8x8_add_neon. After the call, *block is
|
||||
* zeroed (matches FFmpeg behaviour). */
|
||||
void daedalus_vp9_idct_idct_8x8_add_ref(uint8_t *dst, ptrdiff_t stride,
|
||||
int16_t *block, int eob)
|
||||
{
|
||||
int32_t tmp[64];
|
||||
int32_t out[8];
|
||||
int32_t col[8];
|
||||
|
||||
/* DC-only fast path: (((coef * 11585) Q14) * 11585) Q14, then
|
||||
* broadcast (+16) >> 5 added to every pixel. */
|
||||
if (eob == 1) {
|
||||
int32_t dc = qround14(qround14((int64_t)block[0] * COSPI_16_64)
|
||||
* (int64_t) COSPI_16_64);
|
||||
block[0] = 0;
|
||||
int32_t add = (dc + 16) >> 5;
|
||||
for (int r = 0; r < 8; r++)
|
||||
for (int c = 0; c < 8; c++)
|
||||
dst[r * stride + c] = clip_u8(dst[r * stride + c] + add);
|
||||
return;
|
||||
}
|
||||
|
||||
/* 8 column passes, transposed write: IDCT of block column i lands
|
||||
* in row i of tmp. This matches FFmpeg's idct_idct_8x8_add_c which
|
||||
* uses `tmp + i*8` as the column-pass output base — the transpose
|
||||
* is implicit in the offset pattern, making the row pass below
|
||||
* read columns of tmp and write columns of dst. */
|
||||
for (int i = 0; i < 8; i++) {
|
||||
for (int r = 0; r < 8; r++) col[r] = block[r * 8 + i];
|
||||
idct8_1d(col, out);
|
||||
for (int r = 0; r < 8; r++) tmp[i * 8 + r] = out[r];
|
||||
}
|
||||
memset(block, 0, 64 * sizeof(*block));
|
||||
|
||||
/* 8 row passes: column i of tmp -> column i of dst (matches
|
||||
* FFmpeg's `dst[j*stride] = out[j]; dst++` pattern). */
|
||||
for (int i = 0; i < 8; i++) {
|
||||
for (int r = 0; r < 8; r++) col[r] = tmp[r * 8 + i];
|
||||
idct8_1d(col, out);
|
||||
for (int r = 0; r < 8; r++)
|
||||
dst[r * stride + i] = clip_u8(dst[r * stride + i]
|
||||
+ ((out[r] + 16) >> 5));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user