Files
daedalus-fourier/tests/vp9_idct8_ref.c
T
marfrit dcbbc77038 Path B pivot + Phase 0-3 closed with first baseline numbers
This is a from-scratch initial commit on a fresh .git. The original
scaffold commit (7510b56) and the earlier session's working-tree
docs were lost in a 2026-05-18 10:25 working-tree wipe; the corrupted
.git is preserved at .git-broken-2026-05-18/ (gitignored) for
forensic inspection.

Scope re-anchored from Path A (custom VPU firmware on VC7 scalar
cores; blocked by BCM2712 silicon-RoT mask-ROM signature check)
to Path B (QPU compute kernels via Mesa v3d / Vulkan compute or
direct DRM, on stock signed Pi 5 / CM5). See README.md and
docs/phase0.md for the substrate audit that closed Path A.

Phases closed:
  Phase 0 — substrate audit; Path A blocked, Path B open;
            codec-back-end-fits-QPU finding (docs/phase0.md)
  Phase 1 — first kernel locked (VP9 / AV1 8x8 inverse DCT) with
            publish-before-measure R = M2/M3 decision rules
            (docs/phase1.md)
  Phase 2 — reference impls mapped; FFmpeg n7.1.3 source vendored
            under external/ffmpeg-snapshot/ (PROVENANCE.md pins
            commit f46e514 + per-file SHA-256s) (docs/phase2.md)
  Phase 3 — real baseline measurements on hertz (docs/phase3.md):
              M1 bit-exact            100.0000 % (10000/10000)
              M3 NEON IDCT8 single    8.171 Mblock/s (122.4 ns/block)
              M5a empty Vulkan submit 22.66 us
              M5b 1-WG noop dispatch  55.60 us
              M5 delta                32.95 us/dispatch
            => per-dispatch overhead is ~455x per-NEON-block cost;
               Phase 4 must batch at frame level or close to it.

Build harness in place: CMakeLists.txt + tests/{bench_neon_idct.c,
vp9_idct8_ref.c, bench_vulkan_dispatch.c, shaders/noop.comp} +
external/ffmpeg-snapshot/config.h shim (7 defines + EXTERN_ASM).
Builds clean on Debian Trixie aarch64 with cmake 3.31, ninja 1.12,
libvulkan-dev 1.4.309, glslang-tools 15.1.0. Vendored FFmpeg .S
assembles via the config.h shim.

Next: Phase 4 (plan first QPU IDCT kernel under the M5 batching
constraint) -> Phase 5 second-model review -> Phase 6 implement.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 11:30:12 +00:00

115 lines
4.5 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Standalone bit-exact C reference for VP9 8×8 DCT_DCT inverse
* transform + add (8-bit pixels), transcribed from the spec
* structure as represented in FFmpeg's libavcodec/vp9dsp_template.c
* (vendored under external/ffmpeg-snapshot/ at commit f46e514).
*
* Provided as a self-contained translation unit so the harness
* doesn't need to wrestle FFmpeg's BIT_DEPTH-templated macro
* expansion. Cross-checked against the vendored reference at
* runtime (see bench_neon_idct.c::cross_check_vs_ffmpeg_c()).
*
* License: LGPL-2.1-or-later (matches the upstream reference).
*
* Spec source: VP9 specification §8.7 — Inverse transform process.
*/
#include <stdint.h>
#include <stddef.h>
#include <string.h>
/* Q14 trig constants — VP9 spec table 8.7.1.4. */
#define COSPI_16_64 11585 /* cos(pi/4) * 2^14 */
#define COSPI_24_64 6270 /* cos(3pi/8) * 2^14 */
#define COSPI_8_64 15137 /* sin(3pi/8) * 2^14 */
#define COSPI_28_64 3196 /* cos(7pi/16)* 2^14 */
#define COSPI_4_64 16069 /* sin(7pi/16)* 2^14 */
#define COSPI_20_64 9102 /* cos(5pi/16)* 2^14 */
#define COSPI_12_64 13623 /* sin(5pi/16)* 2^14 */
/* Q14 round-shift: (x + (1<<13)) >> 14, with overflow-safe widening. */
static inline int32_t qround14(int64_t x)
{
return (int32_t) ((x + (1 << 13)) >> 14);
}
static inline uint8_t clip_u8(int x)
{
return (uint8_t) (x < 0 ? 0 : x > 255 ? 255 : x);
}
/* 1-D 8-point inverse DCT, signed int32 throughout. Matches
* idct8_1d in libavcodec/vp9dsp_template.c (with the stride
* collapsed to indexed access; identical arithmetic). */
static void idct8_1d(const int32_t in[8], int32_t out[8])
{
int32_t t0a = qround14((int64_t)(in[0] + in[4]) * COSPI_16_64);
int32_t t1a = qround14((int64_t)(in[0] - in[4]) * COSPI_16_64);
int32_t t2a = qround14((int64_t)in[2] * COSPI_24_64 - (int64_t)in[6] * COSPI_8_64);
int32_t t3a = qround14((int64_t)in[2] * COSPI_8_64 + (int64_t)in[6] * COSPI_24_64);
int32_t t4a = qround14((int64_t)in[1] * COSPI_28_64 - (int64_t)in[7] * COSPI_4_64);
int32_t t5a = qround14((int64_t)in[5] * COSPI_12_64 - (int64_t)in[3] * COSPI_20_64);
int32_t t6a = qround14((int64_t)in[5] * COSPI_20_64 + (int64_t)in[3] * COSPI_12_64);
int32_t t7a = qround14((int64_t)in[1] * COSPI_4_64 + (int64_t)in[7] * COSPI_28_64);
int32_t t0 = t0a + t3a, t1 = t1a + t2a;
int32_t t2 = t1a - t2a, t3 = t0a - t3a;
int32_t t4 = t4a + t5a;
int32_t t5p = t4a - t5a;
int32_t t7 = t7a + t6a;
int32_t t6p = t7a - t6a;
int32_t t5 = qround14((int64_t)(t6p - t5p) * COSPI_16_64);
int32_t t6 = qround14((int64_t)(t6p + t5p) * COSPI_16_64);
out[0] = t0 + t7; out[1] = t1 + t6;
out[2] = t2 + t5; out[3] = t3 + t4;
out[4] = t3 - t4; out[5] = t2 - t5;
out[6] = t1 - t6; out[7] = t0 - t7;
}
/* Public reference entry point. Signature matches
* ff_vp9_idct_idct_8x8_add_neon. After the call, *block is
* zeroed (matches FFmpeg behaviour). */
void daedalus_vp9_idct_idct_8x8_add_ref(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob)
{
int32_t tmp[64];
int32_t out[8];
int32_t col[8];
/* DC-only fast path: (((coef * 11585) Q14) * 11585) Q14, then
* broadcast (+16) >> 5 added to every pixel. */
if (eob == 1) {
int32_t dc = qround14(qround14((int64_t)block[0] * COSPI_16_64)
* (int64_t) COSPI_16_64);
block[0] = 0;
int32_t add = (dc + 16) >> 5;
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++)
dst[r * stride + c] = clip_u8(dst[r * stride + c] + add);
return;
}
/* 8 column passes, transposed write: IDCT of block column i lands
* in row i of tmp. This matches FFmpeg's idct_idct_8x8_add_c which
* uses `tmp + i*8` as the column-pass output base — the transpose
* is implicit in the offset pattern, making the row pass below
* read columns of tmp and write columns of dst. */
for (int i = 0; i < 8; i++) {
for (int r = 0; r < 8; r++) col[r] = block[r * 8 + i];
idct8_1d(col, out);
for (int r = 0; r < 8; r++) tmp[i * 8 + r] = out[r];
}
memset(block, 0, 64 * sizeof(*block));
/* 8 row passes: column i of tmp -> column i of dst (matches
* FFmpeg's `dst[j*stride] = out[j]; dst++` pattern). */
for (int i = 0; i < 8; i++) {
for (int r = 0; r < 8; r++) col[r] = tmp[r * 8 + i];
idct8_1d(col, out);
for (int r = 0; r < 8; r++)
dst[r * stride + i] = clip_u8(dst[r * stride + i]
+ ((out[r] + 16) >> 5));
}
}