/* * Standalone bit-exact C reference for VP9 8×8 DCT_DCT inverse * transform + add (8-bit pixels), transcribed from the spec * structure as represented in FFmpeg's libavcodec/vp9dsp_template.c * (vendored under external/ffmpeg-snapshot/ at commit f46e514). * * Provided as a self-contained translation unit so the harness * doesn't need to wrestle FFmpeg's BIT_DEPTH-templated macro * expansion. Cross-checked against the vendored reference at * runtime (see bench_neon_idct.c::cross_check_vs_ffmpeg_c()). * * License: LGPL-2.1-or-later (matches the upstream reference). * * Spec source: VP9 specification §8.7 — Inverse transform process. */ #include #include #include /* Q14 trig constants — VP9 spec table 8.7.1.4. */ #define COSPI_16_64 11585 /* cos(pi/4) * 2^14 */ #define COSPI_24_64 6270 /* cos(3pi/8) * 2^14 */ #define COSPI_8_64 15137 /* sin(3pi/8) * 2^14 */ #define COSPI_28_64 3196 /* cos(7pi/16)* 2^14 */ #define COSPI_4_64 16069 /* sin(7pi/16)* 2^14 */ #define COSPI_20_64 9102 /* cos(5pi/16)* 2^14 */ #define COSPI_12_64 13623 /* sin(5pi/16)* 2^14 */ /* Q14 round-shift: (x + (1<<13)) >> 14, with overflow-safe widening. */ static inline int32_t qround14(int64_t x) { return (int32_t) ((x + (1 << 13)) >> 14); } static inline uint8_t clip_u8(int x) { return (uint8_t) (x < 0 ? 0 : x > 255 ? 255 : x); } /* 1-D 8-point inverse DCT, signed int32 throughout. Matches * idct8_1d in libavcodec/vp9dsp_template.c (with the stride * collapsed to indexed access; identical arithmetic). */ static void idct8_1d(const int32_t in[8], int32_t out[8]) { int32_t t0a = qround14((int64_t)(in[0] + in[4]) * COSPI_16_64); int32_t t1a = qround14((int64_t)(in[0] - in[4]) * COSPI_16_64); int32_t t2a = qround14((int64_t)in[2] * COSPI_24_64 - (int64_t)in[6] * COSPI_8_64); int32_t t3a = qround14((int64_t)in[2] * COSPI_8_64 + (int64_t)in[6] * COSPI_24_64); int32_t t4a = qround14((int64_t)in[1] * COSPI_28_64 - (int64_t)in[7] * COSPI_4_64); int32_t t5a = qround14((int64_t)in[5] * COSPI_12_64 - (int64_t)in[3] * COSPI_20_64); int32_t t6a = qround14((int64_t)in[5] * COSPI_20_64 + (int64_t)in[3] * COSPI_12_64); int32_t t7a = qround14((int64_t)in[1] * COSPI_4_64 + (int64_t)in[7] * COSPI_28_64); int32_t t0 = t0a + t3a, t1 = t1a + t2a; int32_t t2 = t1a - t2a, t3 = t0a - t3a; int32_t t4 = t4a + t5a; int32_t t5p = t4a - t5a; int32_t t7 = t7a + t6a; int32_t t6p = t7a - t6a; int32_t t5 = qround14((int64_t)(t6p - t5p) * COSPI_16_64); int32_t t6 = qround14((int64_t)(t6p + t5p) * COSPI_16_64); out[0] = t0 + t7; out[1] = t1 + t6; out[2] = t2 + t5; out[3] = t3 + t4; out[4] = t3 - t4; out[5] = t2 - t5; out[6] = t1 - t6; out[7] = t0 - t7; } /* Public reference entry point. Signature matches * ff_vp9_idct_idct_8x8_add_neon. After the call, *block is * zeroed (matches FFmpeg behaviour). */ void daedalus_vp9_idct_idct_8x8_add_ref(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob) { int32_t tmp[64]; int32_t out[8]; int32_t col[8]; /* DC-only fast path: (((coef * 11585) Q14) * 11585) Q14, then * broadcast (+16) >> 5 added to every pixel. */ if (eob == 1) { int32_t dc = qround14(qround14((int64_t)block[0] * COSPI_16_64) * (int64_t) COSPI_16_64); block[0] = 0; int32_t add = (dc + 16) >> 5; for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) dst[r * stride + c] = clip_u8(dst[r * stride + c] + add); return; } /* 8 column passes, transposed write: IDCT of block column i lands * in row i of tmp. This matches FFmpeg's idct_idct_8x8_add_c which * uses `tmp + i*8` as the column-pass output base — the transpose * is implicit in the offset pattern, making the row pass below * read columns of tmp and write columns of dst. */ for (int i = 0; i < 8; i++) { for (int r = 0; r < 8; r++) col[r] = block[r * 8 + i]; idct8_1d(col, out); for (int r = 0; r < 8; r++) tmp[i * 8 + r] = out[r]; } memset(block, 0, 64 * sizeof(*block)); /* 8 row passes: column i of tmp -> column i of dst (matches * FFmpeg's `dst[j*stride] = out[j]; dst++` pattern). */ for (int i = 0; i < 8; i++) { for (int r = 0; r < 8; r++) col[r] = tmp[r * 8 + i]; idct8_1d(col, out); for (int r = 0; r < 8; r++) dst[r * stride + i] = clip_u8(dst[r * stride + i] + ((out[r] + 16) >> 5)); } }