db2205d0e3
M1: 10000/10000 bit-exact first try (column-major-block lesson from cycle 6 carried over cleanly). M3: 151.2 Mblock/s per core. Per-block 6.6 ns. 155x the 1080p30 floor (0.972 Mblock/s req'd). Phase-1 prediction of R7 = 0.5-0.9 YELLOW/GREEN was WRONG. H.264 IDCT 8x8 is dramatically lighter than VP9 IDCT 8x8 (18.5x faster NEON): VP9 IDCT 8x8: 122 ns/block (Q14 trig + COSPI multiplies) H.264 IDCT 8x8: 6.6 ns/block (pure integer butterfly + shifts) Phase 4 deferred via the cycle 6 lightweight-kernel rationale: NEON per-block << QPU dispatch floor; offload doesn't help. Phase 9 lesson updated: H.264 transforms (both 4x4 and 8x8) are NEON-trivial. Skip ALL H.264 transform cycles for QPU. Target compute-heavy H.264 kernels only (deblock = cycle 8 next; MC likely RED). Cycle 7 = 2nd consecutive "predicted GREEN, measured CPU-only" result. Forces a sharper view of which kernels QPU can actually help with: deblock and possibly some VP9 cases. - tests/h264_idct8_ref.c (column-major C ref) - tests/bench_neon_h264idct8.c (M1 + M3 bench) - CMakeLists.txt: cycle 7 bench wiring - docs/k7_h264idct8_phase3_and_4.md (closure) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
93 lines
2.7 KiB
C
93 lines
2.7 KiB
C
/*
|
|
* Standalone bit-exact C reference for H.264 8x8 inverse integer
|
|
* transform + add. Algorithm per H.264 spec §8.5.13.2 (8x8 IT).
|
|
*
|
|
* Mirrors FFmpeg `ff_h264_idct8_add_neon` in
|
|
* external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S
|
|
* line 267. Block is COLUMN-MAJOR (per cycle 6 Phase 9 lesson):
|
|
* block[c*8 + r] = coefficient at (row=r, col=c).
|
|
*
|
|
* Signature:
|
|
* void(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
|
*
|
|
* Zeroes block after transform (per FFmpeg convention).
|
|
*
|
|
* License: LGPL-2.1-or-later.
|
|
*/
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
|
|
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
|
|
|
/* 1D 8-element H.264 IT butterfly per H.264 §8.5.13.2.
|
|
* Takes d[0..7], produces g[0..7]. */
|
|
static inline void h264_idct8_butterfly(const int d[8], int g[8])
|
|
{
|
|
int e[8], f[8];
|
|
|
|
e[0] = d[0] + d[4];
|
|
e[1] = -d[3] + d[5] - d[7] - (d[7] >> 1);
|
|
e[2] = d[0] - d[4];
|
|
e[3] = d[1] + d[7] - d[3] - (d[3] >> 1);
|
|
e[4] = (d[2] >> 1) - d[6];
|
|
e[5] = -d[1] + d[7] + d[5] + (d[5] >> 1);
|
|
e[6] = d[2] + (d[6] >> 1);
|
|
e[7] = d[3] + d[5] + d[1] + (d[1] >> 1);
|
|
|
|
f[0] = e[0] + e[6];
|
|
f[1] = e[1] + (e[7] >> 2);
|
|
f[2] = e[2] + e[4];
|
|
f[3] = e[3] + (e[5] >> 2);
|
|
f[4] = e[2] - e[4];
|
|
f[5] = (e[3] >> 2) - e[5];
|
|
f[6] = e[0] - e[6];
|
|
f[7] = e[7] - (e[1] >> 2);
|
|
|
|
g[0] = f[0] + f[7];
|
|
g[1] = f[2] + f[5];
|
|
g[2] = f[4] + f[3];
|
|
g[3] = f[6] + f[1];
|
|
g[4] = f[6] - f[1];
|
|
g[5] = f[4] - f[3];
|
|
g[6] = f[2] - f[5];
|
|
g[7] = f[0] - f[7];
|
|
}
|
|
|
|
void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride)
|
|
{
|
|
int tmp[8][8];
|
|
|
|
/* Row pass FIRST. Read block as column-major (block[c*8 + r]).
|
|
* d[c] for row r = block[c*8 + r] = (row=r, col=c) per the
|
|
* H.264/FFmpeg column-major convention from cycle 6 phase 9. */
|
|
for (int r = 0; r < 8; r++) {
|
|
int d[8];
|
|
for (int c = 0; c < 8; c++) d[c] = block[c*8 + r];
|
|
int g[8];
|
|
h264_idct8_butterfly(d, g);
|
|
for (int c = 0; c < 8; c++) tmp[r][c] = g[c];
|
|
}
|
|
|
|
/* Column pass NEXT (on row-major tmp). */
|
|
int col_out[8][8];
|
|
for (int c = 0; c < 8; c++) {
|
|
int d[8];
|
|
for (int r = 0; r < 8; r++) d[r] = tmp[r][c];
|
|
int g[8];
|
|
h264_idct8_butterfly(d, g);
|
|
for (int r = 0; r < 8; r++) col_out[r][c] = g[r];
|
|
}
|
|
|
|
/* Round (+32) >> 6, add to dst, clip to u8. */
|
|
for (int r = 0; r < 8; r++) {
|
|
for (int c = 0; c < 8; c++) {
|
|
int rounded = (col_out[r][c] + 32) >> 6;
|
|
dst[r * stride + c] = (uint8_t) clip_u8(dst[r * stride + c] + rounded);
|
|
}
|
|
}
|
|
|
|
/* FFmpeg convention: zero the block after transform. */
|
|
memset(block, 0, 64 * sizeof(int16_t));
|
|
}
|