/* * Standalone bit-exact C reference for H.264 4x4 inverse integer * transform + add. Algorithm per H.264 spec §8.5.12.1 (4x4 IT for * blocks coded with TransformBypassFlag = 0). * * Mirrors FFmpeg `ff_h264_idct_add_neon` in * external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S * (n7.1.3 pin). Destructively zeroes `block` to match upstream * convention (post-call block must be zero for the H.264 conformance * residual loop). * * Signature mirrors the NEON convention: * void(uint8_t *dst, int16_t *block, ptrdiff_t stride); * * License: LGPL-2.1-or-later (matches FFmpeg upstream the algorithm * was transcribed from). Spec is H.264 ITU-T Rec H.264 / ISO/IEC * 14496-10. */ #include #include #include static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } /* 1D butterfly per H.264 spec §8.5.12.1. * d[0..3] are input, e/f/g/h are intermediate, h_c[0..3] are output. */ static inline void h264_idct4_butterfly(const int d[4], int h_c[4]) { int e = d[0] + d[2]; int f = d[0] - d[2]; int g = (d[1] >> 1) - d[3]; int h = d[1] + (d[3] >> 1); h_c[0] = e + h; h_c[1] = f + g; h_c[2] = f - g; h_c[3] = e - h; } void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride) { /* H.264/FFmpeg block layout is COLUMN-MAJOR: * block[c*4 + r] = coefficient at row r, column c. * NEON ld1.4h{4 regs} interleaves consecutive memory across * registers; with column-major source this gives v_r[c] = block at * (row=r, col=c). The first lane-wise butterfly (v0+v2 etc.) then * combines column 0 and column 2 within each row → row pass. * JM and FFmpeg C reference both do row-first then column-pass. * * dst is row-major (dst[r*stride + c]). */ int tmp[4][4]; /* Row pass FIRST. Read block as column-major (block[c*4 + r]). */ for (int r = 0; r < 4; r++) { int d[4] = { block[0*4 + r], block[1*4 + r], block[2*4 + r], block[3*4 + r] }; int h_c[4]; h264_idct4_butterfly(d, h_c); for (int c = 0; c < 4; c++) tmp[r][c] = h_c[c]; } /* Column pass NEXT (on row-major tmp). */ int col_out[4][4]; for (int c = 0; c < 4; c++) { int d[4] = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] }; int h_c[4]; h264_idct4_butterfly(d, h_c); for (int r = 0; r < 4; r++) col_out[r][c] = h_c[r]; } /* Round (+32) >> 6, add to dst, clip to u8. */ for (int r = 0; r < 4; r++) { for (int c = 0; c < 4; c++) { int rounded = (col_out[r][c] + 32) >> 6; dst[r * stride + c] = (uint8_t) clip_u8(dst[r * stride + c] + rounded); } } /* FFmpeg convention: zero the block after the transform. */ memset(block, 0, 16 * sizeof(int16_t)); }