/* * Standalone bit-exact C reference for H.264 luma qpel 8x8 mc22 * (2D half-pel, "put" variant). Cascade of horizontal 6-tap then * vertical 6-tap with INTERMEDIATE 16-bit precision (no per-stage * clip/round), final +512 >> 10 to scale back. * * Per H.264 §8.4.2.2.1, "j" position: * * tmp[r,c] = s[r,c-2] - 5*s[r,c-1] + 20*s[r,c] + 20*s[r,c+1] * - 5*s[r,c+2] + s[r,c+3] (16-bit signed) * * dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c] * + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c] * + 512) >> 10) * * The tmp[] array spans rows r-2 .. r+3 around each output row, so * we need 13 intermediate rows (rows -2..+10 of the SOURCE * neighbourhood) for 8 output rows. Caller's src must have 2 rows * of top context + 3 rows of bottom context AND 2 cols of left + * 3 cols of right context (FFmpeg's edge-emulated buffer provides * this at the frame boundary; same contract as mc20). * * Mirrors FFmpeg `ff_put_h264_qpel8_mc22_neon` (in * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S * line 710, which tail-calls put_h264_qpel8_hv_lowpass_neon). * * Signature: * void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); * * Same single-stride convention as mc20/mc02. * * License: LGPL-2.1-or-later. */ #include #include static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) { /* 13 intermediate rows × 8 cols (for the 8 output rows * dst[0..7][0..7], we need tmp[-2..+10][0..7] — but tmp is * indexed RELATIVE to the output, so tmp_buf[0..12] corresponds * to source rows [-2..+10]). */ int16_t tmp[13][8]; for (int rr = 0; rr < 13; rr++) { int src_row = rr - 2; /* maps tmp_buf[0..12] → src rows [-2..+10] */ const uint8_t *s = src + src_row * stride; for (int c = 0; c < 8; c++) { int v = (int) s[c - 2] - 5 * (int) s[c - 1] + 20 * (int) s[c] + 20 * (int) s[c + 1] - 5 * (int) s[c + 2] + (int) s[c + 3]; tmp[rr][c] = (int16_t) v; } } for (int r = 0; r < 8; r++) { /* tmp[r-2..r+3] in the output's coord system → tmp_buf[r..r+5]. */ for (int c = 0; c < 8; c++) { int v = tmp[r + 0][c] /* "r-2" + shift 2 */ - 5 * tmp[r + 1][c] /* "r-1" */ + 20 * tmp[r + 2][c] /* "r+0" */ + 20 * tmp[r + 3][c] /* "r+1" */ - 5 * tmp[r + 4][c] /* "r+2" */ + tmp[r + 5][c] /* "r+3" */ + 512; dst[r * stride + c] = (uint8_t) clip_u8(v >> 10); } } }