/* * Standalone bit-exact C references for the 8 diagonal H.264 luma * qpel positions (mc11, mc12, mc13, mc21, mc23, mc31, mc32, mc33). * Each is the rounded average of two half-pel intermediates per * H.264 §8.4.2.2.1 / Table 8-4, decomposed to match the FFmpeg .S * reference structure (see comments in mc{11,12,21,...}_neon in * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S). * * Position decompositions (verified against the .S): * mc11 (e, ¼¼): avg(mc20[r,c], mc02[r,c]) * mc12 (f, ¼½): avg(mc22[r,c], mc02[r,c]) * mc13 (g, ¼¾): avg(mc20[r+1,c], mc02[r,c]) * mc21 (i, ½¼): avg(mc22[r,c], mc20[r,c]) * mc23 (k, ½¾): avg(mc22[r,c], mc20[r+1,c]) * mc31 (p, ¾¼): avg(mc20[r,c], mc02[r,c+1]) * mc32 (q, ¾½): avg(mc22[r,c], mc02[r,c+1]) * mc33 (r, ¾¾): avg(mc20[r+1,c], mc02[r,c+1]) * * (The mc20[r,c] notation means "the mc20-style horizontal half-pel * result at source-relative integer position (r, c)"; analogously * for mc02 and mc22.) * * Single-stride convention; same edge-context contract as the simpler * variants (the cells "[r+1,c]" etc. demand one extra row/col of * source context beyond what mc20/mc02 alone would need). * * License: LGPL-2.1-or-later. */ #include #include static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } /* Single-cell helpers — same arithmetic as the dedicated mc20/mc02 * refs but computed point-by-point so the diagonal refs can mix them * cheaply. Each returns a u8 (already clipped). */ static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride) { int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1] + 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1] - 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3] + 16; return (uint8_t) clip_u8(v >> 5); } static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride) { int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c] + 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c] - 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c] + 16; return (uint8_t) clip_u8(v >> 5); } /* hpel_hv — 2D half-pel at (r, c) per the H.264 §8.4.2.2.1 "j" * cascade. Computes the 6 vertical intermediates needed for the * column at offsets -2..+3 around (r, c), each as a 16-bit signed * h-lowpass over the 6 source samples in the same row. Then v-lowpass * over those 6 intermediates with the +512 >> 10 final scale. Same * as the mc22 ref, just expressed point-by-point. */ static inline uint8_t hpel_hv(const uint8_t *s, int r, int c, ptrdiff_t stride) { int t[6]; /* tmp at rows r-2..r+3 of the same col c */ for (int i = 0; i < 6; i++) { int rr = r - 2 + i; t[i] = (int) s[rr*stride + c-2] - 5 * (int) s[rr*stride + c-1] + 20 * (int) s[rr*stride + c] + 20 * (int) s[rr*stride + c+1] - 5 * (int) s[rr*stride + c+2] + (int) s[rr*stride + c+3]; } int v = t[0] - 5 * t[1] + 20 * t[2] + 20 * t[3] - 5 * t[4] + t[5] + 512; return (uint8_t) clip_u8(v >> 10); } /* avg rounded ((a + b + 1) >> 1) — saturates already-clipped inputs * so no further clip needed. */ static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); } #define DEFINE_DIAG_REF(NAME, A_EXPR, B_EXPR) \ void daedalus_put_h264_qpel8_ ## NAME ## _ref(uint8_t *dst, \ const uint8_t *src, ptrdiff_t stride) \ { \ for (int r = 0; r < 8; r++) \ for (int c = 0; c < 8; c++) { \ uint8_t a = (A_EXPR); \ uint8_t b = (B_EXPR); \ dst[r*stride + c] = avg2(a, b); \ } \ } DEFINE_DIAG_REF(mc11, hpel_h(src, r, c, stride), hpel_v(src, r, c, stride)) DEFINE_DIAG_REF(mc12, hpel_hv(src, r, c, stride), hpel_v(src, r, c, stride)) DEFINE_DIAG_REF(mc13, hpel_h(src, r+1, c, stride), hpel_v(src, r, c, stride)) DEFINE_DIAG_REF(mc21, hpel_hv(src, r, c, stride), hpel_h(src, r, c, stride)) DEFINE_DIAG_REF(mc23, hpel_hv(src, r, c, stride), hpel_h(src, r+1, c, stride)) DEFINE_DIAG_REF(mc31, hpel_h(src, r, c, stride), hpel_v(src, r, c+1, stride)) DEFINE_DIAG_REF(mc32, hpel_hv(src, r, c, stride), hpel_v(src, r, c+1, stride)) DEFINE_DIAG_REF(mc33, hpel_h(src, r+1, c, stride), hpel_v(src, r, c+1, stride)) #undef DEFINE_DIAG_REF