daedalus-fourier/tests/h264_qpel8_mc20_ref.c

/*
 * Standalone bit-exact C reference for H.264 luma qpel 8×8 mc20
 * (horizontal half-pel, "put" variant). 6-tap filter:
 *
 *   dst[r,c] = clip255( (s[r,c-2] - 5*s[r,c-1] + 20*s[r,c]
 *                       + 20*s[r,c+1] - 5*s[r,c+2] + s[r,c+3]
 *                       + 16) >> 5 )
 *
 * Mirrors FFmpeg `ff_put_h264_qpel8_mc20_neon` (in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
 * line 595, which tail-calls put_h264_qpel8_h_lowpass_neon).
 *
 * Signature:
 *   void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 *
 * Both dst and src use the SAME stride. src points at the
 * leftmost output column (col 0); filter reads cols -2..+3.
 *
 * License: LGPL-2.1-or-later.
 */
#include <stdint.h>
#include <stddef.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }

void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
    for (int r = 0; r < 8; r++) {
        const uint8_t *s = src + r * stride;
        uint8_t *d = dst + r * stride;
        for (int c = 0; c < 8; c++) {
            int v = (int) s[c - 2] - 5 * (int) s[c - 1]
                  + 20 * (int) s[c] + 20 * (int) s[c + 1]
                  - 5 * (int) s[c + 2] + (int) s[c + 3]
                  + 16;
            d[c] = (uint8_t) clip_u8(v >> 5);
        }
    }
}