/*
 * Standalone bit-exact C reference for H.264 luma qpel 8×8 mc02
 * (vertical half-pel, "put" variant).  Mirror of mc20 with rows
 * and columns transposed.  6-tap filter applied vertically:
 *
 *   dst[r,c] = clip255( (s[r-2,c] - 5*s[r-1,c] + 20*s[r,c]
 *                       + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c]
 *                       + 16) >> 5 )
 *
 * Mirrors FFmpeg `ff_put_h264_qpel8_mc02_neon` (in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
 * line 678, which tail-calls put_h264_qpel8_v_lowpass_neon).
 *
 * Signature:
 *   void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 *
 * Both dst and src use the SAME stride.  src points at row 0 col 0
 * of the output block; the filter reads rows -2..+3 (2 rows of top
 * context, 3 rows of bottom context).  Caller must guarantee the
 * source buffer has those rows available (FFmpeg's edge-emulated
 * buffer handles this at the frame boundary; matches the contract
 * documented for mc20).
 *
 * License: LGPL-2.1-or-later.
 */
#include <stdint.h>
#include <stddef.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }

void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
    for (int r = 0; r < 8; r++) {
        for (int c = 0; c < 8; c++) {
            int s_m2 = src[(r - 2) * stride + c];
            int s_m1 = src[(r - 1) * stride + c];
            int s_0  = src[(r + 0) * stride + c];
            int s_p1 = src[(r + 1) * stride + c];
            int s_p2 = src[(r + 2) * stride + c];
            int s_p3 = src[(r + 3) * stride + c];
            int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
            dst[r * stride + c] = (uint8_t) clip_u8(v >> 5);
        }
    }
}