/*
 * Standalone bit-exact C references for the four single-axis quarter-
 * pel luma qpel positions (H.264 §8.4.2.2.1, "put" variants).  Each
 * is a half-pel lowpass clipped to u8 followed by an L2 rounded-average
 * with an integer-position source pixel.
 *
 *   mc10 ("a" pos, ¼ horiz): a = clip255(mc20(s)); dst = (a + s[r,c]   + 1) >> 1
 *   mc30 ("c" pos, ¾ horiz): a = clip255(mc20(s)); dst = (a + s[r,c+1] + 1) >> 1
 *   mc01 ("d" pos, ¼ vert ): a = clip255(mc02(s)); dst = (a + s[r,  c] + 1) >> 1
 *   mc03 ("n" pos, ¾ vert ): a = clip255(mc02(s)); dst = (a + s[r+1,c] + 1) >> 1
 *
 * Mirror FFmpeg's `ff_put_h264_qpel8_mc{10,30,01,03}_neon` (in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
 * lines 587, 603, 611, 729 — each tail-calls the corresponding
 * lowpass_l2 helper).
 *
 * Same single-stride convention as mc20/mc02 — dst and src share the
 * same stride; src + src_off points at row 0 col 0 of the output
 * block, with appropriate edge context already in-buffer.
 *
 * License: LGPL-2.1-or-later.
 */
#include <stdint.h>
#include <stddef.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }

/* Compute one horizontal half-pel pixel at (r, c) — same as mc20. */
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
    int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
          + 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1]
          - 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3]
          + 16;
    return (uint8_t) clip_u8(v >> 5);
}

/* Compute one vertical half-pel pixel at (r, c) — same as mc02. */
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
    int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
          + 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
          - 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
          + 16;
    return (uint8_t) clip_u8(v >> 5);
}

void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
    for (int r = 0; r < 8; r++)
        for (int c = 0; c < 8; c++) {
            uint8_t a = hpel_h(src, r, c, stride);
            dst[r*stride + c] = (uint8_t) ((a + src[r*stride + c    ] + 1) >> 1);
        }
}

void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
    for (int r = 0; r < 8; r++)
        for (int c = 0; c < 8; c++) {
            uint8_t a = hpel_h(src, r, c, stride);
            dst[r*stride + c] = (uint8_t) ((a + src[r*stride + c + 1] + 1) >> 1);
        }
}

void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
    for (int r = 0; r < 8; r++)
        for (int c = 0; c < 8; c++) {
            uint8_t a = hpel_v(src, r, c, stride);
            dst[r*stride + c] = (uint8_t) ((a + src[(r    )*stride + c] + 1) >> 1);
        }
}

void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
    for (int r = 0; r < 8; r++)
        for (int c = 0; c < 8; c++) {
            uint8_t a = hpel_v(src, r, c, stride);
            dst[r*stride + c] = (uint8_t) ((a + src[(r + 1)*stride + c] + 1) >> 1);
        }
}