/*
 * Standalone bit-exact C reference for VP9 8-tap "regular" subpel
 * filter, horizontal direction, 8-pixel-wide output. Transcribed
 * from FFmpeg's libavcodec/vp9dsp_template.c FILTER_8TAP macro
 * (vendored at external/ffmpeg-snapshot/). 8-bit pixels only.
 *
 * Filter coefficients embedded inline (REGULAR filter only, all 16
 * subpel phases). Same values as ff_vp9_subpel_filters[1][mx] in
 * external/ffmpeg-snapshot/libavcodec/vp9_subpel_filters_table.c.
 *
 * License: LGPL-2.1-or-later.
 *
 * Spec source: VP9 specification §8.5.1 — subpel motion compensation.
 */
#include <stdint.h>
#include <stddef.h>

static const int16_t vp9_8tap_regular_filters[16][8] = {
    {  0,  0,   0, 128,   0,   0,  0,  0 },
    {  0,  1,  -5, 126,   8,  -3,  1,  0 },
    { -1,  3, -10, 122,  18,  -6,  2,  0 },
    { -1,  4, -13, 118,  27,  -9,  3, -1 },
    { -1,  4, -16, 112,  37, -11,  4, -1 },
    { -1,  5, -18, 105,  48, -14,  4, -1 },
    { -1,  5, -19,  97,  58, -16,  5, -1 },
    { -1,  6, -19,  88,  68, -18,  5, -1 },
    { -1,  6, -19,  78,  78, -19,  6, -1 },
    { -1,  5, -18,  68,  88, -19,  6, -1 },
    { -1,  5, -16,  58,  97, -19,  5, -1 },
    { -1,  4, -14,  48, 105, -18,  5, -1 },
    { -1,  4, -11,  37, 112, -16,  4, -1 },
    { -1,  3,  -9,  27, 118, -13,  4, -1 },
    {  0,  2,  -6,  18, 122, -10,  3, -1 },
    {  0,  1,  -3,   8, 126,  -5,  1,  0 },
};

static inline uint8_t clip_u8(int x)
{
    return (uint8_t)(x > 255 ? 255 : x < 0 ? 0 : x);
}

/*
 * 8x8 horizontal 8-tap "put" (non-averaging). Width hard-coded 8.
 * `src` must point at the row-0 output-column-0 source pixel; valid
 * source memory must extend src[r*src_stride + (-3..+11)] for r=0..h-1.
 * `dst` is written at dst[r*dst_stride + 0..7] for r=0..h-1.
 *
 * Matches ff_vp9_put_regular8_h_neon byte-for-byte on 8-bit input.
 */
void daedalus_vp9_put_regular_8h_ref(uint8_t *dst, ptrdiff_t dst_stride,
                                     const uint8_t *src, ptrdiff_t src_stride,
                                     int h, int mx, int my)
{
    (void) my;   /* horizontal-only filter ignores y phase */
    const int16_t *F = vp9_8tap_regular_filters[mx & 15];

    for (int r = 0; r < h; r++) {
        for (int x = 0; x < 8; x++) {
            int sum = F[0] * (int) src[x - 3]
                    + F[1] * (int) src[x - 2]
                    + F[2] * (int) src[x - 1]
                    + F[3] * (int) src[x + 0]
                    + F[4] * (int) src[x + 1]
                    + F[5] * (int) src[x + 2]
                    + F[6] * (int) src[x + 3]
                    + F[7] * (int) src[x + 4];
            dst[x] = clip_u8((sum + 64) >> 7);
        }
        dst += dst_stride;
        src += src_stride;
    }
}