/*
 * Standalone bit-exact C references for the avg_ qpel anchors —
 * the biprediction "average against existing dst" form of mc20,
 * mc02, mc22.  Used in B-slices where two qpel-interpolated samples
 * (one from list0, one from list1) are averaged per H.264 §8.4.2.3.
 *
 * Each kernel computes the same half-pel formula as the put_ form,
 * then averages with dst[r,c] via L2 ((dst + put_val + 1) >> 1).
 * The dst buffer carries the list0 prediction on entry; the avg_
 * call adds the list1 contribution.
 *
 * Mirror FFmpeg's `ff_avg_h264_qpel8_{mc20,mc02,mc22}_neon` in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
 * (same `\type=avg` expansion as the put_ functions).
 *
 * License: LGPL-2.1-or-later.
 */
#include <stdint.h>
#include <stddef.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); }

/* Same per-cell helpers as the diag/quarter-axis refs.  Duplicated
 * here (rather than extern'd) so this TU compiles standalone. */
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
    int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
          + 20 * (int) s[r*stride + c]   + 20 * (int) s[r*stride + c+1]
          - 5 * (int) s[r*stride + c+2]  + (int) s[r*stride + c+3]
          + 16;
    return (uint8_t) clip_u8(v >> 5);
}
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
    int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
          + 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
          - 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
          + 16;
    return (uint8_t) clip_u8(v >> 5);
}

void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
    for (int r = 0; r < 8; r++)
        for (int c = 0; c < 8; c++)
            dst[r*stride + c] = avg2(dst[r*stride + c], hpel_h(src, r, c, stride));
}

void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
    for (int r = 0; r < 8; r++)
        for (int c = 0; c < 8; c++)
            dst[r*stride + c] = avg2(dst[r*stride + c], hpel_v(src, r, c, stride));
}

void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
    /* Per-cell mc22: same 13-row int16 tmp[] computation as the
     * put_ reference, then L2 with dst. */
    int16_t tmp[13][8];
    for (int rr = 0; rr < 13; rr++) {
        int src_row = rr - 2;
        const uint8_t *s = src + src_row * stride;
        for (int c = 0; c < 8; c++) {
            int v = (int) s[c-2] - 5 * (int) s[c-1]
                  + 20 * (int) s[c]   + 20 * (int) s[c+1]
                  - 5 * (int) s[c+2]  + (int) s[c+3];
            tmp[rr][c] = (int16_t) v;
        }
    }
    for (int r = 0; r < 8; r++)
        for (int c = 0; c < 8; c++) {
            int v = tmp[r+0][c] - 5*tmp[r+1][c] + 20*tmp[r+2][c]
                  + 20*tmp[r+3][c] - 5*tmp[r+4][c] + tmp[r+5][c] + 512;
            uint8_t p = (uint8_t) clip_u8(v >> 10);
            dst[r*stride + c] = avg2(dst[r*stride + c], p);
        }
}