/*
 * Standalone bit-exact C references for the 8 diagonal H.264 luma
 * qpel positions (mc11, mc12, mc13, mc21, mc23, mc31, mc32, mc33).
 * Each is the rounded average of two half-pel intermediates per
 * H.264 §8.4.2.2.1 / Table 8-4, decomposed to match the FFmpeg .S
 * reference structure (see comments in mc{11,12,21,...}_neon in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S).
 *
 * Position decompositions (verified against the .S):
 *   mc11 (e, ¼¼): avg(mc20[r,c],   mc02[r,c])
 *   mc12 (f, ¼½): avg(mc22[r,c],   mc02[r,c])
 *   mc13 (g, ¼¾): avg(mc20[r+1,c], mc02[r,c])
 *   mc21 (i, ½¼): avg(mc22[r,c],   mc20[r,c])
 *   mc23 (k, ½¾): avg(mc22[r,c],   mc20[r+1,c])
 *   mc31 (p, ¾¼): avg(mc20[r,c],   mc02[r,c+1])
 *   mc32 (q, ¾½): avg(mc22[r,c],   mc02[r,c+1])
 *   mc33 (r, ¾¾): avg(mc20[r+1,c], mc02[r,c+1])
 *
 * (The mc20[r,c] notation means "the mc20-style horizontal half-pel
 * result at source-relative integer position (r, c)"; analogously
 * for mc02 and mc22.)
 *
 * Single-stride convention; same edge-context contract as the simpler
 * variants (the cells "[r+1,c]" etc. demand one extra row/col of
 * source context beyond what mc20/mc02 alone would need).
 *
 * License: LGPL-2.1-or-later.
 */
#include <stdint.h>
#include <stddef.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }

/* Single-cell helpers — same arithmetic as the dedicated mc20/mc02
 * refs but computed point-by-point so the diagonal refs can mix them
 * cheaply.  Each returns a u8 (already clipped). */
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
    int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
          + 20 * (int) s[r*stride + c]   + 20 * (int) s[r*stride + c+1]
          - 5 * (int) s[r*stride + c+2]  + (int) s[r*stride + c+3]
          + 16;
    return (uint8_t) clip_u8(v >> 5);
}
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
    int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
          + 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
          - 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
          + 16;
    return (uint8_t) clip_u8(v >> 5);
}

/* hpel_hv — 2D half-pel at (r, c) per the H.264 §8.4.2.2.1 "j"
 * cascade.  Computes the 6 vertical intermediates needed for the
 * column at offsets -2..+3 around (r, c), each as a 16-bit signed
 * h-lowpass over the 6 source samples in the same row.  Then v-lowpass
 * over those 6 intermediates with the +512 >> 10 final scale.  Same
 * as the mc22 ref, just expressed point-by-point. */
static inline uint8_t hpel_hv(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
    int t[6];   /* tmp at rows r-2..r+3 of the same col c */
    for (int i = 0; i < 6; i++) {
        int rr = r - 2 + i;
        t[i] = (int) s[rr*stride + c-2] - 5 * (int) s[rr*stride + c-1]
             + 20 * (int) s[rr*stride + c]   + 20 * (int) s[rr*stride + c+1]
             - 5 * (int) s[rr*stride + c+2]  + (int) s[rr*stride + c+3];
    }
    int v = t[0] - 5 * t[1] + 20 * t[2] + 20 * t[3] - 5 * t[4] + t[5] + 512;
    return (uint8_t) clip_u8(v >> 10);
}

/* avg rounded ((a + b + 1) >> 1) — saturates already-clipped inputs
 * so no further clip needed. */
static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); }

#define DEFINE_DIAG_REF(NAME, A_EXPR, B_EXPR)                                  \
void daedalus_put_h264_qpel8_ ## NAME ## _ref(uint8_t *dst,                    \
    const uint8_t *src, ptrdiff_t stride)                                      \
{                                                                              \
    for (int r = 0; r < 8; r++)                                                \
        for (int c = 0; c < 8; c++) {                                          \
            uint8_t a = (A_EXPR);                                              \
            uint8_t b = (B_EXPR);                                              \
            dst[r*stride + c] = avg2(a, b);                                    \
        }                                                                      \
}

DEFINE_DIAG_REF(mc11, hpel_h(src,   r, c, stride), hpel_v(src, r,   c, stride))
DEFINE_DIAG_REF(mc12, hpel_hv(src,  r, c, stride), hpel_v(src, r,   c, stride))
DEFINE_DIAG_REF(mc13, hpel_h(src, r+1, c, stride), hpel_v(src, r,   c, stride))
DEFINE_DIAG_REF(mc21, hpel_hv(src,  r, c, stride), hpel_h(src, r,   c, stride))
DEFINE_DIAG_REF(mc23, hpel_hv(src,  r, c, stride), hpel_h(src, r+1, c, stride))
DEFINE_DIAG_REF(mc31, hpel_h(src,   r, c, stride), hpel_v(src, r, c+1, stride))
DEFINE_DIAG_REF(mc32, hpel_hv(src,  r, c, stride), hpel_v(src, r, c+1, stride))
DEFINE_DIAG_REF(mc33, hpel_h(src, r+1, c, stride), hpel_v(src, r, c+1, stride))

#undef DEFINE_DIAG_REF