/*
 * Standalone bit-exact C reference for H.264 luma qpel 8x8 mc22
 * (2D half-pel, "put" variant).  Cascade of horizontal 6-tap then
 * vertical 6-tap with INTERMEDIATE 16-bit precision (no per-stage
 * clip/round), final +512 >> 10 to scale back.
 *
 * Per H.264 §8.4.2.2.1, "j" position:
 *
 *   tmp[r,c] = s[r,c-2] - 5*s[r,c-1] + 20*s[r,c] + 20*s[r,c+1]
 *              - 5*s[r,c+2] + s[r,c+3]               (16-bit signed)
 *
 *   dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c]
 *                       + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c]
 *                       + 512) >> 10)
 *
 * The tmp[] array spans rows r-2 .. r+3 around each output row, so
 * we need 13 intermediate rows (rows -2..+10 of the SOURCE
 * neighbourhood) for 8 output rows.  Caller's src must have 2 rows
 * of top context + 3 rows of bottom context AND 2 cols of left +
 * 3 cols of right context (FFmpeg's edge-emulated buffer provides
 * this at the frame boundary; same contract as mc20).
 *
 * Mirrors FFmpeg `ff_put_h264_qpel8_mc22_neon` (in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
 * line 710, which tail-calls put_h264_qpel8_hv_lowpass_neon).
 *
 * Signature:
 *   void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 *
 * Same single-stride convention as mc20/mc02.
 *
 * License: LGPL-2.1-or-later.
 */
#include <stdint.h>
#include <stddef.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }

void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
    /* 13 intermediate rows × 8 cols (for the 8 output rows
     * dst[0..7][0..7], we need tmp[-2..+10][0..7] — but tmp is
     * indexed RELATIVE to the output, so tmp_buf[0..12] corresponds
     * to source rows [-2..+10]). */
    int16_t tmp[13][8];
    for (int rr = 0; rr < 13; rr++) {
        int src_row = rr - 2;  /* maps tmp_buf[0..12] → src rows [-2..+10] */
        const uint8_t *s = src + src_row * stride;
        for (int c = 0; c < 8; c++) {
            int v = (int) s[c - 2] - 5 * (int) s[c - 1]
                  + 20 * (int) s[c] + 20 * (int) s[c + 1]
                  - 5 * (int) s[c + 2] + (int) s[c + 3];
            tmp[rr][c] = (int16_t) v;
        }
    }

    for (int r = 0; r < 8; r++) {
        /* tmp[r-2..r+3] in the output's coord system → tmp_buf[r..r+5]. */
        for (int c = 0; c < 8; c++) {
            int v = tmp[r + 0][c]                       /* "r-2" + shift 2 */
                  - 5  * tmp[r + 1][c]                  /* "r-1" */
                  + 20 * tmp[r + 2][c]                  /* "r+0" */
                  + 20 * tmp[r + 3][c]                  /* "r+1" */
                  - 5  * tmp[r + 4][c]                  /* "r+2" */
                  +      tmp[r + 5][c]                  /* "r+3" */
                  + 512;
            dst[r * stride + c] = (uint8_t) clip_u8(v >> 10);
        }
    }
}