/*
 * Standalone bit-exact C reference for H.264 luma "vertical"
 * loop filter (v_loop_filter_luma): applies filter VERTICALLY
 * across a HORIZONTAL edge. The edge spans the 16-column
 * macroblock width, between rows -1 and 0.
 *
 * Mirrors FFmpeg `ff_h264_v_loop_filter_luma_neon` in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S
 * line 111. Operates on a 8-row × 16-col region:
 *   pix[r*stride + c] for r in -4..+3, c in 0..15
 * With pix pointing to row 0, col 0 of the bottom block.
 *
 * 16 columns divided into 4 segments of 4 cols; each segment
 * has its own tc0 strength (tc0[0..3]).
 *
 * Note: FFmpeg's "v_loop_filter" naming uses the FILTER
 * DIRECTION (vertical = across the edge from above), not the
 * edge orientation (horizontal). H.264 spec calls this the
 * "horizontal edge" filter.
 *
 * Signature:
 *   void(uint8_t *pix, ptrdiff_t stride,
 *        int alpha, int beta, int8_t tc0[4]);
 *
 * License: LGPL-2.1-or-later (matches FFmpeg upstream).
 */
#include <stdint.h>
#include <stddef.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
static inline int clip3(int v, int lo, int hi) {
    return v < lo ? lo : v > hi ? hi : v;
}
static inline int abs_i(int x) { return x < 0 ? -x : x; }

/* Apply luma deblock to one COLUMN at the horizontal edge.
 * p0..p3 are pixels above the edge (pix[-stride..-4*stride]),
 * q0..q3 below (pix[0..+3*stride]).
 * tc0_s is the segment's tc0 value (already known >= 0).
 *
 * Writes back to pix[-2*stride], pix[-1*stride], pix[0], pix[+stride]
 * (= p1, p0, q0, q1).
 */
static void h264_deblock_luma_col(uint8_t *pix, ptrdiff_t stride,
                                   int alpha, int beta, int tc0_s)
{
    int p3 = pix[-4*stride], p2 = pix[-3*stride], p1 = pix[-2*stride], p0 = pix[-1*stride];
    int q0 = pix[ 0*stride], q1 = pix[ 1*stride], q2 = pix[ 2*stride], q3 = pix[ 3*stride];
    (void) p3; (void) q3;   /* not used in bS<4 path */

    /* Edge pre-conditions. */
    if (abs_i(p0 - q0) >= alpha) return;
    if (abs_i(p1 - p0) >= beta)  return;
    if (abs_i(q1 - q0) >= beta)  return;

    /* Side conditions. */
    int ap = abs_i(p2 - p0);
    int aq = abs_i(q2 - q0);
    int ap_lt_beta = (ap < beta);
    int aq_lt_beta = (aq < beta);

    /* Combined filter strength. */
    int tc = tc0_s + ap_lt_beta + aq_lt_beta;

    /* p0 / q0 update. */
    int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
    int p0p = clip_u8(p0 + delta);
    int q0p = clip_u8(q0 - delta);

    /* p1 update (only if ap<beta). */
    int p1p = p1;
    if (ap_lt_beta) {
        int delta_p1 = clip3((p2 + ((p0 + q0 + 1) >> 1) - 2*p1) >> 1, -tc0_s, tc0_s);
        p1p = p1 + delta_p1;
    }
    /* q1 update (only if aq<beta). */
    int q1p = q1;
    if (aq_lt_beta) {
        int delta_q1 = clip3((q2 + ((p0 + q0 + 1) >> 1) - 2*q1) >> 1, -tc0_s, tc0_s);
        q1p = q1 + delta_q1;
    }

    pix[-2*stride] = (uint8_t) p1p;
    pix[-1*stride] = (uint8_t) p0p;
    pix[ 0*stride] = (uint8_t) q0p;
    pix[ 1*stride] = (uint8_t) q1p;
}

void daedalus_h264_v_loop_filter_luma_ref(
    uint8_t *pix, ptrdiff_t stride,
    int alpha, int beta, int8_t tc0[4])
{
    /* H.264 deblock "outer" precondition: alpha == 0 OR beta == 0
     * skips filtering. Also if ALL tc0[*] == -1, skip
     * (h264_loop_filter_start macro check). */
    if (alpha == 0 || beta == 0) return;
    if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;

    /* 16 columns divided into 4 segments of 4 columns each. */
    for (int s = 0; s < 4; s++) {
        int tc0_s = tc0[s];
        if (tc0_s < 0) continue;   /* bS = 0 segment → skip */
        for (int c = 0; c < 4; c++) {
            int col = s * 4 + c;
            h264_deblock_luma_col(pix + col, stride, alpha, beta, tc0_s);
        }
    }
}