/* * Standalone bit-exact C reference for H.264 luma "vertical" * loop filter (v_loop_filter_luma): applies filter VERTICALLY * across a HORIZONTAL edge. The edge spans the 16-column * macroblock width, between rows -1 and 0. * * Mirrors FFmpeg `ff_h264_v_loop_filter_luma_neon` in * external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S * line 111. Operates on a 8-row × 16-col region: * pix[r*stride + c] for r in -4..+3, c in 0..15 * With pix pointing to row 0, col 0 of the bottom block. * * 16 columns divided into 4 segments of 4 cols; each segment * has its own tc0 strength (tc0[0..3]). * * Note: FFmpeg's "v_loop_filter" naming uses the FILTER * DIRECTION (vertical = across the edge from above), not the * edge orientation (horizontal). H.264 spec calls this the * "horizontal edge" filter. * * Signature: * void(uint8_t *pix, ptrdiff_t stride, * int alpha, int beta, int8_t tc0[4]); * * License: LGPL-2.1-or-later (matches FFmpeg upstream). */ #include #include static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } static inline int clip3(int v, int lo, int hi) { return v < lo ? lo : v > hi ? hi : v; } static inline int abs_i(int x) { return x < 0 ? -x : x; } /* Apply luma deblock to one COLUMN at the horizontal edge. * p0..p3 are pixels above the edge (pix[-stride..-4*stride]), * q0..q3 below (pix[0..+3*stride]). * tc0_s is the segment's tc0 value (already known >= 0). * * Writes back to pix[-2*stride], pix[-1*stride], pix[0], pix[+stride] * (= p1, p0, q0, q1). */ static void h264_deblock_luma_col(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int tc0_s) { int p3 = pix[-4*stride], p2 = pix[-3*stride], p1 = pix[-2*stride], p0 = pix[-1*stride]; int q0 = pix[ 0*stride], q1 = pix[ 1*stride], q2 = pix[ 2*stride], q3 = pix[ 3*stride]; (void) p3; (void) q3; /* not used in bS<4 path */ /* Edge pre-conditions. */ if (abs_i(p0 - q0) >= alpha) return; if (abs_i(p1 - p0) >= beta) return; if (abs_i(q1 - q0) >= beta) return; /* Side conditions. */ int ap = abs_i(p2 - p0); int aq = abs_i(q2 - q0); int ap_lt_beta = (ap < beta); int aq_lt_beta = (aq < beta); /* Combined filter strength. */ int tc = tc0_s + ap_lt_beta + aq_lt_beta; /* p0 / q0 update. */ int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc); int p0p = clip_u8(p0 + delta); int q0p = clip_u8(q0 - delta); /* p1 update (only if ap> 1) - 2*p1) >> 1, -tc0_s, tc0_s); p1p = p1 + delta_p1; } /* q1 update (only if aq> 1) - 2*q1) >> 1, -tc0_s, tc0_s); q1p = q1 + delta_q1; } pix[-2*stride] = (uint8_t) p1p; pix[-1*stride] = (uint8_t) p0p; pix[ 0*stride] = (uint8_t) q0p; pix[ 1*stride] = (uint8_t) q1p; } void daedalus_h264_v_loop_filter_luma_ref( uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]) { /* H.264 deblock "outer" precondition: alpha == 0 OR beta == 0 * skips filtering. Also if ALL tc0[*] == -1, skip * (h264_loop_filter_start macro check). */ if (alpha == 0 || beta == 0) return; if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return; /* 16 columns divided into 4 segments of 4 columns each. */ for (int s = 0; s < 4; s++) { int tc0_s = tc0[s]; if (tc0_s < 0) continue; /* bS = 0 segment → skip */ for (int c = 0; c < 4; c++) { int col = s * 4 + c; h264_deblock_luma_col(pix + col, stride, alpha, beta, tc0_s); } } }