436a5c4f74
M1: 10000/10000 bit-exact (after orientation fix: ff_h264_v_loop_ filter is "vertical filtering of horizontal edges", not "vertical edge"; 16 columns process the edge horizontally with 8 rows of vertical context). M3: 91.947 Medge/s per core. Per-edge 10.9 ns. 11x worst-case 1080p30 floor, 30x realistic floor. Filter triggers on 25 % of edges (random alpha/beta/tc0 covers both gating paths). Cycle 8 Phase 9 lesson: H.264/FFmpeg "v_loop_filter" naming uses filter DIRECTION (vertical) not edge orientation. Edge is horizontal; filter operates vertically across it. Distinct from cycle 6's column-major-block lesson but related discovery pattern. Encoded for future cycles. R8 prediction revised: 0.09-0.14 ORANGE (down from Phase 1's 0.3-0.8 estimate). H.264 deblock is 2x faster on NEON than VP9 LPF wd=4 (cycle 2) but H.264 deblock has more per-edge branches that hurt QPU more. Worth building anyway: - ORANGE in cycle 1's "M4 may rescue" band - Mixed-kernel deployment helper value (Issue 003) matters more than isolation R - 25%-trigger rate gives 4x effective contribution multiplier on QPU side - tests/h264_deblock_ref.c (column-walking C ref per row segment) - tests/bench_neon_h264deblock.c (M1 + M3 bench) - CMakeLists.txt: cycle 8 NEON bench wiring + h264dsp_neon.S - docs/k8_h264deblock_phase3.md (closure) Next: Phase 4 plan QPU shader, Phase 5 Sonnet review. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
109 lines
3.7 KiB
C
109 lines
3.7 KiB
C
/*
|
||
* Standalone bit-exact C reference for H.264 luma "vertical"
|
||
* loop filter (v_loop_filter_luma): applies filter VERTICALLY
|
||
* across a HORIZONTAL edge. The edge spans the 16-column
|
||
* macroblock width, between rows -1 and 0.
|
||
*
|
||
* Mirrors FFmpeg `ff_h264_v_loop_filter_luma_neon` in
|
||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S
|
||
* line 111. Operates on a 8-row × 16-col region:
|
||
* pix[r*stride + c] for r in -4..+3, c in 0..15
|
||
* With pix pointing to row 0, col 0 of the bottom block.
|
||
*
|
||
* 16 columns divided into 4 segments of 4 cols; each segment
|
||
* has its own tc0 strength (tc0[0..3]).
|
||
*
|
||
* Note: FFmpeg's "v_loop_filter" naming uses the FILTER
|
||
* DIRECTION (vertical = across the edge from above), not the
|
||
* edge orientation (horizontal). H.264 spec calls this the
|
||
* "horizontal edge" filter.
|
||
*
|
||
* Signature:
|
||
* void(uint8_t *pix, ptrdiff_t stride,
|
||
* int alpha, int beta, int8_t tc0[4]);
|
||
*
|
||
* License: LGPL-2.1-or-later (matches FFmpeg upstream).
|
||
*/
|
||
#include <stdint.h>
|
||
#include <stddef.h>
|
||
|
||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||
static inline int clip3(int v, int lo, int hi) {
|
||
return v < lo ? lo : v > hi ? hi : v;
|
||
}
|
||
static inline int abs_i(int x) { return x < 0 ? -x : x; }
|
||
|
||
/* Apply luma deblock to one COLUMN at the horizontal edge.
|
||
* p0..p3 are pixels above the edge (pix[-stride..-4*stride]),
|
||
* q0..q3 below (pix[0..+3*stride]).
|
||
* tc0_s is the segment's tc0 value (already known >= 0).
|
||
*
|
||
* Writes back to pix[-2*stride], pix[-1*stride], pix[0], pix[+stride]
|
||
* (= p1, p0, q0, q1).
|
||
*/
|
||
static void h264_deblock_luma_col(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int tc0_s)
|
||
{
|
||
int p3 = pix[-4*stride], p2 = pix[-3*stride], p1 = pix[-2*stride], p0 = pix[-1*stride];
|
||
int q0 = pix[ 0*stride], q1 = pix[ 1*stride], q2 = pix[ 2*stride], q3 = pix[ 3*stride];
|
||
(void) p3; (void) q3; /* not used in bS<4 path */
|
||
|
||
/* Edge pre-conditions. */
|
||
if (abs_i(p0 - q0) >= alpha) return;
|
||
if (abs_i(p1 - p0) >= beta) return;
|
||
if (abs_i(q1 - q0) >= beta) return;
|
||
|
||
/* Side conditions. */
|
||
int ap = abs_i(p2 - p0);
|
||
int aq = abs_i(q2 - q0);
|
||
int ap_lt_beta = (ap < beta);
|
||
int aq_lt_beta = (aq < beta);
|
||
|
||
/* Combined filter strength. */
|
||
int tc = tc0_s + ap_lt_beta + aq_lt_beta;
|
||
|
||
/* p0 / q0 update. */
|
||
int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||
int p0p = clip_u8(p0 + delta);
|
||
int q0p = clip_u8(q0 - delta);
|
||
|
||
/* p1 update (only if ap<beta). */
|
||
int p1p = p1;
|
||
if (ap_lt_beta) {
|
||
int delta_p1 = clip3((p2 + ((p0 + q0 + 1) >> 1) - 2*p1) >> 1, -tc0_s, tc0_s);
|
||
p1p = p1 + delta_p1;
|
||
}
|
||
/* q1 update (only if aq<beta). */
|
||
int q1p = q1;
|
||
if (aq_lt_beta) {
|
||
int delta_q1 = clip3((q2 + ((p0 + q0 + 1) >> 1) - 2*q1) >> 1, -tc0_s, tc0_s);
|
||
q1p = q1 + delta_q1;
|
||
}
|
||
|
||
pix[-2*stride] = (uint8_t) p1p;
|
||
pix[-1*stride] = (uint8_t) p0p;
|
||
pix[ 0*stride] = (uint8_t) q0p;
|
||
pix[ 1*stride] = (uint8_t) q1p;
|
||
}
|
||
|
||
void daedalus_h264_v_loop_filter_luma_ref(
|
||
uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t tc0[4])
|
||
{
|
||
/* H.264 deblock "outer" precondition: alpha == 0 OR beta == 0
|
||
* skips filtering. Also if ALL tc0[*] == -1, skip
|
||
* (h264_loop_filter_start macro check). */
|
||
if (alpha == 0 || beta == 0) return;
|
||
if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
|
||
|
||
/* 16 columns divided into 4 segments of 4 columns each. */
|
||
for (int s = 0; s < 4; s++) {
|
||
int tc0_s = tc0[s];
|
||
if (tc0_s < 0) continue; /* bS = 0 segment → skip */
|
||
for (int c = 0; c < 4; c++) {
|
||
int col = s * 4 + c;
|
||
h264_deblock_luma_col(pix + col, stride, alpha, beta, tc0_s);
|
||
}
|
||
}
|
||
}
|