Files
daedalus-fourier/tests/h264_qpel8_quarter_axis_ref.c
T
claude-noether e01f7bc7c6 h264: qpel single-axis quarter-pel — mc10/mc30/mc01/mc03 (CPU/NEON)
Closes the 4 single-axis quarter-pel positions in one PR.  Each is
a half-pel lowpass clipped to u8 followed by L2 rounded-average
with an integer-aligned source pixel per H.264 §8.4.2.2.1:

  mc10  ¼-H ("a" pos): clip255(mc20(s)) avg src[r,c]
  mc30  ¾-H ("c" pos): clip255(mc20(s)) avg src[r,c+1]
  mc01  ¼-V ("d" pos): clip255(mc02(s)) avg src[r,c]
  mc03  ¾-V ("n" pos): clip255(mc02(s)) avg src[r+1,c]

The mc10/mc30 pair and mc01/mc03 pair only differ in WHICH integer
source pixel they average with — the half-pel computation is the
same.  Putting them in one PR is justified by that uniformity.

Scope:
  - 4 new kernel enums: MC10=19, MC30=20, MC01=21, MC03=22 → CPU.
  - 4 NEON externs for the vendored ff_put_h264_qpel8_mc{10,30,01,03}_neon.
  - 4 CPU dispatch wrappers via DEFINE_QPEL_CPU_DISPATCH macro
    (collapses ~50 LOC of repetition).
  - 4 public dispatch fns via DEFINE_QPEL_DISPATCH macro.
  - 4 recipe wrappers via DEFINE_QPEL_RECIPE macro.
  - tests/h264_qpel8_quarter_axis_ref.c covers all four via shared
    hpel_h() / hpel_v() inlines + per-mode L2 average.
  - Test refactor: generic run_quarter_axis_qpel() harness exercises
    all 4 positions through a single helper (~50 LOC for 4 tests vs
    ~200 if each was hand-rolled).

Verified on hertz:

  $ ./build/test_api_h264 | tail -8
    H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%)
    H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%)
    H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc22: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc10: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc30: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc01: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel mc03: 2048/2048 bytes bit-exact (100.0000%)

  All 4 new positions bit-exact PASS first try.

Coverage matrix update:
  put_  mc00 mc10 mc20 mc30
  mc01     —    ✓    —    ✓
  mc11     —    —    ✓    —     ← this row
  mc21     —    —    —    —
  mc31     —    —    —    —
  mc02     —    —    ✓    —     ← mc02 + mc22 anchor
  mc03     —    —    ✓    —

After this PR: 7 of 16 single-axis + diagonal positions done.
Remaining 9 are the off-axis quarter-pel combinations
(mc11/mc12/mc13/mc21/mc23/mc31/mc32/mc33) — each combines a 2D
lowpass intermediate with L2 averaging against a 1D-lowpass output.
Next PR scope.

Why no QPU shaders: same R-band logic as the prior CPU additions.
At ~10 ns per 8x8 NEON block, all 16 qpel positions together
would land in ~1.3 ms/frame at 1080p worst case — comfortably
inside the 33 ms budget.  QPU shader for mc20 already exists
(cycle 9 / v3d_h264_qpel_mc20.spv); the other 15 follow once a
clear perf reason emerges.
2026-05-25 01:29:52 +02:00

83 lines
3.2 KiB
C

/*
* Standalone bit-exact C references for the four single-axis quarter-
* pel luma qpel positions (H.264 §8.4.2.2.1, "put" variants). Each
* is a half-pel lowpass clipped to u8 followed by an L2 rounded-average
* with an integer-position source pixel.
*
* mc10 ("a" pos, ¼ horiz): a = clip255(mc20(s)); dst = (a + s[r,c] + 1) >> 1
* mc30 ("c" pos, ¾ horiz): a = clip255(mc20(s)); dst = (a + s[r,c+1] + 1) >> 1
* mc01 ("d" pos, ¼ vert ): a = clip255(mc02(s)); dst = (a + s[r, c] + 1) >> 1
* mc03 ("n" pos, ¾ vert ): a = clip255(mc02(s)); dst = (a + s[r+1,c] + 1) >> 1
*
* Mirror FFmpeg's `ff_put_h264_qpel8_mc{10,30,01,03}_neon` (in
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
* lines 587, 603, 611, 729 — each tail-calls the corresponding
* lowpass_l2 helper).
*
* Same single-stride convention as mc20/mc02 — dst and src share the
* same stride; src + src_off points at row 0 col 0 of the output
* block, with appropriate edge context already in-buffer.
*
* License: LGPL-2.1-or-later.
*/
#include <stdint.h>
#include <stddef.h>
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
/* Compute one horizontal half-pel pixel at (r, c) — same as mc20. */
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
+ 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1]
- 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3]
+ 16;
return (uint8_t) clip_u8(v >> 5);
}
/* Compute one vertical half-pel pixel at (r, c) — same as mc02. */
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
+ 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
- 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
+ 16;
return (uint8_t) clip_u8(v >> 5);
}
void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++) {
uint8_t a = hpel_h(src, r, c, stride);
dst[r*stride + c] = (uint8_t) ((a + src[r*stride + c ] + 1) >> 1);
}
}
void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++) {
uint8_t a = hpel_h(src, r, c, stride);
dst[r*stride + c] = (uint8_t) ((a + src[r*stride + c + 1] + 1) >> 1);
}
}
void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++) {
uint8_t a = hpel_v(src, r, c, stride);
dst[r*stride + c] = (uint8_t) ((a + src[(r )*stride + c] + 1) >> 1);
}
}
void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
{
for (int r = 0; r < 8; r++)
for (int c = 0; c < 8; c++) {
uint8_t a = hpel_v(src, r, c, stride);
dst[r*stride + c] = (uint8_t) ((a + src[(r + 1)*stride + c] + 1) >> 1);
}
}