h264: qpel avg anchors (avg_mc20/02/22, biprediction support)

Begins the avg_ qpel buildout for B-slice biprediction.  Each avg_
form computes the same half-pel formula as its put_ sibling, then
L2-averages the result with the existing dst contents — the caller
pre-loads dst with the list0 prediction; the avg_ call adds list1
per H.264 §8.4.2.3.1.

Scope (3 anchors, sets the pattern for the remaining 13 avg_
variants):
  - 3 new kernel enums (AVG_MC20=31, AVG_MC02=32, AVG_MC22=33) → CPU.
  - 3 NEON externs for the vendored ff_avg_h264_qpel8_{mc20,mc02,mc22}_neon.
  - 3 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro
    (the macro is type-agnostic so it didn't need changes for avg_).
  - 3 public dispatches via DEFINE_QPEL_DISPATCH macro.
  - 3 recipe wrappers via DEFINE_QPEL_RECIPE macro.
  - tests/h264_qpel8_avg_anchors_ref.c — per-cell helpers + L2 avg.
  - Test harness: run_avg_qpel() seeds dst with random content so
    the L2 averaging is actually exercised (not just put_-style
    overwrite that would silently pass).

Verified on hertz:

  $ ./build/test_api_h264 | tail -3
    H.264 qpel avg_mc20: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc02: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc22: 2048/2048 bytes bit-exact (100.0000%)

  All 3 anchors bit-exact PASS first try.

Why anchors only in this PR: the avg_ pattern is uniform across all
16 positions (each is just "put_ result + L2 with dst").  Landing
the anchors first confirms the macro pattern works for both put_
and avg_; the remaining 13 (avg_mc10/30/01/03 + avg_mc11..33) follow
the same template in a follow-up PR.

State of the qpel matrix after this PR:
  put_ : 15 of 16 positions ✓ (mc00 is integer copy, no wrapper)
  avg_ :  3 of 16 positions ✓ (mc20, mc02, mc22 anchors)
        13 follow-up positions
This commit is contained in:
2026-05-25 08:35:25 +02:00
parent 76e3076670
commit 1113953f97
5 changed files with 182 additions and 0 deletions
+18
View File
@@ -152,6 +152,9 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
case DAEDALUS_KERNEL_H264_QPEL_MC31: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾¼ */
case DAEDALUS_KERNEL_H264_QPEL_MC32: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾½ */
case DAEDALUS_KERNEL_H264_QPEL_MC33: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾¾ */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_CPU; /* biprediction anchors */
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_CPU;
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_CPU;
}
return DAEDALUS_SUBSTRATE_CPU;
}
@@ -212,6 +215,9 @@ extern void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdif
extern void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
/* -------------------- CPU dispatch implementations -------------- */
@@ -493,6 +499,12 @@ DEFINE_QPEL_CPU_DISPATCH(mc31, ff_put_h264_qpel8_mc31_neon)
DEFINE_QPEL_CPU_DISPATCH(mc32, ff_put_h264_qpel8_mc32_neon)
DEFINE_QPEL_CPU_DISPATCH(mc33, ff_put_h264_qpel8_mc33_neon)
/* avg_ biprediction variants — same dispatch shape as put_, just
* different NEON entry that L2-averages with the existing dst. */
DEFINE_QPEL_CPU_DISPATCH(avg_mc20, ff_avg_h264_qpel8_mc20_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc02, ff_avg_h264_qpel8_mc02_neon)
DEFINE_QPEL_CPU_DISPATCH(avg_mc22, ff_avg_h264_qpel8_mc22_neon)
#undef DEFINE_QPEL_CPU_DISPATCH
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
@@ -1521,6 +1533,9 @@ DEFINE_QPEL_DISPATCH(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23)
DEFINE_QPEL_DISPATCH(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31)
DEFINE_QPEL_DISPATCH(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32)
DEFINE_QPEL_DISPATCH(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33)
DEFINE_QPEL_DISPATCH(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20)
DEFINE_QPEL_DISPATCH(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02)
DEFINE_QPEL_DISPATCH(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22)
#undef DEFINE_QPEL_DISPATCH
@@ -1680,5 +1695,8 @@ DEFINE_QPEL_RECIPE(mc23)
DEFINE_QPEL_RECIPE(mc31)
DEFINE_QPEL_RECIPE(mc32)
DEFINE_QPEL_RECIPE(mc33)
DEFINE_QPEL_RECIPE(avg_mc20)
DEFINE_QPEL_RECIPE(avg_mc02)
DEFINE_QPEL_RECIPE(avg_mc22)
#undef DEFINE_QPEL_RECIPE