h264: qpel avg anchors (avg_mc20/02/22, biprediction support)

Begins the avg_ qpel buildout for B-slice biprediction. Each avg_ form computes the same half-pel formula as its put_ sibling, then L2-averages the result with the existing dst contents — the caller pre-loads dst with the list0 prediction; the avg_ call adds list1 per H.264 §8.4.2.3.1. Scope (3 anchors, sets the pattern for the remaining 13 avg_ variants): - 3 new kernel enums (AVG_MC20=31, AVG_MC02=32, AVG_MC22=33) → CPU. - 3 NEON externs for the vendored ff_avg_h264_qpel8_{mc20,mc02,mc22}_neon. - 3 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro (the macro is type-agnostic so it didn't need changes for avg_). - 3 public dispatches via DEFINE_QPEL_DISPATCH macro. - 3 recipe wrappers via DEFINE_QPEL_RECIPE macro. - tests/h264_qpel8_avg_anchors_ref.c — per-cell helpers + L2 avg. - Test harness: run_avg_qpel() seeds dst with random content so the L2 averaging is actually exercised (not just put_-style overwrite that would silently pass). Verified on hertz: $ ./build/test_api_h264 | tail -3 H.264 qpel avg_mc20: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel avg_mc02: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel avg_mc22: 2048/2048 bytes bit-exact (100.0000%) All 3 anchors bit-exact PASS first try. Why anchors only in this PR: the avg_ pattern is uniform across all 16 positions (each is just "put_ result + L2 with dst"). Landing the anchors first confirms the macro pattern works for both put_ and avg_; the remaining 13 (avg_mc10/30/01/03 + avg_mc11..33) follow the same template in a follow-up PR. State of the qpel matrix after this PR: put_ : 15 of 16 positions ✓ (mc00 is integer copy, no wrapper) avg_ : 3 of 16 positions ✓ (mc20, mc02, mc22 anchors) 13 follow-up positions
2026-05-25 08:35:25 +02:00
parent 76e3076670
commit 1113953f97
5 changed files with 182 additions and 0 deletions
@@ -0,0 +1,79 @@
+/*
+ * Standalone bit-exact C references for the avg_ qpel anchors —
+ * the biprediction "average against existing dst" form of mc20,
+ * mc02, mc22.  Used in B-slices where two qpel-interpolated samples
+ * (one from list0, one from list1) are averaged per H.264 §8.4.2.3.
+ *
+ * Each kernel computes the same half-pel formula as the put_ form,
+ * then averages with dst[r,c] via L2 ((dst + put_val + 1) >> 1).
+ * The dst buffer carries the list0 prediction on entry; the avg_
+ * call adds the list1 contribution.
+ *
+ * Mirror FFmpeg's `ff_avg_h264_qpel8_{mc20,mc02,mc22}_neon` in
+ * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
+ * (same `\type=avg` expansion as the put_ functions).
+ *
+ * License: LGPL-2.1-or-later.
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
+static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); }
+
+/* Same per-cell helpers as the diag/quarter-axis refs.  Duplicated
+ * here (rather than extern'd) so this TU compiles standalone. */
+static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
+{
+    int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
+          + 20 * (int) s[r*stride + c]   + 20 * (int) s[r*stride + c+1]
+          - 5 * (int) s[r*stride + c+2]  + (int) s[r*stride + c+3]
+          + 16;
+    return (uint8_t) clip_u8(v >> 5);
+}
+static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
+{
+    int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
+          + 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
+          - 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
+          + 16;
+    return (uint8_t) clip_u8(v >> 5);
+}
+
+void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
+{
+    for (int r = 0; r < 8; r++)
+        for (int c = 0; c < 8; c++)
+            dst[r*stride + c] = avg2(dst[r*stride + c], hpel_h(src, r, c, stride));
+}
+
+void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
+{
+    for (int r = 0; r < 8; r++)
+        for (int c = 0; c < 8; c++)
+            dst[r*stride + c] = avg2(dst[r*stride + c], hpel_v(src, r, c, stride));
+}
+
+void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
+{
+    /* Per-cell mc22: same 13-row int16 tmp[] computation as the
+     * put_ reference, then L2 with dst. */
+    int16_t tmp[13][8];
+    for (int rr = 0; rr < 13; rr++) {
+        int src_row = rr - 2;
+        const uint8_t *s = src + src_row * stride;
+        for (int c = 0; c < 8; c++) {
+            int v = (int) s[c-2] - 5 * (int) s[c-1]
+                  + 20 * (int) s[c]   + 20 * (int) s[c+1]
+                  - 5 * (int) s[c+2]  + (int) s[c+3];
+            tmp[rr][c] = (int16_t) v;
+        }
+    }
+    for (int r = 0; r < 8; r++)
+        for (int c = 0; c < 8; c++) {
+            int v = tmp[r+0][c] - 5*tmp[r+1][c] + 20*tmp[r+2][c]
+                  + 20*tmp[r+3][c] - 5*tmp[r+4][c] + tmp[r+5][c] + 512;
+            uint8_t p = (uint8_t) clip_u8(v >> 10);
+            dst[r*stride + c] = avg2(dst[r*stride + c], p);
+        }
+}