h264: qpel avg anchors (avg_mc20/02/22, biprediction support)

Begins the avg_ qpel buildout for B-slice biprediction. Each avg_ form computes the same half-pel formula as its put_ sibling, then L2-averages the result with the existing dst contents — the caller pre-loads dst with the list0 prediction; the avg_ call adds list1 per H.264 §8.4.2.3.1. Scope (3 anchors, sets the pattern for the remaining 13 avg_ variants): - 3 new kernel enums (AVG_MC20=31, AVG_MC02=32, AVG_MC22=33) → CPU. - 3 NEON externs for the vendored ff_avg_h264_qpel8_{mc20,mc02,mc22}_neon. - 3 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro (the macro is type-agnostic so it didn't need changes for avg_). - 3 public dispatches via DEFINE_QPEL_DISPATCH macro. - 3 recipe wrappers via DEFINE_QPEL_RECIPE macro. - tests/h264_qpel8_avg_anchors_ref.c — per-cell helpers + L2 avg. - Test harness: run_avg_qpel() seeds dst with random content so the L2 averaging is actually exercised (not just put_-style overwrite that would silently pass). Verified on hertz: $ ./build/test_api_h264 | tail -3 H.264 qpel avg_mc20: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel avg_mc02: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel avg_mc22: 2048/2048 bytes bit-exact (100.0000%) All 3 anchors bit-exact PASS first try. Why anchors only in this PR: the avg_ pattern is uniform across all 16 positions (each is just "put_ result + L2 with dst"). Landing the anchors first confirms the macro pattern works for both put_ and avg_; the remaining 13 (avg_mc10/30/01/03 + avg_mc11..33) follow the same template in a follow-up PR. State of the qpel matrix after this PR: put_ : 15 of 16 positions ✓ (mc00 is integer copy, no wrapper) avg_ : 3 of 16 positions ✓ (mc20, mc02, mc22 anchors) 13 follow-up positions
2026-05-25 08:35:25 +02:00
parent 76e3076670
commit 1113953f97
5 changed files with 182 additions and 0 deletions
@@ -52,6 +52,9 @@ extern void daedalus_put_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, p
 extern void daedalus_put_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 extern void daedalus_put_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+extern void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
                                              ptrdiff_t stride);

@@ -583,6 +586,62 @@ static int test_qpel_diag_all(void)
    return fail;
 }

+/* Avg-form harness: pre-loads dst + dst_ref with the same random
+ * content so we can verify the L2 averaging is happening (not just
+ * put_-style overwrite).  If the dispatch incorrectly overwrote
+ * dst, the bit-exact compare would still catch the mismatch against
+ * the avg_ reference. */
+static int run_avg_qpel(const char *name,
+                         qpel_ref_fn ref, qpel_dispatch_fn dispatch)
+{
+    enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
+           TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
+           SRC_ROW = 3, SRC_COL = 3 };
+    daedalus_ctx *ctx = daedalus_ctx_create();
+    if (!ctx) return 1;
+
+    uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
+    daedalus_h264_qpel_meta meta[N];
+
+    /* Two random buffers: src for the qpel input, dst seeded with
+     * different random content as the "list0 prediction" — both
+     * dst and dst_ref get the SAME seed so the avg compare is fair. */
+    for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
+    for (int i = 0; i < TOTAL; i++) {
+        uint8_t v = (uint8_t)(xs() & 0xff);
+        dst[i] = dst_ref[i] = v;
+    }
+
+    for (int i = 0; i < N; i++) {
+        meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
+        meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
+    }
+
+    for (int i = 0; i < N; i++)
+        ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE);
+
+    int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta);
+    if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; }
+    int diff = 0;
+    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
+    printf("  H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n",
+           name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
+    daedalus_ctx_destroy(ctx);
+    return diff == 0 ? 0 : 1;
+}
+
+static int test_qpel_avg_anchors(void)
+{
+    int fail = 0;
+    fail |= run_avg_qpel("avg_mc20", daedalus_avg_h264_qpel8_mc20_ref,
+                                      daedalus_recipe_dispatch_h264_qpel_avg_mc20);
+    fail |= run_avg_qpel("avg_mc02", daedalus_avg_h264_qpel8_mc02_ref,
+                                      daedalus_recipe_dispatch_h264_qpel_avg_mc02);
+    fail |= run_avg_qpel("avg_mc22", daedalus_avg_h264_qpel8_mc22_ref,
+                                      daedalus_recipe_dispatch_h264_qpel_avg_mc22);
+    return fail;
+}
+
 int main(void)
 {
    printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
@@ -617,5 +676,6 @@ int main(void)
    fail |= test_qpel_mc22();
    fail |= test_qpel_quarter_axis_all();
    fail |= test_qpel_diag_all();
+    fail |= test_qpel_avg_anchors();
    return fail;
 }