From 1113953f97db281b144b0162739d8b03899b886c Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 08:35:25 +0200 Subject: [PATCH] h264: qpel avg anchors (avg_mc20/02/22, biprediction support) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Begins the avg_ qpel buildout for B-slice biprediction. Each avg_ form computes the same half-pel formula as its put_ sibling, then L2-averages the result with the existing dst contents — the caller pre-loads dst with the list0 prediction; the avg_ call adds list1 per H.264 §8.4.2.3.1. Scope (3 anchors, sets the pattern for the remaining 13 avg_ variants): - 3 new kernel enums (AVG_MC20=31, AVG_MC02=32, AVG_MC22=33) → CPU. - 3 NEON externs for the vendored ff_avg_h264_qpel8_{mc20,mc02,mc22}_neon. - 3 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro (the macro is type-agnostic so it didn't need changes for avg_). - 3 public dispatches via DEFINE_QPEL_DISPATCH macro. - 3 recipe wrappers via DEFINE_QPEL_RECIPE macro. - tests/h264_qpel8_avg_anchors_ref.c — per-cell helpers + L2 avg. - Test harness: run_avg_qpel() seeds dst with random content so the L2 averaging is actually exercised (not just put_-style overwrite that would silently pass). Verified on hertz: $ ./build/test_api_h264 | tail -3 H.264 qpel avg_mc20: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel avg_mc02: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel avg_mc22: 2048/2048 bytes bit-exact (100.0000%) All 3 anchors bit-exact PASS first try. Why anchors only in this PR: the avg_ pattern is uniform across all 16 positions (each is just "put_ result + L2 with dst"). Landing the anchors first confirms the macro pattern works for both put_ and avg_; the remaining 13 (avg_mc10/30/01/03 + avg_mc11..33) follow the same template in a follow-up PR. State of the qpel matrix after this PR: put_ : 15 of 16 positions ✓ (mc00 is integer copy, no wrapper) avg_ : 3 of 16 positions ✓ (mc20, mc02, mc22 anchors) 13 follow-up positions --- CMakeLists.txt | 1 + include/daedalus.h | 24 +++++++++ src/daedalus_core.c | 18 +++++++ tests/h264_qpel8_avg_anchors_ref.c | 79 ++++++++++++++++++++++++++++++ tests/test_api_h264.c | 60 +++++++++++++++++++++++ 5 files changed, 182 insertions(+) create mode 100644 tests/h264_qpel8_avg_anchors_ref.c diff --git a/CMakeLists.txt b/CMakeLists.txt index f8e1059..5efabe8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -527,6 +527,7 @@ add_executable(test_api_h264 tests/h264_qpel8_mc22_ref.c tests/h264_qpel8_quarter_axis_ref.c tests/h264_qpel8_diag_ref.c + tests/h264_qpel8_avg_anchors_ref.c ) target_link_libraries(test_api_h264 PRIVATE daedalus_core) target_compile_options(test_api_h264 PRIVATE -O2) diff --git a/include/daedalus.h b/include/daedalus.h index 09f0193..b7bef71 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -511,6 +511,27 @@ DECLARE_QPEL_DIAG(mc33) #undef DECLARE_QPEL_DIAG +/* H.264 luma qpel avg_ biprediction anchors — 3 half-pel positions + * (the put_ result is L2-averaged into the existing dst buffer per + * H.264 §8.4.2.3.1). Caller is responsible for pre-loading dst with + * the list0 prediction; the avg_ call adds list1. + * + * Same single-stride convention as put_; CPU NEON only for now. + */ +#define DECLARE_QPEL_AVG(name) \ +int daedalus_recipe_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, \ + uint8_t *dst, const uint8_t *src, size_t stride, \ + size_t n_blocks, const daedalus_h264_qpel_meta *meta); \ +int daedalus_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, daedalus_substrate sub, \ + uint8_t *dst, const uint8_t *src, size_t stride, \ + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + +DECLARE_QPEL_AVG(avg_mc20) +DECLARE_QPEL_AVG(avg_mc02) +DECLARE_QPEL_AVG(avg_mc22) + +#undef DECLARE_QPEL_AVG + /* ------------------------------------------------------------------- * Recipe query — what does the API recommend for each kernel? * ----------------------------------------------------------------- */ @@ -545,6 +566,9 @@ typedef enum { DAEDALUS_KERNEL_H264_QPEL_MC31 = 28, DAEDALUS_KERNEL_H264_QPEL_MC32 = 29, DAEDALUS_KERNEL_H264_QPEL_MC33 = 30, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC20 = 31, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC02 = 32, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC22 = 33, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index bf17585..1ee54bb 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -152,6 +152,9 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_QPEL_MC31: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾¼ */ case DAEDALUS_KERNEL_H264_QPEL_MC32: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾½ */ case DAEDALUS_KERNEL_H264_QPEL_MC33: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾¾ */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_CPU; /* biprediction anchors */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_CPU; } return DAEDALUS_SUBSTRATE_CPU; } @@ -212,6 +215,9 @@ extern void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdif extern void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); /* -------------------- CPU dispatch implementations -------------- */ @@ -493,6 +499,12 @@ DEFINE_QPEL_CPU_DISPATCH(mc31, ff_put_h264_qpel8_mc31_neon) DEFINE_QPEL_CPU_DISPATCH(mc32, ff_put_h264_qpel8_mc32_neon) DEFINE_QPEL_CPU_DISPATCH(mc33, ff_put_h264_qpel8_mc33_neon) +/* avg_ biprediction variants — same dispatch shape as put_, just + * different NEON entry that L2-averages with the existing dst. */ +DEFINE_QPEL_CPU_DISPATCH(avg_mc20, ff_avg_h264_qpel8_mc20_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc02, ff_avg_h264_qpel8_mc02_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc22, ff_avg_h264_qpel8_mc22_neon) + #undef DEFINE_QPEL_CPU_DISPATCH /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */ @@ -1521,6 +1533,9 @@ DEFINE_QPEL_DISPATCH(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23) DEFINE_QPEL_DISPATCH(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31) DEFINE_QPEL_DISPATCH(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32) DEFINE_QPEL_DISPATCH(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33) +DEFINE_QPEL_DISPATCH(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20) +DEFINE_QPEL_DISPATCH(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02) +DEFINE_QPEL_DISPATCH(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22) #undef DEFINE_QPEL_DISPATCH @@ -1680,5 +1695,8 @@ DEFINE_QPEL_RECIPE(mc23) DEFINE_QPEL_RECIPE(mc31) DEFINE_QPEL_RECIPE(mc32) DEFINE_QPEL_RECIPE(mc33) +DEFINE_QPEL_RECIPE(avg_mc20) +DEFINE_QPEL_RECIPE(avg_mc02) +DEFINE_QPEL_RECIPE(avg_mc22) #undef DEFINE_QPEL_RECIPE diff --git a/tests/h264_qpel8_avg_anchors_ref.c b/tests/h264_qpel8_avg_anchors_ref.c new file mode 100644 index 0000000..8298346 --- /dev/null +++ b/tests/h264_qpel8_avg_anchors_ref.c @@ -0,0 +1,79 @@ +/* + * Standalone bit-exact C references for the avg_ qpel anchors — + * the biprediction "average against existing dst" form of mc20, + * mc02, mc22. Used in B-slices where two qpel-interpolated samples + * (one from list0, one from list1) are averaged per H.264 §8.4.2.3. + * + * Each kernel computes the same half-pel formula as the put_ form, + * then averages with dst[r,c] via L2 ((dst + put_val + 1) >> 1). + * The dst buffer carries the list0 prediction on entry; the avg_ + * call adds the list1 contribution. + * + * Mirror FFmpeg's `ff_avg_h264_qpel8_{mc20,mc02,mc22}_neon` in + * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S + * (same `\type=avg` expansion as the put_ functions). + * + * License: LGPL-2.1-or-later. + */ +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } +static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); } + +/* Same per-cell helpers as the diag/quarter-axis refs. Duplicated + * here (rather than extern'd) so this TU compiles standalone. */ +static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride) +{ + int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1] + + 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1] + - 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3] + + 16; + return (uint8_t) clip_u8(v >> 5); +} +static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride) +{ + int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c] + + 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c] + - 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c] + + 16; + return (uint8_t) clip_u8(v >> 5); +} + +void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +{ + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) + dst[r*stride + c] = avg2(dst[r*stride + c], hpel_h(src, r, c, stride)); +} + +void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +{ + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) + dst[r*stride + c] = avg2(dst[r*stride + c], hpel_v(src, r, c, stride)); +} + +void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +{ + /* Per-cell mc22: same 13-row int16 tmp[] computation as the + * put_ reference, then L2 with dst. */ + int16_t tmp[13][8]; + for (int rr = 0; rr < 13; rr++) { + int src_row = rr - 2; + const uint8_t *s = src + src_row * stride; + for (int c = 0; c < 8; c++) { + int v = (int) s[c-2] - 5 * (int) s[c-1] + + 20 * (int) s[c] + 20 * (int) s[c+1] + - 5 * (int) s[c+2] + (int) s[c+3]; + tmp[rr][c] = (int16_t) v; + } + } + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) { + int v = tmp[r+0][c] - 5*tmp[r+1][c] + 20*tmp[r+2][c] + + 20*tmp[r+3][c] - 5*tmp[r+4][c] + tmp[r+5][c] + 512; + uint8_t p = (uint8_t) clip_u8(v >> 10); + dst[r*stride + c] = avg2(dst[r*stride + c], p); + } +} diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c index c67c7e6..4f303b6 100644 --- a/tests/test_api_h264.c +++ b/tests/test_api_h264.c @@ -52,6 +52,9 @@ extern void daedalus_put_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, p extern void daedalus_put_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); @@ -583,6 +586,62 @@ static int test_qpel_diag_all(void) return fail; } +/* Avg-form harness: pre-loads dst + dst_ref with the same random + * content so we can verify the L2 averaging is happening (not just + * put_-style overwrite). If the dispatch incorrectly overwrote + * dst, the bit-exact compare would still catch the mismatch against + * the avg_ reference. */ +static int run_avg_qpel(const char *name, + qpel_ref_fn ref, qpel_dispatch_fn dispatch) +{ + enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16, + TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES, + SRC_ROW = 3, SRC_COL = 3 }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL]; + daedalus_h264_qpel_meta meta[N]; + + /* Two random buffers: src for the qpel input, dst seeded with + * different random content as the "list0 prediction" — both + * dst and dst_ref get the SAME seed so the avg compare is fair. */ + for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff); + for (int i = 0; i < TOTAL; i++) { + uint8_t v = (uint8_t)(xs() & 0xff); + dst[i] = dst_ref[i] = v; + } + + for (int i = 0; i < N; i++) { + meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); + meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); + } + + for (int i = 0; i < N; i++) + ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE); + + int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta); + if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; } + int diff = 0; + for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n", + name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + +static int test_qpel_avg_anchors(void) +{ + int fail = 0; + fail |= run_avg_qpel("avg_mc20", daedalus_avg_h264_qpel8_mc20_ref, + daedalus_recipe_dispatch_h264_qpel_avg_mc20); + fail |= run_avg_qpel("avg_mc02", daedalus_avg_h264_qpel8_mc02_ref, + daedalus_recipe_dispatch_h264_qpel_avg_mc02); + fail |= run_avg_qpel("avg_mc22", daedalus_avg_h264_qpel8_mc22_ref, + daedalus_recipe_dispatch_h264_qpel_avg_mc22); + return fail; +} + int main(void) { printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n"); @@ -617,5 +676,6 @@ int main(void) fail |= test_qpel_mc22(); fail |= test_qpel_quarter_axis_all(); fail |= test_qpel_diag_all(); + fail |= test_qpel_avg_anchors(); return fail; } -- 2.47.3