diff --git a/CMakeLists.txt b/CMakeLists.txt index 5efabe8..6c53e92 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -528,6 +528,7 @@ add_executable(test_api_h264 tests/h264_qpel8_quarter_axis_ref.c tests/h264_qpel8_diag_ref.c tests/h264_qpel8_avg_anchors_ref.c + tests/h264_qpel8_avg_rest_ref.c ) target_link_libraries(test_api_h264 PRIVATE daedalus_core) target_compile_options(test_api_h264 PRIVATE -O2) diff --git a/include/daedalus.h b/include/daedalus.h index b7bef71..a224628 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -529,6 +529,18 @@ int daedalus_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, daedalus_substrate s DECLARE_QPEL_AVG(avg_mc20) DECLARE_QPEL_AVG(avg_mc02) DECLARE_QPEL_AVG(avg_mc22) +DECLARE_QPEL_AVG(avg_mc10) +DECLARE_QPEL_AVG(avg_mc30) +DECLARE_QPEL_AVG(avg_mc01) +DECLARE_QPEL_AVG(avg_mc03) +DECLARE_QPEL_AVG(avg_mc11) +DECLARE_QPEL_AVG(avg_mc12) +DECLARE_QPEL_AVG(avg_mc13) +DECLARE_QPEL_AVG(avg_mc21) +DECLARE_QPEL_AVG(avg_mc23) +DECLARE_QPEL_AVG(avg_mc31) +DECLARE_QPEL_AVG(avg_mc32) +DECLARE_QPEL_AVG(avg_mc33) #undef DECLARE_QPEL_AVG @@ -569,6 +581,18 @@ typedef enum { DAEDALUS_KERNEL_H264_QPEL_AVG_MC20 = 31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02 = 32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22 = 33, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC10 = 34, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC30 = 35, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC01 = 36, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC03 = 37, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC11 = 38, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC12 = 39, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC13 = 40, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC21 = 41, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC23 = 42, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC31 = 43, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC32 = 44, + DAEDALUS_KERNEL_H264_QPEL_AVG_MC33 = 45, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 1ee54bb..fe16614 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -155,6 +155,18 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_CPU; /* biprediction anchors */ case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_CPU; case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_CPU; /* ¼-H L2 avg */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_CPU; /* diagonals avg */ + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_CPU; } return DAEDALUS_SUBSTRATE_CPU; } @@ -218,6 +230,18 @@ extern void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdif extern void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); /* -------------------- CPU dispatch implementations -------------- */ @@ -504,6 +528,18 @@ DEFINE_QPEL_CPU_DISPATCH(mc33, ff_put_h264_qpel8_mc33_neon) DEFINE_QPEL_CPU_DISPATCH(avg_mc20, ff_avg_h264_qpel8_mc20_neon) DEFINE_QPEL_CPU_DISPATCH(avg_mc02, ff_avg_h264_qpel8_mc02_neon) DEFINE_QPEL_CPU_DISPATCH(avg_mc22, ff_avg_h264_qpel8_mc22_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc10, ff_avg_h264_qpel8_mc10_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc30, ff_avg_h264_qpel8_mc30_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc01, ff_avg_h264_qpel8_mc01_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc03, ff_avg_h264_qpel8_mc03_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc11, ff_avg_h264_qpel8_mc11_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc12, ff_avg_h264_qpel8_mc12_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc13, ff_avg_h264_qpel8_mc13_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc21, ff_avg_h264_qpel8_mc21_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc23, ff_avg_h264_qpel8_mc23_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc31, ff_avg_h264_qpel8_mc31_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc32, ff_avg_h264_qpel8_mc32_neon) +DEFINE_QPEL_CPU_DISPATCH(avg_mc33, ff_avg_h264_qpel8_mc33_neon) #undef DEFINE_QPEL_CPU_DISPATCH @@ -1536,6 +1572,18 @@ DEFINE_QPEL_DISPATCH(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33) DEFINE_QPEL_DISPATCH(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20) DEFINE_QPEL_DISPATCH(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02) DEFINE_QPEL_DISPATCH(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22) +DEFINE_QPEL_DISPATCH(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10) +DEFINE_QPEL_DISPATCH(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30) +DEFINE_QPEL_DISPATCH(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01) +DEFINE_QPEL_DISPATCH(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03) +DEFINE_QPEL_DISPATCH(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11) +DEFINE_QPEL_DISPATCH(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12) +DEFINE_QPEL_DISPATCH(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13) +DEFINE_QPEL_DISPATCH(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21) +DEFINE_QPEL_DISPATCH(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23) +DEFINE_QPEL_DISPATCH(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31) +DEFINE_QPEL_DISPATCH(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32) +DEFINE_QPEL_DISPATCH(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33) #undef DEFINE_QPEL_DISPATCH @@ -1698,5 +1746,17 @@ DEFINE_QPEL_RECIPE(mc33) DEFINE_QPEL_RECIPE(avg_mc20) DEFINE_QPEL_RECIPE(avg_mc02) DEFINE_QPEL_RECIPE(avg_mc22) +DEFINE_QPEL_RECIPE(avg_mc10) +DEFINE_QPEL_RECIPE(avg_mc30) +DEFINE_QPEL_RECIPE(avg_mc01) +DEFINE_QPEL_RECIPE(avg_mc03) +DEFINE_QPEL_RECIPE(avg_mc11) +DEFINE_QPEL_RECIPE(avg_mc12) +DEFINE_QPEL_RECIPE(avg_mc13) +DEFINE_QPEL_RECIPE(avg_mc21) +DEFINE_QPEL_RECIPE(avg_mc23) +DEFINE_QPEL_RECIPE(avg_mc31) +DEFINE_QPEL_RECIPE(avg_mc32) +DEFINE_QPEL_RECIPE(avg_mc33) #undef DEFINE_QPEL_RECIPE diff --git a/tests/h264_qpel8_avg_rest_ref.c b/tests/h264_qpel8_avg_rest_ref.c new file mode 100644 index 0000000..390e684 --- /dev/null +++ b/tests/h264_qpel8_avg_rest_ref.c @@ -0,0 +1,97 @@ +/* + * Standalone bit-exact C references for the 12 remaining avg_ + * biprediction qpel positions (B-slice list0 + list1 averaging): + * 4 quarter-axis: avg_mc{10,30,01,03} + * 8 diagonals : avg_mc{11,12,13,21,23,31,32,33} + * + * Each is the put_ formula (per H.264 §8.4.2.2.1 / Table 8-4) with + * a final L2 average against the existing dst contents per §8.4.2.3.1. + * Caller pre-loads dst with the list0 prediction; the avg_ call + * folds in list1. + * + * Mirror FFmpeg's `ff_avg_h264_qpel8_mc{XY}_neon` (in + * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S + * — same `\type=avg` expansion as the put_ functions). + * + * License: LGPL-2.1-or-later. + */ +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } +static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); } + +static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride) +{ + int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1] + + 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1] + - 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3] + + 16; + return (uint8_t) clip_u8(v >> 5); +} +static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride) +{ + int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c] + + 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c] + - 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c] + + 16; + return (uint8_t) clip_u8(v >> 5); +} +static inline uint8_t hpel_hv(const uint8_t *s, int r, int c, ptrdiff_t stride) +{ + int t[6]; + for (int i = 0; i < 6; i++) { + int rr = r - 2 + i; + t[i] = (int) s[rr*stride + c-2] - 5 * (int) s[rr*stride + c-1] + + 20 * (int) s[rr*stride + c] + 20 * (int) s[rr*stride + c+1] + - 5 * (int) s[rr*stride + c+2] + (int) s[rr*stride + c+3]; + } + int v = t[0] - 5*t[1] + 20*t[2] + 20*t[3] - 5*t[4] + t[5] + 512; + return (uint8_t) clip_u8(v >> 10); +} + +/* Quarter-axis variants: half-pel + L2 with integer source, then + * L2 again with dst. */ +#define DEFINE_AVG_QUARTER(NAME, A_EXPR, INT_EXPR) \ +void daedalus_avg_h264_qpel8_ ## NAME ## _ref(uint8_t *dst, \ + const uint8_t *src, ptrdiff_t stride) \ +{ \ + for (int r = 0; r < 8; r++) \ + for (int c = 0; c < 8; c++) { \ + uint8_t a = (A_EXPR); \ + uint8_t p = (uint8_t)((a + (INT_EXPR) + 1) >> 1); \ + dst[r*stride + c] = avg2(dst[r*stride + c], p); \ + } \ +} + +DEFINE_AVG_QUARTER(mc10, hpel_h(src, r, c, stride), src[r*stride + c ]) +DEFINE_AVG_QUARTER(mc30, hpel_h(src, r, c, stride), src[r*stride + c + 1]) +DEFINE_AVG_QUARTER(mc01, hpel_v(src, r, c, stride), src[(r )*stride + c]) +DEFINE_AVG_QUARTER(mc03, hpel_v(src, r, c, stride), src[(r + 1)*stride + c]) + +#undef DEFINE_AVG_QUARTER + +/* Diagonal variants: avg of two half-pels, then L2 with dst. */ +#define DEFINE_AVG_DIAG(NAME, A_EXPR, B_EXPR) \ +void daedalus_avg_h264_qpel8_ ## NAME ## _ref(uint8_t *dst, \ + const uint8_t *src, ptrdiff_t stride) \ +{ \ + for (int r = 0; r < 8; r++) \ + for (int c = 0; c < 8; c++) { \ + uint8_t a = (A_EXPR); \ + uint8_t b = (B_EXPR); \ + uint8_t p = avg2(a, b); \ + dst[r*stride + c] = avg2(dst[r*stride + c], p); \ + } \ +} + +DEFINE_AVG_DIAG(mc11, hpel_h(src, r, c, stride), hpel_v(src, r, c, stride)) +DEFINE_AVG_DIAG(mc12, hpel_hv(src, r, c, stride), hpel_v(src, r, c, stride)) +DEFINE_AVG_DIAG(mc13, hpel_h(src, r+1, c, stride), hpel_v(src, r, c, stride)) +DEFINE_AVG_DIAG(mc21, hpel_hv(src, r, c, stride), hpel_h(src, r, c, stride)) +DEFINE_AVG_DIAG(mc23, hpel_hv(src, r, c, stride), hpel_h(src, r+1, c, stride)) +DEFINE_AVG_DIAG(mc31, hpel_h(src, r, c, stride), hpel_v(src, r, c+1, stride)) +DEFINE_AVG_DIAG(mc32, hpel_hv(src, r, c, stride), hpel_v(src, r, c+1, stride)) +DEFINE_AVG_DIAG(mc33, hpel_h(src, r+1, c, stride), hpel_v(src, r, c+1, stride)) + +#undef DEFINE_AVG_DIAG diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c index 4f303b6..edcadaa 100644 --- a/tests/test_api_h264.c +++ b/tests/test_api_h264.c @@ -55,6 +55,18 @@ extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, p extern void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_avg_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); @@ -642,6 +654,23 @@ static int test_qpel_avg_anchors(void) return fail; } +static int test_qpel_avg_rest(void) +{ + int fail = 0; + /* Ref fns are named daedalus_avg_h264_qpel8__ref (no + * second "avg_"); dispatch fns are named ..._avg_mcXX. Macro + * builds both from the bare mcXX name. */ +#define RUN(MC) fail |= run_avg_qpel("avg_" #MC, \ + daedalus_avg_h264_qpel8_ ## MC ## _ref, \ + daedalus_recipe_dispatch_h264_qpel_avg_ ## MC) + RUN(mc10); RUN(mc30); RUN(mc01); RUN(mc03); + RUN(mc11); RUN(mc12); RUN(mc13); + RUN(mc21); RUN(mc23); + RUN(mc31); RUN(mc32); RUN(mc33); +#undef RUN + return fail; +} + int main(void) { printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n"); @@ -677,5 +706,6 @@ int main(void) fail |= test_qpel_quarter_axis_all(); fail |= test_qpel_diag_all(); fail |= test_qpel_avg_anchors(); + fail |= test_qpel_avg_rest(); return fail; }