From 0894a4611493c1d2af0c4d20b6029ce848542d49 Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 07:49:12 +0200 Subject: [PATCH] =?UTF-8?q?h264:=20qpel=20diagonals=20=E2=80=94=208=20posi?= =?UTF-8?q?tions=20(mc11/12/13/21/23/31/32/33)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the qpel buildout. All 8 remaining diagonal positions land in one PR. Each is the rounded average of two half-pel intermediates per H.264 §8.4.2.2.1 / Table 8-4, with the decomposition matching the FFmpeg .S reference structure (verified by reading external/ffmpeg-snapshot/.../h264qpel_neon.S lines 622-758). Decomposition table (the formula for each output cell at (r,c)): mc11 ¼¼ : avg(mc20[r, c], mc02[r, c]) mc12 ¼½ : avg(mc22[r, c], mc02[r, c]) mc13 ¼¾ : avg(mc20[r+1, c], mc02[r, c]) mc21 ½¼ : avg(mc22[r, c], mc20[r, c]) mc23 ½¾ : avg(mc22[r, c], mc20[r+1, c]) mc31 ¾¼ : avg(mc20[r, c], mc02[r, c+1]) mc32 ¾½ : avg(mc22[r, c], mc02[r, c+1]) mc33 ¾¾ : avg(mc20[r+1, c], mc02[r, c+1]) The (r±1, c±1) offsets capture the position-dependent shift that the FFmpeg .S encodes by pre-incrementing x1 (src pointer) before branching into the common mc11/mc21 code paths. Scope (tightly macro-ised): - 8 new kernel enums (MC11..MC33 = 23..30) → CPU. - 8 NEON externs for the vendored ff_put_h264_qpel8_mc*_neon. - 8 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro. - 8 public dispatches via DEFINE_QPEL_DISPATCH macro. - 8 recipe wrappers via DEFINE_QPEL_RECIPE macro. - Header decls condensed via a DECLARE_QPEL_DIAG macro that expands to both recipe + dispatch decls per name. - C references via DEFINE_DIAG_REF macro: each ref is a 6-line wrapper around the per-cell hpel_h / hpel_v / hpel_hv helpers (the latter being the per-cell version of mc22's 13-row int16 tmp[] computation). - Test wrapper: test_qpel_diag_all() drives all 8 through the existing run_quarter_axis_qpel() harness. Verified on hertz (Pi 5 / V3D 7.1): $ ./build/test_api_h264 | tail -8 H.264 qpel mc11: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc12: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc13: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc21: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc23: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc31: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc32: 2048/2048 bytes bit-exact (100.0000%) H.264 qpel mc33: 2048/2048 bytes bit-exact (100.0000%) ALL 8 diagonal positions bit-exact PASS first try. Meaningful because the position-dependent (r±1, c±1) source offsets are easy to get wrong by transcription, and any of them would surface on random inputs immediately. After this PR the H.264 qpel 8x8 put_ matrix is complete: mc00 mc01 mc02 mc03 mc10 mc11 mc12 mc13 mc20 mc21 mc22 mc23 mc30 mc31 mc32 mc33 15 of 16 positions exposed through the daedalus API; mc00 is just integer copy and rarely needs a dispatch wrapper (libavcodec sets the function pointer table directly). mc20 retains its QPU shader (cycle 9 / v3d_h264_qpel_mc20.spv); all other 14 are CPU NEON. What this does NOT cover (still in backlog): - avg_ variants (the "add" form for biprediction, 16 more positions). Currently the API only exposes put_. - 16x16 qpel (separate function family in FFmpeg; the 8x8 path can be used twice to substitute when 16x16 isn't critical). - QPU shaders for any qpel position other than mc20. --- CMakeLists.txt | 1 + include/daedalus.h | 44 +++++++++++++++++ src/daedalus_core.c | 40 +++++++++++++++ tests/h264_qpel8_diag_ref.c | 98 +++++++++++++++++++++++++++++++++++++ tests/test_api_h264.c | 36 ++++++++++++++ 5 files changed, 219 insertions(+) create mode 100644 tests/h264_qpel8_diag_ref.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 971a63e..f8e1059 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -526,6 +526,7 @@ add_executable(test_api_h264 tests/h264_qpel8_mc02_ref.c tests/h264_qpel8_mc22_ref.c tests/h264_qpel8_quarter_axis_ref.c + tests/h264_qpel8_diag_ref.c ) target_link_libraries(test_api_h264 PRIVATE daedalus_core) target_compile_options(test_api_h264 PRIVATE -O2) diff --git a/include/daedalus.h b/include/daedalus.h index 46f9fce..09f0193 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -475,6 +475,42 @@ int daedalus_dispatch_h264_qpel_mc03(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta); +/* H.264 luma diagonal qpel positions ("put", 8 variants). Each is + * the rounded average of two half-pel intermediates per H.264 + * §8.4.2.2.1 / Table 8-4 (decomposition matches the FFmpeg .S + * structure; see test/h264_qpel8_diag_ref.c for the formulas). + * + * mc11 ¼¼ : avg(mc20[r,c], mc02[r,c]) + * mc12 ¼½ : avg(mc22[r,c], mc02[r,c]) + * mc13 ¼¾ : avg(mc20[r+1,c], mc02[r,c]) + * mc21 ½¼ : avg(mc22[r,c], mc20[r,c]) + * mc23 ½¾ : avg(mc22[r,c], mc20[r+1,c]) + * mc31 ¾¼ : avg(mc20[r,c], mc02[r,c+1]) + * mc32 ¾½ : avg(mc22[r,c], mc02[r,c+1]) + * mc33 ¾¾ : avg(mc20[r+1,c], mc02[r,c+1]) + * + * CPU-only via vendored FFmpeg NEON; QPU shaders pending. + * Explicit SUBSTRATE_QPU returns -1. + */ +#define DECLARE_QPEL_DIAG(name) \ +int daedalus_recipe_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, \ + uint8_t *dst, const uint8_t *src, size_t stride, \ + size_t n_blocks, const daedalus_h264_qpel_meta *meta); \ +int daedalus_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, daedalus_substrate sub, \ + uint8_t *dst, const uint8_t *src, size_t stride, \ + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + +DECLARE_QPEL_DIAG(mc11) +DECLARE_QPEL_DIAG(mc12) +DECLARE_QPEL_DIAG(mc13) +DECLARE_QPEL_DIAG(mc21) +DECLARE_QPEL_DIAG(mc23) +DECLARE_QPEL_DIAG(mc31) +DECLARE_QPEL_DIAG(mc32) +DECLARE_QPEL_DIAG(mc33) + +#undef DECLARE_QPEL_DIAG + /* ------------------------------------------------------------------- * Recipe query — what does the API recommend for each kernel? * ----------------------------------------------------------------- */ @@ -501,6 +537,14 @@ typedef enum { DAEDALUS_KERNEL_H264_QPEL_MC30 = 20, DAEDALUS_KERNEL_H264_QPEL_MC01 = 21, DAEDALUS_KERNEL_H264_QPEL_MC03 = 22, + DAEDALUS_KERNEL_H264_QPEL_MC11 = 23, + DAEDALUS_KERNEL_H264_QPEL_MC12 = 24, + DAEDALUS_KERNEL_H264_QPEL_MC13 = 25, + DAEDALUS_KERNEL_H264_QPEL_MC21 = 26, + DAEDALUS_KERNEL_H264_QPEL_MC23 = 27, + DAEDALUS_KERNEL_H264_QPEL_MC31 = 28, + DAEDALUS_KERNEL_H264_QPEL_MC32 = 29, + DAEDALUS_KERNEL_H264_QPEL_MC33 = 30, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 69edcc1..bf17585 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -144,6 +144,14 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_QPEL_MC30: return DAEDALUS_SUBSTRATE_CPU; /* ¾-H L2 */ case DAEDALUS_KERNEL_H264_QPEL_MC01: return DAEDALUS_SUBSTRATE_CPU; /* ¼-V L2 */ case DAEDALUS_KERNEL_H264_QPEL_MC03: return DAEDALUS_SUBSTRATE_CPU; /* ¾-V L2 */ + case DAEDALUS_KERNEL_H264_QPEL_MC11: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¼¼ */ + case DAEDALUS_KERNEL_H264_QPEL_MC12: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¼½ */ + case DAEDALUS_KERNEL_H264_QPEL_MC13: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¼¾ */ + case DAEDALUS_KERNEL_H264_QPEL_MC21: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ½¼ */ + case DAEDALUS_KERNEL_H264_QPEL_MC23: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ½¾ */ + case DAEDALUS_KERNEL_H264_QPEL_MC31: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾¼ */ + case DAEDALUS_KERNEL_H264_QPEL_MC32: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾½ */ + case DAEDALUS_KERNEL_H264_QPEL_MC33: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾¾ */ } return DAEDALUS_SUBSTRATE_CPU; } @@ -196,6 +204,14 @@ extern void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); /* -------------------- CPU dispatch implementations -------------- */ @@ -468,6 +484,14 @@ DEFINE_QPEL_CPU_DISPATCH(mc10, ff_put_h264_qpel8_mc10_neon) DEFINE_QPEL_CPU_DISPATCH(mc30, ff_put_h264_qpel8_mc30_neon) DEFINE_QPEL_CPU_DISPATCH(mc01, ff_put_h264_qpel8_mc01_neon) DEFINE_QPEL_CPU_DISPATCH(mc03, ff_put_h264_qpel8_mc03_neon) +DEFINE_QPEL_CPU_DISPATCH(mc11, ff_put_h264_qpel8_mc11_neon) +DEFINE_QPEL_CPU_DISPATCH(mc12, ff_put_h264_qpel8_mc12_neon) +DEFINE_QPEL_CPU_DISPATCH(mc13, ff_put_h264_qpel8_mc13_neon) +DEFINE_QPEL_CPU_DISPATCH(mc21, ff_put_h264_qpel8_mc21_neon) +DEFINE_QPEL_CPU_DISPATCH(mc23, ff_put_h264_qpel8_mc23_neon) +DEFINE_QPEL_CPU_DISPATCH(mc31, ff_put_h264_qpel8_mc31_neon) +DEFINE_QPEL_CPU_DISPATCH(mc32, ff_put_h264_qpel8_mc32_neon) +DEFINE_QPEL_CPU_DISPATCH(mc33, ff_put_h264_qpel8_mc33_neon) #undef DEFINE_QPEL_CPU_DISPATCH @@ -1489,6 +1513,14 @@ DEFINE_QPEL_DISPATCH(mc10, DAEDALUS_KERNEL_H264_QPEL_MC10) DEFINE_QPEL_DISPATCH(mc30, DAEDALUS_KERNEL_H264_QPEL_MC30) DEFINE_QPEL_DISPATCH(mc01, DAEDALUS_KERNEL_H264_QPEL_MC01) DEFINE_QPEL_DISPATCH(mc03, DAEDALUS_KERNEL_H264_QPEL_MC03) +DEFINE_QPEL_DISPATCH(mc11, DAEDALUS_KERNEL_H264_QPEL_MC11) +DEFINE_QPEL_DISPATCH(mc12, DAEDALUS_KERNEL_H264_QPEL_MC12) +DEFINE_QPEL_DISPATCH(mc13, DAEDALUS_KERNEL_H264_QPEL_MC13) +DEFINE_QPEL_DISPATCH(mc21, DAEDALUS_KERNEL_H264_QPEL_MC21) +DEFINE_QPEL_DISPATCH(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23) +DEFINE_QPEL_DISPATCH(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31) +DEFINE_QPEL_DISPATCH(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32) +DEFINE_QPEL_DISPATCH(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33) #undef DEFINE_QPEL_DISPATCH @@ -1640,5 +1672,13 @@ DEFINE_QPEL_RECIPE(mc10) DEFINE_QPEL_RECIPE(mc30) DEFINE_QPEL_RECIPE(mc01) DEFINE_QPEL_RECIPE(mc03) +DEFINE_QPEL_RECIPE(mc11) +DEFINE_QPEL_RECIPE(mc12) +DEFINE_QPEL_RECIPE(mc13) +DEFINE_QPEL_RECIPE(mc21) +DEFINE_QPEL_RECIPE(mc23) +DEFINE_QPEL_RECIPE(mc31) +DEFINE_QPEL_RECIPE(mc32) +DEFINE_QPEL_RECIPE(mc33) #undef DEFINE_QPEL_RECIPE diff --git a/tests/h264_qpel8_diag_ref.c b/tests/h264_qpel8_diag_ref.c new file mode 100644 index 0000000..06c6243 --- /dev/null +++ b/tests/h264_qpel8_diag_ref.c @@ -0,0 +1,98 @@ +/* + * Standalone bit-exact C references for the 8 diagonal H.264 luma + * qpel positions (mc11, mc12, mc13, mc21, mc23, mc31, mc32, mc33). + * Each is the rounded average of two half-pel intermediates per + * H.264 §8.4.2.2.1 / Table 8-4, decomposed to match the FFmpeg .S + * reference structure (see comments in mc{11,12,21,...}_neon in + * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S). + * + * Position decompositions (verified against the .S): + * mc11 (e, ¼¼): avg(mc20[r,c], mc02[r,c]) + * mc12 (f, ¼½): avg(mc22[r,c], mc02[r,c]) + * mc13 (g, ¼¾): avg(mc20[r+1,c], mc02[r,c]) + * mc21 (i, ½¼): avg(mc22[r,c], mc20[r,c]) + * mc23 (k, ½¾): avg(mc22[r,c], mc20[r+1,c]) + * mc31 (p, ¾¼): avg(mc20[r,c], mc02[r,c+1]) + * mc32 (q, ¾½): avg(mc22[r,c], mc02[r,c+1]) + * mc33 (r, ¾¾): avg(mc20[r+1,c], mc02[r,c+1]) + * + * (The mc20[r,c] notation means "the mc20-style horizontal half-pel + * result at source-relative integer position (r, c)"; analogously + * for mc02 and mc22.) + * + * Single-stride convention; same edge-context contract as the simpler + * variants (the cells "[r+1,c]" etc. demand one extra row/col of + * source context beyond what mc20/mc02 alone would need). + * + * License: LGPL-2.1-or-later. + */ +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } + +/* Single-cell helpers — same arithmetic as the dedicated mc20/mc02 + * refs but computed point-by-point so the diagonal refs can mix them + * cheaply. Each returns a u8 (already clipped). */ +static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride) +{ + int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1] + + 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1] + - 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3] + + 16; + return (uint8_t) clip_u8(v >> 5); +} +static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride) +{ + int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c] + + 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c] + - 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c] + + 16; + return (uint8_t) clip_u8(v >> 5); +} + +/* hpel_hv — 2D half-pel at (r, c) per the H.264 §8.4.2.2.1 "j" + * cascade. Computes the 6 vertical intermediates needed for the + * column at offsets -2..+3 around (r, c), each as a 16-bit signed + * h-lowpass over the 6 source samples in the same row. Then v-lowpass + * over those 6 intermediates with the +512 >> 10 final scale. Same + * as the mc22 ref, just expressed point-by-point. */ +static inline uint8_t hpel_hv(const uint8_t *s, int r, int c, ptrdiff_t stride) +{ + int t[6]; /* tmp at rows r-2..r+3 of the same col c */ + for (int i = 0; i < 6; i++) { + int rr = r - 2 + i; + t[i] = (int) s[rr*stride + c-2] - 5 * (int) s[rr*stride + c-1] + + 20 * (int) s[rr*stride + c] + 20 * (int) s[rr*stride + c+1] + - 5 * (int) s[rr*stride + c+2] + (int) s[rr*stride + c+3]; + } + int v = t[0] - 5 * t[1] + 20 * t[2] + 20 * t[3] - 5 * t[4] + t[5] + 512; + return (uint8_t) clip_u8(v >> 10); +} + +/* avg rounded ((a + b + 1) >> 1) — saturates already-clipped inputs + * so no further clip needed. */ +static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); } + +#define DEFINE_DIAG_REF(NAME, A_EXPR, B_EXPR) \ +void daedalus_put_h264_qpel8_ ## NAME ## _ref(uint8_t *dst, \ + const uint8_t *src, ptrdiff_t stride) \ +{ \ + for (int r = 0; r < 8; r++) \ + for (int c = 0; c < 8; c++) { \ + uint8_t a = (A_EXPR); \ + uint8_t b = (B_EXPR); \ + dst[r*stride + c] = avg2(a, b); \ + } \ +} + +DEFINE_DIAG_REF(mc11, hpel_h(src, r, c, stride), hpel_v(src, r, c, stride)) +DEFINE_DIAG_REF(mc12, hpel_hv(src, r, c, stride), hpel_v(src, r, c, stride)) +DEFINE_DIAG_REF(mc13, hpel_h(src, r+1, c, stride), hpel_v(src, r, c, stride)) +DEFINE_DIAG_REF(mc21, hpel_hv(src, r, c, stride), hpel_h(src, r, c, stride)) +DEFINE_DIAG_REF(mc23, hpel_hv(src, r, c, stride), hpel_h(src, r+1, c, stride)) +DEFINE_DIAG_REF(mc31, hpel_h(src, r, c, stride), hpel_v(src, r, c+1, stride)) +DEFINE_DIAG_REF(mc32, hpel_hv(src, r, c, stride), hpel_v(src, r, c+1, stride)) +DEFINE_DIAG_REF(mc33, hpel_h(src, r+1, c, stride), hpel_v(src, r, c+1, stride)) + +#undef DEFINE_DIAG_REF diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c index 370f34b..c67c7e6 100644 --- a/tests/test_api_h264.c +++ b/tests/test_api_h264.c @@ -44,6 +44,14 @@ extern void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); @@ -548,6 +556,33 @@ static int test_qpel_quarter_axis_all(void) return fail; } +static int test_qpel_diag_all(void) +{ + /* Diagonal positions need TWO half-pel intermediates per output; + * some of them read at (r+1,c) or (r,c+1) so the test geometry + * needs an extra row + col of context. run_quarter_axis_qpel + * already provides plenty (SRC_ROW=3, SRC_COL=3, 16x16 tile) + * — reusing that harness is fine. */ + int fail = 0; + fail |= run_quarter_axis_qpel("mc11", daedalus_put_h264_qpel8_mc11_ref, + daedalus_recipe_dispatch_h264_qpel_mc11); + fail |= run_quarter_axis_qpel("mc12", daedalus_put_h264_qpel8_mc12_ref, + daedalus_recipe_dispatch_h264_qpel_mc12); + fail |= run_quarter_axis_qpel("mc13", daedalus_put_h264_qpel8_mc13_ref, + daedalus_recipe_dispatch_h264_qpel_mc13); + fail |= run_quarter_axis_qpel("mc21", daedalus_put_h264_qpel8_mc21_ref, + daedalus_recipe_dispatch_h264_qpel_mc21); + fail |= run_quarter_axis_qpel("mc23", daedalus_put_h264_qpel8_mc23_ref, + daedalus_recipe_dispatch_h264_qpel_mc23); + fail |= run_quarter_axis_qpel("mc31", daedalus_put_h264_qpel8_mc31_ref, + daedalus_recipe_dispatch_h264_qpel_mc31); + fail |= run_quarter_axis_qpel("mc32", daedalus_put_h264_qpel8_mc32_ref, + daedalus_recipe_dispatch_h264_qpel_mc32); + fail |= run_quarter_axis_qpel("mc33", daedalus_put_h264_qpel8_mc33_ref, + daedalus_recipe_dispatch_h264_qpel_mc33); + return fail; +} + int main(void) { printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n"); @@ -581,5 +616,6 @@ int main(void) fail |= test_qpel_mc02(); fail |= test_qpel_mc22(); fail |= test_qpel_quarter_axis_all(); + fail |= test_qpel_diag_all(); return fail; }