diff --git a/CMakeLists.txt b/CMakeLists.txt index 38ced99..971a63e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -525,6 +525,7 @@ add_executable(test_api_h264 tests/h264_qpel8_mc20_ref.c tests/h264_qpel8_mc02_ref.c tests/h264_qpel8_mc22_ref.c + tests/h264_qpel8_quarter_axis_ref.c ) target_link_libraries(test_api_h264 PRIVATE daedalus_core) target_compile_options(test_api_h264 PRIVATE -O2) diff --git a/include/daedalus.h b/include/daedalus.h index d827d3e..46f9fce 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -436,6 +436,45 @@ int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta); +/* H.264 luma single-axis quarter-pel qpel positions ("put"): + * mc10 ¼-H ("a" position): clip255(mc20(s)) avg src[r,c] + * mc30 ¾-H ("c" position): clip255(mc20(s)) avg src[r,c+1] + * mc01 ¼-V ("d" position): clip255(mc02(s)) avg src[r,c] + * mc03 ¾-V ("n" position): clip255(mc02(s)) avg src[r+1,c] + * + * Each is a half-pel lowpass clipped to u8 then averaged with an + * integer-aligned source pixel (rounded +1 >> 1). Same edge + * context contract as mc20/mc02. CPU-only for now; QPU shaders + * not yet implemented. Explicit SUBSTRATE_QPU returns -1. + */ +int daedalus_recipe_dispatch_h264_qpel_mc10(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); +int daedalus_dispatch_h264_qpel_mc10(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + +int daedalus_recipe_dispatch_h264_qpel_mc30(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); +int daedalus_dispatch_h264_qpel_mc30(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + +int daedalus_recipe_dispatch_h264_qpel_mc01(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); +int daedalus_dispatch_h264_qpel_mc01(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + +int daedalus_recipe_dispatch_h264_qpel_mc03(daedalus_ctx *ctx, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); +int daedalus_dispatch_h264_qpel_mc03(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + /* ------------------------------------------------------------------- * Recipe query — what does the API recommend for each kernel? * ----------------------------------------------------------------- */ @@ -458,6 +497,10 @@ typedef enum { DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16, DAEDALUS_KERNEL_H264_QPEL_MC02 = 17, DAEDALUS_KERNEL_H264_QPEL_MC22 = 18, + DAEDALUS_KERNEL_H264_QPEL_MC10 = 19, + DAEDALUS_KERNEL_H264_QPEL_MC30 = 20, + DAEDALUS_KERNEL_H264_QPEL_MC01 = 21, + DAEDALUS_KERNEL_H264_QPEL_MC03 = 22, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 45ecff5..69edcc1 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -140,6 +140,10 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */ case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */ case DAEDALUS_KERNEL_H264_QPEL_MC22: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc22 shader pending (hv lowpass) */ + case DAEDALUS_KERNEL_H264_QPEL_MC10: return DAEDALUS_SUBSTRATE_CPU; /* ¼-H L2 */ + case DAEDALUS_KERNEL_H264_QPEL_MC30: return DAEDALUS_SUBSTRATE_CPU; /* ¾-H L2 */ + case DAEDALUS_KERNEL_H264_QPEL_MC01: return DAEDALUS_SUBSTRATE_CPU; /* ¼-V L2 */ + case DAEDALUS_KERNEL_H264_QPEL_MC03: return DAEDALUS_SUBSTRATE_CPU; /* ¾-V L2 */ } return DAEDALUS_SUBSTRATE_CPU; } @@ -184,6 +188,14 @@ extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +extern void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); /* -------------------- CPU dispatch implementations -------------- */ @@ -437,6 +449,28 @@ static int dispatch_h264_qpel_mc22_cpu(daedalus_ctx *ctx, return 0; } +/* The four single-axis quarter-pel CPU dispatches are uniform; the + * macro collapses ~50 LOC of repetition. */ +#define DEFINE_QPEL_CPU_DISPATCH(suffix, neon_fn) \ +static int dispatch_h264_qpel_ ## suffix ## _cpu(daedalus_ctx *ctx, \ + uint8_t *dst, const uint8_t *src, size_t stride, \ + size_t n_blocks, const daedalus_h264_qpel_meta *meta) \ +{ \ + (void) ctx; \ + for (size_t i = 0; i < n_blocks; i++) { \ + neon_fn(dst + meta[i].dst_off, src + meta[i].src_off, \ + (ptrdiff_t) stride); \ + } \ + return 0; \ +} + +DEFINE_QPEL_CPU_DISPATCH(mc10, ff_put_h264_qpel8_mc10_neon) +DEFINE_QPEL_CPU_DISPATCH(mc30, ff_put_h264_qpel8_mc30_neon) +DEFINE_QPEL_CPU_DISPATCH(mc01, ff_put_h264_qpel8_mc01_neon) +DEFINE_QPEL_CPU_DISPATCH(mc03, ff_put_h264_qpel8_mc03_neon) + +#undef DEFINE_QPEL_CPU_DISPATCH + /* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */ typedef struct { @@ -1436,6 +1470,28 @@ int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub, return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta); } +#define DEFINE_QPEL_DISPATCH(suffix, kernel) \ +int daedalus_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \ + daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, \ + size_t n_blocks, const daedalus_h264_qpel_meta *meta) \ +{ \ + daedalus_substrate eff = sub; \ + if (eff == DAEDALUS_SUBSTRATE_AUTO) \ + eff = daedalus_recipe_substrate_for(kernel); \ + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \ + eff = DAEDALUS_SUBSTRATE_CPU; \ + if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \ + return dispatch_h264_qpel_ ## suffix ## _cpu(ctx, dst, src, stride, \ + n_blocks, meta); \ +} + +DEFINE_QPEL_DISPATCH(mc10, DAEDALUS_KERNEL_H264_QPEL_MC10) +DEFINE_QPEL_DISPATCH(mc30, DAEDALUS_KERNEL_H264_QPEL_MC30) +DEFINE_QPEL_DISPATCH(mc01, DAEDALUS_KERNEL_H264_QPEL_MC01) +DEFINE_QPEL_DISPATCH(mc03, DAEDALUS_KERNEL_H264_QPEL_MC03) + +#undef DEFINE_QPEL_DISPATCH + /* -------------------- Recipe convenience wrappers --------------- */ int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx, @@ -1570,3 +1626,19 @@ int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO, dst, src, stride, n_blocks, meta); } + +#define DEFINE_QPEL_RECIPE(suffix) \ +int daedalus_recipe_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \ + uint8_t *dst, const uint8_t *src, size_t stride, \ + size_t n_blocks, const daedalus_h264_qpel_meta *meta) \ +{ \ + return daedalus_dispatch_h264_qpel_ ## suffix(ctx, DAEDALUS_SUBSTRATE_AUTO,\ + dst, src, stride, n_blocks, meta); \ +} + +DEFINE_QPEL_RECIPE(mc10) +DEFINE_QPEL_RECIPE(mc30) +DEFINE_QPEL_RECIPE(mc01) +DEFINE_QPEL_RECIPE(mc03) + +#undef DEFINE_QPEL_RECIPE diff --git a/tests/h264_qpel8_quarter_axis_ref.c b/tests/h264_qpel8_quarter_axis_ref.c new file mode 100644 index 0000000..e581006 --- /dev/null +++ b/tests/h264_qpel8_quarter_axis_ref.c @@ -0,0 +1,82 @@ +/* + * Standalone bit-exact C references for the four single-axis quarter- + * pel luma qpel positions (H.264 §8.4.2.2.1, "put" variants). Each + * is a half-pel lowpass clipped to u8 followed by an L2 rounded-average + * with an integer-position source pixel. + * + * mc10 ("a" pos, ¼ horiz): a = clip255(mc20(s)); dst = (a + s[r,c] + 1) >> 1 + * mc30 ("c" pos, ¾ horiz): a = clip255(mc20(s)); dst = (a + s[r,c+1] + 1) >> 1 + * mc01 ("d" pos, ¼ vert ): a = clip255(mc02(s)); dst = (a + s[r, c] + 1) >> 1 + * mc03 ("n" pos, ¾ vert ): a = clip255(mc02(s)); dst = (a + s[r+1,c] + 1) >> 1 + * + * Mirror FFmpeg's `ff_put_h264_qpel8_mc{10,30,01,03}_neon` (in + * external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S + * lines 587, 603, 611, 729 — each tail-calls the corresponding + * lowpass_l2 helper). + * + * Same single-stride convention as mc20/mc02 — dst and src share the + * same stride; src + src_off points at row 0 col 0 of the output + * block, with appropriate edge context already in-buffer. + * + * License: LGPL-2.1-or-later. + */ +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } + +/* Compute one horizontal half-pel pixel at (r, c) — same as mc20. */ +static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride) +{ + int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1] + + 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1] + - 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3] + + 16; + return (uint8_t) clip_u8(v >> 5); +} + +/* Compute one vertical half-pel pixel at (r, c) — same as mc02. */ +static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride) +{ + int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c] + + 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c] + - 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c] + + 16; + return (uint8_t) clip_u8(v >> 5); +} + +void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +{ + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) { + uint8_t a = hpel_h(src, r, c, stride); + dst[r*stride + c] = (uint8_t) ((a + src[r*stride + c ] + 1) >> 1); + } +} + +void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +{ + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) { + uint8_t a = hpel_h(src, r, c, stride); + dst[r*stride + c] = (uint8_t) ((a + src[r*stride + c + 1] + 1) >> 1); + } +} + +void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +{ + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) { + uint8_t a = hpel_v(src, r, c, stride); + dst[r*stride + c] = (uint8_t) ((a + src[(r )*stride + c] + 1) >> 1); + } +} + +void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) +{ + for (int r = 0; r < 8; r++) + for (int c = 0; c < 8; c++) { + uint8_t a = hpel_v(src, r, c, stride); + dst[r*stride + c] = (uint8_t) ((a + src[(r + 1)*stride + c] + 1) >> 1); + } +} diff --git a/tests/test_api_h264.c b/tests/test_api_h264.c index 275a556..370f34b 100644 --- a/tests/test_api_h264.c +++ b/tests/test_api_h264.c @@ -36,6 +36,14 @@ extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +extern void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); @@ -483,6 +491,63 @@ static int test_qpel_mc22(void) return diff == 0 ? 0 : 1; } +/* Generic harness for the 4 single-axis quarter-pel positions; same + * tile geometry as mc22 since each one reads the largest of the H/V + * lowpass windows (mc10/mc30 need cols -2..+3, mc01/mc03 need rows + * -2..+3 OR +1..+3 on the integer side). */ +typedef void (*qpel_ref_fn)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +typedef int (*qpel_dispatch_fn)(daedalus_ctx *ctx, uint8_t *dst, + const uint8_t *src, size_t stride, + size_t n_blocks, const daedalus_h264_qpel_meta *meta); + +static int run_quarter_axis_qpel(const char *name, + qpel_ref_fn ref, qpel_dispatch_fn dispatch) +{ + enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16, + TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES, + SRC_ROW = 3, SRC_COL = 3 }; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL]; + daedalus_h264_qpel_meta meta[N]; + + for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff); + memset(dst, 0, sizeof(dst)); + memset(dst_ref, 0, sizeof(dst_ref)); + + for (int i = 0; i < N; i++) { + meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); + meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); + } + + for (int i = 0; i < N; i++) + ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE); + + int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta); + if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; } + int diff = 0; + for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n", + name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + +static int test_qpel_quarter_axis_all(void) +{ + int fail = 0; + fail |= run_quarter_axis_qpel("mc10", daedalus_put_h264_qpel8_mc10_ref, + daedalus_recipe_dispatch_h264_qpel_mc10); + fail |= run_quarter_axis_qpel("mc30", daedalus_put_h264_qpel8_mc30_ref, + daedalus_recipe_dispatch_h264_qpel_mc30); + fail |= run_quarter_axis_qpel("mc01", daedalus_put_h264_qpel8_mc01_ref, + daedalus_recipe_dispatch_h264_qpel_mc01); + fail |= run_quarter_axis_qpel("mc03", daedalus_put_h264_qpel8_mc03_ref, + daedalus_recipe_dispatch_h264_qpel_mc03); + return fail; +} + int main(void) { printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n"); @@ -515,5 +580,6 @@ int main(void) fail |= test_qpel_mc20(); fail |= test_qpel_mc02(); fail |= test_qpel_mc22(); + fail |= test_qpel_quarter_axis_all(); return fail; }