h264: qpel diagonals — 8 positions (mc11/12/13/21/23/31/32/33) #18

Merged
marfrit merged 1 commits from noether/h264-qpel-diagonals into main 2026-05-25 06:32:03 +00:00
5 changed files with 219 additions and 0 deletions
+1
View File
@@ -526,6 +526,7 @@ add_executable(test_api_h264
tests/h264_qpel8_mc02_ref.c
tests/h264_qpel8_mc22_ref.c
tests/h264_qpel8_quarter_axis_ref.c
tests/h264_qpel8_diag_ref.c
)
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
target_compile_options(test_api_h264 PRIVATE -O2)
+44
View File
@@ -475,6 +475,42 @@ int daedalus_dispatch_h264_qpel_mc03(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
/* H.264 luma diagonal qpel positions ("put", 8 variants). Each is
* the rounded average of two half-pel intermediates per H.264
* §8.4.2.2.1 / Table 8-4 (decomposition matches the FFmpeg .S
* structure; see test/h264_qpel8_diag_ref.c for the formulas).
*
* mc11 ¼¼ : avg(mc20[r,c], mc02[r,c])
* mc12 ¼½ : avg(mc22[r,c], mc02[r,c])
* mc13 ¼¾ : avg(mc20[r+1,c], mc02[r,c])
* mc21 ½¼ : avg(mc22[r,c], mc20[r,c])
* mc23 ½¾ : avg(mc22[r,c], mc20[r+1,c])
* mc31 ¾¼ : avg(mc20[r,c], mc02[r,c+1])
* mc32 ¾½ : avg(mc22[r,c], mc02[r,c+1])
* mc33 ¾¾ : avg(mc20[r+1,c], mc02[r,c+1])
*
* CPU-only via vendored FFmpeg NEON; QPU shaders pending.
* Explicit SUBSTRATE_QPU returns -1.
*/
#define DECLARE_QPEL_DIAG(name) \
int daedalus_recipe_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, \
uint8_t *dst, const uint8_t *src, size_t stride, \
size_t n_blocks, const daedalus_h264_qpel_meta *meta); \
int daedalus_dispatch_h264_qpel_ ## name(daedalus_ctx *ctx, daedalus_substrate sub, \
uint8_t *dst, const uint8_t *src, size_t stride, \
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
DECLARE_QPEL_DIAG(mc11)
DECLARE_QPEL_DIAG(mc12)
DECLARE_QPEL_DIAG(mc13)
DECLARE_QPEL_DIAG(mc21)
DECLARE_QPEL_DIAG(mc23)
DECLARE_QPEL_DIAG(mc31)
DECLARE_QPEL_DIAG(mc32)
DECLARE_QPEL_DIAG(mc33)
#undef DECLARE_QPEL_DIAG
/* -------------------------------------------------------------------
* Recipe query — what does the API recommend for each kernel?
* ----------------------------------------------------------------- */
@@ -501,6 +537,14 @@ typedef enum {
DAEDALUS_KERNEL_H264_QPEL_MC30 = 20,
DAEDALUS_KERNEL_H264_QPEL_MC01 = 21,
DAEDALUS_KERNEL_H264_QPEL_MC03 = 22,
DAEDALUS_KERNEL_H264_QPEL_MC11 = 23,
DAEDALUS_KERNEL_H264_QPEL_MC12 = 24,
DAEDALUS_KERNEL_H264_QPEL_MC13 = 25,
DAEDALUS_KERNEL_H264_QPEL_MC21 = 26,
DAEDALUS_KERNEL_H264_QPEL_MC23 = 27,
DAEDALUS_KERNEL_H264_QPEL_MC31 = 28,
DAEDALUS_KERNEL_H264_QPEL_MC32 = 29,
DAEDALUS_KERNEL_H264_QPEL_MC33 = 30,
} daedalus_kernel;
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
+40
View File
@@ -144,6 +144,14 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
case DAEDALUS_KERNEL_H264_QPEL_MC30: return DAEDALUS_SUBSTRATE_CPU; /* ¾-H L2 */
case DAEDALUS_KERNEL_H264_QPEL_MC01: return DAEDALUS_SUBSTRATE_CPU; /* ¼-V L2 */
case DAEDALUS_KERNEL_H264_QPEL_MC03: return DAEDALUS_SUBSTRATE_CPU; /* ¾-V L2 */
case DAEDALUS_KERNEL_H264_QPEL_MC11: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¼¼ */
case DAEDALUS_KERNEL_H264_QPEL_MC12: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¼½ */
case DAEDALUS_KERNEL_H264_QPEL_MC13: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¼¾ */
case DAEDALUS_KERNEL_H264_QPEL_MC21: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ½¼ */
case DAEDALUS_KERNEL_H264_QPEL_MC23: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ½¾ */
case DAEDALUS_KERNEL_H264_QPEL_MC31: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾¼ */
case DAEDALUS_KERNEL_H264_QPEL_MC32: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾½ */
case DAEDALUS_KERNEL_H264_QPEL_MC33: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾¾ */
}
return DAEDALUS_SUBSTRATE_CPU;
}
@@ -196,6 +204,14 @@ extern void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
/* -------------------- CPU dispatch implementations -------------- */
@@ -468,6 +484,14 @@ DEFINE_QPEL_CPU_DISPATCH(mc10, ff_put_h264_qpel8_mc10_neon)
DEFINE_QPEL_CPU_DISPATCH(mc30, ff_put_h264_qpel8_mc30_neon)
DEFINE_QPEL_CPU_DISPATCH(mc01, ff_put_h264_qpel8_mc01_neon)
DEFINE_QPEL_CPU_DISPATCH(mc03, ff_put_h264_qpel8_mc03_neon)
DEFINE_QPEL_CPU_DISPATCH(mc11, ff_put_h264_qpel8_mc11_neon)
DEFINE_QPEL_CPU_DISPATCH(mc12, ff_put_h264_qpel8_mc12_neon)
DEFINE_QPEL_CPU_DISPATCH(mc13, ff_put_h264_qpel8_mc13_neon)
DEFINE_QPEL_CPU_DISPATCH(mc21, ff_put_h264_qpel8_mc21_neon)
DEFINE_QPEL_CPU_DISPATCH(mc23, ff_put_h264_qpel8_mc23_neon)
DEFINE_QPEL_CPU_DISPATCH(mc31, ff_put_h264_qpel8_mc31_neon)
DEFINE_QPEL_CPU_DISPATCH(mc32, ff_put_h264_qpel8_mc32_neon)
DEFINE_QPEL_CPU_DISPATCH(mc33, ff_put_h264_qpel8_mc33_neon)
#undef DEFINE_QPEL_CPU_DISPATCH
@@ -1489,6 +1513,14 @@ DEFINE_QPEL_DISPATCH(mc10, DAEDALUS_KERNEL_H264_QPEL_MC10)
DEFINE_QPEL_DISPATCH(mc30, DAEDALUS_KERNEL_H264_QPEL_MC30)
DEFINE_QPEL_DISPATCH(mc01, DAEDALUS_KERNEL_H264_QPEL_MC01)
DEFINE_QPEL_DISPATCH(mc03, DAEDALUS_KERNEL_H264_QPEL_MC03)
DEFINE_QPEL_DISPATCH(mc11, DAEDALUS_KERNEL_H264_QPEL_MC11)
DEFINE_QPEL_DISPATCH(mc12, DAEDALUS_KERNEL_H264_QPEL_MC12)
DEFINE_QPEL_DISPATCH(mc13, DAEDALUS_KERNEL_H264_QPEL_MC13)
DEFINE_QPEL_DISPATCH(mc21, DAEDALUS_KERNEL_H264_QPEL_MC21)
DEFINE_QPEL_DISPATCH(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23)
DEFINE_QPEL_DISPATCH(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31)
DEFINE_QPEL_DISPATCH(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32)
DEFINE_QPEL_DISPATCH(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33)
#undef DEFINE_QPEL_DISPATCH
@@ -1640,5 +1672,13 @@ DEFINE_QPEL_RECIPE(mc10)
DEFINE_QPEL_RECIPE(mc30)
DEFINE_QPEL_RECIPE(mc01)
DEFINE_QPEL_RECIPE(mc03)
DEFINE_QPEL_RECIPE(mc11)
DEFINE_QPEL_RECIPE(mc12)
DEFINE_QPEL_RECIPE(mc13)
DEFINE_QPEL_RECIPE(mc21)
DEFINE_QPEL_RECIPE(mc23)
DEFINE_QPEL_RECIPE(mc31)
DEFINE_QPEL_RECIPE(mc32)
DEFINE_QPEL_RECIPE(mc33)
#undef DEFINE_QPEL_RECIPE
+98
View File
@@ -0,0 +1,98 @@
/*
* Standalone bit-exact C references for the 8 diagonal H.264 luma
* qpel positions (mc11, mc12, mc13, mc21, mc23, mc31, mc32, mc33).
* Each is the rounded average of two half-pel intermediates per
* H.264 §8.4.2.2.1 / Table 8-4, decomposed to match the FFmpeg .S
* reference structure (see comments in mc{11,12,21,...}_neon in
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S).
*
* Position decompositions (verified against the .S):
* mc11 (e, ¼¼): avg(mc20[r,c], mc02[r,c])
* mc12 (f, ¼½): avg(mc22[r,c], mc02[r,c])
* mc13 (g, ¼¾): avg(mc20[r+1,c], mc02[r,c])
* mc21 (i, ½¼): avg(mc22[r,c], mc20[r,c])
* mc23 (k, ½¾): avg(mc22[r,c], mc20[r+1,c])
* mc31 (p, ¾¼): avg(mc20[r,c], mc02[r,c+1])
* mc32 (q, ¾½): avg(mc22[r,c], mc02[r,c+1])
* mc33 (r, ¾¾): avg(mc20[r+1,c], mc02[r,c+1])
*
* (The mc20[r,c] notation means "the mc20-style horizontal half-pel
* result at source-relative integer position (r, c)"; analogously
* for mc02 and mc22.)
*
* Single-stride convention; same edge-context contract as the simpler
* variants (the cells "[r+1,c]" etc. demand one extra row/col of
* source context beyond what mc20/mc02 alone would need).
*
* License: LGPL-2.1-or-later.
*/
#include <stdint.h>
#include <stddef.h>
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
/* Single-cell helpers — same arithmetic as the dedicated mc20/mc02
* refs but computed point-by-point so the diagonal refs can mix them
* cheaply. Each returns a u8 (already clipped). */
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
+ 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1]
- 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3]
+ 16;
return (uint8_t) clip_u8(v >> 5);
}
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
+ 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
- 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
+ 16;
return (uint8_t) clip_u8(v >> 5);
}
/* hpel_hv — 2D half-pel at (r, c) per the H.264 §8.4.2.2.1 "j"
* cascade. Computes the 6 vertical intermediates needed for the
* column at offsets -2..+3 around (r, c), each as a 16-bit signed
* h-lowpass over the 6 source samples in the same row. Then v-lowpass
* over those 6 intermediates with the +512 >> 10 final scale. Same
* as the mc22 ref, just expressed point-by-point. */
static inline uint8_t hpel_hv(const uint8_t *s, int r, int c, ptrdiff_t stride)
{
int t[6]; /* tmp at rows r-2..r+3 of the same col c */
for (int i = 0; i < 6; i++) {
int rr = r - 2 + i;
t[i] = (int) s[rr*stride + c-2] - 5 * (int) s[rr*stride + c-1]
+ 20 * (int) s[rr*stride + c] + 20 * (int) s[rr*stride + c+1]
- 5 * (int) s[rr*stride + c+2] + (int) s[rr*stride + c+3];
}
int v = t[0] - 5 * t[1] + 20 * t[2] + 20 * t[3] - 5 * t[4] + t[5] + 512;
return (uint8_t) clip_u8(v >> 10);
}
/* avg rounded ((a + b + 1) >> 1) — saturates already-clipped inputs
* so no further clip needed. */
static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); }
#define DEFINE_DIAG_REF(NAME, A_EXPR, B_EXPR) \
void daedalus_put_h264_qpel8_ ## NAME ## _ref(uint8_t *dst, \
const uint8_t *src, ptrdiff_t stride) \
{ \
for (int r = 0; r < 8; r++) \
for (int c = 0; c < 8; c++) { \
uint8_t a = (A_EXPR); \
uint8_t b = (B_EXPR); \
dst[r*stride + c] = avg2(a, b); \
} \
}
DEFINE_DIAG_REF(mc11, hpel_h(src, r, c, stride), hpel_v(src, r, c, stride))
DEFINE_DIAG_REF(mc12, hpel_hv(src, r, c, stride), hpel_v(src, r, c, stride))
DEFINE_DIAG_REF(mc13, hpel_h(src, r+1, c, stride), hpel_v(src, r, c, stride))
DEFINE_DIAG_REF(mc21, hpel_hv(src, r, c, stride), hpel_h(src, r, c, stride))
DEFINE_DIAG_REF(mc23, hpel_hv(src, r, c, stride), hpel_h(src, r+1, c, stride))
DEFINE_DIAG_REF(mc31, hpel_h(src, r, c, stride), hpel_v(src, r, c+1, stride))
DEFINE_DIAG_REF(mc32, hpel_hv(src, r, c, stride), hpel_v(src, r, c+1, stride))
DEFINE_DIAG_REF(mc33, hpel_h(src, r+1, c, stride), hpel_v(src, r, c+1, stride))
#undef DEFINE_DIAG_REF
+36
View File
@@ -44,6 +44,14 @@ extern void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
@@ -548,6 +556,33 @@ static int test_qpel_quarter_axis_all(void)
return fail;
}
static int test_qpel_diag_all(void)
{
/* Diagonal positions need TWO half-pel intermediates per output;
* some of them read at (r+1,c) or (r,c+1) so the test geometry
* needs an extra row + col of context. run_quarter_axis_qpel
* already provides plenty (SRC_ROW=3, SRC_COL=3, 16x16 tile)
* reusing that harness is fine. */
int fail = 0;
fail |= run_quarter_axis_qpel("mc11", daedalus_put_h264_qpel8_mc11_ref,
daedalus_recipe_dispatch_h264_qpel_mc11);
fail |= run_quarter_axis_qpel("mc12", daedalus_put_h264_qpel8_mc12_ref,
daedalus_recipe_dispatch_h264_qpel_mc12);
fail |= run_quarter_axis_qpel("mc13", daedalus_put_h264_qpel8_mc13_ref,
daedalus_recipe_dispatch_h264_qpel_mc13);
fail |= run_quarter_axis_qpel("mc21", daedalus_put_h264_qpel8_mc21_ref,
daedalus_recipe_dispatch_h264_qpel_mc21);
fail |= run_quarter_axis_qpel("mc23", daedalus_put_h264_qpel8_mc23_ref,
daedalus_recipe_dispatch_h264_qpel_mc23);
fail |= run_quarter_axis_qpel("mc31", daedalus_put_h264_qpel8_mc31_ref,
daedalus_recipe_dispatch_h264_qpel_mc31);
fail |= run_quarter_axis_qpel("mc32", daedalus_put_h264_qpel8_mc32_ref,
daedalus_recipe_dispatch_h264_qpel_mc32);
fail |= run_quarter_axis_qpel("mc33", daedalus_put_h264_qpel8_mc33_ref,
daedalus_recipe_dispatch_h264_qpel_mc33);
return fail;
}
int main(void)
{
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
@@ -581,5 +616,6 @@ int main(void)
fail |= test_qpel_mc02();
fail |= test_qpel_mc22();
fail |= test_qpel_quarter_axis_all();
fail |= test_qpel_diag_all();
return fail;
}