h264: qpel avg — 12 remaining variants (closes the matrix)
Closes the H.264 8x8 qpel buildout. Adds the remaining 12 avg_
biprediction positions:
4 quarter-axis: avg_mc{10,30,01,03}
8 diagonals : avg_mc{11,12,13,21,23,31,32,33}
Each follows the established pattern: same half-pel formula as the
put_ sibling, then L2 average with the existing dst contents per
H.264 §8.4.2.3.1.
Scope:
- 12 new kernel enums (MC10..MC33 avg_ = 34..45) → CPU.
- 12 NEON externs for the vendored ff_avg_h264_qpel8_mc*_neon.
- 12 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro.
- 12 public dispatches via DEFINE_QPEL_DISPATCH macro.
- 12 recipe wrappers via DEFINE_QPEL_RECIPE macro.
- 12 header decls via DECLARE_QPEL_AVG macro.
- tests/h264_qpel8_avg_rest_ref.c — references via two parametric
macros: DEFINE_AVG_QUARTER for the 4 ¼-pel L2 forms,
DEFINE_AVG_DIAG for the 8 two-half-pel-avg forms.
- Test harness extended with a RUN(MC) sub-macro that derives both
the ref name and dispatch name from the bare mcXX. (The ref
is daedalus_avg_h264_qpel8_<mc>_ref; the dispatch is
daedalus_recipe_dispatch_h264_qpel_avg_<mc>. Macro had a typo
on first try that duplicated "avg_" in the ref name — caught at
compile, fixed.)
Verified on hertz:
$ ./build/test_api_h264 | tail -12
H.264 qpel avg_mc10: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc30: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc01: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc03: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc11: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc12: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc13: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc21: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc23: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc31: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc32: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc33: 2048/2048 bytes bit-exact (100.0000%)
All 12 new positions bit-exact PASS first try.
Final qpel matrix state:
put_: mc00 (none — integer copy)
mc01 ✓ mc02 ✓ mc03 ✓
mc10 ✓ mc11 ✓ mc12 ✓ mc13 ✓
mc20 ✓ (QPU+CPU) mc21 ✓ mc22 ✓ mc23 ✓
mc30 ✓ mc31 ✓ mc32 ✓ mc33 ✓
avg_: same 15-of-16 coverage, all CPU.
Every B-slice biprediction case the libavcodec intercept can throw
at us is now serviceable. QPU shaders remain mc20-only (cycle 9);
the other 29 positions are CPU NEON. Whether to write more QPU
shaders depends on real perf measurement — at NEON ~10 ns per
8x8 block, full qpel coverage at 1080p is ~2-3 ms of total work,
well inside budget.
This commit is contained in:
@@ -155,6 +155,18 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC20: return DAEDALUS_SUBSTRATE_CPU; /* biprediction anchors */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC02: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC22: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC10: return DAEDALUS_SUBSTRATE_CPU; /* ¼-H L2 avg */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC30: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC01: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC03: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC11: return DAEDALUS_SUBSTRATE_CPU; /* diagonals avg */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC12: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC13: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC21: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC23: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC31: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC32: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_AVG_MC33: return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
@@ -218,6 +230,18 @@ extern void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdif
|
||||
extern void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
/* -------------------- CPU dispatch implementations -------------- */
|
||||
|
||||
@@ -504,6 +528,18 @@ DEFINE_QPEL_CPU_DISPATCH(mc33, ff_put_h264_qpel8_mc33_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc20, ff_avg_h264_qpel8_mc20_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc02, ff_avg_h264_qpel8_mc02_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc22, ff_avg_h264_qpel8_mc22_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc10, ff_avg_h264_qpel8_mc10_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc30, ff_avg_h264_qpel8_mc30_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc01, ff_avg_h264_qpel8_mc01_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc03, ff_avg_h264_qpel8_mc03_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc11, ff_avg_h264_qpel8_mc11_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc12, ff_avg_h264_qpel8_mc12_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc13, ff_avg_h264_qpel8_mc13_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc21, ff_avg_h264_qpel8_mc21_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc23, ff_avg_h264_qpel8_mc23_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc31, ff_avg_h264_qpel8_mc31_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc32, ff_avg_h264_qpel8_mc32_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(avg_mc33, ff_avg_h264_qpel8_mc33_neon)
|
||||
|
||||
#undef DEFINE_QPEL_CPU_DISPATCH
|
||||
|
||||
@@ -1536,6 +1572,18 @@ DEFINE_QPEL_DISPATCH(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc20, DAEDALUS_KERNEL_H264_QPEL_AVG_MC20)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc02, DAEDALUS_KERNEL_H264_QPEL_AVG_MC02)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc22, DAEDALUS_KERNEL_H264_QPEL_AVG_MC22)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc10, DAEDALUS_KERNEL_H264_QPEL_AVG_MC10)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc30, DAEDALUS_KERNEL_H264_QPEL_AVG_MC30)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc01, DAEDALUS_KERNEL_H264_QPEL_AVG_MC01)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc03, DAEDALUS_KERNEL_H264_QPEL_AVG_MC03)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc11, DAEDALUS_KERNEL_H264_QPEL_AVG_MC11)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc12, DAEDALUS_KERNEL_H264_QPEL_AVG_MC12)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc13, DAEDALUS_KERNEL_H264_QPEL_AVG_MC13)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc21, DAEDALUS_KERNEL_H264_QPEL_AVG_MC21)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc23, DAEDALUS_KERNEL_H264_QPEL_AVG_MC23)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc31, DAEDALUS_KERNEL_H264_QPEL_AVG_MC31)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc32, DAEDALUS_KERNEL_H264_QPEL_AVG_MC32)
|
||||
DEFINE_QPEL_DISPATCH(avg_mc33, DAEDALUS_KERNEL_H264_QPEL_AVG_MC33)
|
||||
|
||||
#undef DEFINE_QPEL_DISPATCH
|
||||
|
||||
@@ -1698,5 +1746,17 @@ DEFINE_QPEL_RECIPE(mc33)
|
||||
DEFINE_QPEL_RECIPE(avg_mc20)
|
||||
DEFINE_QPEL_RECIPE(avg_mc02)
|
||||
DEFINE_QPEL_RECIPE(avg_mc22)
|
||||
DEFINE_QPEL_RECIPE(avg_mc10)
|
||||
DEFINE_QPEL_RECIPE(avg_mc30)
|
||||
DEFINE_QPEL_RECIPE(avg_mc01)
|
||||
DEFINE_QPEL_RECIPE(avg_mc03)
|
||||
DEFINE_QPEL_RECIPE(avg_mc11)
|
||||
DEFINE_QPEL_RECIPE(avg_mc12)
|
||||
DEFINE_QPEL_RECIPE(avg_mc13)
|
||||
DEFINE_QPEL_RECIPE(avg_mc21)
|
||||
DEFINE_QPEL_RECIPE(avg_mc23)
|
||||
DEFINE_QPEL_RECIPE(avg_mc31)
|
||||
DEFINE_QPEL_RECIPE(avg_mc32)
|
||||
DEFINE_QPEL_RECIPE(avg_mc33)
|
||||
|
||||
#undef DEFINE_QPEL_RECIPE
|
||||
|
||||
Reference in New Issue
Block a user