h264: qpel diagonals — 8 positions (mc11/12/13/21/23/31/32/33)
Closes the qpel buildout. All 8 remaining diagonal positions land
in one PR. Each is the rounded average of two half-pel intermediates
per H.264 §8.4.2.2.1 / Table 8-4, with the decomposition matching
the FFmpeg .S reference structure (verified by reading
external/ffmpeg-snapshot/.../h264qpel_neon.S lines 622-758).
Decomposition table (the formula for each output cell at (r,c)):
mc11 ¼¼ : avg(mc20[r, c], mc02[r, c])
mc12 ¼½ : avg(mc22[r, c], mc02[r, c])
mc13 ¼¾ : avg(mc20[r+1, c], mc02[r, c])
mc21 ½¼ : avg(mc22[r, c], mc20[r, c])
mc23 ½¾ : avg(mc22[r, c], mc20[r+1, c])
mc31 ¾¼ : avg(mc20[r, c], mc02[r, c+1])
mc32 ¾½ : avg(mc22[r, c], mc02[r, c+1])
mc33 ¾¾ : avg(mc20[r+1, c], mc02[r, c+1])
The (r±1, c±1) offsets capture the position-dependent shift that
the FFmpeg .S encodes by pre-incrementing x1 (src pointer) before
branching into the common mc11/mc21 code paths.
Scope (tightly macro-ised):
- 8 new kernel enums (MC11..MC33 = 23..30) → CPU.
- 8 NEON externs for the vendored ff_put_h264_qpel8_mc*_neon.
- 8 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro.
- 8 public dispatches via DEFINE_QPEL_DISPATCH macro.
- 8 recipe wrappers via DEFINE_QPEL_RECIPE macro.
- Header decls condensed via a DECLARE_QPEL_DIAG macro that
expands to both recipe + dispatch decls per name.
- C references via DEFINE_DIAG_REF macro: each ref is a 6-line
wrapper around the per-cell hpel_h / hpel_v / hpel_hv helpers
(the latter being the per-cell version of mc22's 13-row int16
tmp[] computation).
- Test wrapper: test_qpel_diag_all() drives all 8 through the
existing run_quarter_axis_qpel() harness.
Verified on hertz (Pi 5 / V3D 7.1):
$ ./build/test_api_h264 | tail -8
H.264 qpel mc11: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc12: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc13: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc21: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc23: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc31: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc32: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc33: 2048/2048 bytes bit-exact (100.0000%)
ALL 8 diagonal positions bit-exact PASS first try. Meaningful
because the position-dependent (r±1, c±1) source offsets are easy
to get wrong by transcription, and any of them would surface on
random inputs immediately.
After this PR the H.264 qpel 8x8 put_ matrix is complete:
mc00 mc01 mc02 mc03
mc10 mc11 mc12 mc13
mc20 mc21 mc22 mc23
mc30 mc31 mc32 mc33
15 of 16 positions exposed through the daedalus API; mc00 is just
integer copy and rarely needs a dispatch wrapper (libavcodec sets
the function pointer table directly). mc20 retains its QPU shader
(cycle 9 / v3d_h264_qpel_mc20.spv); all other 14 are CPU NEON.
What this does NOT cover (still in backlog):
- avg_ variants (the "add" form for biprediction, 16 more
positions). Currently the API only exposes put_.
- 16x16 qpel (separate function family in FFmpeg; the 8x8 path
can be used twice to substitute when 16x16 isn't critical).
- QPU shaders for any qpel position other than mc20.
This commit is contained in:
@@ -144,6 +144,14 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC30: return DAEDALUS_SUBSTRATE_CPU; /* ¾-H L2 */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC01: return DAEDALUS_SUBSTRATE_CPU; /* ¼-V L2 */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC03: return DAEDALUS_SUBSTRATE_CPU; /* ¾-V L2 */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC11: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¼¼ */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC12: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¼½ */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC13: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¼¾ */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC21: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ½¼ */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC23: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ½¾ */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC31: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾¼ */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC32: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾½ */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC33: return DAEDALUS_SUBSTRATE_CPU; /* diagonal ¾¾ */
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
@@ -196,6 +204,14 @@ extern void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
/* -------------------- CPU dispatch implementations -------------- */
|
||||
|
||||
@@ -468,6 +484,14 @@ DEFINE_QPEL_CPU_DISPATCH(mc10, ff_put_h264_qpel8_mc10_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc30, ff_put_h264_qpel8_mc30_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc01, ff_put_h264_qpel8_mc01_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc03, ff_put_h264_qpel8_mc03_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc11, ff_put_h264_qpel8_mc11_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc12, ff_put_h264_qpel8_mc12_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc13, ff_put_h264_qpel8_mc13_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc21, ff_put_h264_qpel8_mc21_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc23, ff_put_h264_qpel8_mc23_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc31, ff_put_h264_qpel8_mc31_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc32, ff_put_h264_qpel8_mc32_neon)
|
||||
DEFINE_QPEL_CPU_DISPATCH(mc33, ff_put_h264_qpel8_mc33_neon)
|
||||
|
||||
#undef DEFINE_QPEL_CPU_DISPATCH
|
||||
|
||||
@@ -1489,6 +1513,14 @@ DEFINE_QPEL_DISPATCH(mc10, DAEDALUS_KERNEL_H264_QPEL_MC10)
|
||||
DEFINE_QPEL_DISPATCH(mc30, DAEDALUS_KERNEL_H264_QPEL_MC30)
|
||||
DEFINE_QPEL_DISPATCH(mc01, DAEDALUS_KERNEL_H264_QPEL_MC01)
|
||||
DEFINE_QPEL_DISPATCH(mc03, DAEDALUS_KERNEL_H264_QPEL_MC03)
|
||||
DEFINE_QPEL_DISPATCH(mc11, DAEDALUS_KERNEL_H264_QPEL_MC11)
|
||||
DEFINE_QPEL_DISPATCH(mc12, DAEDALUS_KERNEL_H264_QPEL_MC12)
|
||||
DEFINE_QPEL_DISPATCH(mc13, DAEDALUS_KERNEL_H264_QPEL_MC13)
|
||||
DEFINE_QPEL_DISPATCH(mc21, DAEDALUS_KERNEL_H264_QPEL_MC21)
|
||||
DEFINE_QPEL_DISPATCH(mc23, DAEDALUS_KERNEL_H264_QPEL_MC23)
|
||||
DEFINE_QPEL_DISPATCH(mc31, DAEDALUS_KERNEL_H264_QPEL_MC31)
|
||||
DEFINE_QPEL_DISPATCH(mc32, DAEDALUS_KERNEL_H264_QPEL_MC32)
|
||||
DEFINE_QPEL_DISPATCH(mc33, DAEDALUS_KERNEL_H264_QPEL_MC33)
|
||||
|
||||
#undef DEFINE_QPEL_DISPATCH
|
||||
|
||||
@@ -1640,5 +1672,13 @@ DEFINE_QPEL_RECIPE(mc10)
|
||||
DEFINE_QPEL_RECIPE(mc30)
|
||||
DEFINE_QPEL_RECIPE(mc01)
|
||||
DEFINE_QPEL_RECIPE(mc03)
|
||||
DEFINE_QPEL_RECIPE(mc11)
|
||||
DEFINE_QPEL_RECIPE(mc12)
|
||||
DEFINE_QPEL_RECIPE(mc13)
|
||||
DEFINE_QPEL_RECIPE(mc21)
|
||||
DEFINE_QPEL_RECIPE(mc23)
|
||||
DEFINE_QPEL_RECIPE(mc31)
|
||||
DEFINE_QPEL_RECIPE(mc32)
|
||||
DEFINE_QPEL_RECIPE(mc33)
|
||||
|
||||
#undef DEFINE_QPEL_RECIPE
|
||||
|
||||
Reference in New Issue
Block a user