01f782cfaf
Closes the H.264 8x8 qpel buildout. Adds the remaining 12 avg_
biprediction positions:
4 quarter-axis: avg_mc{10,30,01,03}
8 diagonals : avg_mc{11,12,13,21,23,31,32,33}
Each follows the established pattern: same half-pel formula as the
put_ sibling, then L2 average with the existing dst contents per
H.264 §8.4.2.3.1.
Scope:
- 12 new kernel enums (MC10..MC33 avg_ = 34..45) → CPU.
- 12 NEON externs for the vendored ff_avg_h264_qpel8_mc*_neon.
- 12 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro.
- 12 public dispatches via DEFINE_QPEL_DISPATCH macro.
- 12 recipe wrappers via DEFINE_QPEL_RECIPE macro.
- 12 header decls via DECLARE_QPEL_AVG macro.
- tests/h264_qpel8_avg_rest_ref.c — references via two parametric
macros: DEFINE_AVG_QUARTER for the 4 ¼-pel L2 forms,
DEFINE_AVG_DIAG for the 8 two-half-pel-avg forms.
- Test harness extended with a RUN(MC) sub-macro that derives both
the ref name and dispatch name from the bare mcXX. (The ref
is daedalus_avg_h264_qpel8_<mc>_ref; the dispatch is
daedalus_recipe_dispatch_h264_qpel_avg_<mc>. Macro had a typo
on first try that duplicated "avg_" in the ref name — caught at
compile, fixed.)
Verified on hertz:
$ ./build/test_api_h264 | tail -12
H.264 qpel avg_mc10: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc30: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc01: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc03: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc11: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc12: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc13: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc21: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc23: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc31: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc32: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc33: 2048/2048 bytes bit-exact (100.0000%)
All 12 new positions bit-exact PASS first try.
Final qpel matrix state:
put_: mc00 (none — integer copy)
mc01 ✓ mc02 ✓ mc03 ✓
mc10 ✓ mc11 ✓ mc12 ✓ mc13 ✓
mc20 ✓ (QPU+CPU) mc21 ✓ mc22 ✓ mc23 ✓
mc30 ✓ mc31 ✓ mc32 ✓ mc33 ✓
avg_: same 15-of-16 coverage, all CPU.
Every B-slice biprediction case the libavcodec intercept can throw
at us is now serviceable. QPU shaders remain mc20-only (cycle 9);
the other 29 positions are CPU NEON. Whether to write more QPU
shaders depends on real perf measurement — at NEON ~10 ns per
8x8 block, full qpel coverage at 1080p is ~2-3 ms of total work,
well inside budget.
98 lines
4.8 KiB
C
98 lines
4.8 KiB
C
/*
|
|
* Standalone bit-exact C references for the 12 remaining avg_
|
|
* biprediction qpel positions (B-slice list0 + list1 averaging):
|
|
* 4 quarter-axis: avg_mc{10,30,01,03}
|
|
* 8 diagonals : avg_mc{11,12,13,21,23,31,32,33}
|
|
*
|
|
* Each is the put_ formula (per H.264 §8.4.2.2.1 / Table 8-4) with
|
|
* a final L2 average against the existing dst contents per §8.4.2.3.1.
|
|
* Caller pre-loads dst with the list0 prediction; the avg_ call
|
|
* folds in list1.
|
|
*
|
|
* Mirror FFmpeg's `ff_avg_h264_qpel8_mc{XY}_neon` (in
|
|
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
|
|
* — same `\type=avg` expansion as the put_ functions).
|
|
*
|
|
* License: LGPL-2.1-or-later.
|
|
*/
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
|
|
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
|
static inline uint8_t avg2(uint8_t a, uint8_t b) { return (uint8_t)((a + b + 1) >> 1); }
|
|
|
|
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
|
{
|
|
int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
|
|
+ 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1]
|
|
- 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3]
|
|
+ 16;
|
|
return (uint8_t) clip_u8(v >> 5);
|
|
}
|
|
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
|
{
|
|
int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
|
|
+ 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
|
|
- 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
|
|
+ 16;
|
|
return (uint8_t) clip_u8(v >> 5);
|
|
}
|
|
static inline uint8_t hpel_hv(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
|
{
|
|
int t[6];
|
|
for (int i = 0; i < 6; i++) {
|
|
int rr = r - 2 + i;
|
|
t[i] = (int) s[rr*stride + c-2] - 5 * (int) s[rr*stride + c-1]
|
|
+ 20 * (int) s[rr*stride + c] + 20 * (int) s[rr*stride + c+1]
|
|
- 5 * (int) s[rr*stride + c+2] + (int) s[rr*stride + c+3];
|
|
}
|
|
int v = t[0] - 5*t[1] + 20*t[2] + 20*t[3] - 5*t[4] + t[5] + 512;
|
|
return (uint8_t) clip_u8(v >> 10);
|
|
}
|
|
|
|
/* Quarter-axis variants: half-pel + L2 with integer source, then
|
|
* L2 again with dst. */
|
|
#define DEFINE_AVG_QUARTER(NAME, A_EXPR, INT_EXPR) \
|
|
void daedalus_avg_h264_qpel8_ ## NAME ## _ref(uint8_t *dst, \
|
|
const uint8_t *src, ptrdiff_t stride) \
|
|
{ \
|
|
for (int r = 0; r < 8; r++) \
|
|
for (int c = 0; c < 8; c++) { \
|
|
uint8_t a = (A_EXPR); \
|
|
uint8_t p = (uint8_t)((a + (INT_EXPR) + 1) >> 1); \
|
|
dst[r*stride + c] = avg2(dst[r*stride + c], p); \
|
|
} \
|
|
}
|
|
|
|
DEFINE_AVG_QUARTER(mc10, hpel_h(src, r, c, stride), src[r*stride + c ])
|
|
DEFINE_AVG_QUARTER(mc30, hpel_h(src, r, c, stride), src[r*stride + c + 1])
|
|
DEFINE_AVG_QUARTER(mc01, hpel_v(src, r, c, stride), src[(r )*stride + c])
|
|
DEFINE_AVG_QUARTER(mc03, hpel_v(src, r, c, stride), src[(r + 1)*stride + c])
|
|
|
|
#undef DEFINE_AVG_QUARTER
|
|
|
|
/* Diagonal variants: avg of two half-pels, then L2 with dst. */
|
|
#define DEFINE_AVG_DIAG(NAME, A_EXPR, B_EXPR) \
|
|
void daedalus_avg_h264_qpel8_ ## NAME ## _ref(uint8_t *dst, \
|
|
const uint8_t *src, ptrdiff_t stride) \
|
|
{ \
|
|
for (int r = 0; r < 8; r++) \
|
|
for (int c = 0; c < 8; c++) { \
|
|
uint8_t a = (A_EXPR); \
|
|
uint8_t b = (B_EXPR); \
|
|
uint8_t p = avg2(a, b); \
|
|
dst[r*stride + c] = avg2(dst[r*stride + c], p); \
|
|
} \
|
|
}
|
|
|
|
DEFINE_AVG_DIAG(mc11, hpel_h(src, r, c, stride), hpel_v(src, r, c, stride))
|
|
DEFINE_AVG_DIAG(mc12, hpel_hv(src, r, c, stride), hpel_v(src, r, c, stride))
|
|
DEFINE_AVG_DIAG(mc13, hpel_h(src, r+1, c, stride), hpel_v(src, r, c, stride))
|
|
DEFINE_AVG_DIAG(mc21, hpel_hv(src, r, c, stride), hpel_h(src, r, c, stride))
|
|
DEFINE_AVG_DIAG(mc23, hpel_hv(src, r, c, stride), hpel_h(src, r+1, c, stride))
|
|
DEFINE_AVG_DIAG(mc31, hpel_h(src, r, c, stride), hpel_v(src, r, c+1, stride))
|
|
DEFINE_AVG_DIAG(mc32, hpel_hv(src, r, c, stride), hpel_v(src, r, c+1, stride))
|
|
DEFINE_AVG_DIAG(mc33, hpel_h(src, r+1, c, stride), hpel_v(src, r, c+1, stride))
|
|
|
|
#undef DEFINE_AVG_DIAG
|