h264: qpel single-axis quarter-pel — mc10/mc30/mc01/mc03 (CPU/NEON)
Closes the 4 single-axis quarter-pel positions in one PR. Each is
a half-pel lowpass clipped to u8 followed by L2 rounded-average
with an integer-aligned source pixel per H.264 §8.4.2.2.1:
mc10 ¼-H ("a" pos): clip255(mc20(s)) avg src[r,c]
mc30 ¾-H ("c" pos): clip255(mc20(s)) avg src[r,c+1]
mc01 ¼-V ("d" pos): clip255(mc02(s)) avg src[r,c]
mc03 ¾-V ("n" pos): clip255(mc02(s)) avg src[r+1,c]
The mc10/mc30 pair and mc01/mc03 pair only differ in WHICH integer
source pixel they average with — the half-pel computation is the
same. Putting them in one PR is justified by that uniformity.
Scope:
- 4 new kernel enums: MC10=19, MC30=20, MC01=21, MC03=22 → CPU.
- 4 NEON externs for the vendored ff_put_h264_qpel8_mc{10,30,01,03}_neon.
- 4 CPU dispatch wrappers via DEFINE_QPEL_CPU_DISPATCH macro
(collapses ~50 LOC of repetition).
- 4 public dispatch fns via DEFINE_QPEL_DISPATCH macro.
- 4 recipe wrappers via DEFINE_QPEL_RECIPE macro.
- tests/h264_qpel8_quarter_axis_ref.c covers all four via shared
hpel_h() / hpel_v() inlines + per-mode L2 average.
- Test refactor: generic run_quarter_axis_qpel() harness exercises
all 4 positions through a single helper (~50 LOC for 4 tests vs
~200 if each was hand-rolled).
Verified on hertz:
$ ./build/test_api_h264 | tail -8
H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%)
H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%)
H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc22: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc10: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc30: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc01: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel mc03: 2048/2048 bytes bit-exact (100.0000%)
All 4 new positions bit-exact PASS first try.
Coverage matrix update:
put_ mc00 mc10 mc20 mc30
mc01 — ✓ — ✓
mc11 — — ✓ — ← this row
mc21 — — — —
mc31 — — — —
mc02 — — ✓ — ← mc02 + mc22 anchor
mc03 — — ✓ —
After this PR: 7 of 16 single-axis + diagonal positions done.
Remaining 9 are the off-axis quarter-pel combinations
(mc11/mc12/mc13/mc21/mc23/mc31/mc32/mc33) — each combines a 2D
lowpass intermediate with L2 averaging against a 1D-lowpass output.
Next PR scope.
Why no QPU shaders: same R-band logic as the prior CPU additions.
At ~10 ns per 8x8 NEON block, all 16 qpel positions together
would land in ~1.3 ms/frame at 1080p worst case — comfortably
inside the 33 ms budget. QPU shader for mc20 already exists
(cycle 9 / v3d_h264_qpel_mc20.spv); the other 15 follow once a
clear perf reason emerges.
This commit is contained in:
@@ -525,6 +525,7 @@ add_executable(test_api_h264
|
|||||||
tests/h264_qpel8_mc20_ref.c
|
tests/h264_qpel8_mc20_ref.c
|
||||||
tests/h264_qpel8_mc02_ref.c
|
tests/h264_qpel8_mc02_ref.c
|
||||||
tests/h264_qpel8_mc22_ref.c
|
tests/h264_qpel8_mc22_ref.c
|
||||||
|
tests/h264_qpel8_quarter_axis_ref.c
|
||||||
)
|
)
|
||||||
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
||||||
target_compile_options(test_api_h264 PRIVATE -O2)
|
target_compile_options(test_api_h264 PRIVATE -O2)
|
||||||
|
|||||||
@@ -436,6 +436,45 @@ int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
|
|||||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
|
/* H.264 luma single-axis quarter-pel qpel positions ("put"):
|
||||||
|
* mc10 ¼-H ("a" position): clip255(mc20(s)) avg src[r,c]
|
||||||
|
* mc30 ¾-H ("c" position): clip255(mc20(s)) avg src[r,c+1]
|
||||||
|
* mc01 ¼-V ("d" position): clip255(mc02(s)) avg src[r,c]
|
||||||
|
* mc03 ¾-V ("n" position): clip255(mc02(s)) avg src[r+1,c]
|
||||||
|
*
|
||||||
|
* Each is a half-pel lowpass clipped to u8 then averaged with an
|
||||||
|
* integer-aligned source pixel (rounded +1 >> 1). Same edge
|
||||||
|
* context contract as mc20/mc02. CPU-only for now; QPU shaders
|
||||||
|
* not yet implemented. Explicit SUBSTRATE_QPU returns -1.
|
||||||
|
*/
|
||||||
|
int daedalus_recipe_dispatch_h264_qpel_mc10(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
int daedalus_dispatch_h264_qpel_mc10(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_h264_qpel_mc30(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
int daedalus_dispatch_h264_qpel_mc30(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_h264_qpel_mc01(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
int daedalus_dispatch_h264_qpel_mc01(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_h264_qpel_mc03(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
int daedalus_dispatch_h264_qpel_mc03(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
/* -------------------------------------------------------------------
|
/* -------------------------------------------------------------------
|
||||||
* Recipe query — what does the API recommend for each kernel?
|
* Recipe query — what does the API recommend for each kernel?
|
||||||
* ----------------------------------------------------------------- */
|
* ----------------------------------------------------------------- */
|
||||||
@@ -458,6 +497,10 @@ typedef enum {
|
|||||||
DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
|
DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
|
||||||
DAEDALUS_KERNEL_H264_QPEL_MC02 = 17,
|
DAEDALUS_KERNEL_H264_QPEL_MC02 = 17,
|
||||||
DAEDALUS_KERNEL_H264_QPEL_MC22 = 18,
|
DAEDALUS_KERNEL_H264_QPEL_MC22 = 18,
|
||||||
|
DAEDALUS_KERNEL_H264_QPEL_MC10 = 19,
|
||||||
|
DAEDALUS_KERNEL_H264_QPEL_MC30 = 20,
|
||||||
|
DAEDALUS_KERNEL_H264_QPEL_MC01 = 21,
|
||||||
|
DAEDALUS_KERNEL_H264_QPEL_MC03 = 22,
|
||||||
} daedalus_kernel;
|
} daedalus_kernel;
|
||||||
|
|
||||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||||
|
|||||||
@@ -140,6 +140,10 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
|||||||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
||||||
case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */
|
case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */
|
||||||
case DAEDALUS_KERNEL_H264_QPEL_MC22: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc22 shader pending (hv lowpass) */
|
case DAEDALUS_KERNEL_H264_QPEL_MC22: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc22 shader pending (hv lowpass) */
|
||||||
|
case DAEDALUS_KERNEL_H264_QPEL_MC10: return DAEDALUS_SUBSTRATE_CPU; /* ¼-H L2 */
|
||||||
|
case DAEDALUS_KERNEL_H264_QPEL_MC30: return DAEDALUS_SUBSTRATE_CPU; /* ¾-H L2 */
|
||||||
|
case DAEDALUS_KERNEL_H264_QPEL_MC01: return DAEDALUS_SUBSTRATE_CPU; /* ¼-V L2 */
|
||||||
|
case DAEDALUS_KERNEL_H264_QPEL_MC03: return DAEDALUS_SUBSTRATE_CPU; /* ¾-V L2 */
|
||||||
}
|
}
|
||||||
return DAEDALUS_SUBSTRATE_CPU;
|
return DAEDALUS_SUBSTRATE_CPU;
|
||||||
}
|
}
|
||||||
@@ -184,6 +188,14 @@ extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
|
|||||||
ptrdiff_t stride);
|
ptrdiff_t stride);
|
||||||
extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src,
|
extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src,
|
||||||
ptrdiff_t stride);
|
ptrdiff_t stride);
|
||||||
|
extern void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
|
extern void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
|
extern void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
|
extern void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
|
|
||||||
/* -------------------- CPU dispatch implementations -------------- */
|
/* -------------------- CPU dispatch implementations -------------- */
|
||||||
|
|
||||||
@@ -437,6 +449,28 @@ static int dispatch_h264_qpel_mc22_cpu(daedalus_ctx *ctx,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* The four single-axis quarter-pel CPU dispatches are uniform; the
|
||||||
|
* macro collapses ~50 LOC of repetition. */
|
||||||
|
#define DEFINE_QPEL_CPU_DISPATCH(suffix, neon_fn) \
|
||||||
|
static int dispatch_h264_qpel_ ## suffix ## _cpu(daedalus_ctx *ctx, \
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride, \
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
|
||||||
|
{ \
|
||||||
|
(void) ctx; \
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) { \
|
||||||
|
neon_fn(dst + meta[i].dst_off, src + meta[i].src_off, \
|
||||||
|
(ptrdiff_t) stride); \
|
||||||
|
} \
|
||||||
|
return 0; \
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFINE_QPEL_CPU_DISPATCH(mc10, ff_put_h264_qpel8_mc10_neon)
|
||||||
|
DEFINE_QPEL_CPU_DISPATCH(mc30, ff_put_h264_qpel8_mc30_neon)
|
||||||
|
DEFINE_QPEL_CPU_DISPATCH(mc01, ff_put_h264_qpel8_mc01_neon)
|
||||||
|
DEFINE_QPEL_CPU_DISPATCH(mc03, ff_put_h264_qpel8_mc03_neon)
|
||||||
|
|
||||||
|
#undef DEFINE_QPEL_CPU_DISPATCH
|
||||||
|
|
||||||
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@@ -1436,6 +1470,28 @@ int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
|
|||||||
return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta);
|
return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define DEFINE_QPEL_DISPATCH(suffix, kernel) \
|
||||||
|
int daedalus_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \
|
||||||
|
daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, \
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
|
||||||
|
{ \
|
||||||
|
daedalus_substrate eff = sub; \
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
|
||||||
|
eff = daedalus_recipe_substrate_for(kernel); \
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||||||
|
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \
|
||||||
|
return dispatch_h264_qpel_ ## suffix ## _cpu(ctx, dst, src, stride, \
|
||||||
|
n_blocks, meta); \
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFINE_QPEL_DISPATCH(mc10, DAEDALUS_KERNEL_H264_QPEL_MC10)
|
||||||
|
DEFINE_QPEL_DISPATCH(mc30, DAEDALUS_KERNEL_H264_QPEL_MC30)
|
||||||
|
DEFINE_QPEL_DISPATCH(mc01, DAEDALUS_KERNEL_H264_QPEL_MC01)
|
||||||
|
DEFINE_QPEL_DISPATCH(mc03, DAEDALUS_KERNEL_H264_QPEL_MC03)
|
||||||
|
|
||||||
|
#undef DEFINE_QPEL_DISPATCH
|
||||||
|
|
||||||
/* -------------------- Recipe convenience wrappers --------------- */
|
/* -------------------- Recipe convenience wrappers --------------- */
|
||||||
|
|
||||||
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
||||||
@@ -1570,3 +1626,19 @@ int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx,
|
|||||||
return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||||
dst, src, stride, n_blocks, meta);
|
dst, src, stride, n_blocks, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define DEFINE_QPEL_RECIPE(suffix) \
|
||||||
|
int daedalus_recipe_dispatch_h264_qpel_ ## suffix(daedalus_ctx *ctx, \
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride, \
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta) \
|
||||||
|
{ \
|
||||||
|
return daedalus_dispatch_h264_qpel_ ## suffix(ctx, DAEDALUS_SUBSTRATE_AUTO,\
|
||||||
|
dst, src, stride, n_blocks, meta); \
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFINE_QPEL_RECIPE(mc10)
|
||||||
|
DEFINE_QPEL_RECIPE(mc30)
|
||||||
|
DEFINE_QPEL_RECIPE(mc01)
|
||||||
|
DEFINE_QPEL_RECIPE(mc03)
|
||||||
|
|
||||||
|
#undef DEFINE_QPEL_RECIPE
|
||||||
|
|||||||
@@ -0,0 +1,82 @@
|
|||||||
|
/*
|
||||||
|
* Standalone bit-exact C references for the four single-axis quarter-
|
||||||
|
* pel luma qpel positions (H.264 §8.4.2.2.1, "put" variants). Each
|
||||||
|
* is a half-pel lowpass clipped to u8 followed by an L2 rounded-average
|
||||||
|
* with an integer-position source pixel.
|
||||||
|
*
|
||||||
|
* mc10 ("a" pos, ¼ horiz): a = clip255(mc20(s)); dst = (a + s[r,c] + 1) >> 1
|
||||||
|
* mc30 ("c" pos, ¾ horiz): a = clip255(mc20(s)); dst = (a + s[r,c+1] + 1) >> 1
|
||||||
|
* mc01 ("d" pos, ¼ vert ): a = clip255(mc02(s)); dst = (a + s[r, c] + 1) >> 1
|
||||||
|
* mc03 ("n" pos, ¾ vert ): a = clip255(mc02(s)); dst = (a + s[r+1,c] + 1) >> 1
|
||||||
|
*
|
||||||
|
* Mirror FFmpeg's `ff_put_h264_qpel8_mc{10,30,01,03}_neon` (in
|
||||||
|
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
|
||||||
|
* lines 587, 603, 611, 729 — each tail-calls the corresponding
|
||||||
|
* lowpass_l2 helper).
|
||||||
|
*
|
||||||
|
* Same single-stride convention as mc20/mc02 — dst and src share the
|
||||||
|
* same stride; src + src_off points at row 0 col 0 of the output
|
||||||
|
* block, with appropriate edge context already in-buffer.
|
||||||
|
*
|
||||||
|
* License: LGPL-2.1-or-later.
|
||||||
|
*/
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||||
|
|
||||||
|
/* Compute one horizontal half-pel pixel at (r, c) — same as mc20. */
|
||||||
|
static inline uint8_t hpel_h(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||||
|
{
|
||||||
|
int v = (int) s[r*stride + c-2] - 5 * (int) s[r*stride + c-1]
|
||||||
|
+ 20 * (int) s[r*stride + c] + 20 * (int) s[r*stride + c+1]
|
||||||
|
- 5 * (int) s[r*stride + c+2] + (int) s[r*stride + c+3]
|
||||||
|
+ 16;
|
||||||
|
return (uint8_t) clip_u8(v >> 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Compute one vertical half-pel pixel at (r, c) — same as mc02. */
|
||||||
|
static inline uint8_t hpel_v(const uint8_t *s, int r, int c, ptrdiff_t stride)
|
||||||
|
{
|
||||||
|
int v = (int) s[(r-2)*stride + c] - 5 * (int) s[(r-1)*stride + c]
|
||||||
|
+ 20 * (int) s[r*stride + c] + 20 * (int) s[(r+1)*stride + c]
|
||||||
|
- 5 * (int) s[(r+2)*stride + c] + (int) s[(r+3)*stride + c]
|
||||||
|
+ 16;
|
||||||
|
return (uint8_t) clip_u8(v >> 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||||
|
{
|
||||||
|
for (int r = 0; r < 8; r++)
|
||||||
|
for (int c = 0; c < 8; c++) {
|
||||||
|
uint8_t a = hpel_h(src, r, c, stride);
|
||||||
|
dst[r*stride + c] = (uint8_t) ((a + src[r*stride + c ] + 1) >> 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||||
|
{
|
||||||
|
for (int r = 0; r < 8; r++)
|
||||||
|
for (int c = 0; c < 8; c++) {
|
||||||
|
uint8_t a = hpel_h(src, r, c, stride);
|
||||||
|
dst[r*stride + c] = (uint8_t) ((a + src[r*stride + c + 1] + 1) >> 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||||
|
{
|
||||||
|
for (int r = 0; r < 8; r++)
|
||||||
|
for (int c = 0; c < 8; c++) {
|
||||||
|
uint8_t a = hpel_v(src, r, c, stride);
|
||||||
|
dst[r*stride + c] = (uint8_t) ((a + src[(r )*stride + c] + 1) >> 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||||
|
{
|
||||||
|
for (int r = 0; r < 8; r++)
|
||||||
|
for (int c = 0; c < 8; c++) {
|
||||||
|
uint8_t a = hpel_v(src, r, c, stride);
|
||||||
|
dst[r*stride + c] = (uint8_t) ((a + src[(r + 1)*stride + c] + 1) >> 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -36,6 +36,14 @@ extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
|
|||||||
ptrdiff_t stride);
|
ptrdiff_t stride);
|
||||||
extern void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src,
|
extern void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src,
|
||||||
ptrdiff_t stride);
|
ptrdiff_t stride);
|
||||||
|
extern void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
|
extern void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
|
extern void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
|
extern void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||||||
ptrdiff_t stride);
|
ptrdiff_t stride);
|
||||||
|
|
||||||
@@ -483,6 +491,63 @@ static int test_qpel_mc22(void)
|
|||||||
return diff == 0 ? 0 : 1;
|
return diff == 0 ? 0 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Generic harness for the 4 single-axis quarter-pel positions; same
|
||||||
|
* tile geometry as mc22 since each one reads the largest of the H/V
|
||||||
|
* lowpass windows (mc10/mc30 need cols -2..+3, mc01/mc03 need rows
|
||||||
|
* -2..+3 OR +1..+3 on the integer side). */
|
||||||
|
typedef void (*qpel_ref_fn)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
typedef int (*qpel_dispatch_fn)(daedalus_ctx *ctx, uint8_t *dst,
|
||||||
|
const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
|
static int run_quarter_axis_qpel(const char *name,
|
||||||
|
qpel_ref_fn ref, qpel_dispatch_fn dispatch)
|
||||||
|
{
|
||||||
|
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||||||
|
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||||
|
SRC_ROW = 3, SRC_COL = 3 };
|
||||||
|
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||||
|
if (!ctx) return 1;
|
||||||
|
|
||||||
|
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||||
|
daedalus_h264_qpel_meta meta[N];
|
||||||
|
|
||||||
|
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||||
|
memset(dst, 0, sizeof(dst));
|
||||||
|
memset(dst_ref, 0, sizeof(dst_ref));
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||||
|
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++)
|
||||||
|
ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE);
|
||||||
|
|
||||||
|
int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta);
|
||||||
|
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; }
|
||||||
|
int diff = 0;
|
||||||
|
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||||
|
printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||||
|
name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||||
|
daedalus_ctx_destroy(ctx);
|
||||||
|
return diff == 0 ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int test_qpel_quarter_axis_all(void)
|
||||||
|
{
|
||||||
|
int fail = 0;
|
||||||
|
fail |= run_quarter_axis_qpel("mc10", daedalus_put_h264_qpel8_mc10_ref,
|
||||||
|
daedalus_recipe_dispatch_h264_qpel_mc10);
|
||||||
|
fail |= run_quarter_axis_qpel("mc30", daedalus_put_h264_qpel8_mc30_ref,
|
||||||
|
daedalus_recipe_dispatch_h264_qpel_mc30);
|
||||||
|
fail |= run_quarter_axis_qpel("mc01", daedalus_put_h264_qpel8_mc01_ref,
|
||||||
|
daedalus_recipe_dispatch_h264_qpel_mc01);
|
||||||
|
fail |= run_quarter_axis_qpel("mc03", daedalus_put_h264_qpel8_mc03_ref,
|
||||||
|
daedalus_recipe_dispatch_h264_qpel_mc03);
|
||||||
|
return fail;
|
||||||
|
}
|
||||||
|
|
||||||
int main(void)
|
int main(void)
|
||||||
{
|
{
|
||||||
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
||||||
@@ -515,5 +580,6 @@ int main(void)
|
|||||||
fail |= test_qpel_mc20();
|
fail |= test_qpel_mc20();
|
||||||
fail |= test_qpel_mc02();
|
fail |= test_qpel_mc02();
|
||||||
fail |= test_qpel_mc22();
|
fail |= test_qpel_mc22();
|
||||||
|
fail |= test_qpel_quarter_axis_all();
|
||||||
return fail;
|
return fail;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user