h264: qpel avg anchors (avg_mc20/02/22, biprediction support)
Begins the avg_ qpel buildout for B-slice biprediction. Each avg_
form computes the same half-pel formula as its put_ sibling, then
L2-averages the result with the existing dst contents — the caller
pre-loads dst with the list0 prediction; the avg_ call adds list1
per H.264 §8.4.2.3.1.
Scope (3 anchors, sets the pattern for the remaining 13 avg_
variants):
- 3 new kernel enums (AVG_MC20=31, AVG_MC02=32, AVG_MC22=33) → CPU.
- 3 NEON externs for the vendored ff_avg_h264_qpel8_{mc20,mc02,mc22}_neon.
- 3 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro
(the macro is type-agnostic so it didn't need changes for avg_).
- 3 public dispatches via DEFINE_QPEL_DISPATCH macro.
- 3 recipe wrappers via DEFINE_QPEL_RECIPE macro.
- tests/h264_qpel8_avg_anchors_ref.c — per-cell helpers + L2 avg.
- Test harness: run_avg_qpel() seeds dst with random content so
the L2 averaging is actually exercised (not just put_-style
overwrite that would silently pass).
Verified on hertz:
$ ./build/test_api_h264 | tail -3
H.264 qpel avg_mc20: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc02: 2048/2048 bytes bit-exact (100.0000%)
H.264 qpel avg_mc22: 2048/2048 bytes bit-exact (100.0000%)
All 3 anchors bit-exact PASS first try.
Why anchors only in this PR: the avg_ pattern is uniform across all
16 positions (each is just "put_ result + L2 with dst"). Landing
the anchors first confirms the macro pattern works for both put_
and avg_; the remaining 13 (avg_mc10/30/01/03 + avg_mc11..33) follow
the same template in a follow-up PR.
State of the qpel matrix after this PR:
put_ : 15 of 16 positions ✓ (mc00 is integer copy, no wrapper)
avg_ : 3 of 16 positions ✓ (mc20, mc02, mc22 anchors)
13 follow-up positions
This commit is contained in:
@@ -52,6 +52,9 @@ extern void daedalus_put_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, p
|
||||
extern void daedalus_put_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
|
||||
@@ -583,6 +586,62 @@ static int test_qpel_diag_all(void)
|
||||
return fail;
|
||||
}
|
||||
|
||||
/* Avg-form harness: pre-loads dst + dst_ref with the same random
|
||||
* content so we can verify the L2 averaging is happening (not just
|
||||
* put_-style overwrite). If the dispatch incorrectly overwrote
|
||||
* dst, the bit-exact compare would still catch the mismatch against
|
||||
* the avg_ reference. */
|
||||
static int run_avg_qpel(const char *name,
|
||||
qpel_ref_fn ref, qpel_dispatch_fn dispatch)
|
||||
{
|
||||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||
SRC_ROW = 3, SRC_COL = 3 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_qpel_meta meta[N];
|
||||
|
||||
/* Two random buffers: src for the qpel input, dst seeded with
|
||||
* different random content as the "list0 prediction" — both
|
||||
* dst and dst_ref get the SAME seed so the avg compare is fair. */
|
||||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||
for (int i = 0; i < TOTAL; i++) {
|
||||
uint8_t v = (uint8_t)(xs() & 0xff);
|
||||
dst[i] = dst_ref[i] = v;
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE);
|
||||
|
||||
int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta);
|
||||
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_qpel_avg_anchors(void)
|
||||
{
|
||||
int fail = 0;
|
||||
fail |= run_avg_qpel("avg_mc20", daedalus_avg_h264_qpel8_mc20_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_avg_mc20);
|
||||
fail |= run_avg_qpel("avg_mc02", daedalus_avg_h264_qpel8_mc02_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_avg_mc02);
|
||||
fail |= run_avg_qpel("avg_mc22", daedalus_avg_h264_qpel8_mc22_ref,
|
||||
daedalus_recipe_dispatch_h264_qpel_avg_mc22);
|
||||
return fail;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
||||
@@ -617,5 +676,6 @@ int main(void)
|
||||
fail |= test_qpel_mc22();
|
||||
fail |= test_qpel_quarter_axis_all();
|
||||
fail |= test_qpel_diag_all();
|
||||
fail |= test_qpel_avg_anchors();
|
||||
return fail;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user