h264: qpel mc02 (vertical half-pel, CPU/NEON)
Mirror of cycle 9's mc20 transposed to vertical orientation. Wires
up the second qpel half-pel position via the vendored
ff_put_h264_qpel8_mc02_neon symbol, closes the "missing vertical
sibling" gap that mc20 left open since cycle 9.
Scope:
- Public API: daedalus_dispatch_h264_qpel_mc02 + recipe wrapper.
- Internal: dispatch_h264_qpel_mc02_cpu calling the NEON entry.
- Recipe table: DAEDALUS_KERNEL_H264_QPEL_MC02 = 17 → CPU.
Explicit SUBSTRATE_QPU returns -1 (no shader yet).
- C reference: tests/h264_qpel8_mc02_ref.c — vertical 6-tap
transpose of mc20 (reads src[(r±N)*stride + c] instead of
src[r*stride + c±N]).
- Test: test_qpel_mc02 in test_api_h264, 8 tiles × 16×16 cols
× 16 rows, random input, bit-exact compare against the C ref.
Verified on hertz:
$ ./build/test_api_h264
...
H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%)
H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%)
All 12 H.264 kernels in the api_smoke now bit-exact PASS.
Why CPU-only: same R-band logic as the deblock _h sibling pattern.
mc02 at ~7.6 ns per 8x8 block on NEON (per the cycle 9 baseline
measurements) gives ~700 us for 8160 MBs × 4 8x8 luma blocks at
1080p — comfortably inside the 33 ms budget. QPU shader is a
fast-follow once the V vs H shader work is consolidated (the
transpose for the V shader is not mechanical — different SIMD
access pattern than the H shader).
Coverage matrix update:
qpel position put_ status avg_ status
------------- ----------- -----------
mc00 (copy) not wired not wired
mc10 (¼-H) not wired not wired
mc20 (½-H) ✓ QPU+CPU not wired
mc30 (¾-H) not wired not wired
mc01 (¼-V) not wired not wired
mc02 (½-V) ✓ CPU not wired (this PR)
mc03 (¾-V) not wired not wired
mc11..mc33 not wired not wired
13 more qpel positions to go for the full put_ matrix. Adding them
follows the same template; each is a small contained PR.
This commit is contained in:
@@ -32,6 +32,8 @@ extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t
|
||||
int alpha, int beta);
|
||||
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
|
||||
@@ -399,6 +401,46 @@ static int test_qpel_mc20(void)
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_qpel_mc02(void)
|
||||
{
|
||||
/* mc02: vertical 6-tap. Tile is 16 cols × 16 rows so the kernel
|
||||
* can read rows [SRC_ROW-2 .. SRC_ROW+7+3] inside the buffer.
|
||||
* SRC_ROW = 3 leaves rows -2..-1 above the output (rows 1..2 of
|
||||
* the tile) and rows 8..10 below (rows 11..13). */
|
||||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||
SRC_ROW = 3 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_qpel_meta meta[N];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||
memset(dst, 0, sizeof(dst));
|
||||
memset(dst_ref, 0, sizeof(dst_ref));
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
|
||||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
daedalus_put_h264_qpel8_mc02_ref(dst_ref + meta[i].dst_off,
|
||||
src + meta[i].src_off,
|
||||
TILE_STRIDE);
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_qpel_mc02(ctx, dst, src,
|
||||
TILE_STRIDE, N, meta);
|
||||
if (rc) { fprintf(stderr, "qpel_mc02 dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 qpel mc02: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
||||
@@ -429,5 +471,6 @@ int main(void)
|
||||
fail |= test_deblock_chroma_h();
|
||||
fail |= test_deblock_intra_all();
|
||||
fail |= test_qpel_mc20();
|
||||
fail |= test_qpel_mc02();
|
||||
return fail;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user