h264: qpel mc02 (vertical half-pel, CPU/NEON)
Mirror of cycle 9's mc20 transposed to vertical orientation. Wires
up the second qpel half-pel position via the vendored
ff_put_h264_qpel8_mc02_neon symbol, closes the "missing vertical
sibling" gap that mc20 left open since cycle 9.
Scope:
- Public API: daedalus_dispatch_h264_qpel_mc02 + recipe wrapper.
- Internal: dispatch_h264_qpel_mc02_cpu calling the NEON entry.
- Recipe table: DAEDALUS_KERNEL_H264_QPEL_MC02 = 17 → CPU.
Explicit SUBSTRATE_QPU returns -1 (no shader yet).
- C reference: tests/h264_qpel8_mc02_ref.c — vertical 6-tap
transpose of mc20 (reads src[(r±N)*stride + c] instead of
src[r*stride + c±N]).
- Test: test_qpel_mc02 in test_api_h264, 8 tiles × 16×16 cols
× 16 rows, random input, bit-exact compare against the C ref.
Verified on hertz:
$ ./build/test_api_h264
...
H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%)
H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%)
All 12 H.264 kernels in the api_smoke now bit-exact PASS.
Why CPU-only: same R-band logic as the deblock _h sibling pattern.
mc02 at ~7.6 ns per 8x8 block on NEON (per the cycle 9 baseline
measurements) gives ~700 us for 8160 MBs × 4 8x8 luma blocks at
1080p — comfortably inside the 33 ms budget. QPU shader is a
fast-follow once the V vs H shader work is consolidated (the
transpose for the V shader is not mechanical — different SIMD
access pattern than the H shader).
Coverage matrix update:
qpel position put_ status avg_ status
------------- ----------- -----------
mc00 (copy) not wired not wired
mc10 (¼-H) not wired not wired
mc20 (½-H) ✓ QPU+CPU not wired
mc30 (¾-H) not wired not wired
mc01 (¼-V) not wired not wired
mc02 (½-V) ✓ CPU not wired (this PR)
mc03 (¾-V) not wired not wired
mc11..mc33 not wired not wired
13 more qpel positions to go for the full put_ matrix. Adding them
follows the same template; each is a small contained PR.
This commit is contained in:
@@ -138,6 +138,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
@@ -178,6 +179,8 @@ extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stri
|
||||
int alpha, int beta);
|
||||
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
|
||||
/* -------------------- CPU dispatch implementations -------------- */
|
||||
|
||||
@@ -405,6 +408,19 @@ static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
ff_put_h264_qpel8_mc02_neon(dst + meta[i].dst_off,
|
||||
src + meta[i].src_off,
|
||||
(ptrdiff_t) stride);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
||||
|
||||
typedef struct {
|
||||
@@ -1376,6 +1392,20 @@ int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC02);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU)
|
||||
return -1; /* No mc02 QPU shader yet — explicit QPU fast-fails. */
|
||||
return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
/* -------------------- Recipe convenience wrappers --------------- */
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
||||
@@ -1494,3 +1524,11 @@ int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
||||
return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, src, stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, src, stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user