Phase 8c: H.264 luma qpel mc20 through public API
Extends daedalus-fourier with daedalus_recipe_dispatch_h264_qpel_mc20
so libavcodec.so can route H264QpelContext.put_h264_qpel_pixels_tab[1][2]
through the recipe layer instead of ff_put_h264_qpel8_mc20_neon directly.
API additions (header + library):
- daedalus_h264_qpel_meta { dst_off, src_off }
- daedalus_dispatch_h264_qpel_mc20(ctx, sub, dst, src, stride,
n_blocks, meta)
- daedalus_recipe_dispatch_h264_qpel_mc20(...) (AUTO wrapper)
- DAEDALUS_KERNEL_H264_QPEL_MC20 = 9 in the recipe-query enum
- daedalus_recipe_substrate_for() returns CPU NEON for cycle 9
The 6-tap horizontal half-pel filter signature matches FFmpeg's
H264QpelContext convention exactly: dst and src share a single stride
and src already points at output column 0 (filter reads cols -2..+3).
Single-stride API to make the marfrit-packages FFmpeg shim a
straight pointer-pass; no buffer rearrangement.
Verdict per docs/k9_h264qpel_mc20.md: CPU NEON. Per-block 7.6 ns
gives 135x margin over 30 fps 1080p; QPU dispatch floor at ~250 ns
makes any V3D shader strictly worse. Recipe table reflects that —
the recipe_dispatch entry is a one-line forward to the CPU path.
CMakeLists changes:
- h264qpel_neon.S added to the daedalus_core static lib (only the
bench targets owned it before; now the public API needs it too)
- tests/h264_qpel8_mc20_ref.c added to the test_api_h264 target
Phase 8a/8b smoke gains a 4th case (test_qpel_mc20): 1024/1024
bytes bit-exact via daedalus_recipe_dispatch_h264_qpel_mc20.
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 9.
This commit is contained in:
@@ -365,6 +365,7 @@ add_library(daedalus_core STATIC
|
|||||||
${FFC_MC_SOURCES}
|
${FFC_MC_SOURCES}
|
||||||
${FFASM_H264IDCT_SOURCES}
|
${FFASM_H264IDCT_SOURCES}
|
||||||
${FFASM_H264DSP_SOURCES}
|
${FFASM_H264DSP_SOURCES}
|
||||||
|
${FFASM_H264QPEL_SOURCES}
|
||||||
${DAV1D_CDEF_ASM_SOURCES}
|
${DAV1D_CDEF_ASM_SOURCES}
|
||||||
${DAV1D_CDEF_C_SOURCES}
|
${DAV1D_CDEF_C_SOURCES}
|
||||||
)
|
)
|
||||||
@@ -458,6 +459,7 @@ add_executable(test_api_h264
|
|||||||
tests/h264_idct4_ref.c
|
tests/h264_idct4_ref.c
|
||||||
tests/h264_idct8_ref.c
|
tests/h264_idct8_ref.c
|
||||||
tests/h264_deblock_ref.c
|
tests/h264_deblock_ref.c
|
||||||
|
tests/h264_qpel8_mc20_ref.c
|
||||||
)
|
)
|
||||||
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
||||||
target_compile_options(test_api_h264 PRIVATE -O2)
|
target_compile_options(test_api_h264 PRIVATE -O2)
|
||||||
|
|||||||
@@ -263,6 +263,39 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
|
|||||||
uint8_t *dst, size_t dst_stride,
|
uint8_t *dst, size_t dst_stride,
|
||||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
|
||||||
|
/* -------------------------------------------------------------------
|
||||||
|
* H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
|
||||||
|
* (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
|
||||||
|
* docs/k9_h264qpel_mc20.md for the R-band rationale).
|
||||||
|
*
|
||||||
|
* Per H.264 §8.4.2.2.1, horizontal half-pel luma 6-tap filter:
|
||||||
|
* dst[r,c] = clip255((s[r,c-2] - 5*s[r,c-1] + 20*s[r,c]
|
||||||
|
* + 20*s[r,c+1] - 5*s[r,c+2] + s[r,c+3]
|
||||||
|
* + 16) >> 5)
|
||||||
|
*
|
||||||
|
* Single-stride: dst and src share `stride`; this matches FFmpeg's
|
||||||
|
* H264QpelContext.put_h264_qpel_pixels_tab[][] convention and the
|
||||||
|
* vendored ff_put_h264_qpel8_mc20_neon signature.
|
||||||
|
*
|
||||||
|
* `src + src_off` points at the leftmost OUTPUT column (col 0); the
|
||||||
|
* filter reads cols -2..+3, so the caller must guarantee src has at
|
||||||
|
* least 2 pixels of left context and 3 pixels of right context per
|
||||||
|
* row. (FFmpeg already maintains an edge-emulated buffer for the
|
||||||
|
* frame boundary; this matches that contract.)
|
||||||
|
* ----------------------------------------------------------------- */
|
||||||
|
typedef struct {
|
||||||
|
uint32_t dst_off; /* byte offset into dst (block top-left) */
|
||||||
|
uint32_t src_off; /* byte offset into src (col 0, row 0) */
|
||||||
|
} daedalus_h264_qpel_meta;
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
/* -------------------------------------------------------------------
|
/* -------------------------------------------------------------------
|
||||||
* Recipe query — what does the API recommend for each kernel?
|
* Recipe query — what does the API recommend for each kernel?
|
||||||
* ----------------------------------------------------------------- */
|
* ----------------------------------------------------------------- */
|
||||||
@@ -275,6 +308,7 @@ typedef enum {
|
|||||||
DAEDALUS_KERNEL_H264_IDCT4 = 6,
|
DAEDALUS_KERNEL_H264_IDCT4 = 6,
|
||||||
DAEDALUS_KERNEL_H264_IDCT8 = 7,
|
DAEDALUS_KERNEL_H264_IDCT8 = 7,
|
||||||
DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
|
DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
|
||||||
|
DAEDALUS_KERNEL_H264_QPEL_MC20 = 9,
|
||||||
} daedalus_kernel;
|
} daedalus_kernel;
|
||||||
|
|
||||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||||
|
|||||||
@@ -93,6 +93,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
|||||||
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_H264_IDCT4: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
}
|
}
|
||||||
return DAEDALUS_SUBSTRATE_CPU;
|
return DAEDALUS_SUBSTRATE_CPU;
|
||||||
}
|
}
|
||||||
@@ -117,6 +118,8 @@ extern void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride
|
|||||||
extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
extern void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||||||
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||||
int alpha, int beta, int8_t *tc0);
|
int alpha, int beta, int8_t *tc0);
|
||||||
|
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
|
|
||||||
/* -------------------- CPU dispatch implementations -------------- */
|
/* -------------------- CPU dispatch implementations -------------- */
|
||||||
|
|
||||||
@@ -226,6 +229,22 @@ static int dispatch_h264_deblock_cpu(daedalus_ctx *ctx,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||||
|
{
|
||||||
|
(void) ctx;
|
||||||
|
/* FFmpeg's NEON entry uses a single stride for both dst and src
|
||||||
|
* (H264QpelContext convention). Caller already guarantees this
|
||||||
|
* via the public API contract documented in daedalus.h. */
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
ff_put_h264_qpel8_mc20_neon(dst + meta[i].dst_off,
|
||||||
|
src + meta[i].src_off,
|
||||||
|
(ptrdiff_t) stride);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@@ -811,6 +830,14 @@ int daedalus_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx, daedalus_substrate
|
|||||||
return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
|
return dispatch_h264_deblock_qpu(ctx, dst, dst_stride, n_edges, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||||
|
{
|
||||||
|
ROUTE_CPU_ONLY(DAEDALUS_KERNEL_H264_QPEL_MC20, dispatch_h264_qpel_mc20_cpu,
|
||||||
|
dst, src, stride, n_blocks, meta);
|
||||||
|
}
|
||||||
|
|
||||||
/* -------------------- Recipe convenience wrappers --------------- */
|
/* -------------------- Recipe convenience wrappers --------------- */
|
||||||
|
|
||||||
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
||||||
@@ -881,3 +908,11 @@ int daedalus_recipe_dispatch_h264_deblock_luma_v(daedalus_ctx *ctx,
|
|||||||
return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
return daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||||
dst, dst_stride, n_edges, meta);
|
dst, dst_stride, n_edges, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||||
|
{
|
||||||
|
return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||||
|
dst, src, stride, n_blocks, meta);
|
||||||
|
}
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t s
|
|||||||
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||||||
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||||
int alpha, int beta, int8_t tc0[4]);
|
int alpha, int beta, int8_t tc0[4]);
|
||||||
|
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
|
|
||||||
static uint64_t xs_state = 0xa11264ULL;
|
static uint64_t xs_state = 0xa11264ULL;
|
||||||
static inline uint64_t xs(void) {
|
static inline uint64_t xs(void) {
|
||||||
@@ -143,6 +145,46 @@ static int test_deblock(void)
|
|||||||
return diff == 0 ? 0 : 1;
|
return diff == 0 ? 0 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int test_qpel_mc20(void)
|
||||||
|
{
|
||||||
|
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
|
||||||
|
* holds rows 0..7; src[c-2..c+3] read via SRC_COL offset matches the
|
||||||
|
* cycle-9 bench convention so the same C reference and NEON .S can
|
||||||
|
* be compared. */
|
||||||
|
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 8,
|
||||||
|
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||||
|
SRC_COL = 3 };
|
||||||
|
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||||
|
if (!ctx) return 1;
|
||||||
|
|
||||||
|
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||||
|
daedalus_h264_qpel_meta meta[N];
|
||||||
|
|
||||||
|
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||||
|
memset(dst, 0, sizeof(dst));
|
||||||
|
memset(dst_ref, 0, sizeof(dst_ref));
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
|
||||||
|
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++)
|
||||||
|
daedalus_put_h264_qpel8_mc20_ref(dst_ref + meta[i].dst_off,
|
||||||
|
src + meta[i].src_off,
|
||||||
|
TILE_STRIDE);
|
||||||
|
|
||||||
|
int rc = daedalus_recipe_dispatch_h264_qpel_mc20(ctx, dst, src,
|
||||||
|
TILE_STRIDE, N, meta);
|
||||||
|
if (rc) { fprintf(stderr, "qpel_mc20 dispatch rc=%d\n", rc); return 1; }
|
||||||
|
int diff = 0;
|
||||||
|
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||||
|
printf(" H.264 qpel mc20: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||||
|
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||||
|
daedalus_ctx_destroy(ctx);
|
||||||
|
return diff == 0 ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
int main(void)
|
int main(void)
|
||||||
{
|
{
|
||||||
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
||||||
@@ -152,10 +194,13 @@ int main(void)
|
|||||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8));
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8));
|
||||||
printf(" H264_DEBLOCK_LV recipe substrate: %d\n",
|
printf(" H264_DEBLOCK_LV recipe substrate: %d\n",
|
||||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV));
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV));
|
||||||
|
printf(" H264_QPEL_MC20 recipe substrate: %d\n",
|
||||||
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));
|
||||||
|
|
||||||
int fail = 0;
|
int fail = 0;
|
||||||
fail |= test_idct4();
|
fail |= test_idct4();
|
||||||
fail |= test_idct8();
|
fail |= test_idct8();
|
||||||
fail |= test_deblock();
|
fail |= test_deblock();
|
||||||
|
fail |= test_qpel_mc20();
|
||||||
return fail;
|
return fail;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user