h264: qpel mc22 (2D half-pel, CPU/NEON) #16
@@ -524,6 +524,7 @@ add_executable(test_api_h264
|
|||||||
tests/h264_intra_loop_filter_ref.c
|
tests/h264_intra_loop_filter_ref.c
|
||||||
tests/h264_qpel8_mc20_ref.c
|
tests/h264_qpel8_mc20_ref.c
|
||||||
tests/h264_qpel8_mc02_ref.c
|
tests/h264_qpel8_mc02_ref.c
|
||||||
|
tests/h264_qpel8_mc22_ref.c
|
||||||
)
|
)
|
||||||
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
||||||
target_compile_options(test_api_h264 PRIVATE -O2)
|
target_compile_options(test_api_h264 PRIVATE -O2)
|
||||||
|
|||||||
@@ -415,6 +415,27 @@ int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
|
|||||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
|
/* H.264 luma qpel mc22 (2D half-pel "j" position per spec §8.4.2.2.1).
|
||||||
|
* Horizontal 6-tap cascaded into vertical 6-tap with intermediate
|
||||||
|
* 16-bit precision; final +512 >> 10 with clip255. Common position
|
||||||
|
* in real H.264 streams.
|
||||||
|
*
|
||||||
|
* src + src_off points at row 0 col 0 of the OUTPUT block; the
|
||||||
|
* cascade reads rows -2..+10 (13 rows of context) and cols -2..+5
|
||||||
|
* (10 cols of context). Caller must guarantee.
|
||||||
|
*
|
||||||
|
* QPU shader not implemented yet (the HV lowpass is the meatiest
|
||||||
|
* qpel kernel; structurally distinct from the 1D mc20 shader).
|
||||||
|
* Recipe routes AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1.
|
||||||
|
*/
|
||||||
|
int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||||
|
|
||||||
/* -------------------------------------------------------------------
|
/* -------------------------------------------------------------------
|
||||||
* Recipe query — what does the API recommend for each kernel?
|
* Recipe query — what does the API recommend for each kernel?
|
||||||
* ----------------------------------------------------------------- */
|
* ----------------------------------------------------------------- */
|
||||||
@@ -436,6 +457,7 @@ typedef enum {
|
|||||||
DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15,
|
DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15,
|
||||||
DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
|
DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
|
||||||
DAEDALUS_KERNEL_H264_QPEL_MC02 = 17,
|
DAEDALUS_KERNEL_H264_QPEL_MC02 = 17,
|
||||||
|
DAEDALUS_KERNEL_H264_QPEL_MC22 = 18,
|
||||||
} daedalus_kernel;
|
} daedalus_kernel;
|
||||||
|
|
||||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||||
|
|||||||
@@ -139,6 +139,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
|||||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
||||||
case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */
|
case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */
|
||||||
|
case DAEDALUS_KERNEL_H264_QPEL_MC22: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc22 shader pending (hv lowpass) */
|
||||||
}
|
}
|
||||||
return DAEDALUS_SUBSTRATE_CPU;
|
return DAEDALUS_SUBSTRATE_CPU;
|
||||||
}
|
}
|
||||||
@@ -181,6 +182,8 @@ extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
|
|||||||
ptrdiff_t stride);
|
ptrdiff_t stride);
|
||||||
extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
|
extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
|
||||||
ptrdiff_t stride);
|
ptrdiff_t stride);
|
||||||
|
extern void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
|
|
||||||
/* -------------------- CPU dispatch implementations -------------- */
|
/* -------------------- CPU dispatch implementations -------------- */
|
||||||
|
|
||||||
@@ -421,6 +424,19 @@ static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int dispatch_h264_qpel_mc22_cpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||||
|
{
|
||||||
|
(void) ctx;
|
||||||
|
for (size_t i = 0; i < n_blocks; i++) {
|
||||||
|
ff_put_h264_qpel8_mc22_neon(dst + meta[i].dst_off,
|
||||||
|
src + meta[i].src_off,
|
||||||
|
(ptrdiff_t) stride);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@@ -1406,6 +1422,20 @@ int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
|
|||||||
return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta);
|
return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int daedalus_dispatch_h264_qpel_mc22(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||||
|
{
|
||||||
|
daedalus_substrate eff = sub;
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||||
|
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC22);
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||||
|
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU)
|
||||||
|
return -1; /* No mc22 QPU shader yet — explicit QPU fast-fails. */
|
||||||
|
return dispatch_h264_qpel_mc22_cpu(ctx, dst, src, stride, n_blocks, meta);
|
||||||
|
}
|
||||||
|
|
||||||
/* -------------------- Recipe convenience wrappers --------------- */
|
/* -------------------- Recipe convenience wrappers --------------- */
|
||||||
|
|
||||||
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
||||||
@@ -1532,3 +1562,11 @@ int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
|
|||||||
return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||||
dst, src, stride, n_blocks, meta);
|
dst, src, stride, n_blocks, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_h264_qpel_mc22(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||||
|
{
|
||||||
|
return daedalus_dispatch_h264_qpel_mc22(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||||
|
dst, src, stride, n_blocks, meta);
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,70 @@
|
|||||||
|
/*
|
||||||
|
* Standalone bit-exact C reference for H.264 luma qpel 8x8 mc22
|
||||||
|
* (2D half-pel, "put" variant). Cascade of horizontal 6-tap then
|
||||||
|
* vertical 6-tap with INTERMEDIATE 16-bit precision (no per-stage
|
||||||
|
* clip/round), final +512 >> 10 to scale back.
|
||||||
|
*
|
||||||
|
* Per H.264 §8.4.2.2.1, "j" position:
|
||||||
|
*
|
||||||
|
* tmp[r,c] = s[r,c-2] - 5*s[r,c-1] + 20*s[r,c] + 20*s[r,c+1]
|
||||||
|
* - 5*s[r,c+2] + s[r,c+3] (16-bit signed)
|
||||||
|
*
|
||||||
|
* dst[r,c] = clip255((tmp[r-2,c] - 5*tmp[r-1,c] + 20*tmp[r,c]
|
||||||
|
* + 20*tmp[r+1,c] - 5*tmp[r+2,c] + tmp[r+3,c]
|
||||||
|
* + 512) >> 10)
|
||||||
|
*
|
||||||
|
* The tmp[] array spans rows r-2 .. r+3 around each output row, so
|
||||||
|
* we need 13 intermediate rows (rows -2..+10 of the SOURCE
|
||||||
|
* neighbourhood) for 8 output rows. Caller's src must have 2 rows
|
||||||
|
* of top context + 3 rows of bottom context AND 2 cols of left +
|
||||||
|
* 3 cols of right context (FFmpeg's edge-emulated buffer provides
|
||||||
|
* this at the frame boundary; same contract as mc20).
|
||||||
|
*
|
||||||
|
* Mirrors FFmpeg `ff_put_h264_qpel8_mc22_neon` (in
|
||||||
|
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
|
||||||
|
* line 710, which tail-calls put_h264_qpel8_hv_lowpass_neon).
|
||||||
|
*
|
||||||
|
* Signature:
|
||||||
|
* void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
*
|
||||||
|
* Same single-stride convention as mc20/mc02.
|
||||||
|
*
|
||||||
|
* License: LGPL-2.1-or-later.
|
||||||
|
*/
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||||
|
|
||||||
|
void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||||
|
{
|
||||||
|
/* 13 intermediate rows × 8 cols (for the 8 output rows
|
||||||
|
* dst[0..7][0..7], we need tmp[-2..+10][0..7] — but tmp is
|
||||||
|
* indexed RELATIVE to the output, so tmp_buf[0..12] corresponds
|
||||||
|
* to source rows [-2..+10]). */
|
||||||
|
int16_t tmp[13][8];
|
||||||
|
for (int rr = 0; rr < 13; rr++) {
|
||||||
|
int src_row = rr - 2; /* maps tmp_buf[0..12] → src rows [-2..+10] */
|
||||||
|
const uint8_t *s = src + src_row * stride;
|
||||||
|
for (int c = 0; c < 8; c++) {
|
||||||
|
int v = (int) s[c - 2] - 5 * (int) s[c - 1]
|
||||||
|
+ 20 * (int) s[c] + 20 * (int) s[c + 1]
|
||||||
|
- 5 * (int) s[c + 2] + (int) s[c + 3];
|
||||||
|
tmp[rr][c] = (int16_t) v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int r = 0; r < 8; r++) {
|
||||||
|
/* tmp[r-2..r+3] in the output's coord system → tmp_buf[r..r+5]. */
|
||||||
|
for (int c = 0; c < 8; c++) {
|
||||||
|
int v = tmp[r + 0][c] /* "r-2" + shift 2 */
|
||||||
|
- 5 * tmp[r + 1][c] /* "r-1" */
|
||||||
|
+ 20 * tmp[r + 2][c] /* "r+0" */
|
||||||
|
+ 20 * tmp[r + 3][c] /* "r+1" */
|
||||||
|
- 5 * tmp[r + 4][c] /* "r+2" */
|
||||||
|
+ tmp[r + 5][c] /* "r+3" */
|
||||||
|
+ 512;
|
||||||
|
dst[r * stride + c] = (uint8_t) clip_u8(v >> 10);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -34,6 +34,8 @@ extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
|||||||
int alpha, int beta, int8_t tc0[4]);
|
int alpha, int beta, int8_t tc0[4]);
|
||||||
extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
|
extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
|
||||||
ptrdiff_t stride);
|
ptrdiff_t stride);
|
||||||
|
extern void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src,
|
||||||
|
ptrdiff_t stride);
|
||||||
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||||||
ptrdiff_t stride);
|
ptrdiff_t stride);
|
||||||
|
|
||||||
@@ -441,6 +443,46 @@ static int test_qpel_mc02(void)
|
|||||||
return diff == 0 ? 0 : 1;
|
return diff == 0 ? 0 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int test_qpel_mc22(void)
|
||||||
|
{
|
||||||
|
/* mc22: 2D HV lowpass. Needs 2 cols left + 3 cols right + 2 rows
|
||||||
|
* top + 3 rows bottom of context per 8x8 output. Tile is 16x16
|
||||||
|
* with output positioned at (SRC_ROW=3, SRC_COL=3) so the read
|
||||||
|
* range [SRC_*-2 .. SRC_*+7+3] stays inside the tile. */
|
||||||
|
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||||||
|
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||||
|
SRC_ROW = 3, SRC_COL = 3 };
|
||||||
|
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||||
|
if (!ctx) return 1;
|
||||||
|
|
||||||
|
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||||
|
daedalus_h264_qpel_meta meta[N];
|
||||||
|
|
||||||
|
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||||
|
memset(dst, 0, sizeof(dst));
|
||||||
|
memset(dst_ref, 0, sizeof(dst_ref));
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||||
|
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++)
|
||||||
|
daedalus_put_h264_qpel8_mc22_ref(dst_ref + meta[i].dst_off,
|
||||||
|
src + meta[i].src_off,
|
||||||
|
TILE_STRIDE);
|
||||||
|
|
||||||
|
int rc = daedalus_recipe_dispatch_h264_qpel_mc22(ctx, dst, src,
|
||||||
|
TILE_STRIDE, N, meta);
|
||||||
|
if (rc) { fprintf(stderr, "qpel_mc22 dispatch rc=%d\n", rc); return 1; }
|
||||||
|
int diff = 0;
|
||||||
|
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||||
|
printf(" H.264 qpel mc22: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||||
|
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||||
|
daedalus_ctx_destroy(ctx);
|
||||||
|
return diff == 0 ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
int main(void)
|
int main(void)
|
||||||
{
|
{
|
||||||
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
||||||
@@ -472,5 +514,6 @@ int main(void)
|
|||||||
fail |= test_deblock_intra_all();
|
fail |= test_deblock_intra_all();
|
||||||
fail |= test_qpel_mc20();
|
fail |= test_qpel_mc20();
|
||||||
fail |= test_qpel_mc02();
|
fail |= test_qpel_mc02();
|
||||||
|
fail |= test_qpel_mc22();
|
||||||
return fail;
|
return fail;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user