Merge pull request 'h264: qpel mc02 (vertical half-pel, CPU/NEON)' (#15) from noether/h264-qpel-mc02 into main
Reviewed-on: #15
This commit was merged in pull request #15.
This commit is contained in:
@@ -523,6 +523,7 @@ add_executable(test_api_h264
|
||||
tests/h264_chroma_loop_filter_ref.c
|
||||
tests/h264_intra_loop_filter_ref.c
|
||||
tests/h264_qpel8_mc20_ref.c
|
||||
tests/h264_qpel8_mc02_ref.c
|
||||
)
|
||||
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
||||
target_compile_options(test_api_h264 PRIVATE -O2)
|
||||
|
||||
@@ -392,6 +392,29 @@ int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
/* H.264 luma qpel mc02 (vertical half-pel) — mirror of mc20.
|
||||
* 6-tap filter applied vertically:
|
||||
* dst[r,c] = clip255((s[r-2,c] - 5*s[r-1,c] + 20*s[r,c]
|
||||
* + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c]
|
||||
* + 16) >> 5)
|
||||
*
|
||||
* Same single-stride convention as mc20. src + src_off points at
|
||||
* row 0 col 0 of the OUTPUT block; the filter reads rows -2..+3, so
|
||||
* the caller must guarantee 2 rows of top context and 3 rows of
|
||||
* bottom context per block (FFmpeg edge-emulated buffer handles
|
||||
* frame boundaries; same contract as mc20).
|
||||
*
|
||||
* QPU shader not implemented yet; recipe table routes AUTO to CPU
|
||||
* NEON. Explicit DAEDALUS_SUBSTRATE_QPU returns -1.
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* Recipe query — what does the API recommend for each kernel?
|
||||
* ----------------------------------------------------------------- */
|
||||
@@ -412,6 +435,7 @@ typedef enum {
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA = 14,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC02 = 17,
|
||||
} daedalus_kernel;
|
||||
|
||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||
|
||||
@@ -138,6 +138,7 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC02: return DAEDALUS_SUBSTRATE_CPU; /* QPU mc02 shader pending */
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
}
|
||||
@@ -178,6 +179,8 @@ extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stri
|
||||
int alpha, int beta);
|
||||
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
|
||||
/* -------------------- CPU dispatch implementations -------------- */
|
||||
|
||||
@@ -405,6 +408,19 @@ static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_qpel_mc02_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_blocks; i++) {
|
||||
ff_put_h264_qpel8_mc02_neon(dst + meta[i].dst_off,
|
||||
src + meta[i].src_off,
|
||||
(ptrdiff_t) stride);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* -------------------- IDCT QPU dispatch (cycle 1 v4 shader) ---- */
|
||||
|
||||
typedef struct {
|
||||
@@ -1376,6 +1392,20 @@ int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_qpel_mc02(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC02);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU)
|
||||
return -1; /* No mc02 QPU shader yet — explicit QPU fast-fails. */
|
||||
return dispatch_h264_qpel_mc02_cpu(ctx, dst, src, stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
/* -------------------- Recipe convenience wrappers --------------- */
|
||||
|
||||
int daedalus_recipe_dispatch_vp9_idct8(daedalus_ctx *ctx,
|
||||
@@ -1494,3 +1524,11 @@ int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
||||
return daedalus_dispatch_h264_qpel_mc20(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, src, stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc02(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_qpel_mc02(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, src, stride, n_blocks, meta);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 luma qpel 8×8 mc02
|
||||
* (vertical half-pel, "put" variant). Mirror of mc20 with rows
|
||||
* and columns transposed. 6-tap filter applied vertically:
|
||||
*
|
||||
* dst[r,c] = clip255( (s[r-2,c] - 5*s[r-1,c] + 20*s[r,c]
|
||||
* + 20*s[r+1,c] - 5*s[r+2,c] + s[r+3,c]
|
||||
* + 16) >> 5 )
|
||||
*
|
||||
* Mirrors FFmpeg `ff_put_h264_qpel8_mc02_neon` (in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264qpel_neon.S
|
||||
* line 678, which tail-calls put_h264_qpel8_v_lowpass_neon).
|
||||
*
|
||||
* Signature:
|
||||
* void(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||
*
|
||||
* Both dst and src use the SAME stride. src points at row 0 col 0
|
||||
* of the output block; the filter reads rows -2..+3 (2 rows of top
|
||||
* context, 3 rows of bottom context). Caller must guarantee the
|
||||
* source buffer has those rows available (FFmpeg's edge-emulated
|
||||
* buffer handles this at the frame boundary; matches the contract
|
||||
* documented for mc20).
|
||||
*
|
||||
* License: LGPL-2.1-or-later.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
|
||||
void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||
{
|
||||
for (int r = 0; r < 8; r++) {
|
||||
for (int c = 0; c < 8; c++) {
|
||||
int s_m2 = src[(r - 2) * stride + c];
|
||||
int s_m1 = src[(r - 1) * stride + c];
|
||||
int s_0 = src[(r + 0) * stride + c];
|
||||
int s_p1 = src[(r + 1) * stride + c];
|
||||
int s_p2 = src[(r + 2) * stride + c];
|
||||
int s_p3 = src[(r + 3) * stride + c];
|
||||
int v = s_m2 - 5 * s_m1 + 20 * s_0 + 20 * s_p1 - 5 * s_p2 + s_p3 + 16;
|
||||
dst[r * stride + c] = (uint8_t) clip_u8(v >> 5);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -32,6 +32,8 @@ extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t
|
||||
int alpha, int beta);
|
||||
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
|
||||
@@ -399,6 +401,46 @@ static int test_qpel_mc20(void)
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_qpel_mc02(void)
|
||||
{
|
||||
/* mc02: vertical 6-tap. Tile is 16 cols × 16 rows so the kernel
|
||||
* can read rows [SRC_ROW-2 .. SRC_ROW+7+3] inside the buffer.
|
||||
* SRC_ROW = 3 leaves rows -2..-1 above the output (rows 1..2 of
|
||||
* the tile) and rows 8..10 below (rows 11..13). */
|
||||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||||
SRC_ROW = 3 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_qpel_meta meta[N];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||||
memset(dst, 0, sizeof(dst));
|
||||
memset(dst_ref, 0, sizeof(dst_ref));
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
|
||||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++)
|
||||
daedalus_put_h264_qpel8_mc02_ref(dst_ref + meta[i].dst_off,
|
||||
src + meta[i].src_off,
|
||||
TILE_STRIDE);
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_qpel_mc02(ctx, dst, src,
|
||||
TILE_STRIDE, N, meta);
|
||||
if (rc) { fprintf(stderr, "qpel_mc02 dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 qpel mc02: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
||||
@@ -429,5 +471,6 @@ int main(void)
|
||||
fail |= test_deblock_chroma_h();
|
||||
fail |= test_deblock_intra_all();
|
||||
fail |= test_qpel_mc20();
|
||||
fail |= test_qpel_mc02();
|
||||
return fail;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user