Merge pull request 'h264: deblock chroma_v + chroma_h (CPU/NEON, bS<4)' (#10) from noether/h264-deblock-chroma into main
Reviewed-on: #10
This commit was merged in pull request #10.
This commit is contained in:
@@ -520,6 +520,7 @@ add_executable(test_api_h264
|
||||
tests/h264_idct8_ref.c
|
||||
tests/h264_deblock_ref.c
|
||||
tests/h264_h_loop_filter_luma_ref.c
|
||||
tests/h264_chroma_loop_filter_ref.c
|
||||
tests/h264_qpel8_mc20_ref.c
|
||||
)
|
||||
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
||||
|
||||
@@ -286,6 +286,35 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
/* H.264 chroma (4:2:0) loop filters — bS<4 variant. Chroma uses
|
||||
* the SAME daedalus_h264_deblock_meta struct as luma but on smaller
|
||||
* tiles: 8 cols × 4 rows for V (4 segments of 2 cols), 4 cols × 8
|
||||
* rows for H (4 segments of 2 rows). Each segment has its own tc0
|
||||
* strength (tc0[s] applies to both cells in segment s).
|
||||
*
|
||||
* Algorithm difference vs luma: chroma updates only p0 and q0
|
||||
* (never p1/p2/q1/q2) and uses tC = tc0_seg + 1 directly (no
|
||||
* luma-style ap/aq side-condition bonus).
|
||||
*
|
||||
* QPU shaders for chroma deblock not implemented yet; recipe table
|
||||
* routes AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1.
|
||||
*/
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||
|
||||
/* -------------------------------------------------------------------
|
||||
* H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
|
||||
* (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
|
||||
@@ -333,6 +362,8 @@ typedef enum {
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
|
||||
DAEDALUS_KERNEL_H264_QPEL_MC20 = 9,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11,
|
||||
DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12,
|
||||
} daedalus_kernel;
|
||||
|
||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||
|
||||
@@ -131,6 +131,8 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
||||
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
|
||||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
||||
}
|
||||
return DAEDALUS_SUBSTRATE_CPU;
|
||||
@@ -158,6 +160,10 @@ extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
|
||||
ptrdiff_t stride);
|
||||
|
||||
@@ -284,6 +290,36 @@ static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
||||
meta[i].tc0[2], meta[i].tc0[3] };
|
||||
ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
(void) ctx;
|
||||
for (size_t i = 0; i < n_edges; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
|
||||
meta[i].tc0[2], meta[i].tc0[3] };
|
||||
ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off,
|
||||
(ptrdiff_t) dst_stride,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
@@ -1206,6 +1242,34 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate
|
||||
return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU)
|
||||
return -1; /* No chroma QPU shader yet. */
|
||||
return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
daedalus_substrate eff = sub;
|
||||
if (eff == DAEDALUS_SUBSTRATE_AUTO)
|
||||
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH);
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
|
||||
eff = DAEDALUS_SUBSTRATE_CPU;
|
||||
if (eff == DAEDALUS_SUBSTRATE_QPU)
|
||||
return -1;
|
||||
return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
@@ -1301,6 +1365,22 @@ int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
|
||||
uint8_t *dst, size_t dst_stride,
|
||||
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||
{
|
||||
return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
|
||||
dst, dst_stride, n_edges, meta);
|
||||
}
|
||||
|
||||
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
* Standalone bit-exact C reference for H.264 chroma loop filters
|
||||
* (bS < 4 variant; "intra" / bS=4 variant lives in a separate file
|
||||
* when added). Covers both orientations:
|
||||
*
|
||||
* v_loop_filter_chroma: filter applied VERTICALLY across a
|
||||
* HORIZONTAL edge. Tile is 8 cols × 4 rows of context
|
||||
* (rows -2..+1); pix points to row 0 of the bottom block.
|
||||
* h_loop_filter_chroma: filter applied HORIZONTALLY across a
|
||||
* VERTICAL edge. Tile is 4 cols × 8 rows of context
|
||||
* (cols -2..+1); pix points to col 0 of the right block.
|
||||
*
|
||||
* Mirrors FFmpeg `ff_h264_v_loop_filter_chroma_neon` (line 412) and
|
||||
* `ff_h264_h_loop_filter_chroma_neon` (line 430) in
|
||||
* external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
|
||||
*
|
||||
* Algorithm per H.264 §8.7.2.4 (chroma bS<4 inter):
|
||||
* - Same edge preconditions as luma: |p0-q0|<α, |p1-p0|<β, |q1-q0|<β.
|
||||
* - tC = tc0_seg + 1 (chroma's tc has no luma-style ap/aq side bonus).
|
||||
* - δ = clip3((((q0-p0)<<2) + (p1-q1) + 4) >> 3, -tC, tC).
|
||||
* - p0' = clip255(p0+δ); q0' = clip255(q0-δ).
|
||||
* - Chroma NEVER updates p1, p2, q1, q2 (unlike luma).
|
||||
*
|
||||
* tc0[4]: 4 segments × 2 cells per segment = 8 cells per edge
|
||||
* (matches both 4:2:0 chroma plane geometry — 8 cols for V edge or
|
||||
* 8 rows for H edge).
|
||||
*
|
||||
* Signature (matches FFmpeg + the existing luma refs):
|
||||
* void(uint8_t *pix, ptrdiff_t stride,
|
||||
* int alpha, int beta, int8_t tc0[4]);
|
||||
*
|
||||
* License: LGPL-2.1-or-later (matches FFmpeg upstream).
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||
static inline int clip3(int v, int lo, int hi) {
|
||||
return v < lo ? lo : v > hi ? hi : v;
|
||||
}
|
||||
static inline int abs_i(int x) { return x < 0 ? -x : x; }
|
||||
|
||||
/* Per-cell chroma filter, vertical-direction access (one column
|
||||
* across the horizontal edge). p1 is at pix[-2*stride], q1 at
|
||||
* pix[+1*stride]. */
|
||||
static void h264_chroma_cell_v(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int tc0_s)
|
||||
{
|
||||
int p1 = pix[-2*stride], p0 = pix[-1*stride];
|
||||
int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
int tc = tc0_s + 1;
|
||||
int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
pix[-1*stride] = (uint8_t) clip_u8(p0 + delta);
|
||||
pix[ 0*stride] = (uint8_t) clip_u8(q0 - delta);
|
||||
}
|
||||
|
||||
/* Same kernel, horizontal-direction access (one row across the
|
||||
* vertical edge). p1 at pix[-2], q1 at pix[+1]. */
|
||||
static void h264_chroma_cell_h(uint8_t *pix,
|
||||
int alpha, int beta, int tc0_s)
|
||||
{
|
||||
int p1 = pix[-2], p0 = pix[-1];
|
||||
int q0 = pix[ 0], q1 = pix[ 1];
|
||||
if (abs_i(p0 - q0) >= alpha) return;
|
||||
if (abs_i(p1 - p0) >= beta) return;
|
||||
if (abs_i(q1 - q0) >= beta) return;
|
||||
int tc = tc0_s + 1;
|
||||
int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
|
||||
pix[-1] = (uint8_t) clip_u8(p0 + delta);
|
||||
pix[ 0] = (uint8_t) clip_u8(q0 - delta);
|
||||
}
|
||||
|
||||
void daedalus_h264_v_loop_filter_chroma_ref(
|
||||
uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4])
|
||||
{
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
|
||||
|
||||
/* 8 cols divided into 4 segments of 2 cols each. */
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int tc0_s = tc0[s];
|
||||
if (tc0_s < 0) continue;
|
||||
for (int c = 0; c < 2; c++) {
|
||||
int col = s * 2 + c;
|
||||
h264_chroma_cell_v(pix + col, stride, alpha, beta, tc0_s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void daedalus_h264_h_loop_filter_chroma_ref(
|
||||
uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4])
|
||||
{
|
||||
if (alpha == 0 || beta == 0) return;
|
||||
if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
|
||||
|
||||
/* 8 rows divided into 4 segments of 2 rows each. */
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int tc0_s = tc0[s];
|
||||
if (tc0_s < 0) continue;
|
||||
for (int r = 0; r < 2; r++) {
|
||||
int row = s * 2 + r;
|
||||
h264_chroma_cell_h(pix + row * stride, alpha, beta, tc0_s);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -18,6 +18,10 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t s
|
||||
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||||
extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t tc0[4]);
|
||||
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||||
@@ -191,6 +195,89 @@ static int test_deblock_h(void)
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_deblock_chroma_v(void)
|
||||
{
|
||||
/* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
|
||||
* (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
|
||||
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
|
||||
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
|
||||
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
|
||||
EDGE_OFF = EDGE_ROW * TILE_STRIDE };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
|
||||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||||
meta[i].beta = (int)(xs() % 16) + 1;
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int r = (int)(xs() % 8);
|
||||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||||
daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
|
||||
N_EDGES, meta);
|
||||
if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_deblock_chroma_h(void)
|
||||
{
|
||||
/* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
|
||||
* (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
|
||||
enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
|
||||
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
|
||||
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
|
||||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||
if (!ctx) return 1;
|
||||
|
||||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||||
|
||||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
|
||||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||||
meta[i].beta = (int)(xs() % 16) + 1;
|
||||
for (int s = 0; s < 4; s++) {
|
||||
int r = (int)(xs() % 8);
|
||||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < N_EDGES; i++) {
|
||||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||||
daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||||
meta[i].alpha, meta[i].beta, tc0_local);
|
||||
}
|
||||
|
||||
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
|
||||
N_EDGES, meta);
|
||||
if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
|
||||
int diff = 0;
|
||||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||
printf(" H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||||
daedalus_ctx_destroy(ctx);
|
||||
return diff == 0 ? 0 : 1;
|
||||
}
|
||||
|
||||
static int test_qpel_mc20(void)
|
||||
{
|
||||
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
|
||||
@@ -245,12 +332,18 @@ int main(void)
|
||||
|
||||
printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
|
||||
printf(" H264_DEBLOCK_CV recipe substrate: %d (CPU)\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
|
||||
printf(" H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
|
||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
|
||||
|
||||
int fail = 0;
|
||||
fail |= test_idct4();
|
||||
fail |= test_idct8();
|
||||
fail |= test_deblock();
|
||||
fail |= test_deblock_h();
|
||||
fail |= test_deblock_chroma_v();
|
||||
fail |= test_deblock_chroma_h();
|
||||
fail |= test_qpel_mc20();
|
||||
return fail;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user