Merge pull request 'h264: deblock chroma_v + chroma_h (CPU/NEON, bS<4)' (#10) from noether/h264-deblock-chroma into main

Reviewed-on: #10
This commit was merged in pull request #10.
This commit is contained in:
2026-05-24 21:55:57 +00:00
5 changed files with 315 additions and 0 deletions
+1
View File
@@ -520,6 +520,7 @@ add_executable(test_api_h264
tests/h264_idct8_ref.c tests/h264_idct8_ref.c
tests/h264_deblock_ref.c tests/h264_deblock_ref.c
tests/h264_h_loop_filter_luma_ref.c tests/h264_h_loop_filter_luma_ref.c
tests/h264_chroma_loop_filter_ref.c
tests/h264_qpel8_mc20_ref.c tests/h264_qpel8_mc20_ref.c
) )
target_link_libraries(test_api_h264 PRIVATE daedalus_core) target_link_libraries(test_api_h264 PRIVATE daedalus_core)
+31
View File
@@ -286,6 +286,35 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate
uint8_t *dst, size_t dst_stride, uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta); size_t n_edges, const daedalus_h264_deblock_meta *meta);
/* H.264 chroma (4:2:0) loop filters — bS<4 variant. Chroma uses
* the SAME daedalus_h264_deblock_meta struct as luma but on smaller
* tiles: 8 cols × 4 rows for V (4 segments of 2 cols), 4 cols × 8
* rows for H (4 segments of 2 rows). Each segment has its own tc0
* strength (tc0[s] applies to both cells in segment s).
*
* Algorithm difference vs luma: chroma updates only p0 and q0
* (never p1/p2/q1/q2) and uses tC = tc0_seg + 1 directly (no
* luma-style ap/aq side-condition bonus).
*
* QPU shaders for chroma deblock not implemented yet; recipe table
* routes AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1.
*/
int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta);
int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta);
int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta);
int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta);
/* ------------------------------------------------------------------- /* -------------------------------------------------------------------
* H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9 * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
* (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
@@ -333,6 +362,8 @@ typedef enum {
DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8, DAEDALUS_KERNEL_H264_DEBLOCK_LV = 8,
DAEDALUS_KERNEL_H264_QPEL_MC20 = 9, DAEDALUS_KERNEL_H264_QPEL_MC20 = 9,
DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10, DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11,
DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12,
} daedalus_kernel; } daedalus_kernel;
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
+80
View File
@@ -131,6 +131,8 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */ case DAEDALUS_KERNEL_H264_IDCT8: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_idct8.spv */
case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */ case DAEDALUS_KERNEL_H264_DEBLOCK_LV: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264deblock.spv */
case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */ case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */
case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */ case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
} }
return DAEDALUS_SUBSTRATE_CPU; return DAEDALUS_SUBSTRATE_CPU;
@@ -158,6 +160,10 @@ extern void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0); int alpha, int beta, int8_t *tc0);
extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, extern void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0); int alpha, int beta, int8_t *tc0);
extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride); ptrdiff_t stride);
@@ -284,6 +290,36 @@ static int dispatch_h264_deblock_h_cpu(daedalus_ctx *ctx,
return 0; return 0;
} }
static int dispatch_h264_deblock_chroma_v_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
meta[i].tc0[2], meta[i].tc0[3] };
ff_h264_v_loop_filter_chroma_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta, tc0_local);
}
return 0;
}
static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
(void) ctx;
for (size_t i = 0; i < n_edges; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1],
meta[i].tc0[2], meta[i].tc0[3] };
ff_h264_h_loop_filter_chroma_neon(dst + meta[i].dst_off,
(ptrdiff_t) dst_stride,
meta[i].alpha, meta[i].beta, tc0_local);
}
return 0;
}
static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx, static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride, uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta) size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1206,6 +1242,34 @@ int daedalus_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx, daedalus_substrate
return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta); return dispatch_h264_deblock_h_cpu(ctx, dst, dst_stride, n_edges, meta);
} }
int daedalus_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_QPU)
return -1; /* No chroma QPU shader yet. */
return dispatch_h264_deblock_chroma_v_cpu(ctx, dst, dst_stride, n_edges, meta);
}
int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
daedalus_substrate eff = sub;
if (eff == DAEDALUS_SUBSTRATE_AUTO)
eff = daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH);
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx))
eff = DAEDALUS_SUBSTRATE_CPU;
if (eff == DAEDALUS_SUBSTRATE_QPU)
return -1;
return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
}
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub, int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
uint8_t *dst, const uint8_t *src, size_t stride, uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta) size_t n_blocks, const daedalus_h264_qpel_meta *meta)
@@ -1301,6 +1365,22 @@ int daedalus_recipe_dispatch_h264_deblock_luma_h(daedalus_ctx *ctx,
dst, dst_stride, n_edges, meta); dst, dst_stride, n_edges, meta);
} }
int daedalus_recipe_dispatch_h264_deblock_chroma_v(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
return daedalus_dispatch_h264_deblock_chroma_v(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, n_edges, meta);
}
int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta)
{
return daedalus_dispatch_h264_deblock_chroma_h(ctx, DAEDALUS_SUBSTRATE_AUTO,
dst, dst_stride, n_edges, meta);
}
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
uint8_t *dst, const uint8_t *src, size_t stride, uint8_t *dst, const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta) size_t n_blocks, const daedalus_h264_qpel_meta *meta)
+110
View File
@@ -0,0 +1,110 @@
/*
* Standalone bit-exact C reference for H.264 chroma loop filters
* (bS < 4 variant; "intra" / bS=4 variant lives in a separate file
* when added). Covers both orientations:
*
* v_loop_filter_chroma: filter applied VERTICALLY across a
* HORIZONTAL edge. Tile is 8 cols × 4 rows of context
* (rows -2..+1); pix points to row 0 of the bottom block.
* h_loop_filter_chroma: filter applied HORIZONTALLY across a
* VERTICAL edge. Tile is 4 cols × 8 rows of context
* (cols -2..+1); pix points to col 0 of the right block.
*
* Mirrors FFmpeg `ff_h264_v_loop_filter_chroma_neon` (line 412) and
* `ff_h264_h_loop_filter_chroma_neon` (line 430) in
* external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
*
* Algorithm per H.264 §8.7.2.4 (chroma bS<4 inter):
* - Same edge preconditions as luma: |p0-q0|<α, |p1-p0|<β, |q1-q0|<β.
* - tC = tc0_seg + 1 (chroma's tc has no luma-style ap/aq side bonus).
* - δ = clip3((((q0-p0)<<2) + (p1-q1) + 4) >> 3, -tC, tC).
* - p0' = clip255(p0+δ); q0' = clip255(q0-δ).
* - Chroma NEVER updates p1, p2, q1, q2 (unlike luma).
*
* tc0[4]: 4 segments × 2 cells per segment = 8 cells per edge
* (matches both 4:2:0 chroma plane geometry 8 cols for V edge or
* 8 rows for H edge).
*
* Signature (matches FFmpeg + the existing luma refs):
* void(uint8_t *pix, ptrdiff_t stride,
* int alpha, int beta, int8_t tc0[4]);
*
* License: LGPL-2.1-or-later (matches FFmpeg upstream).
*/
#include <stdint.h>
#include <stddef.h>
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
static inline int clip3(int v, int lo, int hi) {
return v < lo ? lo : v > hi ? hi : v;
}
static inline int abs_i(int x) { return x < 0 ? -x : x; }
/* Per-cell chroma filter, vertical-direction access (one column
* across the horizontal edge). p1 is at pix[-2*stride], q1 at
* pix[+1*stride]. */
static void h264_chroma_cell_v(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int tc0_s)
{
int p1 = pix[-2*stride], p0 = pix[-1*stride];
int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
if (abs_i(p0 - q0) >= alpha) return;
if (abs_i(p1 - p0) >= beta) return;
if (abs_i(q1 - q0) >= beta) return;
int tc = tc0_s + 1;
int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
pix[-1*stride] = (uint8_t) clip_u8(p0 + delta);
pix[ 0*stride] = (uint8_t) clip_u8(q0 - delta);
}
/* Same kernel, horizontal-direction access (one row across the
* vertical edge). p1 at pix[-2], q1 at pix[+1]. */
static void h264_chroma_cell_h(uint8_t *pix,
int alpha, int beta, int tc0_s)
{
int p1 = pix[-2], p0 = pix[-1];
int q0 = pix[ 0], q1 = pix[ 1];
if (abs_i(p0 - q0) >= alpha) return;
if (abs_i(p1 - p0) >= beta) return;
if (abs_i(q1 - q0) >= beta) return;
int tc = tc0_s + 1;
int delta = clip3(((q0 - p0) * 4 + (p1 - q1) + 4) >> 3, -tc, tc);
pix[-1] = (uint8_t) clip_u8(p0 + delta);
pix[ 0] = (uint8_t) clip_u8(q0 - delta);
}
void daedalus_h264_v_loop_filter_chroma_ref(
uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4])
{
if (alpha == 0 || beta == 0) return;
if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
/* 8 cols divided into 4 segments of 2 cols each. */
for (int s = 0; s < 4; s++) {
int tc0_s = tc0[s];
if (tc0_s < 0) continue;
for (int c = 0; c < 2; c++) {
int col = s * 2 + c;
h264_chroma_cell_v(pix + col, stride, alpha, beta, tc0_s);
}
}
}
void daedalus_h264_h_loop_filter_chroma_ref(
uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4])
{
if (alpha == 0 || beta == 0) return;
if (tc0[0] < 0 && tc0[1] < 0 && tc0[2] < 0 && tc0[3] < 0) return;
/* 8 rows divided into 4 segments of 2 rows each. */
for (int s = 0; s < 4; s++) {
int tc0_s = tc0[s];
if (tc0_s < 0) continue;
for (int r = 0; r < 2; r++) {
int row = s * 2 + r;
h264_chroma_cell_h(pix + row * stride, alpha, beta, tc0_s);
}
}
}
+93
View File
@@ -18,6 +18,10 @@ extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t s
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]); int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]); int alpha, int beta, int8_t tc0[4]);
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
@@ -191,6 +195,89 @@ static int test_deblock_h(void)
return diff == 0 ? 0 : 1; return diff == 0 ? 0 : 1;
} }
static int test_deblock_chroma_v(void)
{
/* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
* (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
EDGE_OFF = EDGE_ROW * TILE_STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock_chroma_h(void)
{
/* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
* (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_qpel_mc20(void) static int test_qpel_mc20(void)
{ {
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
@@ -245,12 +332,18 @@ int main(void)
printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n", printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH)); (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
printf(" H264_DEBLOCK_CV recipe substrate: %d (CPU)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
printf(" H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
int fail = 0; int fail = 0;
fail |= test_idct4(); fail |= test_idct4();
fail |= test_idct8(); fail |= test_idct8();
fail |= test_deblock(); fail |= test_deblock();
fail |= test_deblock_h(); fail |= test_deblock_h();
fail |= test_deblock_chroma_v();
fail |= test_deblock_chroma_h();
fail |= test_qpel_mc20(); fail |= test_qpel_mc20();
return fail; return fail;
} }