Files
daedalus-fourier/tests/test_api_h264.c
T
claude-noether c3301b0c2e h264: qpel mc02 (vertical half-pel, CPU/NEON)
Mirror of cycle 9's mc20 transposed to vertical orientation.  Wires
up the second qpel half-pel position via the vendored
ff_put_h264_qpel8_mc02_neon symbol, closes the "missing vertical
sibling" gap that mc20 left open since cycle 9.

Scope:
  - Public API: daedalus_dispatch_h264_qpel_mc02 + recipe wrapper.
  - Internal: dispatch_h264_qpel_mc02_cpu calling the NEON entry.
  - Recipe table: DAEDALUS_KERNEL_H264_QPEL_MC02 = 17 → CPU.
    Explicit SUBSTRATE_QPU returns -1 (no shader yet).
  - C reference: tests/h264_qpel8_mc02_ref.c — vertical 6-tap
    transpose of mc20 (reads src[(r±N)*stride + c] instead of
    src[r*stride + c±N]).
  - Test: test_qpel_mc02 in test_api_h264, 8 tiles × 16×16 cols
    × 16 rows, random input, bit-exact compare against the C ref.

Verified on hertz:

  $ ./build/test_api_h264
  ...
    H.264 qpel mc20: 1024/1024 bytes bit-exact (100.0000%)
    H.264 qpel mc02: 2048/2048 bytes bit-exact (100.0000%)

  All 12 H.264 kernels in the api_smoke now bit-exact PASS.

Why CPU-only: same R-band logic as the deblock _h sibling pattern.
mc02 at ~7.6 ns per 8x8 block on NEON (per the cycle 9 baseline
measurements) gives ~700 us for 8160 MBs × 4 8x8 luma blocks at
1080p — comfortably inside the 33 ms budget.  QPU shader is a
fast-follow once the V vs H shader work is consolidated (the
transpose for the V shader is not mechanical — different SIMD
access pattern than the H shader).

Coverage matrix update:

  qpel position  put_ status  avg_ status
  -------------  -----------  -----------
  mc00 (copy)    not wired    not wired
  mc10 (¼-H)     not wired    not wired
  mc20 (½-H)    ✓ QPU+CPU     not wired
  mc30 (¾-H)     not wired    not wired
  mc01 (¼-V)     not wired    not wired
  mc02 (½-V)    ✓ CPU         not wired (this PR)
  mc03 (¾-V)     not wired    not wired
  mc11..mc33     not wired    not wired

13 more qpel positions to go for the full put_ matrix.  Adding them
follows the same template; each is a small contained PR.
2026-05-25 00:47:37 +02:00

477 lines
20 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Phase 8a — H.264 kernels through the public API.
*
* Covers IDCT 4x4, IDCT 8x8, deblock luma vertical. Each kernel
* exercised through daedalus_recipe_dispatch_* and compared to
* the C reference. Recipe routes all 3 to CPU (per cycles 6+7+8
* verdicts).
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "../include/daedalus.h"
extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_v_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void daedalus_h264_h_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void daedalus_h264_v_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
static uint64_t xs_state = 0xa11264ULL;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
static int test_idct4(void)
{
enum { N = 64, STRIDE = 64, BYTES = 8 * STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
int16_t coeffs[N * 16], coeffs_ref[N * 16];
uint8_t dst[BYTES], dst_ref[BYTES];
daedalus_h264_block_meta meta[N];
/* Layout: 8x8 grid of 4x4 blocks (each 4x4 occupies 4 rows x 4 cols).
* Block (bx, by) at byte offset by*4*STRIDE + bx*4. Need BYTES big
* enough: 8 row-blocks * 4 rows = 32 rows × 64 stride = 2048. Use
* 8 row-blocks. */
enum { BX = 8, BY = 8, FULL_BYTES = BY * 4 * STRIDE };
uint8_t big_dst[FULL_BYTES], big_dst_ref[FULL_BYTES];
for (int i = 0; i < FULL_BYTES; i++)
big_dst[i] = big_dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N * 16; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 1024) - 512);
for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
int i = by * BX + bx;
meta[i].dst_off = by * 4 * STRIDE + bx * 4;
}
for (int i = 0; i < N; i++)
daedalus_h264_idct_add_ref(big_dst_ref + meta[i].dst_off,
coeffs_ref + i * 16, STRIDE);
int rc = daedalus_recipe_dispatch_h264_idct4(ctx, big_dst, STRIDE,
coeffs, N, meta);
if (rc) { fprintf(stderr, "idct4 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < FULL_BYTES; i++) if (big_dst[i] != big_dst_ref[i]) diff++;
printf(" H.264 IDCT 4x4: %d/%d bytes bit-exact (%.4f%%)\n",
FULL_BYTES - diff, FULL_BYTES, 100.0 * (FULL_BYTES - diff) / FULL_BYTES);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_idct8(void)
{
enum { N = 16, STRIDE = 64, BYTES = (8 * 4) * STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
int16_t coeffs[N * 64], coeffs_ref[N * 64];
uint8_t dst[BYTES], dst_ref[BYTES];
daedalus_h264_block_meta meta[N];
for (int i = 0; i < BYTES; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N * 64; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 2048) - 1024);
/* 8 blocks per row × 4 row-blocks = 32 blocks. Use 8 cols × 2 rows-of-blocks
* for safety inside BYTES. Actually BYTES = 32*64 = 2048, supports 8*8=64
* blocks. Let me use 8 cols × 2 rows of blocks = 16 blocks. */
int BX = 8, BY = 2; /* 16 blocks total */
for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
int i = by * BX + bx;
meta[i].dst_off = by * 8 * STRIDE + bx * 8;
}
for (int i = 0; i < N; i++)
daedalus_h264_idct8_add_ref(dst_ref + meta[i].dst_off,
coeffs_ref + i * 64, STRIDE);
int rc = daedalus_recipe_dispatch_h264_idct8(ctx, dst, STRIDE,
coeffs, N, meta);
if (rc) { fprintf(stderr, "idct8 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < BYTES; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 IDCT 8x8: %d/%d bytes bit-exact (%.4f%%)\n",
BYTES - diff, BYTES, 100.0 * (BYTES - diff) / BYTES);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock(void)
{
/* One edge per 16x16 tile. */
enum { N_EDGES = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 4, EDGE_OFF = EDGE_ROW * TILE_STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_v_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_luma_v(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock luma v: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock_h(void)
{
/* Mirror of test_deblock but for the H variant. Per-tile layout
* is now 8 cols x 16 rows (one vertical edge between cols 3 and 4
* of the tile); EDGE_COL = 4 puts dst_off at the leftmost output
* column of the right block so the kernel's pix[-4..+3] read sits
* inside the tile. */
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16,
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock_chroma_v(void)
{
/* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
* (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
EDGE_OFF = EDGE_ROW * TILE_STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock_chroma_h(void)
{
/* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
* (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
/* --- bS=4 intra-strength deblock tests ---
* Tile geometry per orientation matches the bS<4 variant; only the
* dispatch + reference function change. alpha/beta are non-trivial
* (the C ref + NEON both early-return when alpha|beta == 0).
*/
typedef struct {
const char *name;
int n_edges, tile_stride, tile_rows, edge_off;
void (*ref)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta);
int (*dispatch)(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta);
} intra_test_spec;
static int run_intra_test(const intra_test_spec *t)
{
int total = t->n_edges * t->tile_stride * t->tile_rows;
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t *dst = malloc((size_t) total);
uint8_t *dst_ref = malloc((size_t) total);
daedalus_h264_deblock_meta *meta = calloc((size_t) t->n_edges, sizeof(*meta));
if (!dst || !dst_ref || !meta) return 1;
for (int i = 0; i < total; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
int tile_bytes = t->tile_stride * t->tile_rows;
for (int i = 0; i < t->n_edges; i++) {
meta[i].dst_off = (uint32_t)(i * tile_bytes + t->edge_off);
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
/* tc0[] unused for intra; leave at 0 from calloc. */
}
for (int i = 0; i < t->n_edges; i++) {
t->ref(dst_ref + meta[i].dst_off,
(ptrdiff_t) t->tile_stride,
meta[i].alpha, meta[i].beta);
}
int rc = t->dispatch(ctx, dst, (size_t) t->tile_stride,
(size_t) t->n_edges, meta);
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", t->name, rc); return 1; }
int diff = 0;
for (int i = 0; i < total; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock %s: %d/%d bytes bit-exact (%.4f%%)\n",
t->name, total - diff, total, 100.0 * (total - diff) / total);
free(meta); free(dst_ref); free(dst);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock_intra_all(void)
{
intra_test_spec specs[] = {
{ "luma v intra", 8, 16, 8, 4 * 16,
daedalus_h264_v_loop_filter_luma_intra_ref,
daedalus_recipe_dispatch_h264_deblock_luma_v_intra },
{ "luma h intra", 8, 8, 16, 4,
daedalus_h264_h_loop_filter_luma_intra_ref,
daedalus_recipe_dispatch_h264_deblock_luma_h_intra },
{ "chroma v intra", 8, 8, 4, 2 * 8,
daedalus_h264_v_loop_filter_chroma_intra_ref,
daedalus_recipe_dispatch_h264_deblock_chroma_v_intra },
{ "chroma h intra", 8, 4, 8, 2,
daedalus_h264_h_loop_filter_chroma_intra_ref,
daedalus_recipe_dispatch_h264_deblock_chroma_h_intra },
};
int fail = 0;
for (size_t i = 0; i < sizeof(specs)/sizeof(specs[0]); i++)
fail |= run_intra_test(&specs[i]);
return fail;
}
static int test_qpel_mc20(void)
{
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
* holds rows 0..7; src[c-2..c+3] read via SRC_COL offset matches the
* cycle-9 bench convention so the same C reference and NEON .S can
* be compared. */
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 8,
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
SRC_COL = 3 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_qpel_meta meta[N];
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
memset(dst, 0, sizeof(dst));
memset(dst_ref, 0, sizeof(dst_ref));
for (int i = 0; i < N; i++) {
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
}
for (int i = 0; i < N; i++)
daedalus_put_h264_qpel8_mc20_ref(dst_ref + meta[i].dst_off,
src + meta[i].src_off,
TILE_STRIDE);
int rc = daedalus_recipe_dispatch_h264_qpel_mc20(ctx, dst, src,
TILE_STRIDE, N, meta);
if (rc) { fprintf(stderr, "qpel_mc20 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 qpel mc20: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_qpel_mc02(void)
{
/* mc02: vertical 6-tap. Tile is 16 cols × 16 rows so the kernel
* can read rows [SRC_ROW-2 .. SRC_ROW+7+3] inside the buffer.
* SRC_ROW = 3 leaves rows -2..-1 above the output (rows 1..2 of
* the tile) and rows 8..10 below (rows 11..13). */
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
SRC_ROW = 3 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_qpel_meta meta[N];
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
memset(dst, 0, sizeof(dst));
memset(dst_ref, 0, sizeof(dst_ref));
for (int i = 0; i < N; i++) {
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
}
for (int i = 0; i < N; i++)
daedalus_put_h264_qpel8_mc02_ref(dst_ref + meta[i].dst_off,
src + meta[i].src_off,
TILE_STRIDE);
int rc = daedalus_recipe_dispatch_h264_qpel_mc02(ctx, dst, src,
TILE_STRIDE, N, meta);
if (rc) { fprintf(stderr, "qpel_mc02 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 qpel mc02: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
int main(void)
{
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
printf(" H264_IDCT4 recipe substrate: %d (1=CPU, 2=QPU)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4));
printf(" H264_IDCT8 recipe substrate: %d\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8));
printf(" H264_DEBLOCK_LV recipe substrate: %d\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV));
printf(" H264_QPEL_MC20 recipe substrate: %d\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));
printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
printf(" H264_DEBLOCK_CV recipe substrate: %d (CPU)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
printf(" H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
printf(" H264_DEBLOCK_*_INTRA recipe substrate: %d (CPU, bS=4 set)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA));
int fail = 0;
fail |= test_idct4();
fail |= test_idct8();
fail |= test_deblock();
fail |= test_deblock_h();
fail |= test_deblock_chroma_v();
fail |= test_deblock_chroma_h();
fail |= test_deblock_intra_all();
fail |= test_qpel_mc20();
fail |= test_qpel_mc02();
return fail;
}