Files
daedalus-fourier/tests/test_api_h264.c
T
claude-noether 01f782cfaf h264: qpel avg — 12 remaining variants (closes the matrix)
Closes the H.264 8x8 qpel buildout.  Adds the remaining 12 avg_
biprediction positions:
  4 quarter-axis: avg_mc{10,30,01,03}
  8 diagonals  : avg_mc{11,12,13,21,23,31,32,33}

Each follows the established pattern: same half-pel formula as the
put_ sibling, then L2 average with the existing dst contents per
H.264 §8.4.2.3.1.

Scope:
  - 12 new kernel enums (MC10..MC33 avg_ = 34..45) → CPU.
  - 12 NEON externs for the vendored ff_avg_h264_qpel8_mc*_neon.
  - 12 CPU dispatches via existing DEFINE_QPEL_CPU_DISPATCH macro.
  - 12 public dispatches via DEFINE_QPEL_DISPATCH macro.
  - 12 recipe wrappers via DEFINE_QPEL_RECIPE macro.
  - 12 header decls via DECLARE_QPEL_AVG macro.
  - tests/h264_qpel8_avg_rest_ref.c — references via two parametric
    macros: DEFINE_AVG_QUARTER for the 4 ¼-pel L2 forms,
    DEFINE_AVG_DIAG for the 8 two-half-pel-avg forms.
  - Test harness extended with a RUN(MC) sub-macro that derives both
    the ref name and dispatch name from the bare mcXX.  (The ref
    is daedalus_avg_h264_qpel8_<mc>_ref; the dispatch is
    daedalus_recipe_dispatch_h264_qpel_avg_<mc>.  Macro had a typo
    on first try that duplicated "avg_" in the ref name — caught at
    compile, fixed.)

Verified on hertz:

  $ ./build/test_api_h264 | tail -12
    H.264 qpel avg_mc10: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc30: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc01: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc03: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc11: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc12: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc13: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc21: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc23: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc31: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc32: 2048/2048 bytes bit-exact (100.0000%)
    H.264 qpel avg_mc33: 2048/2048 bytes bit-exact (100.0000%)

  All 12 new positions bit-exact PASS first try.

Final qpel matrix state:
  put_:  mc00 (none — integer copy)
         mc01 ✓  mc02 ✓  mc03 ✓
         mc10 ✓  mc11 ✓  mc12 ✓  mc13 ✓
         mc20 ✓ (QPU+CPU)  mc21 ✓  mc22 ✓  mc23 ✓
         mc30 ✓  mc31 ✓  mc32 ✓  mc33 ✓
  avg_:  same 15-of-16 coverage, all CPU.

Every B-slice biprediction case the libavcodec intercept can throw
at us is now serviceable.  QPU shaders remain mc20-only (cycle 9);
the other 29 positions are CPU NEON.  Whether to write more QPU
shaders depends on real perf measurement — at NEON ~10 ns per
8x8 block, full qpel coverage at 1080p is ~2-3 ms of total work,
well inside budget.
2026-05-25 08:49:42 +02:00

712 lines
32 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* Phase 8a — H.264 kernels through the public API.
*
* Covers IDCT 4x4, IDCT 8x8, deblock luma vertical. Each kernel
* exercised through daedalus_recipe_dispatch_* and compared to
* the C reference. Recipe routes all 3 to CPU (per cycles 6+7+8
* verdicts).
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "../include/daedalus.h"
extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_v_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void daedalus_h264_h_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void daedalus_h264_v_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t tc0[4]);
extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_avg_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride);
static uint64_t xs_state = 0xa11264ULL;
static inline uint64_t xs(void) {
uint64_t x = xs_state;
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
return xs_state = x;
}
static int test_idct4(void)
{
enum { N = 64, STRIDE = 64, BYTES = 8 * STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
int16_t coeffs[N * 16], coeffs_ref[N * 16];
uint8_t dst[BYTES], dst_ref[BYTES];
daedalus_h264_block_meta meta[N];
/* Layout: 8x8 grid of 4x4 blocks (each 4x4 occupies 4 rows x 4 cols).
* Block (bx, by) at byte offset by*4*STRIDE + bx*4. Need BYTES big
* enough: 8 row-blocks * 4 rows = 32 rows × 64 stride = 2048. Use
* 8 row-blocks. */
enum { BX = 8, BY = 8, FULL_BYTES = BY * 4 * STRIDE };
uint8_t big_dst[FULL_BYTES], big_dst_ref[FULL_BYTES];
for (int i = 0; i < FULL_BYTES; i++)
big_dst[i] = big_dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N * 16; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 1024) - 512);
for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
int i = by * BX + bx;
meta[i].dst_off = by * 4 * STRIDE + bx * 4;
}
for (int i = 0; i < N; i++)
daedalus_h264_idct_add_ref(big_dst_ref + meta[i].dst_off,
coeffs_ref + i * 16, STRIDE);
int rc = daedalus_recipe_dispatch_h264_idct4(ctx, big_dst, STRIDE,
coeffs, N, meta);
if (rc) { fprintf(stderr, "idct4 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < FULL_BYTES; i++) if (big_dst[i] != big_dst_ref[i]) diff++;
printf(" H.264 IDCT 4x4: %d/%d bytes bit-exact (%.4f%%)\n",
FULL_BYTES - diff, FULL_BYTES, 100.0 * (FULL_BYTES - diff) / FULL_BYTES);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_idct8(void)
{
enum { N = 16, STRIDE = 64, BYTES = (8 * 4) * STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
int16_t coeffs[N * 64], coeffs_ref[N * 64];
uint8_t dst[BYTES], dst_ref[BYTES];
daedalus_h264_block_meta meta[N];
for (int i = 0; i < BYTES; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N * 64; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 2048) - 1024);
/* 8 blocks per row × 4 row-blocks = 32 blocks. Use 8 cols × 2 rows-of-blocks
* for safety inside BYTES. Actually BYTES = 32*64 = 2048, supports 8*8=64
* blocks. Let me use 8 cols × 2 rows of blocks = 16 blocks. */
int BX = 8, BY = 2; /* 16 blocks total */
for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
int i = by * BX + bx;
meta[i].dst_off = by * 8 * STRIDE + bx * 8;
}
for (int i = 0; i < N; i++)
daedalus_h264_idct8_add_ref(dst_ref + meta[i].dst_off,
coeffs_ref + i * 64, STRIDE);
int rc = daedalus_recipe_dispatch_h264_idct8(ctx, dst, STRIDE,
coeffs, N, meta);
if (rc) { fprintf(stderr, "idct8 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < BYTES; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 IDCT 8x8: %d/%d bytes bit-exact (%.4f%%)\n",
BYTES - diff, BYTES, 100.0 * (BYTES - diff) / BYTES);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock(void)
{
/* One edge per 16x16 tile. */
enum { N_EDGES = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 4, EDGE_OFF = EDGE_ROW * TILE_STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_v_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_luma_v(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock luma v: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock_h(void)
{
/* Mirror of test_deblock but for the H variant. Per-tile layout
* is now 8 cols x 16 rows (one vertical edge between cols 3 and 4
* of the tile); EDGE_COL = 4 puts dst_off at the leftmost output
* column of the right block so the kernel's pix[-4..+3] read sits
* inside the tile. */
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16,
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock_chroma_v(void)
{
/* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
* (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
EDGE_OFF = EDGE_ROW * TILE_STRIDE };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock_chroma_h(void)
{
/* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
* (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_deblock_meta meta[N_EDGES];
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < N_EDGES; i++) {
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
for (int s = 0; s < 4; s++) {
int r = (int)(xs() % 8);
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
}
}
for (int i = 0; i < N_EDGES; i++) {
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
meta[i].alpha, meta[i].beta, tc0_local);
}
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
N_EDGES, meta);
if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
/* --- bS=4 intra-strength deblock tests ---
* Tile geometry per orientation matches the bS<4 variant; only the
* dispatch + reference function change. alpha/beta are non-trivial
* (the C ref + NEON both early-return when alpha|beta == 0).
*/
typedef struct {
const char *name;
int n_edges, tile_stride, tile_rows, edge_off;
void (*ref)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta);
int (*dispatch)(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride,
size_t n_edges, const daedalus_h264_deblock_meta *meta);
} intra_test_spec;
static int run_intra_test(const intra_test_spec *t)
{
int total = t->n_edges * t->tile_stride * t->tile_rows;
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t *dst = malloc((size_t) total);
uint8_t *dst_ref = malloc((size_t) total);
daedalus_h264_deblock_meta *meta = calloc((size_t) t->n_edges, sizeof(*meta));
if (!dst || !dst_ref || !meta) return 1;
for (int i = 0; i < total; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
int tile_bytes = t->tile_stride * t->tile_rows;
for (int i = 0; i < t->n_edges; i++) {
meta[i].dst_off = (uint32_t)(i * tile_bytes + t->edge_off);
meta[i].alpha = (int)(xs() % 64) + 1;
meta[i].beta = (int)(xs() % 16) + 1;
/* tc0[] unused for intra; leave at 0 from calloc. */
}
for (int i = 0; i < t->n_edges; i++) {
t->ref(dst_ref + meta[i].dst_off,
(ptrdiff_t) t->tile_stride,
meta[i].alpha, meta[i].beta);
}
int rc = t->dispatch(ctx, dst, (size_t) t->tile_stride,
(size_t) t->n_edges, meta);
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", t->name, rc); return 1; }
int diff = 0;
for (int i = 0; i < total; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 deblock %s: %d/%d bytes bit-exact (%.4f%%)\n",
t->name, total - diff, total, 100.0 * (total - diff) / total);
free(meta); free(dst_ref); free(dst);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_deblock_intra_all(void)
{
intra_test_spec specs[] = {
{ "luma v intra", 8, 16, 8, 4 * 16,
daedalus_h264_v_loop_filter_luma_intra_ref,
daedalus_recipe_dispatch_h264_deblock_luma_v_intra },
{ "luma h intra", 8, 8, 16, 4,
daedalus_h264_h_loop_filter_luma_intra_ref,
daedalus_recipe_dispatch_h264_deblock_luma_h_intra },
{ "chroma v intra", 8, 8, 4, 2 * 8,
daedalus_h264_v_loop_filter_chroma_intra_ref,
daedalus_recipe_dispatch_h264_deblock_chroma_v_intra },
{ "chroma h intra", 8, 4, 8, 2,
daedalus_h264_h_loop_filter_chroma_intra_ref,
daedalus_recipe_dispatch_h264_deblock_chroma_h_intra },
};
int fail = 0;
for (size_t i = 0; i < sizeof(specs)/sizeof(specs[0]); i++)
fail |= run_intra_test(&specs[i]);
return fail;
}
static int test_qpel_mc20(void)
{
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
* holds rows 0..7; src[c-2..c+3] read via SRC_COL offset matches the
* cycle-9 bench convention so the same C reference and NEON .S can
* be compared. */
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 8,
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
SRC_COL = 3 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_qpel_meta meta[N];
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
memset(dst, 0, sizeof(dst));
memset(dst_ref, 0, sizeof(dst_ref));
for (int i = 0; i < N; i++) {
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
}
for (int i = 0; i < N; i++)
daedalus_put_h264_qpel8_mc20_ref(dst_ref + meta[i].dst_off,
src + meta[i].src_off,
TILE_STRIDE);
int rc = daedalus_recipe_dispatch_h264_qpel_mc20(ctx, dst, src,
TILE_STRIDE, N, meta);
if (rc) { fprintf(stderr, "qpel_mc20 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 qpel mc20: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_qpel_mc02(void)
{
/* mc02: vertical 6-tap. Tile is 16 cols × 16 rows so the kernel
* can read rows [SRC_ROW-2 .. SRC_ROW+7+3] inside the buffer.
* SRC_ROW = 3 leaves rows -2..-1 above the output (rows 1..2 of
* the tile) and rows 8..10 below (rows 11..13). */
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
SRC_ROW = 3 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_qpel_meta meta[N];
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
memset(dst, 0, sizeof(dst));
memset(dst_ref, 0, sizeof(dst_ref));
for (int i = 0; i < N; i++) {
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
}
for (int i = 0; i < N; i++)
daedalus_put_h264_qpel8_mc02_ref(dst_ref + meta[i].dst_off,
src + meta[i].src_off,
TILE_STRIDE);
int rc = daedalus_recipe_dispatch_h264_qpel_mc02(ctx, dst, src,
TILE_STRIDE, N, meta);
if (rc) { fprintf(stderr, "qpel_mc02 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 qpel mc02: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_qpel_mc22(void)
{
/* mc22: 2D HV lowpass. Needs 2 cols left + 3 cols right + 2 rows
* top + 3 rows bottom of context per 8x8 output. Tile is 16x16
* with output positioned at (SRC_ROW=3, SRC_COL=3) so the read
* range [SRC_*-2 .. SRC_*+7+3] stays inside the tile. */
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
SRC_ROW = 3, SRC_COL = 3 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_qpel_meta meta[N];
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
memset(dst, 0, sizeof(dst));
memset(dst_ref, 0, sizeof(dst_ref));
for (int i = 0; i < N; i++) {
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
}
for (int i = 0; i < N; i++)
daedalus_put_h264_qpel8_mc22_ref(dst_ref + meta[i].dst_off,
src + meta[i].src_off,
TILE_STRIDE);
int rc = daedalus_recipe_dispatch_h264_qpel_mc22(ctx, dst, src,
TILE_STRIDE, N, meta);
if (rc) { fprintf(stderr, "qpel_mc22 dispatch rc=%d\n", rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 qpel mc22: %d/%d bytes bit-exact (%.4f%%)\n",
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
/* Generic harness for the 4 single-axis quarter-pel positions; same
* tile geometry as mc22 since each one reads the largest of the H/V
* lowpass windows (mc10/mc30 need cols -2..+3, mc01/mc03 need rows
* -2..+3 OR +1..+3 on the integer side). */
typedef void (*qpel_ref_fn)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
typedef int (*qpel_dispatch_fn)(daedalus_ctx *ctx, uint8_t *dst,
const uint8_t *src, size_t stride,
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
static int run_quarter_axis_qpel(const char *name,
qpel_ref_fn ref, qpel_dispatch_fn dispatch)
{
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
SRC_ROW = 3, SRC_COL = 3 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_qpel_meta meta[N];
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
memset(dst, 0, sizeof(dst));
memset(dst_ref, 0, sizeof(dst_ref));
for (int i = 0; i < N; i++) {
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
}
for (int i = 0; i < N; i++)
ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE);
int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta);
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n",
name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_qpel_quarter_axis_all(void)
{
int fail = 0;
fail |= run_quarter_axis_qpel("mc10", daedalus_put_h264_qpel8_mc10_ref,
daedalus_recipe_dispatch_h264_qpel_mc10);
fail |= run_quarter_axis_qpel("mc30", daedalus_put_h264_qpel8_mc30_ref,
daedalus_recipe_dispatch_h264_qpel_mc30);
fail |= run_quarter_axis_qpel("mc01", daedalus_put_h264_qpel8_mc01_ref,
daedalus_recipe_dispatch_h264_qpel_mc01);
fail |= run_quarter_axis_qpel("mc03", daedalus_put_h264_qpel8_mc03_ref,
daedalus_recipe_dispatch_h264_qpel_mc03);
return fail;
}
static int test_qpel_diag_all(void)
{
/* Diagonal positions need TWO half-pel intermediates per output;
* some of them read at (r+1,c) or (r,c+1) so the test geometry
* needs an extra row + col of context. run_quarter_axis_qpel
* already provides plenty (SRC_ROW=3, SRC_COL=3, 16x16 tile)
* — reusing that harness is fine. */
int fail = 0;
fail |= run_quarter_axis_qpel("mc11", daedalus_put_h264_qpel8_mc11_ref,
daedalus_recipe_dispatch_h264_qpel_mc11);
fail |= run_quarter_axis_qpel("mc12", daedalus_put_h264_qpel8_mc12_ref,
daedalus_recipe_dispatch_h264_qpel_mc12);
fail |= run_quarter_axis_qpel("mc13", daedalus_put_h264_qpel8_mc13_ref,
daedalus_recipe_dispatch_h264_qpel_mc13);
fail |= run_quarter_axis_qpel("mc21", daedalus_put_h264_qpel8_mc21_ref,
daedalus_recipe_dispatch_h264_qpel_mc21);
fail |= run_quarter_axis_qpel("mc23", daedalus_put_h264_qpel8_mc23_ref,
daedalus_recipe_dispatch_h264_qpel_mc23);
fail |= run_quarter_axis_qpel("mc31", daedalus_put_h264_qpel8_mc31_ref,
daedalus_recipe_dispatch_h264_qpel_mc31);
fail |= run_quarter_axis_qpel("mc32", daedalus_put_h264_qpel8_mc32_ref,
daedalus_recipe_dispatch_h264_qpel_mc32);
fail |= run_quarter_axis_qpel("mc33", daedalus_put_h264_qpel8_mc33_ref,
daedalus_recipe_dispatch_h264_qpel_mc33);
return fail;
}
/* Avg-form harness: pre-loads dst + dst_ref with the same random
* content so we can verify the L2 averaging is happening (not just
* put_-style overwrite). If the dispatch incorrectly overwrote
* dst, the bit-exact compare would still catch the mismatch against
* the avg_ reference. */
static int run_avg_qpel(const char *name,
qpel_ref_fn ref, qpel_dispatch_fn dispatch)
{
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
SRC_ROW = 3, SRC_COL = 3 };
daedalus_ctx *ctx = daedalus_ctx_create();
if (!ctx) return 1;
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
daedalus_h264_qpel_meta meta[N];
/* Two random buffers: src for the qpel input, dst seeded with
* different random content as the "list0 prediction" — both
* dst and dst_ref get the SAME seed so the avg compare is fair. */
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
for (int i = 0; i < TOTAL; i++) {
uint8_t v = (uint8_t)(xs() & 0xff);
dst[i] = dst_ref[i] = v;
}
for (int i = 0; i < N; i++) {
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
}
for (int i = 0; i < N; i++)
ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE);
int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta);
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; }
int diff = 0;
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n",
name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
daedalus_ctx_destroy(ctx);
return diff == 0 ? 0 : 1;
}
static int test_qpel_avg_anchors(void)
{
int fail = 0;
fail |= run_avg_qpel("avg_mc20", daedalus_avg_h264_qpel8_mc20_ref,
daedalus_recipe_dispatch_h264_qpel_avg_mc20);
fail |= run_avg_qpel("avg_mc02", daedalus_avg_h264_qpel8_mc02_ref,
daedalus_recipe_dispatch_h264_qpel_avg_mc02);
fail |= run_avg_qpel("avg_mc22", daedalus_avg_h264_qpel8_mc22_ref,
daedalus_recipe_dispatch_h264_qpel_avg_mc22);
return fail;
}
static int test_qpel_avg_rest(void)
{
int fail = 0;
/* Ref fns are named daedalus_avg_h264_qpel8_<mcXX>_ref (no
* second "avg_"); dispatch fns are named ..._avg_mcXX. Macro
* builds both from the bare mcXX name. */
#define RUN(MC) fail |= run_avg_qpel("avg_" #MC, \
daedalus_avg_h264_qpel8_ ## MC ## _ref, \
daedalus_recipe_dispatch_h264_qpel_avg_ ## MC)
RUN(mc10); RUN(mc30); RUN(mc01); RUN(mc03);
RUN(mc11); RUN(mc12); RUN(mc13);
RUN(mc21); RUN(mc23);
RUN(mc31); RUN(mc32); RUN(mc33);
#undef RUN
return fail;
}
int main(void)
{
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
printf(" H264_IDCT4 recipe substrate: %d (1=CPU, 2=QPU)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4));
printf(" H264_IDCT8 recipe substrate: %d\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8));
printf(" H264_DEBLOCK_LV recipe substrate: %d\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV));
printf(" H264_QPEL_MC20 recipe substrate: %d\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));
printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
printf(" H264_DEBLOCK_CV recipe substrate: %d (CPU)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
printf(" H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
printf(" H264_DEBLOCK_*_INTRA recipe substrate: %d (CPU, bS=4 set)\n",
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA));
int fail = 0;
fail |= test_idct4();
fail |= test_idct8();
fail |= test_deblock();
fail |= test_deblock_h();
fail |= test_deblock_chroma_v();
fail |= test_deblock_chroma_h();
fail |= test_deblock_intra_all();
fail |= test_qpel_mc20();
fail |= test_qpel_mc02();
fail |= test_qpel_mc22();
fail |= test_qpel_quarter_axis_all();
fail |= test_qpel_diag_all();
fail |= test_qpel_avg_anchors();
fail |= test_qpel_avg_rest();
return fail;
}