989818c2e6
Closes task #166 (re-measure R-bands on post-buffer-pool dispatch path). Now that all H.264 hot-path primitives have QPU shaders and the dispatch overhead has been hammered down (tasks #160 buffer pool, #161 persistent command buffer), bench_h264_primitives no longer measures one column. Two passes — CPU NEON and QPU V3D7 compute — with a side-by-side per-kernel comparison and ratio. Headline result on hertz (Pi 5 V3D 7.1, 30 iters x 5 warmup): kernel CPU ns/op QPU ns/op winner IDCT 4x4 luma 10.79 2.47 QPU 4.36x IDCT 8x8 luma 29.69 9.23 QPU 3.22x Deblock luma_v 17.58 10.21 QPU 1.72x Deblock luma_h 38.41 9.98 QPU 3.85x qpel mc20 (8x8) 28.24 9.66 QPU 2.92x qpel mc02 (8x8) 16.96 20.54 CPU 1.21x qpel mc22 (8x8) 71.58 9.64 QPU 7.43x 1080p worst-case sum (IDCT4 + deblock luma + qpel mc22): CPU NEON only: 5.57 ms QPU only: 1.30 ms (CPU/QPU sum ratio = 4.30x) Reverses PR #10's verdict (which had CPU NEON 4x faster than QPU for IDCT-only) — the buffer-pool + persistent-cmdbuf wins land hard. Only qpel mc02 still shows CPU ahead, marginally (single- axis vertical filter, row-strided memory pattern unfriendly to the WG layout — left as a follow-up for cycle-9-style targeted tuning). Substrate decree (2026-05-23) stays in force as policy — these numbers retroactively justify it. Also tightens test_api_h264's startup recipe print: the stale "(CPU)" / "(CPU, no QPU H shader yet)" / "(CPU, bS=4 set)" labels next to deblock_lh, deblock_cv, deblock_ch and deblock_*_intra are now wrong since PRs #28, #29, #35 (those kernels are on QPU).
712 lines
32 KiB
C
712 lines
32 KiB
C
/*
|
||
* Phase 8a — H.264 kernels through the public API.
|
||
*
|
||
* Covers IDCT 4x4, IDCT 8x8, deblock luma vertical. Each kernel
|
||
* exercised through daedalus_recipe_dispatch_* and compared to
|
||
* the C reference. Recipe routes all 3 to CPU (per cycles 6+7+8
|
||
* verdicts).
|
||
*/
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <stdint.h>
|
||
#include <stddef.h>
|
||
#include <string.h>
|
||
|
||
#include "../include/daedalus.h"
|
||
|
||
extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
|
||
extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t tc0[4]);
|
||
extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t tc0[4]);
|
||
extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t tc0[4]);
|
||
extern void daedalus_h264_v_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta);
|
||
extern void daedalus_h264_h_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta);
|
||
extern void daedalus_h264_v_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta);
|
||
extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta);
|
||
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||
int alpha, int beta, int8_t tc0[4]);
|
||
extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_avg_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||
ptrdiff_t stride);
|
||
|
||
static uint64_t xs_state = 0xa11264ULL;
|
||
static inline uint64_t xs(void) {
|
||
uint64_t x = xs_state;
|
||
x ^= x << 13; x ^= x >> 7; x ^= x << 17;
|
||
return xs_state = x;
|
||
}
|
||
|
||
static int test_idct4(void)
|
||
{
|
||
enum { N = 64, STRIDE = 64, BYTES = 8 * STRIDE };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
int16_t coeffs[N * 16], coeffs_ref[N * 16];
|
||
uint8_t dst[BYTES], dst_ref[BYTES];
|
||
daedalus_h264_block_meta meta[N];
|
||
|
||
/* Layout: 8x8 grid of 4x4 blocks (each 4x4 occupies 4 rows x 4 cols).
|
||
* Block (bx, by) at byte offset by*4*STRIDE + bx*4. Need BYTES big
|
||
* enough: 8 row-blocks * 4 rows = 32 rows × 64 stride = 2048. Use
|
||
* 8 row-blocks. */
|
||
enum { BX = 8, BY = 8, FULL_BYTES = BY * 4 * STRIDE };
|
||
uint8_t big_dst[FULL_BYTES], big_dst_ref[FULL_BYTES];
|
||
for (int i = 0; i < FULL_BYTES; i++)
|
||
big_dst[i] = big_dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||
|
||
for (int i = 0; i < N * 16; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 1024) - 512);
|
||
|
||
for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
|
||
int i = by * BX + bx;
|
||
meta[i].dst_off = by * 4 * STRIDE + bx * 4;
|
||
}
|
||
|
||
for (int i = 0; i < N; i++)
|
||
daedalus_h264_idct_add_ref(big_dst_ref + meta[i].dst_off,
|
||
coeffs_ref + i * 16, STRIDE);
|
||
|
||
int rc = daedalus_recipe_dispatch_h264_idct4(ctx, big_dst, STRIDE,
|
||
coeffs, N, meta);
|
||
if (rc) { fprintf(stderr, "idct4 dispatch rc=%d\n", rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < FULL_BYTES; i++) if (big_dst[i] != big_dst_ref[i]) diff++;
|
||
printf(" H.264 IDCT 4x4: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
FULL_BYTES - diff, FULL_BYTES, 100.0 * (FULL_BYTES - diff) / FULL_BYTES);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_idct8(void)
|
||
{
|
||
enum { N = 16, STRIDE = 64, BYTES = (8 * 4) * STRIDE };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
int16_t coeffs[N * 64], coeffs_ref[N * 64];
|
||
uint8_t dst[BYTES], dst_ref[BYTES];
|
||
daedalus_h264_block_meta meta[N];
|
||
|
||
for (int i = 0; i < BYTES; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||
for (int i = 0; i < N * 64; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 2048) - 1024);
|
||
|
||
/* 8 blocks per row × 4 row-blocks = 32 blocks. Use 8 cols × 2 rows-of-blocks
|
||
* for safety inside BYTES. Actually BYTES = 32*64 = 2048, supports 8*8=64
|
||
* blocks. Let me use 8 cols × 2 rows of blocks = 16 blocks. */
|
||
int BX = 8, BY = 2; /* 16 blocks total */
|
||
for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
|
||
int i = by * BX + bx;
|
||
meta[i].dst_off = by * 8 * STRIDE + bx * 8;
|
||
}
|
||
|
||
for (int i = 0; i < N; i++)
|
||
daedalus_h264_idct8_add_ref(dst_ref + meta[i].dst_off,
|
||
coeffs_ref + i * 64, STRIDE);
|
||
|
||
int rc = daedalus_recipe_dispatch_h264_idct8(ctx, dst, STRIDE,
|
||
coeffs, N, meta);
|
||
if (rc) { fprintf(stderr, "idct8 dispatch rc=%d\n", rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < BYTES; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 IDCT 8x8: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
BYTES - diff, BYTES, 100.0 * (BYTES - diff) / BYTES);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_deblock(void)
|
||
{
|
||
/* One edge per 16x16 tile. */
|
||
enum { N_EDGES = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
|
||
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 4, EDGE_OFF = EDGE_ROW * TILE_STRIDE };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||
|
||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||
for (int i = 0; i < N_EDGES; i++) {
|
||
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
|
||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||
meta[i].beta = (int)(xs() % 16) + 1;
|
||
for (int s = 0; s < 4; s++) {
|
||
int r = (int)(xs() % 8);
|
||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||
}
|
||
}
|
||
|
||
for (int i = 0; i < N_EDGES; i++) {
|
||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||
daedalus_h264_v_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||
meta[i].alpha, meta[i].beta, tc0_local);
|
||
}
|
||
|
||
int rc = daedalus_recipe_dispatch_h264_deblock_luma_v(ctx, dst, TILE_STRIDE,
|
||
N_EDGES, meta);
|
||
if (rc) { fprintf(stderr, "deblock dispatch rc=%d\n", rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 deblock luma v: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_deblock_h(void)
|
||
{
|
||
/* Mirror of test_deblock but for the H variant. Per-tile layout
|
||
* is now 8 cols x 16 rows (one vertical edge between cols 3 and 4
|
||
* of the tile); EDGE_COL = 4 puts dst_off at the leftmost output
|
||
* column of the right block so the kernel's pix[-4..+3] read sits
|
||
* inside the tile. */
|
||
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16,
|
||
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
|
||
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||
|
||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||
for (int i = 0; i < N_EDGES; i++) {
|
||
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
|
||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||
meta[i].beta = (int)(xs() % 16) + 1;
|
||
for (int s = 0; s < 4; s++) {
|
||
int r = (int)(xs() % 8);
|
||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||
}
|
||
}
|
||
|
||
for (int i = 0; i < N_EDGES; i++) {
|
||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||
daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||
meta[i].alpha, meta[i].beta, tc0_local);
|
||
}
|
||
|
||
int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE,
|
||
N_EDGES, meta);
|
||
if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_deblock_chroma_v(void)
|
||
{
|
||
/* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2
|
||
* (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */
|
||
enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4,
|
||
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
|
||
TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2,
|
||
EDGE_OFF = EDGE_ROW * TILE_STRIDE };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||
|
||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||
for (int i = 0; i < N_EDGES; i++) {
|
||
meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
|
||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||
meta[i].beta = (int)(xs() % 16) + 1;
|
||
for (int s = 0; s < 4; s++) {
|
||
int r = (int)(xs() % 8);
|
||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||
}
|
||
}
|
||
|
||
for (int i = 0; i < N_EDGES; i++) {
|
||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||
daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||
meta[i].alpha, meta[i].beta, tc0_local);
|
||
}
|
||
|
||
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE,
|
||
N_EDGES, meta);
|
||
if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_deblock_chroma_h(void)
|
||
{
|
||
/* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2
|
||
* (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */
|
||
enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8,
|
||
TILE_BYTES = TILE_STRIDE * TILE_ROWS,
|
||
TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
uint8_t dst[TOTAL], dst_ref[TOTAL];
|
||
daedalus_h264_deblock_meta meta[N_EDGES];
|
||
|
||
for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||
for (int i = 0; i < N_EDGES; i++) {
|
||
meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
|
||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||
meta[i].beta = (int)(xs() % 16) + 1;
|
||
for (int s = 0; s < 4; s++) {
|
||
int r = (int)(xs() % 8);
|
||
meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
|
||
}
|
||
}
|
||
|
||
for (int i = 0; i < N_EDGES; i++) {
|
||
int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
|
||
daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
|
||
meta[i].alpha, meta[i].beta, tc0_local);
|
||
}
|
||
|
||
int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE,
|
||
N_EDGES, meta);
|
||
if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
/* --- bS=4 intra-strength deblock tests ---
|
||
* Tile geometry per orientation matches the bS<4 variant; only the
|
||
* dispatch + reference function change. alpha/beta are non-trivial
|
||
* (the C ref + NEON both early-return when alpha|beta == 0).
|
||
*/
|
||
typedef struct {
|
||
const char *name;
|
||
int n_edges, tile_stride, tile_rows, edge_off;
|
||
void (*ref)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta);
|
||
int (*dispatch)(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride,
|
||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||
} intra_test_spec;
|
||
|
||
static int run_intra_test(const intra_test_spec *t)
|
||
{
|
||
int total = t->n_edges * t->tile_stride * t->tile_rows;
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
uint8_t *dst = malloc((size_t) total);
|
||
uint8_t *dst_ref = malloc((size_t) total);
|
||
daedalus_h264_deblock_meta *meta = calloc((size_t) t->n_edges, sizeof(*meta));
|
||
if (!dst || !dst_ref || !meta) return 1;
|
||
|
||
for (int i = 0; i < total; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||
int tile_bytes = t->tile_stride * t->tile_rows;
|
||
for (int i = 0; i < t->n_edges; i++) {
|
||
meta[i].dst_off = (uint32_t)(i * tile_bytes + t->edge_off);
|
||
meta[i].alpha = (int)(xs() % 64) + 1;
|
||
meta[i].beta = (int)(xs() % 16) + 1;
|
||
/* tc0[] unused for intra; leave at 0 from calloc. */
|
||
}
|
||
for (int i = 0; i < t->n_edges; i++) {
|
||
t->ref(dst_ref + meta[i].dst_off,
|
||
(ptrdiff_t) t->tile_stride,
|
||
meta[i].alpha, meta[i].beta);
|
||
}
|
||
int rc = t->dispatch(ctx, dst, (size_t) t->tile_stride,
|
||
(size_t) t->n_edges, meta);
|
||
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", t->name, rc); return 1; }
|
||
|
||
int diff = 0;
|
||
for (int i = 0; i < total; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 deblock %s: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
t->name, total - diff, total, 100.0 * (total - diff) / total);
|
||
|
||
free(meta); free(dst_ref); free(dst);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_deblock_intra_all(void)
|
||
{
|
||
intra_test_spec specs[] = {
|
||
{ "luma v intra", 8, 16, 8, 4 * 16,
|
||
daedalus_h264_v_loop_filter_luma_intra_ref,
|
||
daedalus_recipe_dispatch_h264_deblock_luma_v_intra },
|
||
{ "luma h intra", 8, 8, 16, 4,
|
||
daedalus_h264_h_loop_filter_luma_intra_ref,
|
||
daedalus_recipe_dispatch_h264_deblock_luma_h_intra },
|
||
{ "chroma v intra", 8, 8, 4, 2 * 8,
|
||
daedalus_h264_v_loop_filter_chroma_intra_ref,
|
||
daedalus_recipe_dispatch_h264_deblock_chroma_v_intra },
|
||
{ "chroma h intra", 8, 4, 8, 2,
|
||
daedalus_h264_h_loop_filter_chroma_intra_ref,
|
||
daedalus_recipe_dispatch_h264_deblock_chroma_h_intra },
|
||
};
|
||
int fail = 0;
|
||
for (size_t i = 0; i < sizeof(specs)/sizeof(specs[0]); i++)
|
||
fail |= run_intra_test(&specs[i]);
|
||
return fail;
|
||
}
|
||
|
||
static int test_qpel_mc20(void)
|
||
{
|
||
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
|
||
* holds rows 0..7; src[c-2..c+3] read via SRC_COL offset matches the
|
||
* cycle-9 bench convention so the same C reference and NEON .S can
|
||
* be compared. */
|
||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 8,
|
||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||
SRC_COL = 3 };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||
daedalus_h264_qpel_meta meta[N];
|
||
|
||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||
memset(dst, 0, sizeof(dst));
|
||
memset(dst_ref, 0, sizeof(dst_ref));
|
||
|
||
for (int i = 0; i < N; i++) {
|
||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
|
||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
|
||
}
|
||
|
||
for (int i = 0; i < N; i++)
|
||
daedalus_put_h264_qpel8_mc20_ref(dst_ref + meta[i].dst_off,
|
||
src + meta[i].src_off,
|
||
TILE_STRIDE);
|
||
|
||
int rc = daedalus_recipe_dispatch_h264_qpel_mc20(ctx, dst, src,
|
||
TILE_STRIDE, N, meta);
|
||
if (rc) { fprintf(stderr, "qpel_mc20 dispatch rc=%d\n", rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 qpel mc20: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_qpel_mc02(void)
|
||
{
|
||
/* mc02: vertical 6-tap. Tile is 16 cols × 16 rows so the kernel
|
||
* can read rows [SRC_ROW-2 .. SRC_ROW+7+3] inside the buffer.
|
||
* SRC_ROW = 3 leaves rows -2..-1 above the output (rows 1..2 of
|
||
* the tile) and rows 8..10 below (rows 11..13). */
|
||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||
SRC_ROW = 3 };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||
daedalus_h264_qpel_meta meta[N];
|
||
|
||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||
memset(dst, 0, sizeof(dst));
|
||
memset(dst_ref, 0, sizeof(dst_ref));
|
||
|
||
for (int i = 0; i < N; i++) {
|
||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
|
||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE);
|
||
}
|
||
|
||
for (int i = 0; i < N; i++)
|
||
daedalus_put_h264_qpel8_mc02_ref(dst_ref + meta[i].dst_off,
|
||
src + meta[i].src_off,
|
||
TILE_STRIDE);
|
||
|
||
int rc = daedalus_recipe_dispatch_h264_qpel_mc02(ctx, dst, src,
|
||
TILE_STRIDE, N, meta);
|
||
if (rc) { fprintf(stderr, "qpel_mc02 dispatch rc=%d\n", rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 qpel mc02: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_qpel_mc22(void)
|
||
{
|
||
/* mc22: 2D HV lowpass. Needs 2 cols left + 3 cols right + 2 rows
|
||
* top + 3 rows bottom of context per 8x8 output. Tile is 16x16
|
||
* with output positioned at (SRC_ROW=3, SRC_COL=3) so the read
|
||
* range [SRC_*-2 .. SRC_*+7+3] stays inside the tile. */
|
||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||
SRC_ROW = 3, SRC_COL = 3 };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||
daedalus_h264_qpel_meta meta[N];
|
||
|
||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||
memset(dst, 0, sizeof(dst));
|
||
memset(dst_ref, 0, sizeof(dst_ref));
|
||
|
||
for (int i = 0; i < N; i++) {
|
||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||
}
|
||
|
||
for (int i = 0; i < N; i++)
|
||
daedalus_put_h264_qpel8_mc22_ref(dst_ref + meta[i].dst_off,
|
||
src + meta[i].src_off,
|
||
TILE_STRIDE);
|
||
|
||
int rc = daedalus_recipe_dispatch_h264_qpel_mc22(ctx, dst, src,
|
||
TILE_STRIDE, N, meta);
|
||
if (rc) { fprintf(stderr, "qpel_mc22 dispatch rc=%d\n", rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 qpel mc22: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
/* Generic harness for the 4 single-axis quarter-pel positions; same
|
||
* tile geometry as mc22 since each one reads the largest of the H/V
|
||
* lowpass windows (mc10/mc30 need cols -2..+3, mc01/mc03 need rows
|
||
* -2..+3 OR +1..+3 on the integer side). */
|
||
typedef void (*qpel_ref_fn)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||
typedef int (*qpel_dispatch_fn)(daedalus_ctx *ctx, uint8_t *dst,
|
||
const uint8_t *src, size_t stride,
|
||
size_t n_blocks, const daedalus_h264_qpel_meta *meta);
|
||
|
||
static int run_quarter_axis_qpel(const char *name,
|
||
qpel_ref_fn ref, qpel_dispatch_fn dispatch)
|
||
{
|
||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||
SRC_ROW = 3, SRC_COL = 3 };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||
daedalus_h264_qpel_meta meta[N];
|
||
|
||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||
memset(dst, 0, sizeof(dst));
|
||
memset(dst_ref, 0, sizeof(dst_ref));
|
||
|
||
for (int i = 0; i < N; i++) {
|
||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||
}
|
||
|
||
for (int i = 0; i < N; i++)
|
||
ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE);
|
||
|
||
int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta);
|
||
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_qpel_quarter_axis_all(void)
|
||
{
|
||
int fail = 0;
|
||
fail |= run_quarter_axis_qpel("mc10", daedalus_put_h264_qpel8_mc10_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc10);
|
||
fail |= run_quarter_axis_qpel("mc30", daedalus_put_h264_qpel8_mc30_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc30);
|
||
fail |= run_quarter_axis_qpel("mc01", daedalus_put_h264_qpel8_mc01_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc01);
|
||
fail |= run_quarter_axis_qpel("mc03", daedalus_put_h264_qpel8_mc03_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc03);
|
||
return fail;
|
||
}
|
||
|
||
static int test_qpel_diag_all(void)
|
||
{
|
||
/* Diagonal positions need TWO half-pel intermediates per output;
|
||
* some of them read at (r+1,c) or (r,c+1) so the test geometry
|
||
* needs an extra row + col of context. run_quarter_axis_qpel
|
||
* already provides plenty (SRC_ROW=3, SRC_COL=3, 16x16 tile)
|
||
* — reusing that harness is fine. */
|
||
int fail = 0;
|
||
fail |= run_quarter_axis_qpel("mc11", daedalus_put_h264_qpel8_mc11_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc11);
|
||
fail |= run_quarter_axis_qpel("mc12", daedalus_put_h264_qpel8_mc12_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc12);
|
||
fail |= run_quarter_axis_qpel("mc13", daedalus_put_h264_qpel8_mc13_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc13);
|
||
fail |= run_quarter_axis_qpel("mc21", daedalus_put_h264_qpel8_mc21_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc21);
|
||
fail |= run_quarter_axis_qpel("mc23", daedalus_put_h264_qpel8_mc23_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc23);
|
||
fail |= run_quarter_axis_qpel("mc31", daedalus_put_h264_qpel8_mc31_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc31);
|
||
fail |= run_quarter_axis_qpel("mc32", daedalus_put_h264_qpel8_mc32_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc32);
|
||
fail |= run_quarter_axis_qpel("mc33", daedalus_put_h264_qpel8_mc33_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_mc33);
|
||
return fail;
|
||
}
|
||
|
||
/* Avg-form harness: pre-loads dst + dst_ref with the same random
|
||
* content so we can verify the L2 averaging is happening (not just
|
||
* put_-style overwrite). If the dispatch incorrectly overwrote
|
||
* dst, the bit-exact compare would still catch the mismatch against
|
||
* the avg_ reference. */
|
||
static int run_avg_qpel(const char *name,
|
||
qpel_ref_fn ref, qpel_dispatch_fn dispatch)
|
||
{
|
||
enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16,
|
||
TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
|
||
SRC_ROW = 3, SRC_COL = 3 };
|
||
daedalus_ctx *ctx = daedalus_ctx_create();
|
||
if (!ctx) return 1;
|
||
|
||
uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
|
||
daedalus_h264_qpel_meta meta[N];
|
||
|
||
/* Two random buffers: src for the qpel input, dst seeded with
|
||
* different random content as the "list0 prediction" — both
|
||
* dst and dst_ref get the SAME seed so the avg compare is fair. */
|
||
for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
|
||
for (int i = 0; i < TOTAL; i++) {
|
||
uint8_t v = (uint8_t)(xs() & 0xff);
|
||
dst[i] = dst_ref[i] = v;
|
||
}
|
||
|
||
for (int i = 0; i < N; i++) {
|
||
meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||
meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL);
|
||
}
|
||
|
||
for (int i = 0; i < N; i++)
|
||
ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE);
|
||
|
||
int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta);
|
||
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; }
|
||
int diff = 0;
|
||
for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
|
||
printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n",
|
||
name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
|
||
daedalus_ctx_destroy(ctx);
|
||
return diff == 0 ? 0 : 1;
|
||
}
|
||
|
||
static int test_qpel_avg_anchors(void)
|
||
{
|
||
int fail = 0;
|
||
fail |= run_avg_qpel("avg_mc20", daedalus_avg_h264_qpel8_mc20_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_avg_mc20);
|
||
fail |= run_avg_qpel("avg_mc02", daedalus_avg_h264_qpel8_mc02_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_avg_mc02);
|
||
fail |= run_avg_qpel("avg_mc22", daedalus_avg_h264_qpel8_mc22_ref,
|
||
daedalus_recipe_dispatch_h264_qpel_avg_mc22);
|
||
return fail;
|
||
}
|
||
|
||
static int test_qpel_avg_rest(void)
|
||
{
|
||
int fail = 0;
|
||
/* Ref fns are named daedalus_avg_h264_qpel8_<mcXX>_ref (no
|
||
* second "avg_"); dispatch fns are named ..._avg_mcXX. Macro
|
||
* builds both from the bare mcXX name. */
|
||
#define RUN(MC) fail |= run_avg_qpel("avg_" #MC, \
|
||
daedalus_avg_h264_qpel8_ ## MC ## _ref, \
|
||
daedalus_recipe_dispatch_h264_qpel_avg_ ## MC)
|
||
RUN(mc10); RUN(mc30); RUN(mc01); RUN(mc03);
|
||
RUN(mc11); RUN(mc12); RUN(mc13);
|
||
RUN(mc21); RUN(mc23);
|
||
RUN(mc31); RUN(mc32); RUN(mc33);
|
||
#undef RUN
|
||
return fail;
|
||
}
|
||
|
||
int main(void)
|
||
{
|
||
printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
|
||
printf(" H264_IDCT4 recipe substrate: %d (1=CPU, 2=QPU)\n",
|
||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4));
|
||
printf(" H264_IDCT8 recipe substrate: %d\n",
|
||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8));
|
||
printf(" H264_DEBLOCK_LV recipe substrate: %d\n",
|
||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV));
|
||
printf(" H264_QPEL_MC20 recipe substrate: %d\n",
|
||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));
|
||
|
||
printf(" H264_DEBLOCK_LH recipe substrate: %d\n",
|
||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));
|
||
printf(" H264_DEBLOCK_CV recipe substrate: %d\n",
|
||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
|
||
printf(" H264_DEBLOCK_CH recipe substrate: %d\n",
|
||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
|
||
printf(" H264_DEBLOCK_*_INTRA recipe substrate: %d (bS=4 family, all on QPU)\n",
|
||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA));
|
||
|
||
int fail = 0;
|
||
fail |= test_idct4();
|
||
fail |= test_idct8();
|
||
fail |= test_deblock();
|
||
fail |= test_deblock_h();
|
||
fail |= test_deblock_chroma_v();
|
||
fail |= test_deblock_chroma_h();
|
||
fail |= test_deblock_intra_all();
|
||
fail |= test_qpel_mc20();
|
||
fail |= test_qpel_mc02();
|
||
fail |= test_qpel_mc22();
|
||
fail |= test_qpel_quarter_axis_all();
|
||
fail |= test_qpel_diag_all();
|
||
fail |= test_qpel_avg_anchors();
|
||
fail |= test_qpel_avg_rest();
|
||
return fail;
|
||
}
|