/* * Phase 8a — H.264 kernels through the public API. * * Covers IDCT 4x4, IDCT 8x8, deblock luma vertical. Each kernel * exercised through daedalus_recipe_dispatch_* and compared to * the C reference. Recipe routes all 3 to CPU (per cycles 6+7+8 * verdicts). */ #include #include #include #include #include #include "../include/daedalus.h" extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride); extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); extern void daedalus_h264_v_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); extern void daedalus_h264_h_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); extern void daedalus_h264_v_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t tc0[4]); extern void daedalus_put_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc02_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc22_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc10_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc30_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc01_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc03_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc11_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc12_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc13_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc21_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc23_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc31_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc32_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_avg_h264_qpel8_mc33_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); static uint64_t xs_state = 0xa11264ULL; static inline uint64_t xs(void) { uint64_t x = xs_state; x ^= x << 13; x ^= x >> 7; x ^= x << 17; return xs_state = x; } static int test_idct4(void) { enum { N = 64, STRIDE = 64, BYTES = 8 * STRIDE }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; int16_t coeffs[N * 16], coeffs_ref[N * 16]; uint8_t dst[BYTES], dst_ref[BYTES]; daedalus_h264_block_meta meta[N]; /* Layout: 8x8 grid of 4x4 blocks (each 4x4 occupies 4 rows x 4 cols). * Block (bx, by) at byte offset by*4*STRIDE + bx*4. Need BYTES big * enough: 8 row-blocks * 4 rows = 32 rows × 64 stride = 2048. Use * 8 row-blocks. */ enum { BX = 8, BY = 8, FULL_BYTES = BY * 4 * STRIDE }; uint8_t big_dst[FULL_BYTES], big_dst_ref[FULL_BYTES]; for (int i = 0; i < FULL_BYTES; i++) big_dst[i] = big_dst_ref[i] = (uint8_t)(xs() & 0xff); for (int i = 0; i < N * 16; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 1024) - 512); for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) { int i = by * BX + bx; meta[i].dst_off = by * 4 * STRIDE + bx * 4; } for (int i = 0; i < N; i++) daedalus_h264_idct_add_ref(big_dst_ref + meta[i].dst_off, coeffs_ref + i * 16, STRIDE); int rc = daedalus_recipe_dispatch_h264_idct4(ctx, big_dst, STRIDE, coeffs, N, meta); if (rc) { fprintf(stderr, "idct4 dispatch rc=%d\n", rc); return 1; } int diff = 0; for (int i = 0; i < FULL_BYTES; i++) if (big_dst[i] != big_dst_ref[i]) diff++; printf(" H.264 IDCT 4x4: %d/%d bytes bit-exact (%.4f%%)\n", FULL_BYTES - diff, FULL_BYTES, 100.0 * (FULL_BYTES - diff) / FULL_BYTES); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_idct8(void) { enum { N = 16, STRIDE = 64, BYTES = (8 * 4) * STRIDE }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; int16_t coeffs[N * 64], coeffs_ref[N * 64]; uint8_t dst[BYTES], dst_ref[BYTES]; daedalus_h264_block_meta meta[N]; for (int i = 0; i < BYTES; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); for (int i = 0; i < N * 64; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 2048) - 1024); /* 8 blocks per row × 4 row-blocks = 32 blocks. Use 8 cols × 2 rows-of-blocks * for safety inside BYTES. Actually BYTES = 32*64 = 2048, supports 8*8=64 * blocks. Let me use 8 cols × 2 rows of blocks = 16 blocks. */ int BX = 8, BY = 2; /* 16 blocks total */ for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) { int i = by * BX + bx; meta[i].dst_off = by * 8 * STRIDE + bx * 8; } for (int i = 0; i < N; i++) daedalus_h264_idct8_add_ref(dst_ref + meta[i].dst_off, coeffs_ref + i * 64, STRIDE); int rc = daedalus_recipe_dispatch_h264_idct8(ctx, dst, STRIDE, coeffs, N, meta); if (rc) { fprintf(stderr, "idct8 dispatch rc=%d\n", rc); return 1; } int diff = 0; for (int i = 0; i < BYTES; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 IDCT 8x8: %d/%d bytes bit-exact (%.4f%%)\n", BYTES - diff, BYTES, 100.0 * (BYTES - diff) / BYTES); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_deblock(void) { /* One edge per 16x16 tile. */ enum { N_EDGES = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE, TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 4, EDGE_OFF = EDGE_ROW * TILE_STRIDE }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; uint8_t dst[TOTAL], dst_ref[TOTAL]; daedalus_h264_deblock_meta meta[N_EDGES]; for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); for (int i = 0; i < N_EDGES; i++) { meta[i].dst_off = i * TILE_BYTES + EDGE_OFF; meta[i].alpha = (int)(xs() % 64) + 1; meta[i].beta = (int)(xs() % 16) + 1; for (int s = 0; s < 4; s++) { int r = (int)(xs() % 8); meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); } } for (int i = 0; i < N_EDGES; i++) { int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; daedalus_h264_v_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE, meta[i].alpha, meta[i].beta, tc0_local); } int rc = daedalus_recipe_dispatch_h264_deblock_luma_v(ctx, dst, TILE_STRIDE, N_EDGES, meta); if (rc) { fprintf(stderr, "deblock dispatch rc=%d\n", rc); return 1; } int diff = 0; for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 deblock luma v: %d/%d bytes bit-exact (%.4f%%)\n", TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_deblock_h(void) { /* Mirror of test_deblock but for the H variant. Per-tile layout * is now 8 cols x 16 rows (one vertical edge between cols 3 and 4 * of the tile); EDGE_COL = 4 puts dst_off at the leftmost output * column of the right block so the kernel's pix[-4..+3] read sits * inside the tile. */ enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16, TILE_BYTES = TILE_STRIDE * TILE_ROWS, TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; uint8_t dst[TOTAL], dst_ref[TOTAL]; daedalus_h264_deblock_meta meta[N_EDGES]; for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); for (int i = 0; i < N_EDGES; i++) { meta[i].dst_off = i * TILE_BYTES + EDGE_COL; meta[i].alpha = (int)(xs() % 64) + 1; meta[i].beta = (int)(xs() % 16) + 1; for (int s = 0; s < 4; s++) { int r = (int)(xs() % 8); meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); } } for (int i = 0; i < N_EDGES; i++) { int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE, meta[i].alpha, meta[i].beta, tc0_local); } int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE, N_EDGES, meta); if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; } int diff = 0; for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n", TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_deblock_chroma_v(void) { /* Chroma V: per-tile 8 cols × 4 rows, edge between rows 1 and 2 * (EDGE_ROW=2 lets the kernel read pix[-2..+1]*stride safely). */ enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 4, TILE_BYTES = TILE_STRIDE * TILE_ROWS, TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 2, EDGE_OFF = EDGE_ROW * TILE_STRIDE }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; uint8_t dst[TOTAL], dst_ref[TOTAL]; daedalus_h264_deblock_meta meta[N_EDGES]; for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); for (int i = 0; i < N_EDGES; i++) { meta[i].dst_off = i * TILE_BYTES + EDGE_OFF; meta[i].alpha = (int)(xs() % 64) + 1; meta[i].beta = (int)(xs() % 16) + 1; for (int s = 0; s < 4; s++) { int r = (int)(xs() % 8); meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); } } for (int i = 0; i < N_EDGES; i++) { int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; daedalus_h264_v_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE, meta[i].alpha, meta[i].beta, tc0_local); } int rc = daedalus_recipe_dispatch_h264_deblock_chroma_v(ctx, dst, TILE_STRIDE, N_EDGES, meta); if (rc) { fprintf(stderr, "deblock_chroma_v dispatch rc=%d\n", rc); return 1; } int diff = 0; for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 deblock chroma v: %d/%d bytes bit-exact (%.4f%%)\n", TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_deblock_chroma_h(void) { /* Chroma H: per-tile 4 cols × 8 rows, edge between cols 1 and 2 * (EDGE_COL=2 lets the kernel read pix[-2..+1] safely). */ enum { N_EDGES = 8, TILE_STRIDE = 4, TILE_ROWS = 8, TILE_BYTES = TILE_STRIDE * TILE_ROWS, TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 2 }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; uint8_t dst[TOTAL], dst_ref[TOTAL]; daedalus_h264_deblock_meta meta[N_EDGES]; for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); for (int i = 0; i < N_EDGES; i++) { meta[i].dst_off = i * TILE_BYTES + EDGE_COL; meta[i].alpha = (int)(xs() % 64) + 1; meta[i].beta = (int)(xs() % 16) + 1; for (int s = 0; s < 4; s++) { int r = (int)(xs() % 8); meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1)); } } for (int i = 0; i < N_EDGES; i++) { int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] }; daedalus_h264_h_loop_filter_chroma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE, meta[i].alpha, meta[i].beta, tc0_local); } int rc = daedalus_recipe_dispatch_h264_deblock_chroma_h(ctx, dst, TILE_STRIDE, N_EDGES, meta); if (rc) { fprintf(stderr, "deblock_chroma_h dispatch rc=%d\n", rc); return 1; } int diff = 0; for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 deblock chroma h: %d/%d bytes bit-exact (%.4f%%)\n", TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } /* --- bS=4 intra-strength deblock tests --- * Tile geometry per orientation matches the bS<4 variant; only the * dispatch + reference function change. alpha/beta are non-trivial * (the C ref + NEON both early-return when alpha|beta == 0). */ typedef struct { const char *name; int n_edges, tile_stride, tile_rows, edge_off; void (*ref)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); int (*dispatch)(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta); } intra_test_spec; static int run_intra_test(const intra_test_spec *t) { int total = t->n_edges * t->tile_stride * t->tile_rows; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; uint8_t *dst = malloc((size_t) total); uint8_t *dst_ref = malloc((size_t) total); daedalus_h264_deblock_meta *meta = calloc((size_t) t->n_edges, sizeof(*meta)); if (!dst || !dst_ref || !meta) return 1; for (int i = 0; i < total; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); int tile_bytes = t->tile_stride * t->tile_rows; for (int i = 0; i < t->n_edges; i++) { meta[i].dst_off = (uint32_t)(i * tile_bytes + t->edge_off); meta[i].alpha = (int)(xs() % 64) + 1; meta[i].beta = (int)(xs() % 16) + 1; /* tc0[] unused for intra; leave at 0 from calloc. */ } for (int i = 0; i < t->n_edges; i++) { t->ref(dst_ref + meta[i].dst_off, (ptrdiff_t) t->tile_stride, meta[i].alpha, meta[i].beta); } int rc = t->dispatch(ctx, dst, (size_t) t->tile_stride, (size_t) t->n_edges, meta); if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", t->name, rc); return 1; } int diff = 0; for (int i = 0; i < total; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 deblock %s: %d/%d bytes bit-exact (%.4f%%)\n", t->name, total - diff, total, 100.0 * (total - diff) / total); free(meta); free(dst_ref); free(dst); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_deblock_intra_all(void) { intra_test_spec specs[] = { { "luma v intra", 8, 16, 8, 4 * 16, daedalus_h264_v_loop_filter_luma_intra_ref, daedalus_recipe_dispatch_h264_deblock_luma_v_intra }, { "luma h intra", 8, 8, 16, 4, daedalus_h264_h_loop_filter_luma_intra_ref, daedalus_recipe_dispatch_h264_deblock_luma_h_intra }, { "chroma v intra", 8, 8, 4, 2 * 8, daedalus_h264_v_loop_filter_chroma_intra_ref, daedalus_recipe_dispatch_h264_deblock_chroma_v_intra }, { "chroma h intra", 8, 4, 8, 2, daedalus_h264_h_loop_filter_chroma_intra_ref, daedalus_recipe_dispatch_h264_deblock_chroma_h_intra }, }; int fail = 0; for (size_t i = 0; i < sizeof(specs)/sizeof(specs[0]); i++) fail |= run_intra_test(&specs[i]); return fail; } static int test_qpel_mc20(void) { /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile * holds rows 0..7; src[c-2..c+3] read via SRC_COL offset matches the * cycle-9 bench convention so the same C reference and NEON .S can * be compared. */ enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 8, TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES, SRC_COL = 3 }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL]; daedalus_h264_qpel_meta meta[N]; for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff); memset(dst, 0, sizeof(dst)); memset(dst_ref, 0, sizeof(dst_ref)); for (int i = 0; i < N; i++) { meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_COL); meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_COL); } for (int i = 0; i < N; i++) daedalus_put_h264_qpel8_mc20_ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE); int rc = daedalus_recipe_dispatch_h264_qpel_mc20(ctx, dst, src, TILE_STRIDE, N, meta); if (rc) { fprintf(stderr, "qpel_mc20 dispatch rc=%d\n", rc); return 1; } int diff = 0; for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 qpel mc20: %d/%d bytes bit-exact (%.4f%%)\n", TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_qpel_mc02(void) { /* mc02: vertical 6-tap. Tile is 16 cols × 16 rows so the kernel * can read rows [SRC_ROW-2 .. SRC_ROW+7+3] inside the buffer. * SRC_ROW = 3 leaves rows -2..-1 above the output (rows 1..2 of * the tile) and rows 8..10 below (rows 11..13). */ enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16, TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES, SRC_ROW = 3 }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL]; daedalus_h264_qpel_meta meta[N]; for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff); memset(dst, 0, sizeof(dst)); memset(dst_ref, 0, sizeof(dst_ref)); for (int i = 0; i < N; i++) { meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE); meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE); } for (int i = 0; i < N; i++) daedalus_put_h264_qpel8_mc02_ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE); int rc = daedalus_recipe_dispatch_h264_qpel_mc02(ctx, dst, src, TILE_STRIDE, N, meta); if (rc) { fprintf(stderr, "qpel_mc02 dispatch rc=%d\n", rc); return 1; } int diff = 0; for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 qpel mc02: %d/%d bytes bit-exact (%.4f%%)\n", TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_qpel_mc22(void) { /* mc22: 2D HV lowpass. Needs 2 cols left + 3 cols right + 2 rows * top + 3 rows bottom of context per 8x8 output. Tile is 16x16 * with output positioned at (SRC_ROW=3, SRC_COL=3) so the read * range [SRC_*-2 .. SRC_*+7+3] stays inside the tile. */ enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16, TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES, SRC_ROW = 3, SRC_COL = 3 }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL]; daedalus_h264_qpel_meta meta[N]; for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff); memset(dst, 0, sizeof(dst)); memset(dst_ref, 0, sizeof(dst_ref)); for (int i = 0; i < N; i++) { meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); } for (int i = 0; i < N; i++) daedalus_put_h264_qpel8_mc22_ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE); int rc = daedalus_recipe_dispatch_h264_qpel_mc22(ctx, dst, src, TILE_STRIDE, N, meta); if (rc) { fprintf(stderr, "qpel_mc22 dispatch rc=%d\n", rc); return 1; } int diff = 0; for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 qpel mc22: %d/%d bytes bit-exact (%.4f%%)\n", TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } /* Generic harness for the 4 single-axis quarter-pel positions; same * tile geometry as mc22 since each one reads the largest of the H/V * lowpass windows (mc10/mc30 need cols -2..+3, mc01/mc03 need rows * -2..+3 OR +1..+3 on the integer side). */ typedef void (*qpel_ref_fn)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); typedef int (*qpel_dispatch_fn)(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta); static int run_quarter_axis_qpel(const char *name, qpel_ref_fn ref, qpel_dispatch_fn dispatch) { enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16, TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES, SRC_ROW = 3, SRC_COL = 3 }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL]; daedalus_h264_qpel_meta meta[N]; for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff); memset(dst, 0, sizeof(dst)); memset(dst_ref, 0, sizeof(dst_ref)); for (int i = 0; i < N; i++) { meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); } for (int i = 0; i < N; i++) ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE); int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta); if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; } int diff = 0; for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n", name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_qpel_quarter_axis_all(void) { int fail = 0; fail |= run_quarter_axis_qpel("mc10", daedalus_put_h264_qpel8_mc10_ref, daedalus_recipe_dispatch_h264_qpel_mc10); fail |= run_quarter_axis_qpel("mc30", daedalus_put_h264_qpel8_mc30_ref, daedalus_recipe_dispatch_h264_qpel_mc30); fail |= run_quarter_axis_qpel("mc01", daedalus_put_h264_qpel8_mc01_ref, daedalus_recipe_dispatch_h264_qpel_mc01); fail |= run_quarter_axis_qpel("mc03", daedalus_put_h264_qpel8_mc03_ref, daedalus_recipe_dispatch_h264_qpel_mc03); return fail; } static int test_qpel_diag_all(void) { /* Diagonal positions need TWO half-pel intermediates per output; * some of them read at (r+1,c) or (r,c+1) so the test geometry * needs an extra row + col of context. run_quarter_axis_qpel * already provides plenty (SRC_ROW=3, SRC_COL=3, 16x16 tile) * — reusing that harness is fine. */ int fail = 0; fail |= run_quarter_axis_qpel("mc11", daedalus_put_h264_qpel8_mc11_ref, daedalus_recipe_dispatch_h264_qpel_mc11); fail |= run_quarter_axis_qpel("mc12", daedalus_put_h264_qpel8_mc12_ref, daedalus_recipe_dispatch_h264_qpel_mc12); fail |= run_quarter_axis_qpel("mc13", daedalus_put_h264_qpel8_mc13_ref, daedalus_recipe_dispatch_h264_qpel_mc13); fail |= run_quarter_axis_qpel("mc21", daedalus_put_h264_qpel8_mc21_ref, daedalus_recipe_dispatch_h264_qpel_mc21); fail |= run_quarter_axis_qpel("mc23", daedalus_put_h264_qpel8_mc23_ref, daedalus_recipe_dispatch_h264_qpel_mc23); fail |= run_quarter_axis_qpel("mc31", daedalus_put_h264_qpel8_mc31_ref, daedalus_recipe_dispatch_h264_qpel_mc31); fail |= run_quarter_axis_qpel("mc32", daedalus_put_h264_qpel8_mc32_ref, daedalus_recipe_dispatch_h264_qpel_mc32); fail |= run_quarter_axis_qpel("mc33", daedalus_put_h264_qpel8_mc33_ref, daedalus_recipe_dispatch_h264_qpel_mc33); return fail; } /* Avg-form harness: pre-loads dst + dst_ref with the same random * content so we can verify the L2 averaging is happening (not just * put_-style overwrite). If the dispatch incorrectly overwrote * dst, the bit-exact compare would still catch the mismatch against * the avg_ reference. */ static int run_avg_qpel(const char *name, qpel_ref_fn ref, qpel_dispatch_fn dispatch) { enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 16, TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES, SRC_ROW = 3, SRC_COL = 3 }; daedalus_ctx *ctx = daedalus_ctx_create(); if (!ctx) return 1; uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL]; daedalus_h264_qpel_meta meta[N]; /* Two random buffers: src for the qpel input, dst seeded with * different random content as the "list0 prediction" — both * dst and dst_ref get the SAME seed so the avg compare is fair. */ for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff); for (int i = 0; i < TOTAL; i++) { uint8_t v = (uint8_t)(xs() & 0xff); dst[i] = dst_ref[i] = v; } for (int i = 0; i < N; i++) { meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_ROW * TILE_STRIDE + SRC_COL); } for (int i = 0; i < N; i++) ref(dst_ref + meta[i].dst_off, src + meta[i].src_off, TILE_STRIDE); int rc = dispatch(ctx, dst, src, TILE_STRIDE, N, meta); if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", name, rc); return 1; } int diff = 0; for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++; printf(" H.264 qpel %s: %d/%d bytes bit-exact (%.4f%%)\n", name, TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL); daedalus_ctx_destroy(ctx); return diff == 0 ? 0 : 1; } static int test_qpel_avg_anchors(void) { int fail = 0; fail |= run_avg_qpel("avg_mc20", daedalus_avg_h264_qpel8_mc20_ref, daedalus_recipe_dispatch_h264_qpel_avg_mc20); fail |= run_avg_qpel("avg_mc02", daedalus_avg_h264_qpel8_mc02_ref, daedalus_recipe_dispatch_h264_qpel_avg_mc02); fail |= run_avg_qpel("avg_mc22", daedalus_avg_h264_qpel8_mc22_ref, daedalus_recipe_dispatch_h264_qpel_avg_mc22); return fail; } static int test_qpel_avg_rest(void) { int fail = 0; /* Ref fns are named daedalus_avg_h264_qpel8__ref (no * second "avg_"); dispatch fns are named ..._avg_mcXX. Macro * builds both from the bare mcXX name. */ #define RUN(MC) fail |= run_avg_qpel("avg_" #MC, \ daedalus_avg_h264_qpel8_ ## MC ## _ref, \ daedalus_recipe_dispatch_h264_qpel_avg_ ## MC) RUN(mc10); RUN(mc30); RUN(mc01); RUN(mc03); RUN(mc11); RUN(mc12); RUN(mc13); RUN(mc21); RUN(mc23); RUN(mc31); RUN(mc32); RUN(mc33); #undef RUN return fail; } int main(void) { printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n"); printf(" H264_IDCT4 recipe substrate: %d (1=CPU, 2=QPU)\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4)); printf(" H264_IDCT8 recipe substrate: %d\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8)); printf(" H264_DEBLOCK_LV recipe substrate: %d\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV)); printf(" H264_QPEL_MC20 recipe substrate: %d\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20)); printf(" H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH)); printf(" H264_DEBLOCK_CV recipe substrate: %d (CPU)\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV)); printf(" H264_DEBLOCK_CH recipe substrate: %d (CPU)\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH)); printf(" H264_DEBLOCK_*_INTRA recipe substrate: %d (CPU, bS=4 set)\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA)); int fail = 0; fail |= test_idct4(); fail |= test_idct8(); fail |= test_deblock(); fail |= test_deblock_h(); fail |= test_deblock_chroma_v(); fail |= test_deblock_chroma_h(); fail |= test_deblock_intra_all(); fail |= test_qpel_mc20(); fail |= test_qpel_mc02(); fail |= test_qpel_mc22(); fail |= test_qpel_quarter_axis_all(); fail |= test_qpel_diag_all(); fail |= test_qpel_avg_anchors(); fail |= test_qpel_avg_rest(); return fail; }