daedalus-fourier/tests/test_api_h264.c

/*
 * Phase 8a — H.264 kernels through the public API.
 *
 * Covers IDCT 4x4, IDCT 8x8, deblock luma vertical. Each kernel
 * exercised through daedalus_recipe_dispatch_* and compared to
 * the C reference. Recipe routes all 3 to CPU (per cycles 6+7+8
 * verdicts).
 */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>

#include "../include/daedalus.h"

extern void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_idct8_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride);
extern void daedalus_h264_h_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                   int alpha, int beta, int8_t tc0[4]);
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
                                                  int alpha, int beta, int8_t tc0[4]);
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
                                              ptrdiff_t stride);

static uint64_t xs_state = 0xa11264ULL;
static inline uint64_t xs(void) {
    uint64_t x = xs_state;
    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
    return xs_state = x;
}

static int test_idct4(void)
{
    enum { N = 64, STRIDE = 64, BYTES = 8 * STRIDE };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;

    int16_t coeffs[N * 16], coeffs_ref[N * 16];
    uint8_t dst[BYTES], dst_ref[BYTES];
    daedalus_h264_block_meta meta[N];

    /* Layout: 8x8 grid of 4x4 blocks (each 4x4 occupies 4 rows x 4 cols).
     * Block (bx, by) at byte offset by*4*STRIDE + bx*4. Need BYTES big
     * enough: 8 row-blocks * 4 rows = 32 rows × 64 stride = 2048. Use
     * 8 row-blocks. */
    enum { BX = 8, BY = 8, FULL_BYTES = BY * 4 * STRIDE };
    uint8_t big_dst[FULL_BYTES], big_dst_ref[FULL_BYTES];
    for (int i = 0; i < FULL_BYTES; i++)
        big_dst[i] = big_dst_ref[i] = (uint8_t)(xs() & 0xff);

    for (int i = 0; i < N * 16; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 1024) - 512);

    for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
        int i = by * BX + bx;
        meta[i].dst_off = by * 4 * STRIDE + bx * 4;
    }

    for (int i = 0; i < N; i++)
        daedalus_h264_idct_add_ref(big_dst_ref + meta[i].dst_off,
                                    coeffs_ref + i * 16, STRIDE);

    int rc = daedalus_recipe_dispatch_h264_idct4(ctx, big_dst, STRIDE,
                                                   coeffs, N, meta);
    if (rc) { fprintf(stderr, "idct4 dispatch rc=%d\n", rc); return 1; }
    int diff = 0;
    for (int i = 0; i < FULL_BYTES; i++) if (big_dst[i] != big_dst_ref[i]) diff++;
    printf("  H.264 IDCT 4x4: %d/%d bytes bit-exact (%.4f%%)\n",
           FULL_BYTES - diff, FULL_BYTES, 100.0 * (FULL_BYTES - diff) / FULL_BYTES);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
}

static int test_idct8(void)
{
    enum { N = 16, STRIDE = 64, BYTES = (8 * 4) * STRIDE };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;

    int16_t coeffs[N * 64], coeffs_ref[N * 64];
    uint8_t dst[BYTES], dst_ref[BYTES];
    daedalus_h264_block_meta meta[N];

    for (int i = 0; i < BYTES; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
    for (int i = 0; i < N * 64; i++) coeffs_ref[i] = coeffs[i] = (int16_t)((int)(xs() % 2048) - 1024);

    /* 8 blocks per row × 4 row-blocks = 32 blocks. Use 8 cols × 2 rows-of-blocks
     * for safety inside BYTES. Actually BYTES = 32*64 = 2048, supports 8*8=64
     * blocks. Let me use 8 cols × 2 rows of blocks = 16 blocks. */
    int BX = 8, BY = 2;   /* 16 blocks total */
    for (int by = 0; by < BY; by++) for (int bx = 0; bx < BX; bx++) {
        int i = by * BX + bx;
        meta[i].dst_off = by * 8 * STRIDE + bx * 8;
    }

    for (int i = 0; i < N; i++)
        daedalus_h264_idct8_add_ref(dst_ref + meta[i].dst_off,
                                     coeffs_ref + i * 64, STRIDE);

    int rc = daedalus_recipe_dispatch_h264_idct8(ctx, dst, STRIDE,
                                                   coeffs, N, meta);
    if (rc) { fprintf(stderr, "idct8 dispatch rc=%d\n", rc); return 1; }
    int diff = 0;
    for (int i = 0; i < BYTES; i++) if (dst[i] != dst_ref[i]) diff++;
    printf("  H.264 IDCT 8x8: %d/%d bytes bit-exact (%.4f%%)\n",
           BYTES - diff, BYTES, 100.0 * (BYTES - diff) / BYTES);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
}

static int test_deblock(void)
{
    /* One edge per 16x16 tile. */
    enum { N_EDGES = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
           TOTAL = N_EDGES * TILE_BYTES, EDGE_ROW = 4, EDGE_OFF = EDGE_ROW * TILE_STRIDE };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;

    uint8_t dst[TOTAL], dst_ref[TOTAL];
    daedalus_h264_deblock_meta meta[N_EDGES];

    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
    for (int i = 0; i < N_EDGES; i++) {
        meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
        meta[i].alpha = (int)(xs() % 64) + 1;
        meta[i].beta  = (int)(xs() % 16) + 1;
        for (int s = 0; s < 4; s++) {
            int r = (int)(xs() % 8);
            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
        }
    }

    for (int i = 0; i < N_EDGES; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
        daedalus_h264_v_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
                                              meta[i].alpha, meta[i].beta, tc0_local);
    }

    int rc = daedalus_recipe_dispatch_h264_deblock_luma_v(ctx, dst, TILE_STRIDE,
                                                            N_EDGES, meta);
    if (rc) { fprintf(stderr, "deblock dispatch rc=%d\n", rc); return 1; }
    int diff = 0;
    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
    printf("  H.264 deblock luma v: %d/%d bytes bit-exact (%.4f%%)\n",
           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
}

static int test_deblock_h(void)
{
    /* Mirror of test_deblock but for the H variant.  Per-tile layout
     * is now 8 cols x 16 rows (one vertical edge between cols 3 and 4
     * of the tile); EDGE_COL = 4 puts dst_off at the leftmost output
     * column of the right block so the kernel's pix[-4..+3] read sits
     * inside the tile. */
    enum { N_EDGES = 8, TILE_STRIDE = 8, TILE_ROWS = 16,
           TILE_BYTES = TILE_STRIDE * TILE_ROWS,
           TOTAL = N_EDGES * TILE_BYTES, EDGE_COL = 4 };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;

    uint8_t dst[TOTAL], dst_ref[TOTAL];
    daedalus_h264_deblock_meta meta[N_EDGES];

    for (int i = 0; i < TOTAL; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
    for (int i = 0; i < N_EDGES; i++) {
        meta[i].dst_off = i * TILE_BYTES + EDGE_COL;
        meta[i].alpha = (int)(xs() % 64) + 1;
        meta[i].beta  = (int)(xs() % 16) + 1;
        for (int s = 0; s < 4; s++) {
            int r = (int)(xs() % 8);
            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
        }
    }

    for (int i = 0; i < N_EDGES; i++) {
        int8_t tc0_local[4] = { meta[i].tc0[0], meta[i].tc0[1], meta[i].tc0[2], meta[i].tc0[3] };
        daedalus_h264_h_loop_filter_luma_ref(dst_ref + meta[i].dst_off, TILE_STRIDE,
                                              meta[i].alpha, meta[i].beta, tc0_local);
    }

    int rc = daedalus_recipe_dispatch_h264_deblock_luma_h(ctx, dst, TILE_STRIDE,
                                                           N_EDGES, meta);
    if (rc) { fprintf(stderr, "deblock_h dispatch rc=%d\n", rc); return 1; }
    int diff = 0;
    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
    printf("  H.264 deblock luma h: %d/%d bytes bit-exact (%.4f%%)\n",
           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
}

static int test_qpel_mc20(void)
{
    /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
     * holds rows 0..7; src[c-2..c+3] read via SRC_COL offset matches the
     * cycle-9 bench convention so the same C reference and NEON .S can
     * be compared. */
    enum { N = 8, TILE_STRIDE = 16, TILE_ROWS = 8,
           TILE_BYTES = TILE_ROWS * TILE_STRIDE, TOTAL = N * TILE_BYTES,
           SRC_COL = 3 };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;

    uint8_t src[TOTAL], dst[TOTAL], dst_ref[TOTAL];
    daedalus_h264_qpel_meta meta[N];

    for (int i = 0; i < TOTAL; i++) src[i] = (uint8_t)(xs() & 0xff);
    memset(dst, 0, sizeof(dst));
    memset(dst_ref, 0, sizeof(dst_ref));

    for (int i = 0; i < N; i++) {
        meta[i].src_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
        meta[i].dst_off = (uint32_t)(i * TILE_BYTES + SRC_COL);
    }

    for (int i = 0; i < N; i++)
        daedalus_put_h264_qpel8_mc20_ref(dst_ref + meta[i].dst_off,
                                          src + meta[i].src_off,
                                          TILE_STRIDE);

    int rc = daedalus_recipe_dispatch_h264_qpel_mc20(ctx, dst, src,
                                                      TILE_STRIDE, N, meta);
    if (rc) { fprintf(stderr, "qpel_mc20 dispatch rc=%d\n", rc); return 1; }
    int diff = 0;
    for (int i = 0; i < TOTAL; i++) if (dst[i] != dst_ref[i]) diff++;
    printf("  H.264 qpel mc20: %d/%d bytes bit-exact (%.4f%%)\n",
           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
}

int main(void)
{
    printf("=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===\n");
    printf("  H264_IDCT4 recipe substrate:      %d (1=CPU, 2=QPU)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT4));
    printf("  H264_IDCT8 recipe substrate:      %d\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_IDCT8));
    printf("  H264_DEBLOCK_LV recipe substrate: %d\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV));
    printf("  H264_QPEL_MC20 recipe substrate:  %d\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_QPEL_MC20));

    printf("  H264_DEBLOCK_LH recipe substrate: %d (CPU, no QPU H shader yet)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LH));

    int fail = 0;
    fail |= test_idct4();
    fail |= test_idct8();
    fail |= test_deblock();
    fail |= test_deblock_h();
    fail |= test_qpel_mc20();
    return fail;
}