daedalus-fourier/tests/test_api_idct.c

/*
 * Phase 8 — first end-to-end test through the public API.
 *
 * Exercises `daedalus_recipe_dispatch_vp9_idct8` end-to-end:
 *   1. Create context.
 *   2. Generate random VP9 coefficient blocks + dst pixels.
 *   3. Compute reference output via the C ref (tests/vp9_idct8_ref.c).
 *   4. Run public API dispatch on a copy of dst.
 *   5. Assert bit-exact.
 *
 * In Phase 8 skeleton, the API routes to CPU NEON (QPU dispatch
 * not yet wired through the API).  Bit-exact gate against C ref
 * still passes because the underlying NEON kernel was the cycle 1
 * reference.
 */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>

#include "../include/daedalus.h"

extern void daedalus_vp9_idct_idct_8x8_add_ref(
    uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);

#define BLOCKS_W 8
#define BLOCKS_H 8
#define N_BLOCKS (BLOCKS_W * BLOCKS_H)
#define DST_STRIDE (BLOCKS_W * 8)
#define DST_BYTES (BLOCKS_H * 8 * DST_STRIDE)

static uint64_t xs_state = 0xa57edbeef5717ULL;
static inline uint64_t xs(void) {
    uint64_t x = xs_state;
    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
    return xs_state = x;
}

static int run_once(daedalus_substrate force,
                    const int16_t *coeffs,
                    const daedalus_idct8_meta *meta,
                    const uint8_t *dst_initial,
                    const uint8_t *dst_ref,
                    const char *label)
{
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) { fprintf(stderr, "ctx create failed\n"); return 1; }
    int has_qpu = daedalus_ctx_has_qpu(ctx);
    printf("  [%s] has_qpu=%d force=%d\n", label, has_qpu, (int) force);
    if (force == DAEDALUS_SUBSTRATE_QPU && !has_qpu) {
        printf("    SKIP — QPU unavailable on this host\n");
        daedalus_ctx_destroy(ctx); return 0;
    }
    uint8_t dst[DST_BYTES];
    memcpy(dst, dst_initial, DST_BYTES);
    int rc = daedalus_dispatch_vp9_idct8(ctx, force, dst, DST_STRIDE,
                                          coeffs, N_BLOCKS, meta);
    if (rc) { fprintf(stderr, "    dispatch rc=%d\n", rc); daedalus_ctx_destroy(ctx); return 1; }
    int diffs = 0;
    for (int i = 0; i < DST_BYTES; i++) if (dst[i] != dst_ref[i]) diffs++;
    printf("    %d / %d bytes bit-exact (%.4f%%)\n",
           DST_BYTES - diffs, DST_BYTES, 100.0 * (DST_BYTES - diffs) / DST_BYTES);
    daedalus_ctx_destroy(ctx);
    return diffs == 0 ? 0 : 1;
}

int main(void)
{
    printf("=== Phase 8 API smoke: VP9 IDCT 8x8 via recipe dispatch ===\n");
    printf("  recipe substrate for VP9_IDCT8: %d (1=CPU, 2=QPU)\n",
           (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_VP9_IDCT8));

    /* Generate random VP9 IDCT inputs: 64-coef blocks + a dst surface. */
    int16_t coeffs[N_BLOCKS * 64];
    memset(coeffs, 0, sizeof(coeffs));
    for (int i = 0; i < N_BLOCKS; i++) {
        /* Sparse non-zero coefs to keep range realistic. */
        int n = 1 + (int)(xs() % 16);
        for (int j = 0; j < n; j++) {
            int pos = (int)(xs() % 64);
            int16_t v = (int16_t)((int)(xs() % 8192) - 4096);
            coeffs[i * 64 + pos] = v;
        }
    }

    uint8_t dst_ref[DST_BYTES], dst_initial[DST_BYTES];
    for (int i = 0; i < DST_BYTES; i++)
        dst_ref[i] = dst_initial[i] = (uint8_t)(xs() & 0xff);

    /* 8x8 grid of 8x8 blocks. Block (bx, by) at byte offset
     * by*8*stride + bx*8. */
    daedalus_idct8_meta meta[N_BLOCKS];
    for (int by = 0; by < BLOCKS_H; by++) {
        for (int bx = 0; bx < BLOCKS_W; bx++) {
            int i = by * BLOCKS_W + bx;
            meta[i].dst_off = (uint32_t)(by * 8 * DST_STRIDE + bx * 8);
            meta[i].block_x = (uint32_t) bx;
            meta[i].block_y = (uint32_t) by;
            meta[i]._pad = 0;
        }
    }

    /* Compute reference via the C ref (mutates a scratch copy of
     * coeffs because the C ref destroys its input). */
    int16_t scratch[64];
    for (int i = 0; i < N_BLOCKS; i++) {
        memcpy(scratch, coeffs + i * 64, 64 * sizeof(int16_t));
        daedalus_vp9_idct_idct_8x8_add_ref(dst_ref + meta[i].dst_off,
                                              DST_STRIDE, scratch, 64);
    }

    int fail = 0;
    fail |= run_once(DAEDALUS_SUBSTRATE_CPU, coeffs, meta, dst_initial, dst_ref, "CPU");
    fail |= run_once(DAEDALUS_SUBSTRATE_QPU, coeffs, meta, dst_initial, dst_ref, "QPU");
    fail |= run_once(DAEDALUS_SUBSTRATE_AUTO, coeffs, meta, dst_initial, dst_ref, "AUTO");
    return fail;
}