/*
 * Phase 8b — opportunistic-QPU dispatch paths through public API.
 *
 * Verifies that cycles 3 (VP9 MC), 5 (AV1 CDEF), 8 (H.264 deblock)
 * can be force-routed to QPU via daedalus_dispatch_*(QPU, ...) and
 * produce bit-exact output vs the CPU path (which is the C ref proxy
 * for each kernel — see per-cycle Phase 7 docs).
 *
 * AUTO/recipe path stays on CPU for these kernels — that's the
 * deployment shape. This test exercises the override-mode path
 * the integration layer would use for runtime-aware scheduling.
 */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>

#include "../include/daedalus.h"

static uint64_t xs_state = 0xab10b81cULL;
static inline uint64_t xs(void) {
    uint64_t x = xs_state;
    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
    return xs_state = x;
}

static int test_mc(void)
{
    enum { N = 32, DST_STRIDE = 16, DST_ROWS = 8 * 4, DST_BYTES = DST_ROWS * DST_STRIDE,
           SRC_STRIDE = 16, SRC_ROWS = 12, SRC_BYTES = SRC_ROWS * SRC_STRIDE * N };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;
    if (!daedalus_ctx_has_qpu(ctx)) {
        printf("  VP9 MC: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0;
    }

    /* Allocate per-block src tiles (12 rows x 16 cols each). */
    uint8_t *src = malloc(SRC_BYTES);
    uint8_t *dst_cpu = calloc(1, DST_BYTES * N);
    uint8_t *dst_qpu = calloc(1, DST_BYTES * N);
    daedalus_mc_meta *meta = calloc(N, sizeof(*meta));
    if (!src || !dst_cpu || !dst_qpu || !meta) return 1;

    for (size_t i = 0; i < SRC_BYTES; i++) src[i] = (uint8_t)(xs() & 0xff);
    for (int i = 0; i < N; i++) {
        meta[i].dst_off = i * 64;                            /* 8 rows × 8 cols = 64 bytes per block */
        meta[i].src_off = i * SRC_STRIDE * SRC_ROWS;         /* RAW src offset; shader handles -3 */
        meta[i].mx = (int)(xs() & 15);
    }

    daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, 8, src, SRC_STRIDE, N, meta);
    daedalus_dispatch_vp9_mc_8h(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, 8, src, SRC_STRIDE, N, meta);

    int diff = 0;
    for (int i = 0; i < N * 64; i++) if (dst_cpu[i] != dst_qpu[i]) diff++;
    printf("  VP9 MC (CPU vs QPU): %d/%d bytes match (%.4f%%)\n",
           N * 64 - diff, N * 64, 100.0 * (N * 64 - diff) / (N * 64));

    free(src); free(dst_cpu); free(dst_qpu); free(meta);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
}

static int test_deblock(void)
{
    enum { N = 8, TILE_STRIDE = 16, TILE_BYTES = 16 * TILE_STRIDE,
           TOTAL = N * TILE_BYTES, EDGE_OFF = 4 * TILE_STRIDE };
    daedalus_ctx *ctx = daedalus_ctx_create();
    if (!ctx) return 1;
    if (!daedalus_ctx_has_qpu(ctx)) {
        printf("  H.264 deblock: SKIP (no QPU)\n"); daedalus_ctx_destroy(ctx); return 0;
    }

    uint8_t *master  = malloc(TOTAL);
    uint8_t *dst_cpu = malloc(TOTAL);
    uint8_t *dst_qpu = malloc(TOTAL);
    daedalus_h264_deblock_meta *meta = calloc(N, sizeof(*meta));
    if (!master || !dst_cpu || !dst_qpu || !meta) return 1;

    for (int i = 0; i < TOTAL; i++) master[i] = (uint8_t)(xs() & 0xff);
    memcpy(dst_cpu, master, TOTAL);
    memcpy(dst_qpu, master, TOTAL);

    for (int i = 0; i < N; i++) {
        meta[i].dst_off = i * TILE_BYTES + EDGE_OFF;
        meta[i].alpha = (int)(xs() % 64) + 1;
        meta[i].beta  = (int)(xs() % 16) + 1;
        for (int s = 0; s < 4; s++) {
            int r = (int)(xs() % 8);
            meta[i].tc0[s] = (int8_t)(r == 0 ? -1 : (r - 1));
        }
    }

    daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_CPU, dst_cpu, TILE_STRIDE, N, meta);
    daedalus_dispatch_h264_deblock_luma_v(ctx, DAEDALUS_SUBSTRATE_QPU, dst_qpu, TILE_STRIDE, N, meta);

    int diff = 0;
    for (int i = 0; i < TOTAL; i++) if (dst_cpu[i] != dst_qpu[i]) diff++;
    printf("  H.264 deblock (CPU vs QPU): %d/%d bytes match (%.4f%%)\n",
           TOTAL - diff, TOTAL, 100.0 * (TOTAL - diff) / TOTAL);

    free(master); free(dst_cpu); free(dst_qpu); free(meta);
    daedalus_ctx_destroy(ctx);
    return diff == 0 ? 0 : 1;
}

int main(void)
{
    printf("=== Phase 8b: opportunistic-QPU paths through API ===\n");
    int fail = 0;
    fail |= test_mc();
    fail |= test_deblock();
    /* CDEF skipped here — tmp construction in C ref differs subtly
     * from dav1d NEON's; bench_v3d_cdef.c is the authoritative gate
     * for the QPU CDEF path. */
    return fail;
}