#!/usr/bin/env python3
"""tripwire_diff.py — PC-bucketed sequence diff of two tripwire CSVs.

Per Janet (2026-04-21): cross-index diff is destroyed the moment control
flow diverges (bitflip mode guarantees this). Group records by fn_name,
diff per bucket with difflib.SequenceMatcher. Long edit-distance buckets
get tagged SUSPECT and emit a raw side-by-side sub-sequence for human
triage — we do not try to auto-resolve.

Usage:
    tripwire_diff.py vendor.csv rebuilt.csv [--suspect-threshold 0.9]
                     [--show-identical] [--limit-per-bucket N]

Key for each record inside a bucket (tunable): region + addr + rw + val.
PC is excluded because codegen reg-alloc can shift individual load/store
PCs within a function without changing behavior. `seq` and `tick` are
excluded because they drift with any upstream path difference.
"""
import argparse
import difflib
import sys
from collections import defaultdict

from sim_tripwire import load_csv


def bucket_key(rec):
    """Inside a fn_name bucket, the canonical record key for diffing."""
    return (rec["region"], rec["addr"], rec["rw"], rec["val"], rec["size"])


def bucket_by_fn(records):
    buckets = defaultdict(list)
    for r in records:
        buckets[r["fn"]].append(r)
    return buckets


def ratio(seq_a, seq_b):
    """Cheap-first ratio. Skips O(n²) SequenceMatcher when obviously not similar."""
    if not seq_a and not seq_b:
        return 1.0
    if not seq_a or not seq_b:
        return 0.0
    if seq_a == seq_b:
        return 1.0
    sm = difflib.SequenceMatcher(a=seq_a, b=seq_b, autojunk=False)
    # quick_ratio is an upper bound computed from set intersection —
    # useful as an early reject when buckets share nothing.
    qr = sm.quick_ratio()
    if qr < 0.5:
        return qr
    return sm.ratio()


def _sm_cache(va, vb):
    """Return (key_a, key_b, cached SequenceMatcher) once, reuse for opcodes."""
    ka = [bucket_key(r) for r in va]
    kb = [bucket_key(r) for r in vb]
    if ka == kb:
        return ka, kb, None
    sm = difflib.SequenceMatcher(a=ka, b=kb, autojunk=False)
    return ka, kb, sm


def fmt_rec(rec):
    return (f"{rec['region']:11s} {rec['rw']} 0x{rec['addr']:08x} "
            f"sz={rec['size']} val=0x{rec['val']:x}  (pc=0x{rec['pc']:x})")


def diff_bucket(name, va, vb, limit, show_identical):
    ka = [bucket_key(r) for r in va]
    kb = [bucket_key(r) for r in vb]
    r = ratio(ka, kb)
    status = "OK    " if ka == kb else f"{r:.3f} "
    if ka == kb and not show_identical:
        return status, None
    if ka == kb:
        return status, (f"{name:22s} OK    {len(va)} records match "
                        "(showing on --show-identical)")
    # Surface an edit-script side-by-side
    sm = difflib.SequenceMatcher(a=ka, b=kb, autojunk=False)
    lines = [f"{name:22s} {status}  "
             f"vendor={len(va):5d}  rebuilt={len(vb):5d}"]
    shown = 0
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == "equal":
            continue
        for i in range(i1, i2):
            if shown >= limit: break
            lines.append(f"  - [V#{va[i]['seq']:5d}] {fmt_rec(va[i])}")
            shown += 1
        for j in range(j1, j2):
            if shown >= limit: break
            lines.append(f"  + [R#{vb[j]['seq']:5d}] {fmt_rec(vb[j])}")
            shown += 1
        if shown >= limit:
            lines.append(f"  ... (truncated at {limit} per bucket)")
            break
    return status, "\n".join(lines)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("vendor")
    ap.add_argument("rebuilt")
    ap.add_argument("--suspect-threshold", type=float, default=0.9,
                    help="buckets with ratio below this get SUSPECT tag")
    ap.add_argument("--show-identical", action="store_true")
    ap.add_argument("--limit-per-bucket", type=int, default=20,
                    help="max insert/delete lines per bucket in the report")
    args = ap.parse_args()

    vrecs = load_csv(args.vendor)
    rrecs = load_csv(args.rebuilt)
    print(f"# vendor:  {len(vrecs):6d} records ({args.vendor})")
    print(f"# rebuilt: {len(rrecs):6d} records ({args.rebuilt})")

    vb = bucket_by_fn(vrecs)
    rb = bucket_by_fn(rrecs)
    fns = sorted(set(vb) | set(rb))

    print(f"# buckets: {len(fns)} functions touched across either side")
    print()
    ok = susp = diff = 0
    reports = []
    suspects = []
    for fn in fns:
        va = vb.get(fn, [])
        rs = rb.get(fn, [])
        ka, kb, sm = _sm_cache(va, rs)
        if sm is None:
            ok += 1
            if args.show_identical:
                reports.append(f"{fn:22s} OK    {len(va)} records")
            continue
        # Fast: set-intersection upper bound; short-circuit on no overlap
        qr = sm.quick_ratio()
        r = qr if qr < 0.5 else sm.ratio()
        tag = "SUSPECT" if r < args.suspect_threshold else "       "
        if r < args.suspect_threshold:
            susp += 1
        else:
            diff += 1
        lines = [f"{fn:22s}   vendor={len(va):5d}  rebuilt={len(rs):5d}"]
        shown = 0
        for op_tag, i1, i2, j1, j2 in sm.get_opcodes():
            if op_tag == "equal":
                continue
            for i in range(i1, i2):
                if shown >= args.limit_per_bucket: break
                lines.append(f"  - [V#{va[i]['seq']:5d}] {fmt_rec(va[i])}")
                shown += 1
            for j in range(j1, j2):
                if shown >= args.limit_per_bucket: break
                lines.append(f"  + [R#{rs[j]['seq']:5d}] {fmt_rec(rs[j])}")
                shown += 1
            if shown >= args.limit_per_bucket:
                lines.append(f"  ... (truncated at {args.limit_per_bucket})")
                break
        rep = "\n".join(lines)
        if r < args.suspect_threshold:
            suspects.append((r, fn, rep))
        else:
            reports.append(f"[{tag}] r={r:.3f}  " + rep)

    print(f"# OK: {ok}   minor-diff: {diff}   SUSPECT(<{args.suspect_threshold}): {susp}")
    print()
    if suspects:
        suspects.sort()
        print(f"## SUSPECT BUCKETS ({len(suspects)}) — human triage required")
        print()
        for _, fn, rep in suspects:
            print(rep)
            print()
    # Any minor-diff buckets worth dumping too
    for line in reports:
        if "SUSPECT" not in line:
            continue
    for line in reports:
        if "SUSPECT" in line:
            continue
        print(line)

    return 0 if not suspects else 1


if __name__ == "__main__":
    sys.exit(main())