rk3588-ddr-analysis/patch_timeouts_v3.py

#!/usr/bin/env python3
"""RK3588 DDR Blob Patcher v3 — counted-loop trampolines with SITE SUBSET.

v2 BUG identified 2026-04-15: the original patcher copied at most ONE
body instruction beyond the LDR into the trampoline. For poll patterns of
form  `LDR; AND; CMP; B.cond`  (5 of the 16 sites), the CMP was dropped.
The trampoline's `B.inv_cond` then tested stale flags → random branch →
brick.

v3 fix: copy the **full** original loop body (LDR + every intermediate
instruction, in original order) into the trampoline, so the last
flag-setting instruction matches the original loop.

New trampoline layout (N = len(body) — usually 2 or 3):
    +0:           MOV   W16, #TIMEOUT
    +4   ..   +4N: <body[0] .. body[N-1]>    ; LDR + all test instructions
    +4N+4:        B.inv_cond .done
    +4N+8:        SUBS  W16, W16, #1
    +4N+12:       B.NE .retry (back to +4)
    +4N+16:       B return_addr       ; timeout path
  .done:
    +4N+20:       B return_addr       ; success path

Size per trampoline: 4 * (N + 5) bytes. With max body of 3 instructions
that's 4*8 = 32 bytes (same as v2); 2-inst bodies take 28 bytes.

Site subset flags:
    --sites all | early | mid | late | none
    --sites 0,3,5-7                        # explicit index list

Site index is stable (determined by find_poll_loops' ascending-offset
ordering). As of rk3588_ddr_v1.19:
    0..7   EARLY cluster (0x07b78..0x07f08) — SGRF + PHY firmware fsm
    8..10  MID cluster   (0x09124..0x0aaf8) — DfiStatus / training start
    11..15 LATE cluster  (0x0d154..0x0d378) — UctWriteProt / CalBusy / late
"""

import struct
import sys
import argparse


TIMEOUT_ITERATIONS = 0x4000
COUNTER_REG = 16

COND_NAMES = ['EQ','NE','CS','CC','MI','PL','VS','VC',
              'HI','LS','GE','LT','GT','LE','AL','NV']

CLUSTERS = {
    'early': list(range(0, 8)),
    'mid':   list(range(8, 11)),
    'late':  list(range(11, 16)),
    'all':   list(range(0, 16)),
    'none':  [],
}


def encode_movz(rd, imm16, shift=0):
    hw = shift // 16
    return 0x52800000 | (hw << 21) | (imm16 << 5) | rd

def encode_subs_imm(rd, rn, imm12):
    return 0x71000000 | (imm12 << 10) | (rn << 5) | rd

def encode_b(from_offset, to_offset):
    delta = (to_offset - from_offset) // 4
    if delta < 0:
        delta = delta & 0x3FFFFFF
    return 0x14000000 | (delta & 0x3FFFFFF)

def encode_bcond(cond, from_offset, to_offset):
    delta = (to_offset - from_offset) // 4
    imm19 = delta & 0x7FFFF
    return 0x54000000 | (imm19 << 5) | cond

def encode_bne(from_offset, to_offset):
    return encode_bcond(1, from_offset, to_offset)

def invert_cond(cond):
    return cond ^ 1


def find_poll_loops(blob):
    """Find tight backward-branch poll loops of form <body-ending-in-LDR-or-test>; B.cond back."""
    sites = []
    for i in range(0, len(blob) - 12, 4):
        inst = struct.unpack_from('<I', blob, i)[0]
        if (inst & 0xFF000010) != 0x54000000:
            continue
        imm19 = (inst >> 5) & 0x7FFFF
        if not (imm19 & 0x40000):
            continue
        offset = -((~imm19 & 0x7FFFF) + 1) * 4
        if not (-16 <= offset <= -4):
            continue

        loop_start = i + offset
        cond = inst & 0xF

        # Body = every instruction from loop_start up to (but excluding) the branch.
        body = []
        has_load = False
        for j in range(loop_start, i, 4):
            w = struct.unpack_from('<I', blob, j)[0]
            body.append(w)
            if (w & 0xBFC00000) in (0xB9400000, 0xF9400000):
                has_load = True

        if not has_load:
            continue  # require at least one LDR — filters out pure cmp/b.ne spin loops

        sites.append({
            'idx': len(sites),
            'branch_offset': i,
            'cond': cond,
            'body': body,
        })
    return sites


def build_trampoline(site, tramp_offset, return_offset):
    """Full-body trampoline: body + counter logic."""
    body = site['body']
    N = len(body)
    code = []

    # +0: MOVZ W16, #TIMEOUT
    code.append(encode_movz(COUNTER_REG, TIMEOUT_ITERATIONS))

    # +4 .. +4N: entire original body in order (LDR + any tests/cmps)
    code.extend(body)

    # +4N+4: B.inv_cond → .done
    inv_cond = invert_cond(site['cond'])
    done_off = tramp_offset + 4 * (N + 5)
    code.append(encode_bcond(inv_cond, tramp_offset + 4 * (N + 1), done_off))

    # +4N+8: SUBS W16, W16, #1
    code.append(encode_subs_imm(COUNTER_REG, COUNTER_REG, 1))

    # +4N+12: B.NE → .retry (back to +4, first body instruction)
    code.append(encode_bne(tramp_offset + 4 * (N + 3), tramp_offset + 4))

    # +4N+16: B return_addr (timeout fallthrough — condition NOT met)
    code.append(encode_b(tramp_offset + 4 * (N + 4), return_offset))

    # +4N+20 (.done): B return_addr (success — condition met)
    code.append(encode_b(done_off, return_offset))

    return code


def parse_sites(spec, max_index):
    if spec in CLUSTERS:
        return CLUSTERS[spec]
    result = set()
    for part in spec.split(','):
        part = part.strip()
        if not part:
            continue
        if part in CLUSTERS:
            result.update(CLUSTERS[part])
        elif '-' in part:
            a, b = part.split('-', 1)
            result.update(range(int(a), int(b) + 1))
        else:
            result.add(int(part))
    return sorted(i for i in result if 0 <= i <= max_index)


def patch_blob(inpath, outpath, site_indices):
    with open(inpath, 'rb') as f:
        blob = bytearray(f.read())

    orig_size = len(blob)
    all_sites = find_poll_loops(blob)
    if not all_sites:
        print('No poll loops found!', file=sys.stderr)
        return 0, len(blob)

    site_indices = [i for i in site_indices if i < len(all_sites)]
    if not site_indices:
        print('No sites selected — writing unmodified blob.')
        with open(outpath, 'wb') as f:
            f.write(blob)
        return 0, len(blob)

    # Per-site trampoline sizes vary (body length varies). Precompute.
    tramp_sizes = [(4 * (len(all_sites[i]['body']) + 6)) for i in site_indices]
    # Align each to 32 bytes to keep things tidy.
    tramp_total = sum(tramp_sizes)

    tramp_start = len(blob)
    blob.extend(b'\x00' * tramp_total)

    patches = []
    tramp_offset = tramp_start

    for idx, slot_size in zip(site_indices, tramp_sizes):
        site = all_sites[idx]
        branch_offset = site['branch_offset']
        return_offset = branch_offset + 4
        tramp_code = build_trampoline(site, tramp_offset, return_offset)
        for j, inst in enumerate(tramp_code):
            struct.pack_into('<I', blob, tramp_offset + j * 4, inst)
        struct.pack_into('<I', blob, branch_offset,
                         encode_b(branch_offset, tramp_offset))
        patches.append({
            'idx': idx,
            'addr': branch_offset,
            'cond': COND_NAMES[site['cond']],
            'body_len': len(site['body']),
            'trampoline': tramp_offset,
        })
        tramp_offset += slot_size

    with open(outpath, 'wb') as f:
        f.write(blob)

    total_sites = len(all_sites)
    print(f'Patched {len(patches)}/{total_sites} poll sites  (full-body trampolines):')
    print(f'  Timeout: {TIMEOUT_ITERATIONS} iter (~{TIMEOUT_ITERATIONS*10/1800:.0f}us @1.8GHz)')
    for p in patches:
        print(f'  site {p["idx"]:2d}: 0x{p["addr"]:05x} B.{p["cond"]:<2s} body={p["body_len"]}  → tramp@0x{p["trampoline"]:05x}')
    unpatched = [i for i in range(total_sites) if i not in [p["idx"] for p in patches]]
    if unpatched:
        print(f'  UNPATCHED (stock): {unpatched}')
    print(f'\nBinary: {orig_size} → {len(blob)} bytes (+{len(blob)-orig_size})')
    print(f'Output: {outpath}')
    return len(patches), len(blob)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--sites', default='all',
                    help="Sites to patch: 'all', 'early', 'mid', 'late', 'none',"
                         " or comma/hyphen list (e.g. '0,3,5-7'). Default: all.")
    ap.add_argument('input', help='Input DDR blob')
    ap.add_argument('output', nargs='?',
                    help='Output path (default: <input>_v3.bin)')
    args = ap.parse_args()

    outpath = args.output or args.input.replace('.bin', '_v3.bin')

    with open(args.input, 'rb') as f:
        all_sites = find_poll_loops(bytearray(f.read()))
    max_idx = len(all_sites) - 1 if all_sites else 0
    indices = parse_sites(args.sites, max_idx)

    patch_blob(args.input, outpath, indices)


if __name__ == '__main__':
    main()