rk3588-ddr-analysis/patch_timeouts.py

#!/usr/bin/env python3
"""RK3588 DDR Blob Patcher v2 — converts infinite poll loops to counted loops.

v1 (WRONG): NOP'd the backward branch, removing the poll entirely.
v2 (CORRECT): Replaces infinite polls with bounded retry loops.

Strategy: Each tight poll loop (LDR + optional test + B.cond back) is
patched to jump to a trampoline in unused space at the end of the binary.
The trampoline counts iterations and returns to the instruction after the
original branch when either:
  - The poll condition is satisfied (normal path), OR
  - The retry count expires (timeout path — returns with condition NOT met,
    letting the existing error handling deal with it)

Trampoline structure (per patch site, 6 instructions = 24 bytes):
    ; entered from original B.cond site via unconditional B
    MOV     W16, #0x4000        ; ~16384 iterations ≈ ~100us at 1.8GHz
  .retry:
    LDR     Wn, [Xbase, #off]   ; re-execute the original load
    <test>                       ; re-execute the original test (if any)
    B.cond  .done                ; original condition met? exit loop
    SUBS    W16, W16, #1         ; decrement counter
    B.NE    .retry               ; keep trying if counter > 0
  .done:
    B       return_addr          ; jump back after original branch site

W16 (IP0) is used as the counter — it's a scratch register in the AAPCS64
calling convention and not used by any of the poll loop bodies (which use
W0-W4 for register reads and X8-X12 for base addresses).
"""

import struct
import os
import sys

# AArch64 instruction encoding helpers
def encode_movz(rd, imm16, shift=0):
    """MOVZ Wd, #imm16, LSL #shift"""
    hw = shift // 16
    return 0x52800000 | (hw << 21) | (imm16 << 5) | rd

def encode_subs_imm(rd, rn, imm12):
    """SUBS Wd, Wn, #imm12"""
    return 0x71000000 | (imm12 << 10) | (rn << 5) | rd

def encode_b(from_offset, to_offset):
    """B (unconditional branch), PC-relative"""
    delta = (to_offset - from_offset) // 4
    if delta < 0:
        delta = delta & 0x3FFFFFF
    return 0x14000000 | (delta & 0x3FFFFFF)

def encode_bcond(cond, from_offset, to_offset):
    """B.cond, PC-relative"""
    delta = (to_offset - from_offset) // 4
    imm19 = delta & 0x7FFFF
    return 0x54000000 | (imm19 << 5) | cond

def encode_bne(from_offset, to_offset):
    """B.NE (cond=1)"""
    return encode_bcond(1, from_offset, to_offset)

def invert_cond(cond):
    """Invert AArch64 condition code (flip bit 0)"""
    return cond ^ 1

COND_NAMES = ['EQ','NE','CS','CC','MI','PL','VS','VC',
              'HI','LS','GE','LT','GT','LE','AL','NV']

# Timeout: ~16384 iterations. At ~10 cycles/iteration on a 1.8GHz core,
# this is about 90 microseconds — generous for any DDR PHY operation
# (ZQ cal is typ 1us, max ~10us; DFI handshake is sub-microsecond).
TIMEOUT_ITERATIONS = 0x4000

# Counter register: W16 (IP0, caller-saved scratch in AAPCS64)
COUNTER_REG = 16


def find_poll_loops(blob):
    """Find tight backward-branch poll loops containing a load instruction."""
    sites = []
    for i in range(0, len(blob) - 12, 4):
        inst = struct.unpack_from('<I', blob, i)[0]
        # B.cond with backward offset
        if (inst & 0xFF000010) != 0x54000000:
            continue
        imm19 = (inst >> 5) & 0x7FFFF
        if not (imm19 & 0x40000):
            continue  # forward branch, skip
        offset = -((~imm19 & 0x7FFFF) + 1) * 4
        if not (-16 <= offset <= -4):
            continue  # not a tight loop

        loop_start = i + offset
        cond = inst & 0xF

        # Collect loop body instructions
        body = []
        load_inst = None
        load_offset = None
        for j in range(loop_start, i, 4):
            w = struct.unpack_from('<I', blob, j)[0]
            body.append((j, w))
            # LDR (unsigned offset): 32-bit and 64-bit variants
            if (w & 0xFFC00000) in (0xB9400000, 0xF9400000, 0xB9800000):
                load_inst = w
                load_offset = j

        if load_inst is None:
            continue  # no load in the loop body, not a register poll

        sites.append({
            'branch_offset': i,
            'branch_inst': inst,
            'loop_start': loop_start,
            'loop_offset': offset,
            'cond': cond,
            'body': body,
            'load_inst': load_inst,
            'load_offset': load_offset,
        })

    return sites


def find_trampoline_space(blob, needed_bytes):
    """Find unused space for trampolines.

    Look for NOP sleds or zero-filled regions near the end of the binary.
    If nothing found, we'll append to the binary (safe for DDR blobs which
    are loaded to a fixed SRAM address — extra bytes at the end are ignored).
    """
    # First try: find a run of NOPs or zeros at the end
    nop = 0xD503201F
    pos = len(blob) - 4
    free_start = len(blob)
    while pos >= len(blob) - 4096:
        w = struct.unpack_from('<I', blob, pos)[0]
        if w == nop or w == 0:
            free_start = pos
            pos -= 4
        else:
            break

    available = len(blob) - free_start
    if available >= needed_bytes:
        return free_start

    # Fallback: append space to the binary
    # The DDR blob is loaded to SRAM at a fixed size. Extra bytes at the
    # end won't be executed unless jumped to (which is what we do).
    # However, the loader might check size. Pad conservatively.
    return len(blob)  # caller will extend the blob


def build_trampoline(site, tramp_offset, return_offset):
    """Build a counted-loop trampoline for one poll site.

    Layout (6 instructions, 24 bytes):
        +0:  MOV   W16, #TIMEOUT
        +4:  LDR   Wn, [Xbase, #off]     ; copy of original load
        +8:  <test instruction>           ; copy of test (if body has one)
             or NOP                       ; if no separate test (load sets flags)
        +12: B.inv_cond .done (+12)       ; condition MET → exit (inverted!)
        +16: SUBS  W16, W16, #1           ; decrement counter
        +20: B.NE  .retry (-16, back to +4)
        ; fall through = timeout, condition NOT met
        +24: B     return_addr            ; return (will hit error path)
      .done:
        +28: B     return_addr            ; return (success)

    Total: 8 instructions = 32 bytes (safe margin)
    """
    code = []

    # +0: MOV W16, #TIMEOUT
    code.append(encode_movz(COUNTER_REG, TIMEOUT_ITERATIONS))

    # +4: Copy the original load instruction
    code.append(site['load_inst'])

    # +8: Copy any non-load body instructions (test/TST/AND/etc.)
    # If the loop body has instructions between the load and the branch
    # that aren't the load itself, copy them as the test.
    test_inst = None
    for off, w in site['body']:
        if off != site['load_offset']:
            test_inst = w
            break
    if test_inst is not None:
        code.append(test_inst)
    else:
        code.append(0xD503201F)  # NOP (load itself sets condition flags)

    # +12: B.inv_cond → .done (jump forward to success exit)
    # The original loop branches BACK when condition is true (still polling).
    # We invert: if the INVERTED condition is true, polling is DONE.
    inv_cond = invert_cond(site['cond'])
    done_offset = tramp_offset + 28  # .done is at +28
    code.append(encode_bcond(inv_cond, tramp_offset + 12, done_offset))

    # +16: SUBS W16, W16, #1
    code.append(encode_subs_imm(COUNTER_REG, COUNTER_REG, 1))

    # +20: B.NE → .retry (back to +4)
    retry_offset = tramp_offset + 4
    code.append(encode_bne(tramp_offset + 20, retry_offset))

    # +24: B return_addr (timeout — condition NOT met, error path)
    code.append(encode_b(tramp_offset + 24, return_offset))

    # +28: B return_addr (success — condition met)
    code.append(encode_b(tramp_offset + 28, return_offset))

    return code


def patch_blob(inpath, outpath):
    with open(inpath, 'rb') as f:
        blob = bytearray(f.read())

    orig_size = len(blob)
    sites = find_poll_loops(blob)

    if not sites:
        print("No poll loops found!")
        return 0, len(blob)

    # Each trampoline needs 32 bytes
    tramp_total = len(sites) * 32
    tramp_start = find_trampoline_space(blob, tramp_total)

    if tramp_start >= len(blob):
        # Extend the binary
        blob.extend(b'\x00' * tramp_total)

    patches = []
    tramp_offset = tramp_start

    for site in sites:
        branch_offset = site['branch_offset']
        return_offset = branch_offset + 4  # instruction after the original branch

        # Build trampoline
        tramp_code = build_trampoline(site, tramp_offset, return_offset)

        # Write trampoline to blob
        for idx, inst in enumerate(tramp_code):
            struct.pack_into('<I', blob, tramp_offset + idx * 4, inst)

        # Patch original branch site: unconditional B to trampoline
        struct.pack_into('<I', blob, branch_offset,
                         encode_b(branch_offset, tramp_offset))

        patches.append({
            'addr': branch_offset,
            'old': site['branch_inst'],
            'cond': COND_NAMES[site['cond']],
            'loop_offset': site['loop_offset'],
            'trampoline': tramp_offset,
        })

        tramp_offset += 32

    with open(outpath, 'wb') as f:
        f.write(blob)

    print(f'Patched {len(patches)} poll loops with counted-loop trampolines:')
    print(f'  Timeout: {TIMEOUT_ITERATIONS} iterations (~{TIMEOUT_ITERATIONS * 10 / 1800:.0f}us at 1.8GHz)')
    print(f'  Counter register: W{COUNTER_REG} (IP0, caller-saved)')
    print(f'  Trampoline area: 0x{tramp_start:05x} - 0x{tramp_offset:05x} ({tramp_offset - tramp_start} bytes)')
    print()
    for p in patches:
        print(f'  0x{p["addr"]:05x}: B.{p["cond"]} {p["loop_offset"]} -> B trampoline@0x{p["trampoline"]:05x}')

    if len(blob) != orig_size:
        print(f'\nWARNING: Binary grew from {orig_size} to {len(blob)} bytes (+{len(blob) - orig_size})')
        print(f'         Verify the DDR blob loader accepts the larger size.')
    else:
        print(f'\nBinary size unchanged: {len(blob)} bytes')

    return len(patches), len(blob)


if __name__ == '__main__':
    infile = sys.argv[1] if len(sys.argv) > 1 else \
        '/opt/rkbin/bin/rk35/rk3588_ddr_lp4_2112MHz_lp5_2400MHz_v1.19.bin'
    outfile = sys.argv[2] if len(sys.argv) > 2 else \
        infile.replace('.bin', '_patched_v2.bin')

    n, size = patch_blob(infile, outfile)