Files
rk3588-ddr-analysis/patch_timeouts.py
T
test0r 05d0d8edd5 patch_timeouts v2: counted-loop trampolines instead of NOPs
v1 NOP approach was WRONG — removed the poll entirely, proceeding with
stale/incomplete register values. ZQ cal, DFI handshake, and PHY mailbox
all require actual polling until the hardware responds.

v2 uses trampoline functions appended to the binary:
- Each poll site jumps to a trampoline that retries with a W16 counter
- 16384 iterations (~91us at 1.8GHz) before timeout
- On timeout, returns with condition NOT met (hits existing error path)
- On success, returns normally (original behavior preserved)
- W16 (IP0) used as counter — caller-saved, not used by poll loop bodies

35 poll loops patched, 13 non-poll backward branches left intact.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 21:31:38 +02:00

292 lines
10 KiB
Python

#!/usr/bin/env python3
"""RK3588 DDR Blob Patcher v2 — converts infinite poll loops to counted loops.
v1 (WRONG): NOP'd the backward branch, removing the poll entirely.
v2 (CORRECT): Replaces infinite polls with bounded retry loops.
Strategy: Each tight poll loop (LDR + optional test + B.cond back) is
patched to jump to a trampoline in unused space at the end of the binary.
The trampoline counts iterations and returns to the instruction after the
original branch when either:
- The poll condition is satisfied (normal path), OR
- The retry count expires (timeout path — returns with condition NOT met,
letting the existing error handling deal with it)
Trampoline structure (per patch site, 6 instructions = 24 bytes):
; entered from original B.cond site via unconditional B
MOV W16, #0x4000 ; ~16384 iterations ≈ ~100us at 1.8GHz
.retry:
LDR Wn, [Xbase, #off] ; re-execute the original load
<test> ; re-execute the original test (if any)
B.cond .done ; original condition met? exit loop
SUBS W16, W16, #1 ; decrement counter
B.NE .retry ; keep trying if counter > 0
.done:
B return_addr ; jump back after original branch site
W16 (IP0) is used as the counter — it's a scratch register in the AAPCS64
calling convention and not used by any of the poll loop bodies (which use
W0-W4 for register reads and X8-X12 for base addresses).
"""
import struct
import os
import sys
# AArch64 instruction encoding helpers
def encode_movz(rd, imm16, shift=0):
"""MOVZ Wd, #imm16, LSL #shift"""
hw = shift // 16
return 0x52800000 | (hw << 21) | (imm16 << 5) | rd
def encode_subs_imm(rd, rn, imm12):
"""SUBS Wd, Wn, #imm12"""
return 0x71000000 | (imm12 << 10) | (rn << 5) | rd
def encode_b(from_offset, to_offset):
"""B (unconditional branch), PC-relative"""
delta = (to_offset - from_offset) // 4
if delta < 0:
delta = delta & 0x3FFFFFF
return 0x14000000 | (delta & 0x3FFFFFF)
def encode_bcond(cond, from_offset, to_offset):
"""B.cond, PC-relative"""
delta = (to_offset - from_offset) // 4
imm19 = delta & 0x7FFFF
return 0x54000000 | (imm19 << 5) | cond
def encode_bne(from_offset, to_offset):
"""B.NE (cond=1)"""
return encode_bcond(1, from_offset, to_offset)
def invert_cond(cond):
"""Invert AArch64 condition code (flip bit 0)"""
return cond ^ 1
COND_NAMES = ['EQ','NE','CS','CC','MI','PL','VS','VC',
'HI','LS','GE','LT','GT','LE','AL','NV']
# Timeout: ~16384 iterations. At ~10 cycles/iteration on a 1.8GHz core,
# this is about 90 microseconds — generous for any DDR PHY operation
# (ZQ cal is typ 1us, max ~10us; DFI handshake is sub-microsecond).
TIMEOUT_ITERATIONS = 0x4000
# Counter register: W16 (IP0, caller-saved scratch in AAPCS64)
COUNTER_REG = 16
def find_poll_loops(blob):
"""Find tight backward-branch poll loops containing a load instruction."""
sites = []
for i in range(0, len(blob) - 12, 4):
inst = struct.unpack_from('<I', blob, i)[0]
# B.cond with backward offset
if (inst & 0xFF000010) != 0x54000000:
continue
imm19 = (inst >> 5) & 0x7FFFF
if not (imm19 & 0x40000):
continue # forward branch, skip
offset = -((~imm19 & 0x7FFFF) + 1) * 4
if not (-16 <= offset <= -4):
continue # not a tight loop
loop_start = i + offset
cond = inst & 0xF
# Collect loop body instructions
body = []
load_inst = None
load_offset = None
for j in range(loop_start, i, 4):
w = struct.unpack_from('<I', blob, j)[0]
body.append((j, w))
# LDR (unsigned offset): 32-bit and 64-bit variants
if (w & 0xFFC00000) in (0xB9400000, 0xF9400000, 0xB9800000):
load_inst = w
load_offset = j
if load_inst is None:
continue # no load in the loop body, not a register poll
sites.append({
'branch_offset': i,
'branch_inst': inst,
'loop_start': loop_start,
'loop_offset': offset,
'cond': cond,
'body': body,
'load_inst': load_inst,
'load_offset': load_offset,
})
return sites
def find_trampoline_space(blob, needed_bytes):
"""Find unused space for trampolines.
Look for NOP sleds or zero-filled regions near the end of the binary.
If nothing found, we'll append to the binary (safe for DDR blobs which
are loaded to a fixed SRAM address — extra bytes at the end are ignored).
"""
# First try: find a run of NOPs or zeros at the end
nop = 0xD503201F
pos = len(blob) - 4
free_start = len(blob)
while pos >= len(blob) - 4096:
w = struct.unpack_from('<I', blob, pos)[0]
if w == nop or w == 0:
free_start = pos
pos -= 4
else:
break
available = len(blob) - free_start
if available >= needed_bytes:
return free_start
# Fallback: append space to the binary
# The DDR blob is loaded to SRAM at a fixed size. Extra bytes at the
# end won't be executed unless jumped to (which is what we do).
# However, the loader might check size. Pad conservatively.
return len(blob) # caller will extend the blob
def build_trampoline(site, tramp_offset, return_offset):
"""Build a counted-loop trampoline for one poll site.
Layout (6 instructions, 24 bytes):
+0: MOV W16, #TIMEOUT
+4: LDR Wn, [Xbase, #off] ; copy of original load
+8: <test instruction> ; copy of test (if body has one)
or NOP ; if no separate test (load sets flags)
+12: B.inv_cond .done (+12) ; condition MET → exit (inverted!)
+16: SUBS W16, W16, #1 ; decrement counter
+20: B.NE .retry (-16, back to +4)
; fall through = timeout, condition NOT met
+24: B return_addr ; return (will hit error path)
.done:
+28: B return_addr ; return (success)
Total: 8 instructions = 32 bytes (safe margin)
"""
code = []
# +0: MOV W16, #TIMEOUT
code.append(encode_movz(COUNTER_REG, TIMEOUT_ITERATIONS))
# +4: Copy the original load instruction
code.append(site['load_inst'])
# +8: Copy any non-load body instructions (test/TST/AND/etc.)
# If the loop body has instructions between the load and the branch
# that aren't the load itself, copy them as the test.
test_inst = None
for off, w in site['body']:
if off != site['load_offset']:
test_inst = w
break
if test_inst is not None:
code.append(test_inst)
else:
code.append(0xD503201F) # NOP (load itself sets condition flags)
# +12: B.inv_cond → .done (jump forward to success exit)
# The original loop branches BACK when condition is true (still polling).
# We invert: if the INVERTED condition is true, polling is DONE.
inv_cond = invert_cond(site['cond'])
done_offset = tramp_offset + 28 # .done is at +28
code.append(encode_bcond(inv_cond, tramp_offset + 12, done_offset))
# +16: SUBS W16, W16, #1
code.append(encode_subs_imm(COUNTER_REG, COUNTER_REG, 1))
# +20: B.NE → .retry (back to +4)
retry_offset = tramp_offset + 4
code.append(encode_bne(tramp_offset + 20, retry_offset))
# +24: B return_addr (timeout — condition NOT met, error path)
code.append(encode_b(tramp_offset + 24, return_offset))
# +28: B return_addr (success — condition met)
code.append(encode_b(tramp_offset + 28, return_offset))
return code
def patch_blob(inpath, outpath):
with open(inpath, 'rb') as f:
blob = bytearray(f.read())
orig_size = len(blob)
sites = find_poll_loops(blob)
if not sites:
print("No poll loops found!")
return 0, len(blob)
# Each trampoline needs 32 bytes
tramp_total = len(sites) * 32
tramp_start = find_trampoline_space(blob, tramp_total)
if tramp_start >= len(blob):
# Extend the binary
blob.extend(b'\x00' * tramp_total)
patches = []
tramp_offset = tramp_start
for site in sites:
branch_offset = site['branch_offset']
return_offset = branch_offset + 4 # instruction after the original branch
# Build trampoline
tramp_code = build_trampoline(site, tramp_offset, return_offset)
# Write trampoline to blob
for idx, inst in enumerate(tramp_code):
struct.pack_into('<I', blob, tramp_offset + idx * 4, inst)
# Patch original branch site: unconditional B to trampoline
struct.pack_into('<I', blob, branch_offset,
encode_b(branch_offset, tramp_offset))
patches.append({
'addr': branch_offset,
'old': site['branch_inst'],
'cond': COND_NAMES[site['cond']],
'loop_offset': site['loop_offset'],
'trampoline': tramp_offset,
})
tramp_offset += 32
with open(outpath, 'wb') as f:
f.write(blob)
print(f'Patched {len(patches)} poll loops with counted-loop trampolines:')
print(f' Timeout: {TIMEOUT_ITERATIONS} iterations (~{TIMEOUT_ITERATIONS * 10 / 1800:.0f}us at 1.8GHz)')
print(f' Counter register: W{COUNTER_REG} (IP0, caller-saved)')
print(f' Trampoline area: 0x{tramp_start:05x} - 0x{tramp_offset:05x} ({tramp_offset - tramp_start} bytes)')
print()
for p in patches:
print(f' 0x{p["addr"]:05x}: B.{p["cond"]} {p["loop_offset"]} -> B trampoline@0x{p["trampoline"]:05x}')
if len(blob) != orig_size:
print(f'\nWARNING: Binary grew from {orig_size} to {len(blob)} bytes (+{len(blob) - orig_size})')
print(f' Verify the DDR blob loader accepts the larger size.')
else:
print(f'\nBinary size unchanged: {len(blob)} bytes')
return len(patches), len(blob)
if __name__ == '__main__':
infile = sys.argv[1] if len(sys.argv) > 1 else \
'/opt/rkbin/bin/rk35/rk3588_ddr_lp4_2112MHz_lp5_2400MHz_v1.19.bin'
outfile = sys.argv[2] if len(sys.argv) > 2 else \
infile.replace('.bin', '_patched_v2.bin')
n, size = patch_blob(infile, outfile)