05d0d8edd5
v1 NOP approach was WRONG — removed the poll entirely, proceeding with stale/incomplete register values. ZQ cal, DFI handshake, and PHY mailbox all require actual polling until the hardware responds. v2 uses trampoline functions appended to the binary: - Each poll site jumps to a trampoline that retries with a W16 counter - 16384 iterations (~91us at 1.8GHz) before timeout - On timeout, returns with condition NOT met (hits existing error path) - On success, returns normally (original behavior preserved) - W16 (IP0) used as counter — caller-saved, not used by poll loop bodies 35 poll loops patched, 13 non-poll backward branches left intact. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
292 lines
10 KiB
Python
292 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""RK3588 DDR Blob Patcher v2 — converts infinite poll loops to counted loops.
|
|
|
|
v1 (WRONG): NOP'd the backward branch, removing the poll entirely.
|
|
v2 (CORRECT): Replaces infinite polls with bounded retry loops.
|
|
|
|
Strategy: Each tight poll loop (LDR + optional test + B.cond back) is
|
|
patched to jump to a trampoline in unused space at the end of the binary.
|
|
The trampoline counts iterations and returns to the instruction after the
|
|
original branch when either:
|
|
- The poll condition is satisfied (normal path), OR
|
|
- The retry count expires (timeout path — returns with condition NOT met,
|
|
letting the existing error handling deal with it)
|
|
|
|
Trampoline structure (per patch site, 6 instructions = 24 bytes):
|
|
; entered from original B.cond site via unconditional B
|
|
MOV W16, #0x4000 ; ~16384 iterations ≈ ~100us at 1.8GHz
|
|
.retry:
|
|
LDR Wn, [Xbase, #off] ; re-execute the original load
|
|
<test> ; re-execute the original test (if any)
|
|
B.cond .done ; original condition met? exit loop
|
|
SUBS W16, W16, #1 ; decrement counter
|
|
B.NE .retry ; keep trying if counter > 0
|
|
.done:
|
|
B return_addr ; jump back after original branch site
|
|
|
|
W16 (IP0) is used as the counter — it's a scratch register in the AAPCS64
|
|
calling convention and not used by any of the poll loop bodies (which use
|
|
W0-W4 for register reads and X8-X12 for base addresses).
|
|
"""
|
|
|
|
import struct
|
|
import os
|
|
import sys
|
|
|
|
# AArch64 instruction encoding helpers
|
|
def encode_movz(rd, imm16, shift=0):
|
|
"""MOVZ Wd, #imm16, LSL #shift"""
|
|
hw = shift // 16
|
|
return 0x52800000 | (hw << 21) | (imm16 << 5) | rd
|
|
|
|
def encode_subs_imm(rd, rn, imm12):
|
|
"""SUBS Wd, Wn, #imm12"""
|
|
return 0x71000000 | (imm12 << 10) | (rn << 5) | rd
|
|
|
|
def encode_b(from_offset, to_offset):
|
|
"""B (unconditional branch), PC-relative"""
|
|
delta = (to_offset - from_offset) // 4
|
|
if delta < 0:
|
|
delta = delta & 0x3FFFFFF
|
|
return 0x14000000 | (delta & 0x3FFFFFF)
|
|
|
|
def encode_bcond(cond, from_offset, to_offset):
|
|
"""B.cond, PC-relative"""
|
|
delta = (to_offset - from_offset) // 4
|
|
imm19 = delta & 0x7FFFF
|
|
return 0x54000000 | (imm19 << 5) | cond
|
|
|
|
def encode_bne(from_offset, to_offset):
|
|
"""B.NE (cond=1)"""
|
|
return encode_bcond(1, from_offset, to_offset)
|
|
|
|
def invert_cond(cond):
|
|
"""Invert AArch64 condition code (flip bit 0)"""
|
|
return cond ^ 1
|
|
|
|
COND_NAMES = ['EQ','NE','CS','CC','MI','PL','VS','VC',
|
|
'HI','LS','GE','LT','GT','LE','AL','NV']
|
|
|
|
# Timeout: ~16384 iterations. At ~10 cycles/iteration on a 1.8GHz core,
|
|
# this is about 90 microseconds — generous for any DDR PHY operation
|
|
# (ZQ cal is typ 1us, max ~10us; DFI handshake is sub-microsecond).
|
|
TIMEOUT_ITERATIONS = 0x4000
|
|
|
|
# Counter register: W16 (IP0, caller-saved scratch in AAPCS64)
|
|
COUNTER_REG = 16
|
|
|
|
|
|
def find_poll_loops(blob):
|
|
"""Find tight backward-branch poll loops containing a load instruction."""
|
|
sites = []
|
|
for i in range(0, len(blob) - 12, 4):
|
|
inst = struct.unpack_from('<I', blob, i)[0]
|
|
# B.cond with backward offset
|
|
if (inst & 0xFF000010) != 0x54000000:
|
|
continue
|
|
imm19 = (inst >> 5) & 0x7FFFF
|
|
if not (imm19 & 0x40000):
|
|
continue # forward branch, skip
|
|
offset = -((~imm19 & 0x7FFFF) + 1) * 4
|
|
if not (-16 <= offset <= -4):
|
|
continue # not a tight loop
|
|
|
|
loop_start = i + offset
|
|
cond = inst & 0xF
|
|
|
|
# Collect loop body instructions
|
|
body = []
|
|
load_inst = None
|
|
load_offset = None
|
|
for j in range(loop_start, i, 4):
|
|
w = struct.unpack_from('<I', blob, j)[0]
|
|
body.append((j, w))
|
|
# LDR (unsigned offset): 32-bit and 64-bit variants
|
|
if (w & 0xFFC00000) in (0xB9400000, 0xF9400000, 0xB9800000):
|
|
load_inst = w
|
|
load_offset = j
|
|
|
|
if load_inst is None:
|
|
continue # no load in the loop body, not a register poll
|
|
|
|
sites.append({
|
|
'branch_offset': i,
|
|
'branch_inst': inst,
|
|
'loop_start': loop_start,
|
|
'loop_offset': offset,
|
|
'cond': cond,
|
|
'body': body,
|
|
'load_inst': load_inst,
|
|
'load_offset': load_offset,
|
|
})
|
|
|
|
return sites
|
|
|
|
|
|
def find_trampoline_space(blob, needed_bytes):
|
|
"""Find unused space for trampolines.
|
|
|
|
Look for NOP sleds or zero-filled regions near the end of the binary.
|
|
If nothing found, we'll append to the binary (safe for DDR blobs which
|
|
are loaded to a fixed SRAM address — extra bytes at the end are ignored).
|
|
"""
|
|
# First try: find a run of NOPs or zeros at the end
|
|
nop = 0xD503201F
|
|
pos = len(blob) - 4
|
|
free_start = len(blob)
|
|
while pos >= len(blob) - 4096:
|
|
w = struct.unpack_from('<I', blob, pos)[0]
|
|
if w == nop or w == 0:
|
|
free_start = pos
|
|
pos -= 4
|
|
else:
|
|
break
|
|
|
|
available = len(blob) - free_start
|
|
if available >= needed_bytes:
|
|
return free_start
|
|
|
|
# Fallback: append space to the binary
|
|
# The DDR blob is loaded to SRAM at a fixed size. Extra bytes at the
|
|
# end won't be executed unless jumped to (which is what we do).
|
|
# However, the loader might check size. Pad conservatively.
|
|
return len(blob) # caller will extend the blob
|
|
|
|
|
|
def build_trampoline(site, tramp_offset, return_offset):
|
|
"""Build a counted-loop trampoline for one poll site.
|
|
|
|
Layout (6 instructions, 24 bytes):
|
|
+0: MOV W16, #TIMEOUT
|
|
+4: LDR Wn, [Xbase, #off] ; copy of original load
|
|
+8: <test instruction> ; copy of test (if body has one)
|
|
or NOP ; if no separate test (load sets flags)
|
|
+12: B.inv_cond .done (+12) ; condition MET → exit (inverted!)
|
|
+16: SUBS W16, W16, #1 ; decrement counter
|
|
+20: B.NE .retry (-16, back to +4)
|
|
; fall through = timeout, condition NOT met
|
|
+24: B return_addr ; return (will hit error path)
|
|
.done:
|
|
+28: B return_addr ; return (success)
|
|
|
|
Total: 8 instructions = 32 bytes (safe margin)
|
|
"""
|
|
code = []
|
|
|
|
# +0: MOV W16, #TIMEOUT
|
|
code.append(encode_movz(COUNTER_REG, TIMEOUT_ITERATIONS))
|
|
|
|
# +4: Copy the original load instruction
|
|
code.append(site['load_inst'])
|
|
|
|
# +8: Copy any non-load body instructions (test/TST/AND/etc.)
|
|
# If the loop body has instructions between the load and the branch
|
|
# that aren't the load itself, copy them as the test.
|
|
test_inst = None
|
|
for off, w in site['body']:
|
|
if off != site['load_offset']:
|
|
test_inst = w
|
|
break
|
|
if test_inst is not None:
|
|
code.append(test_inst)
|
|
else:
|
|
code.append(0xD503201F) # NOP (load itself sets condition flags)
|
|
|
|
# +12: B.inv_cond → .done (jump forward to success exit)
|
|
# The original loop branches BACK when condition is true (still polling).
|
|
# We invert: if the INVERTED condition is true, polling is DONE.
|
|
inv_cond = invert_cond(site['cond'])
|
|
done_offset = tramp_offset + 28 # .done is at +28
|
|
code.append(encode_bcond(inv_cond, tramp_offset + 12, done_offset))
|
|
|
|
# +16: SUBS W16, W16, #1
|
|
code.append(encode_subs_imm(COUNTER_REG, COUNTER_REG, 1))
|
|
|
|
# +20: B.NE → .retry (back to +4)
|
|
retry_offset = tramp_offset + 4
|
|
code.append(encode_bne(tramp_offset + 20, retry_offset))
|
|
|
|
# +24: B return_addr (timeout — condition NOT met, error path)
|
|
code.append(encode_b(tramp_offset + 24, return_offset))
|
|
|
|
# +28: B return_addr (success — condition met)
|
|
code.append(encode_b(tramp_offset + 28, return_offset))
|
|
|
|
return code
|
|
|
|
|
|
def patch_blob(inpath, outpath):
|
|
with open(inpath, 'rb') as f:
|
|
blob = bytearray(f.read())
|
|
|
|
orig_size = len(blob)
|
|
sites = find_poll_loops(blob)
|
|
|
|
if not sites:
|
|
print("No poll loops found!")
|
|
return 0, len(blob)
|
|
|
|
# Each trampoline needs 32 bytes
|
|
tramp_total = len(sites) * 32
|
|
tramp_start = find_trampoline_space(blob, tramp_total)
|
|
|
|
if tramp_start >= len(blob):
|
|
# Extend the binary
|
|
blob.extend(b'\x00' * tramp_total)
|
|
|
|
patches = []
|
|
tramp_offset = tramp_start
|
|
|
|
for site in sites:
|
|
branch_offset = site['branch_offset']
|
|
return_offset = branch_offset + 4 # instruction after the original branch
|
|
|
|
# Build trampoline
|
|
tramp_code = build_trampoline(site, tramp_offset, return_offset)
|
|
|
|
# Write trampoline to blob
|
|
for idx, inst in enumerate(tramp_code):
|
|
struct.pack_into('<I', blob, tramp_offset + idx * 4, inst)
|
|
|
|
# Patch original branch site: unconditional B to trampoline
|
|
struct.pack_into('<I', blob, branch_offset,
|
|
encode_b(branch_offset, tramp_offset))
|
|
|
|
patches.append({
|
|
'addr': branch_offset,
|
|
'old': site['branch_inst'],
|
|
'cond': COND_NAMES[site['cond']],
|
|
'loop_offset': site['loop_offset'],
|
|
'trampoline': tramp_offset,
|
|
})
|
|
|
|
tramp_offset += 32
|
|
|
|
with open(outpath, 'wb') as f:
|
|
f.write(blob)
|
|
|
|
print(f'Patched {len(patches)} poll loops with counted-loop trampolines:')
|
|
print(f' Timeout: {TIMEOUT_ITERATIONS} iterations (~{TIMEOUT_ITERATIONS * 10 / 1800:.0f}us at 1.8GHz)')
|
|
print(f' Counter register: W{COUNTER_REG} (IP0, caller-saved)')
|
|
print(f' Trampoline area: 0x{tramp_start:05x} - 0x{tramp_offset:05x} ({tramp_offset - tramp_start} bytes)')
|
|
print()
|
|
for p in patches:
|
|
print(f' 0x{p["addr"]:05x}: B.{p["cond"]} {p["loop_offset"]} -> B trampoline@0x{p["trampoline"]:05x}')
|
|
|
|
if len(blob) != orig_size:
|
|
print(f'\nWARNING: Binary grew from {orig_size} to {len(blob)} bytes (+{len(blob) - orig_size})')
|
|
print(f' Verify the DDR blob loader accepts the larger size.')
|
|
else:
|
|
print(f'\nBinary size unchanged: {len(blob)} bytes')
|
|
|
|
return len(patches), len(blob)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
infile = sys.argv[1] if len(sys.argv) > 1 else \
|
|
'/opt/rkbin/bin/rk35/rk3588_ddr_lp4_2112MHz_lp5_2400MHz_v1.19.bin'
|
|
outfile = sys.argv[2] if len(sys.argv) > 2 else \
|
|
infile.replace('.bin', '_patched_v2.bin')
|
|
|
|
n, size = patch_blob(infile, outfile)
|