Files
rk3588-ddr-analysis/patch_timeouts_v3.py
T
marfrit 694be88964 v3 patcher: full-body trampolines + site bisection subsets
Root cause of counted_v2 brick identified:

v2 copied only ONE non-load body instruction into each trampoline (picks
the first after the LDR). For poll patterns of form

    LDR   Wx, [Xbase, #off]
    AND   Wx, Wx, #mask     ; no flag update
    CMP   Wx, #expected     ; sets flags
    B.cond .retry

— 9 of the 16 sites in v1.19 have this shape — the final CMP was silently
dropped. The trampoline's B.inv_cond tested whatever flags happened to be
set before entry, producing effectively random branch decisions once
under the trampoline. Result: boot crashes before the UART banner,
observed as 'power LED off' brick.

Fix in v3: copy the ENTIRE loop body (LDR + all intermediate instructions,
in original order) into each trampoline. Size is now 4*(N+6) where N is
body length (28 bytes for body=2, 36 for body=3).

Also in v3:
- --sites subset flag for bisection (all/early/mid/late/none/index list)
- decode_sites.py helper that tries to identify which MMIO register each
  site polls (best effort — the materialized_base scanner is naive and
  picks up stale MOVZ targets, but cluster grouping by blob offset is
  reliable and sufficient for bisection)

Site clusters in v1.19:
  0..7   early (0x07b78..0x07f08): SGRF + PHY firmware state machine
  8..10  mid   (0x09124..0x0aaf8): DfiStatus / training start
  11..15 late  (0x0d154..0x0d378): UctWriteProt / CalBusy / late

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 01:06:51 +02:00

255 lines
8.4 KiB
Python
Executable File

#!/usr/bin/env python3
"""RK3588 DDR Blob Patcher v3 — counted-loop trampolines with SITE SUBSET.
v2 BUG identified 2026-04-15: the original patcher copied at most ONE
body instruction beyond the LDR into the trampoline. For poll patterns of
form `LDR; AND; CMP; B.cond` (5 of the 16 sites), the CMP was dropped.
The trampoline's `B.inv_cond` then tested stale flags → random branch →
brick.
v3 fix: copy the **full** original loop body (LDR + every intermediate
instruction, in original order) into the trampoline, so the last
flag-setting instruction matches the original loop.
New trampoline layout (N = len(body) — usually 2 or 3):
+0: MOV W16, #TIMEOUT
+4 .. +4N: <body[0] .. body[N-1]> ; LDR + all test instructions
+4N+4: B.inv_cond .done
+4N+8: SUBS W16, W16, #1
+4N+12: B.NE .retry (back to +4)
+4N+16: B return_addr ; timeout path
.done:
+4N+20: B return_addr ; success path
Size per trampoline: 4 * (N + 5) bytes. With max body of 3 instructions
that's 4*8 = 32 bytes (same as v2); 2-inst bodies take 28 bytes.
Site subset flags:
--sites all | early | mid | late | none
--sites 0,3,5-7 # explicit index list
Site index is stable (determined by find_poll_loops' ascending-offset
ordering). As of rk3588_ddr_v1.19:
0..7 EARLY cluster (0x07b78..0x07f08) — SGRF + PHY firmware fsm
8..10 MID cluster (0x09124..0x0aaf8) — DfiStatus / training start
11..15 LATE cluster (0x0d154..0x0d378) — UctWriteProt / CalBusy / late
"""
import struct
import sys
import argparse
TIMEOUT_ITERATIONS = 0x4000
COUNTER_REG = 16
COND_NAMES = ['EQ','NE','CS','CC','MI','PL','VS','VC',
'HI','LS','GE','LT','GT','LE','AL','NV']
CLUSTERS = {
'early': list(range(0, 8)),
'mid': list(range(8, 11)),
'late': list(range(11, 16)),
'all': list(range(0, 16)),
'none': [],
}
def encode_movz(rd, imm16, shift=0):
hw = shift // 16
return 0x52800000 | (hw << 21) | (imm16 << 5) | rd
def encode_subs_imm(rd, rn, imm12):
return 0x71000000 | (imm12 << 10) | (rn << 5) | rd
def encode_b(from_offset, to_offset):
delta = (to_offset - from_offset) // 4
if delta < 0:
delta = delta & 0x3FFFFFF
return 0x14000000 | (delta & 0x3FFFFFF)
def encode_bcond(cond, from_offset, to_offset):
delta = (to_offset - from_offset) // 4
imm19 = delta & 0x7FFFF
return 0x54000000 | (imm19 << 5) | cond
def encode_bne(from_offset, to_offset):
return encode_bcond(1, from_offset, to_offset)
def invert_cond(cond):
return cond ^ 1
def find_poll_loops(blob):
"""Find tight backward-branch poll loops of form <body-ending-in-LDR-or-test>; B.cond back."""
sites = []
for i in range(0, len(blob) - 12, 4):
inst = struct.unpack_from('<I', blob, i)[0]
if (inst & 0xFF000010) != 0x54000000:
continue
imm19 = (inst >> 5) & 0x7FFFF
if not (imm19 & 0x40000):
continue
offset = -((~imm19 & 0x7FFFF) + 1) * 4
if not (-16 <= offset <= -4):
continue
loop_start = i + offset
cond = inst & 0xF
# Body = every instruction from loop_start up to (but excluding) the branch.
body = []
has_load = False
for j in range(loop_start, i, 4):
w = struct.unpack_from('<I', blob, j)[0]
body.append(w)
if (w & 0xBFC00000) in (0xB9400000, 0xF9400000):
has_load = True
if not has_load:
continue # require at least one LDR — filters out pure cmp/b.ne spin loops
sites.append({
'idx': len(sites),
'branch_offset': i,
'cond': cond,
'body': body,
})
return sites
def build_trampoline(site, tramp_offset, return_offset):
"""Full-body trampoline: body + counter logic."""
body = site['body']
N = len(body)
code = []
# +0: MOVZ W16, #TIMEOUT
code.append(encode_movz(COUNTER_REG, TIMEOUT_ITERATIONS))
# +4 .. +4N: entire original body in order (LDR + any tests/cmps)
code.extend(body)
# +4N+4: B.inv_cond → .done
inv_cond = invert_cond(site['cond'])
done_off = tramp_offset + 4 * (N + 5)
code.append(encode_bcond(inv_cond, tramp_offset + 4 * (N + 1), done_off))
# +4N+8: SUBS W16, W16, #1
code.append(encode_subs_imm(COUNTER_REG, COUNTER_REG, 1))
# +4N+12: B.NE → .retry (back to +4, first body instruction)
code.append(encode_bne(tramp_offset + 4 * (N + 3), tramp_offset + 4))
# +4N+16: B return_addr (timeout fallthrough — condition NOT met)
code.append(encode_b(tramp_offset + 4 * (N + 4), return_offset))
# +4N+20 (.done): B return_addr (success — condition met)
code.append(encode_b(done_off, return_offset))
return code
def parse_sites(spec, max_index):
if spec in CLUSTERS:
return CLUSTERS[spec]
result = set()
for part in spec.split(','):
part = part.strip()
if not part:
continue
if part in CLUSTERS:
result.update(CLUSTERS[part])
elif '-' in part:
a, b = part.split('-', 1)
result.update(range(int(a), int(b) + 1))
else:
result.add(int(part))
return sorted(i for i in result if 0 <= i <= max_index)
def patch_blob(inpath, outpath, site_indices):
with open(inpath, 'rb') as f:
blob = bytearray(f.read())
orig_size = len(blob)
all_sites = find_poll_loops(blob)
if not all_sites:
print('No poll loops found!', file=sys.stderr)
return 0, len(blob)
site_indices = [i for i in site_indices if i < len(all_sites)]
if not site_indices:
print('No sites selected — writing unmodified blob.')
with open(outpath, 'wb') as f:
f.write(blob)
return 0, len(blob)
# Per-site trampoline sizes vary (body length varies). Precompute.
tramp_sizes = [(4 * (len(all_sites[i]['body']) + 6)) for i in site_indices]
# Align each to 32 bytes to keep things tidy.
tramp_total = sum(tramp_sizes)
tramp_start = len(blob)
blob.extend(b'\x00' * tramp_total)
patches = []
tramp_offset = tramp_start
for idx, slot_size in zip(site_indices, tramp_sizes):
site = all_sites[idx]
branch_offset = site['branch_offset']
return_offset = branch_offset + 4
tramp_code = build_trampoline(site, tramp_offset, return_offset)
for j, inst in enumerate(tramp_code):
struct.pack_into('<I', blob, tramp_offset + j * 4, inst)
struct.pack_into('<I', blob, branch_offset,
encode_b(branch_offset, tramp_offset))
patches.append({
'idx': idx,
'addr': branch_offset,
'cond': COND_NAMES[site['cond']],
'body_len': len(site['body']),
'trampoline': tramp_offset,
})
tramp_offset += slot_size
with open(outpath, 'wb') as f:
f.write(blob)
total_sites = len(all_sites)
print(f'Patched {len(patches)}/{total_sites} poll sites (full-body trampolines):')
print(f' Timeout: {TIMEOUT_ITERATIONS} iter (~{TIMEOUT_ITERATIONS*10/1800:.0f}us @1.8GHz)')
for p in patches:
print(f' site {p["idx"]:2d}: 0x{p["addr"]:05x} B.{p["cond"]:<2s} body={p["body_len"]} → tramp@0x{p["trampoline"]:05x}')
unpatched = [i for i in range(total_sites) if i not in [p["idx"] for p in patches]]
if unpatched:
print(f' UNPATCHED (stock): {unpatched}')
print(f'\nBinary: {orig_size}{len(blob)} bytes (+{len(blob)-orig_size})')
print(f'Output: {outpath}')
return len(patches), len(blob)
def main():
ap = argparse.ArgumentParser()
ap.add_argument('--sites', default='all',
help="Sites to patch: 'all', 'early', 'mid', 'late', 'none',"
" or comma/hyphen list (e.g. '0,3,5-7'). Default: all.")
ap.add_argument('input', help='Input DDR blob')
ap.add_argument('output', nargs='?',
help='Output path (default: <input>_v3.bin)')
args = ap.parse_args()
outpath = args.output or args.input.replace('.bin', '_v3.bin')
with open(args.input, 'rb') as f:
all_sites = find_poll_loops(bytearray(f.read()))
max_idx = len(all_sites) - 1 if all_sites else 0
indices = parse_sites(args.sites, max_idx)
patch_blob(args.input, outpath, indices)
if __name__ == '__main__':
main()