694be88964
Root cause of counted_v2 brick identified:
v2 copied only ONE non-load body instruction into each trampoline (picks
the first after the LDR). For poll patterns of form
LDR Wx, [Xbase, #off]
AND Wx, Wx, #mask ; no flag update
CMP Wx, #expected ; sets flags
B.cond .retry
— 9 of the 16 sites in v1.19 have this shape — the final CMP was silently
dropped. The trampoline's B.inv_cond tested whatever flags happened to be
set before entry, producing effectively random branch decisions once
under the trampoline. Result: boot crashes before the UART banner,
observed as 'power LED off' brick.
Fix in v3: copy the ENTIRE loop body (LDR + all intermediate instructions,
in original order) into each trampoline. Size is now 4*(N+6) where N is
body length (28 bytes for body=2, 36 for body=3).
Also in v3:
- --sites subset flag for bisection (all/early/mid/late/none/index list)
- decode_sites.py helper that tries to identify which MMIO register each
site polls (best effort — the materialized_base scanner is naive and
picks up stale MOVZ targets, but cluster grouping by blob offset is
reliable and sufficient for bisection)
Site clusters in v1.19:
0..7 early (0x07b78..0x07f08): SGRF + PHY firmware state machine
8..10 mid (0x09124..0x0aaf8): DfiStatus / training start
11..15 late (0x0d154..0x0d378): UctWriteProt / CalBusy / late
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
255 lines
8.4 KiB
Python
Executable File
255 lines
8.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""RK3588 DDR Blob Patcher v3 — counted-loop trampolines with SITE SUBSET.
|
|
|
|
v2 BUG identified 2026-04-15: the original patcher copied at most ONE
|
|
body instruction beyond the LDR into the trampoline. For poll patterns of
|
|
form `LDR; AND; CMP; B.cond` (5 of the 16 sites), the CMP was dropped.
|
|
The trampoline's `B.inv_cond` then tested stale flags → random branch →
|
|
brick.
|
|
|
|
v3 fix: copy the **full** original loop body (LDR + every intermediate
|
|
instruction, in original order) into the trampoline, so the last
|
|
flag-setting instruction matches the original loop.
|
|
|
|
New trampoline layout (N = len(body) — usually 2 or 3):
|
|
+0: MOV W16, #TIMEOUT
|
|
+4 .. +4N: <body[0] .. body[N-1]> ; LDR + all test instructions
|
|
+4N+4: B.inv_cond .done
|
|
+4N+8: SUBS W16, W16, #1
|
|
+4N+12: B.NE .retry (back to +4)
|
|
+4N+16: B return_addr ; timeout path
|
|
.done:
|
|
+4N+20: B return_addr ; success path
|
|
|
|
Size per trampoline: 4 * (N + 5) bytes. With max body of 3 instructions
|
|
that's 4*8 = 32 bytes (same as v2); 2-inst bodies take 28 bytes.
|
|
|
|
Site subset flags:
|
|
--sites all | early | mid | late | none
|
|
--sites 0,3,5-7 # explicit index list
|
|
|
|
Site index is stable (determined by find_poll_loops' ascending-offset
|
|
ordering). As of rk3588_ddr_v1.19:
|
|
0..7 EARLY cluster (0x07b78..0x07f08) — SGRF + PHY firmware fsm
|
|
8..10 MID cluster (0x09124..0x0aaf8) — DfiStatus / training start
|
|
11..15 LATE cluster (0x0d154..0x0d378) — UctWriteProt / CalBusy / late
|
|
"""
|
|
|
|
import struct
|
|
import sys
|
|
import argparse
|
|
|
|
|
|
TIMEOUT_ITERATIONS = 0x4000
|
|
COUNTER_REG = 16
|
|
|
|
COND_NAMES = ['EQ','NE','CS','CC','MI','PL','VS','VC',
|
|
'HI','LS','GE','LT','GT','LE','AL','NV']
|
|
|
|
CLUSTERS = {
|
|
'early': list(range(0, 8)),
|
|
'mid': list(range(8, 11)),
|
|
'late': list(range(11, 16)),
|
|
'all': list(range(0, 16)),
|
|
'none': [],
|
|
}
|
|
|
|
|
|
def encode_movz(rd, imm16, shift=0):
|
|
hw = shift // 16
|
|
return 0x52800000 | (hw << 21) | (imm16 << 5) | rd
|
|
|
|
def encode_subs_imm(rd, rn, imm12):
|
|
return 0x71000000 | (imm12 << 10) | (rn << 5) | rd
|
|
|
|
def encode_b(from_offset, to_offset):
|
|
delta = (to_offset - from_offset) // 4
|
|
if delta < 0:
|
|
delta = delta & 0x3FFFFFF
|
|
return 0x14000000 | (delta & 0x3FFFFFF)
|
|
|
|
def encode_bcond(cond, from_offset, to_offset):
|
|
delta = (to_offset - from_offset) // 4
|
|
imm19 = delta & 0x7FFFF
|
|
return 0x54000000 | (imm19 << 5) | cond
|
|
|
|
def encode_bne(from_offset, to_offset):
|
|
return encode_bcond(1, from_offset, to_offset)
|
|
|
|
def invert_cond(cond):
|
|
return cond ^ 1
|
|
|
|
|
|
def find_poll_loops(blob):
|
|
"""Find tight backward-branch poll loops of form <body-ending-in-LDR-or-test>; B.cond back."""
|
|
sites = []
|
|
for i in range(0, len(blob) - 12, 4):
|
|
inst = struct.unpack_from('<I', blob, i)[0]
|
|
if (inst & 0xFF000010) != 0x54000000:
|
|
continue
|
|
imm19 = (inst >> 5) & 0x7FFFF
|
|
if not (imm19 & 0x40000):
|
|
continue
|
|
offset = -((~imm19 & 0x7FFFF) + 1) * 4
|
|
if not (-16 <= offset <= -4):
|
|
continue
|
|
|
|
loop_start = i + offset
|
|
cond = inst & 0xF
|
|
|
|
# Body = every instruction from loop_start up to (but excluding) the branch.
|
|
body = []
|
|
has_load = False
|
|
for j in range(loop_start, i, 4):
|
|
w = struct.unpack_from('<I', blob, j)[0]
|
|
body.append(w)
|
|
if (w & 0xBFC00000) in (0xB9400000, 0xF9400000):
|
|
has_load = True
|
|
|
|
if not has_load:
|
|
continue # require at least one LDR — filters out pure cmp/b.ne spin loops
|
|
|
|
sites.append({
|
|
'idx': len(sites),
|
|
'branch_offset': i,
|
|
'cond': cond,
|
|
'body': body,
|
|
})
|
|
return sites
|
|
|
|
|
|
def build_trampoline(site, tramp_offset, return_offset):
|
|
"""Full-body trampoline: body + counter logic."""
|
|
body = site['body']
|
|
N = len(body)
|
|
code = []
|
|
|
|
# +0: MOVZ W16, #TIMEOUT
|
|
code.append(encode_movz(COUNTER_REG, TIMEOUT_ITERATIONS))
|
|
|
|
# +4 .. +4N: entire original body in order (LDR + any tests/cmps)
|
|
code.extend(body)
|
|
|
|
# +4N+4: B.inv_cond → .done
|
|
inv_cond = invert_cond(site['cond'])
|
|
done_off = tramp_offset + 4 * (N + 5)
|
|
code.append(encode_bcond(inv_cond, tramp_offset + 4 * (N + 1), done_off))
|
|
|
|
# +4N+8: SUBS W16, W16, #1
|
|
code.append(encode_subs_imm(COUNTER_REG, COUNTER_REG, 1))
|
|
|
|
# +4N+12: B.NE → .retry (back to +4, first body instruction)
|
|
code.append(encode_bne(tramp_offset + 4 * (N + 3), tramp_offset + 4))
|
|
|
|
# +4N+16: B return_addr (timeout fallthrough — condition NOT met)
|
|
code.append(encode_b(tramp_offset + 4 * (N + 4), return_offset))
|
|
|
|
# +4N+20 (.done): B return_addr (success — condition met)
|
|
code.append(encode_b(done_off, return_offset))
|
|
|
|
return code
|
|
|
|
|
|
def parse_sites(spec, max_index):
|
|
if spec in CLUSTERS:
|
|
return CLUSTERS[spec]
|
|
result = set()
|
|
for part in spec.split(','):
|
|
part = part.strip()
|
|
if not part:
|
|
continue
|
|
if part in CLUSTERS:
|
|
result.update(CLUSTERS[part])
|
|
elif '-' in part:
|
|
a, b = part.split('-', 1)
|
|
result.update(range(int(a), int(b) + 1))
|
|
else:
|
|
result.add(int(part))
|
|
return sorted(i for i in result if 0 <= i <= max_index)
|
|
|
|
|
|
def patch_blob(inpath, outpath, site_indices):
|
|
with open(inpath, 'rb') as f:
|
|
blob = bytearray(f.read())
|
|
|
|
orig_size = len(blob)
|
|
all_sites = find_poll_loops(blob)
|
|
if not all_sites:
|
|
print('No poll loops found!', file=sys.stderr)
|
|
return 0, len(blob)
|
|
|
|
site_indices = [i for i in site_indices if i < len(all_sites)]
|
|
if not site_indices:
|
|
print('No sites selected — writing unmodified blob.')
|
|
with open(outpath, 'wb') as f:
|
|
f.write(blob)
|
|
return 0, len(blob)
|
|
|
|
# Per-site trampoline sizes vary (body length varies). Precompute.
|
|
tramp_sizes = [(4 * (len(all_sites[i]['body']) + 6)) for i in site_indices]
|
|
# Align each to 32 bytes to keep things tidy.
|
|
tramp_total = sum(tramp_sizes)
|
|
|
|
tramp_start = len(blob)
|
|
blob.extend(b'\x00' * tramp_total)
|
|
|
|
patches = []
|
|
tramp_offset = tramp_start
|
|
|
|
for idx, slot_size in zip(site_indices, tramp_sizes):
|
|
site = all_sites[idx]
|
|
branch_offset = site['branch_offset']
|
|
return_offset = branch_offset + 4
|
|
tramp_code = build_trampoline(site, tramp_offset, return_offset)
|
|
for j, inst in enumerate(tramp_code):
|
|
struct.pack_into('<I', blob, tramp_offset + j * 4, inst)
|
|
struct.pack_into('<I', blob, branch_offset,
|
|
encode_b(branch_offset, tramp_offset))
|
|
patches.append({
|
|
'idx': idx,
|
|
'addr': branch_offset,
|
|
'cond': COND_NAMES[site['cond']],
|
|
'body_len': len(site['body']),
|
|
'trampoline': tramp_offset,
|
|
})
|
|
tramp_offset += slot_size
|
|
|
|
with open(outpath, 'wb') as f:
|
|
f.write(blob)
|
|
|
|
total_sites = len(all_sites)
|
|
print(f'Patched {len(patches)}/{total_sites} poll sites (full-body trampolines):')
|
|
print(f' Timeout: {TIMEOUT_ITERATIONS} iter (~{TIMEOUT_ITERATIONS*10/1800:.0f}us @1.8GHz)')
|
|
for p in patches:
|
|
print(f' site {p["idx"]:2d}: 0x{p["addr"]:05x} B.{p["cond"]:<2s} body={p["body_len"]} → tramp@0x{p["trampoline"]:05x}')
|
|
unpatched = [i for i in range(total_sites) if i not in [p["idx"] for p in patches]]
|
|
if unpatched:
|
|
print(f' UNPATCHED (stock): {unpatched}')
|
|
print(f'\nBinary: {orig_size} → {len(blob)} bytes (+{len(blob)-orig_size})')
|
|
print(f'Output: {outpath}')
|
|
return len(patches), len(blob)
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument('--sites', default='all',
|
|
help="Sites to patch: 'all', 'early', 'mid', 'late', 'none',"
|
|
" or comma/hyphen list (e.g. '0,3,5-7'). Default: all.")
|
|
ap.add_argument('input', help='Input DDR blob')
|
|
ap.add_argument('output', nargs='?',
|
|
help='Output path (default: <input>_v3.bin)')
|
|
args = ap.parse_args()
|
|
|
|
outpath = args.output or args.input.replace('.bin', '_v3.bin')
|
|
|
|
with open(args.input, 'rb') as f:
|
|
all_sites = find_poll_loops(bytearray(f.read()))
|
|
max_idx = len(all_sites) - 1 if all_sites else 0
|
|
indices = parse_sites(args.sites, max_idx)
|
|
|
|
patch_blob(args.input, outpath, indices)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|