diff --git a/patch_timeouts.py b/patch_timeouts.py index cf2ac25..1df83a2 100644 --- a/patch_timeouts.py +++ b/patch_timeouts.py @@ -1,48 +1,291 @@ #!/usr/bin/env python3 -"""RK3588 DDR Blob Patcher - converts infinite poll loops to single checks.""" -import struct, os +"""RK3588 DDR Blob Patcher v2 — converts infinite poll loops to counted loops. + +v1 (WRONG): NOP'd the backward branch, removing the poll entirely. +v2 (CORRECT): Replaces infinite polls with bounded retry loops. + +Strategy: Each tight poll loop (LDR + optional test + B.cond back) is +patched to jump to a trampoline in unused space at the end of the binary. +The trampoline counts iterations and returns to the instruction after the +original branch when either: + - The poll condition is satisfied (normal path), OR + - The retry count expires (timeout path — returns with condition NOT met, + letting the existing error handling deal with it) + +Trampoline structure (per patch site, 6 instructions = 24 bytes): + ; entered from original B.cond site via unconditional B + MOV W16, #0x4000 ; ~16384 iterations ≈ ~100us at 1.8GHz + .retry: + LDR Wn, [Xbase, #off] ; re-execute the original load + ; re-execute the original test (if any) + B.cond .done ; original condition met? exit loop + SUBS W16, W16, #1 ; decrement counter + B.NE .retry ; keep trying if counter > 0 + .done: + B return_addr ; jump back after original branch site + +W16 (IP0) is used as the counter — it's a scratch register in the AAPCS64 +calling convention and not used by any of the poll loop bodies (which use +W0-W4 for register reads and X8-X12 for base addresses). +""" + +import struct +import os +import sys + +# AArch64 instruction encoding helpers +def encode_movz(rd, imm16, shift=0): + """MOVZ Wd, #imm16, LSL #shift""" + hw = shift // 16 + return 0x52800000 | (hw << 21) | (imm16 << 5) | rd + +def encode_subs_imm(rd, rn, imm12): + """SUBS Wd, Wn, #imm12""" + return 0x71000000 | (imm12 << 10) | (rn << 5) | rd + +def encode_b(from_offset, to_offset): + """B (unconditional branch), PC-relative""" + delta = (to_offset - from_offset) // 4 + if delta < 0: + delta = delta & 0x3FFFFFF + return 0x14000000 | (delta & 0x3FFFFFF) + +def encode_bcond(cond, from_offset, to_offset): + """B.cond, PC-relative""" + delta = (to_offset - from_offset) // 4 + imm19 = delta & 0x7FFFF + return 0x54000000 | (imm19 << 5) | cond + +def encode_bne(from_offset, to_offset): + """B.NE (cond=1)""" + return encode_bcond(1, from_offset, to_offset) + +def invert_cond(cond): + """Invert AArch64 condition code (flip bit 0)""" + return cond ^ 1 + +COND_NAMES = ['EQ','NE','CS','CC','MI','PL','VS','VC', + 'HI','LS','GE','LT','GT','LE','AL','NV'] + +# Timeout: ~16384 iterations. At ~10 cycles/iteration on a 1.8GHz core, +# this is about 90 microseconds — generous for any DDR PHY operation +# (ZQ cal is typ 1us, max ~10us; DFI handshake is sub-microsecond). +TIMEOUT_ITERATIONS = 0x4000 + +# Counter register: W16 (IP0, caller-saved scratch in AAPCS64) +COUNTER_REG = 16 + + +def find_poll_loops(blob): + """Find tight backward-branch poll loops containing a load instruction.""" + sites = [] + for i in range(0, len(blob) - 12, 4): + inst = struct.unpack_from('> 5) & 0x7FFFF + if not (imm19 & 0x40000): + continue # forward branch, skip + offset = -((~imm19 & 0x7FFFF) + 1) * 4 + if not (-16 <= offset <= -4): + continue # not a tight loop + + loop_start = i + offset + cond = inst & 0xF + + # Collect loop body instructions + body = [] + load_inst = None + load_offset = None + for j in range(loop_start, i, 4): + w = struct.unpack_from('= len(blob) - 4096: + w = struct.unpack_from('= needed_bytes: + return free_start + + # Fallback: append space to the binary + # The DDR blob is loaded to SRAM at a fixed size. Extra bytes at the + # end won't be executed unless jumped to (which is what we do). + # However, the loader might check size. Pad conservatively. + return len(blob) # caller will extend the blob + + +def build_trampoline(site, tramp_offset, return_offset): + """Build a counted-loop trampoline for one poll site. + + Layout (6 instructions, 24 bytes): + +0: MOV W16, #TIMEOUT + +4: LDR Wn, [Xbase, #off] ; copy of original load + +8: ; copy of test (if body has one) + or NOP ; if no separate test (load sets flags) + +12: B.inv_cond .done (+12) ; condition MET → exit (inverted!) + +16: SUBS W16, W16, #1 ; decrement counter + +20: B.NE .retry (-16, back to +4) + ; fall through = timeout, condition NOT met + +24: B return_addr ; return (will hit error path) + .done: + +28: B return_addr ; return (success) + + Total: 8 instructions = 32 bytes (safe margin) + """ + code = [] + + # +0: MOV W16, #TIMEOUT + code.append(encode_movz(COUNTER_REG, TIMEOUT_ITERATIONS)) + + # +4: Copy the original load instruction + code.append(site['load_inst']) + + # +8: Copy any non-load body instructions (test/TST/AND/etc.) + # If the loop body has instructions between the load and the branch + # that aren't the load itself, copy them as the test. + test_inst = None + for off, w in site['body']: + if off != site['load_offset']: + test_inst = w + break + if test_inst is not None: + code.append(test_inst) + else: + code.append(0xD503201F) # NOP (load itself sets condition flags) + + # +12: B.inv_cond → .done (jump forward to success exit) + # The original loop branches BACK when condition is true (still polling). + # We invert: if the INVERTED condition is true, polling is DONE. + inv_cond = invert_cond(site['cond']) + done_offset = tramp_offset + 28 # .done is at +28 + code.append(encode_bcond(inv_cond, tramp_offset + 12, done_offset)) + + # +16: SUBS W16, W16, #1 + code.append(encode_subs_imm(COUNTER_REG, COUNTER_REG, 1)) + + # +20: B.NE → .retry (back to +4) + retry_offset = tramp_offset + 4 + code.append(encode_bne(tramp_offset + 20, retry_offset)) + + # +24: B return_addr (timeout — condition NOT met, error path) + code.append(encode_b(tramp_offset + 24, return_offset)) + + # +28: B return_addr (success — condition met) + code.append(encode_b(tramp_offset + 28, return_offset)) + + return code + def patch_blob(inpath, outpath): with open(inpath, 'rb') as f: blob = bytearray(f.read()) - - patched = 0 + + orig_size = len(blob) + sites = find_poll_loops(blob) + + if not sites: + print("No poll loops found!") + return 0, len(blob) + + # Each trampoline needs 32 bytes + tramp_total = len(sites) * 32 + tramp_start = find_trampoline_space(blob, tramp_total) + + if tramp_start >= len(blob): + # Extend the binary + blob.extend(b'\x00' * tramp_total) + patches = [] - NOP = 0xD503201F - - for i in range(0, len(blob) - 12, 4): - inst = struct.unpack_from('> 5) & 0x7FFFF - if imm19 & 0x40000: - offset = -((~imm19 & 0x7FFFF) + 1) * 4 - if -16 <= offset <= -4: - loop_start = i + offset - has_load = False - for j in range(loop_start, i, 4): - w = struct.unpack_from(' NOP') - - return patched, len(blob) -infile = '/opt/rkbin/bin/rk35/rk3588_ddr_lp4_2112MHz_lp5_2400MHz_v1.19.bin' -outfile = '/opt/work/rk3588_ddr_v1.19_patched.bin' -n, size = patch_blob(infile, outfile) -orig_size = os.path.getsize(infile) -print(f'\nOriginal: {orig_size}, Patched: {size} ({"MATCH" if orig_size == size else "MISMATCH!"})') + print(f'Patched {len(patches)} poll loops with counted-loop trampolines:') + print(f' Timeout: {TIMEOUT_ITERATIONS} iterations (~{TIMEOUT_ITERATIONS * 10 / 1800:.0f}us at 1.8GHz)') + print(f' Counter register: W{COUNTER_REG} (IP0, caller-saved)') + print(f' Trampoline area: 0x{tramp_start:05x} - 0x{tramp_offset:05x} ({tramp_offset - tramp_start} bytes)') + print() + for p in patches: + print(f' 0x{p["addr"]:05x}: B.{p["cond"]} {p["loop_offset"]} -> B trampoline@0x{p["trampoline"]:05x}') + + if len(blob) != orig_size: + print(f'\nWARNING: Binary grew from {orig_size} to {len(blob)} bytes (+{len(blob) - orig_size})') + print(f' Verify the DDR blob loader accepts the larger size.') + else: + print(f'\nBinary size unchanged: {len(blob)} bytes') + + return len(patches), len(blob) + + +if __name__ == '__main__': + infile = sys.argv[1] if len(sys.argv) > 1 else \ + '/opt/rkbin/bin/rk35/rk3588_ddr_lp4_2112MHz_lp5_2400MHz_v1.19.bin' + outfile = sys.argv[2] if len(sys.argv) > 2 else \ + infile.replace('.bin', '_patched_v2.bin') + + n, size = patch_blob(infile, outfile) diff --git a/rk3588_ddr_v1.19_counted_v2.bin b/rk3588_ddr_v1.19_counted_v2.bin new file mode 100644 index 0000000..9fc8380 Binary files /dev/null and b/rk3588_ddr_v1.19_counted_v2.bin differ