#!/usr/bin/env python3 """RK3588 DDR Blob Patcher v2 — converts infinite poll loops to counted loops. v1 (WRONG): NOP'd the backward branch, removing the poll entirely. v2 (CORRECT): Replaces infinite polls with bounded retry loops. Strategy: Each tight poll loop (LDR + optional test + B.cond back) is patched to jump to a trampoline in unused space at the end of the binary. The trampoline counts iterations and returns to the instruction after the original branch when either: - The poll condition is satisfied (normal path), OR - The retry count expires (timeout path — returns with condition NOT met, letting the existing error handling deal with it) Trampoline structure (per patch site, 6 instructions = 24 bytes): ; entered from original B.cond site via unconditional B MOV W16, #0x4000 ; ~16384 iterations ≈ ~100us at 1.8GHz .retry: LDR Wn, [Xbase, #off] ; re-execute the original load ; re-execute the original test (if any) B.cond .done ; original condition met? exit loop SUBS W16, W16, #1 ; decrement counter B.NE .retry ; keep trying if counter > 0 .done: B return_addr ; jump back after original branch site W16 (IP0) is used as the counter — it's a scratch register in the AAPCS64 calling convention and not used by any of the poll loop bodies (which use W0-W4 for register reads and X8-X12 for base addresses). """ import struct import os import sys # AArch64 instruction encoding helpers def encode_movz(rd, imm16, shift=0): """MOVZ Wd, #imm16, LSL #shift""" hw = shift // 16 return 0x52800000 | (hw << 21) | (imm16 << 5) | rd def encode_subs_imm(rd, rn, imm12): """SUBS Wd, Wn, #imm12""" return 0x71000000 | (imm12 << 10) | (rn << 5) | rd def encode_b(from_offset, to_offset): """B (unconditional branch), PC-relative""" delta = (to_offset - from_offset) // 4 if delta < 0: delta = delta & 0x3FFFFFF return 0x14000000 | (delta & 0x3FFFFFF) def encode_bcond(cond, from_offset, to_offset): """B.cond, PC-relative""" delta = (to_offset - from_offset) // 4 imm19 = delta & 0x7FFFF return 0x54000000 | (imm19 << 5) | cond def encode_bne(from_offset, to_offset): """B.NE (cond=1)""" return encode_bcond(1, from_offset, to_offset) def invert_cond(cond): """Invert AArch64 condition code (flip bit 0)""" return cond ^ 1 COND_NAMES = ['EQ','NE','CS','CC','MI','PL','VS','VC', 'HI','LS','GE','LT','GT','LE','AL','NV'] # Timeout: ~16384 iterations. At ~10 cycles/iteration on a 1.8GHz core, # this is about 90 microseconds — generous for any DDR PHY operation # (ZQ cal is typ 1us, max ~10us; DFI handshake is sub-microsecond). TIMEOUT_ITERATIONS = 0x4000 # Counter register: W16 (IP0, caller-saved scratch in AAPCS64) COUNTER_REG = 16 def find_poll_loops(blob): """Find tight backward-branch poll loops containing a load instruction.""" sites = [] for i in range(0, len(blob) - 12, 4): inst = struct.unpack_from('> 5) & 0x7FFFF if not (imm19 & 0x40000): continue # forward branch, skip offset = -((~imm19 & 0x7FFFF) + 1) * 4 if not (-16 <= offset <= -4): continue # not a tight loop loop_start = i + offset cond = inst & 0xF # Collect loop body instructions body = [] load_inst = None load_offset = None for j in range(loop_start, i, 4): w = struct.unpack_from('= len(blob) - 4096: w = struct.unpack_from('= needed_bytes: return free_start # Fallback: append space to the binary # The DDR blob is loaded to SRAM at a fixed size. Extra bytes at the # end won't be executed unless jumped to (which is what we do). # However, the loader might check size. Pad conservatively. return len(blob) # caller will extend the blob def build_trampoline(site, tramp_offset, return_offset): """Build a counted-loop trampoline for one poll site. Layout (6 instructions, 24 bytes): +0: MOV W16, #TIMEOUT +4: LDR Wn, [Xbase, #off] ; copy of original load +8: ; copy of test (if body has one) or NOP ; if no separate test (load sets flags) +12: B.inv_cond .done (+12) ; condition MET → exit (inverted!) +16: SUBS W16, W16, #1 ; decrement counter +20: B.NE .retry (-16, back to +4) ; fall through = timeout, condition NOT met +24: B return_addr ; return (will hit error path) .done: +28: B return_addr ; return (success) Total: 8 instructions = 32 bytes (safe margin) """ code = [] # +0: MOV W16, #TIMEOUT code.append(encode_movz(COUNTER_REG, TIMEOUT_ITERATIONS)) # +4: Copy the original load instruction code.append(site['load_inst']) # +8: Copy any non-load body instructions (test/TST/AND/etc.) # If the loop body has instructions between the load and the branch # that aren't the load itself, copy them as the test. test_inst = None for off, w in site['body']: if off != site['load_offset']: test_inst = w break if test_inst is not None: code.append(test_inst) else: code.append(0xD503201F) # NOP (load itself sets condition flags) # +12: B.inv_cond → .done (jump forward to success exit) # The original loop branches BACK when condition is true (still polling). # We invert: if the INVERTED condition is true, polling is DONE. inv_cond = invert_cond(site['cond']) done_offset = tramp_offset + 28 # .done is at +28 code.append(encode_bcond(inv_cond, tramp_offset + 12, done_offset)) # +16: SUBS W16, W16, #1 code.append(encode_subs_imm(COUNTER_REG, COUNTER_REG, 1)) # +20: B.NE → .retry (back to +4) retry_offset = tramp_offset + 4 code.append(encode_bne(tramp_offset + 20, retry_offset)) # +24: B return_addr (timeout — condition NOT met, error path) code.append(encode_b(tramp_offset + 24, return_offset)) # +28: B return_addr (success — condition met) code.append(encode_b(tramp_offset + 28, return_offset)) return code def patch_blob(inpath, outpath): with open(inpath, 'rb') as f: blob = bytearray(f.read()) orig_size = len(blob) sites = find_poll_loops(blob) if not sites: print("No poll loops found!") return 0, len(blob) # Each trampoline needs 32 bytes tramp_total = len(sites) * 32 tramp_start = find_trampoline_space(blob, tramp_total) if tramp_start >= len(blob): # Extend the binary blob.extend(b'\x00' * tramp_total) patches = [] tramp_offset = tramp_start for site in sites: branch_offset = site['branch_offset'] return_offset = branch_offset + 4 # instruction after the original branch # Build trampoline tramp_code = build_trampoline(site, tramp_offset, return_offset) # Write trampoline to blob for idx, inst in enumerate(tramp_code): struct.pack_into(' B trampoline@0x{p["trampoline"]:05x}') if len(blob) != orig_size: print(f'\nWARNING: Binary grew from {orig_size} to {len(blob)} bytes (+{len(blob) - orig_size})') print(f' Verify the DDR blob loader accepts the larger size.') else: print(f'\nBinary size unchanged: {len(blob)} bytes') return len(patches), len(blob) if __name__ == '__main__': infile = sys.argv[1] if len(sys.argv) > 1 else \ '/opt/rkbin/bin/rk35/rk3588_ddr_lp4_2112MHz_lp5_2400MHz_v1.19.bin' outfile = sys.argv[2] if len(sys.argv) > 2 else \ infile.replace('.bin', '_patched_v2.bin') n, size = patch_blob(infile, outfile)