From 694be88964c73ed3db7e6b5c3cf675d58fed2252 Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Wed, 15 Apr 2026 01:06:51 +0200 Subject: [PATCH] v3 patcher: full-body trampolines + site bisection subsets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of counted_v2 brick identified: v2 copied only ONE non-load body instruction into each trampoline (picks the first after the LDR). For poll patterns of form LDR Wx, [Xbase, #off] AND Wx, Wx, #mask ; no flag update CMP Wx, #expected ; sets flags B.cond .retry — 9 of the 16 sites in v1.19 have this shape — the final CMP was silently dropped. The trampoline's B.inv_cond tested whatever flags happened to be set before entry, producing effectively random branch decisions once under the trampoline. Result: boot crashes before the UART banner, observed as 'power LED off' brick. Fix in v3: copy the ENTIRE loop body (LDR + all intermediate instructions, in original order) into each trampoline. Size is now 4*(N+6) where N is body length (28 bytes for body=2, 36 for body=3). Also in v3: - --sites subset flag for bisection (all/early/mid/late/none/index list) - decode_sites.py helper that tries to identify which MMIO register each site polls (best effort — the materialized_base scanner is naive and picks up stale MOVZ targets, but cluster grouping by blob offset is reliable and sufficient for bisection) Site clusters in v1.19: 0..7 early (0x07b78..0x07f08): SGRF + PHY firmware state machine 8..10 mid (0x09124..0x0aaf8): DfiStatus / training start 11..15 late (0x0d154..0x0d378): UctWriteProt / CalBusy / late Co-Authored-By: Claude Opus 4.6 (1M context) --- decode_sites.py | 194 +++++++++++++++++++++++++++++++++ patch_timeouts_v3.py | 254 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 448 insertions(+) create mode 100644 decode_sites.py create mode 100755 patch_timeouts_v3.py diff --git a/decode_sites.py b/decode_sites.py new file mode 100644 index 0000000..191db13 --- /dev/null +++ b/decode_sites.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +"""Decode each poll site: which base register + offset is being polled? + +For each site find_poll_loops() reports, we pull: + - the load instruction (LDR Wn, [Xbase, #off]) + - the base register number + - the immediate offset + - backward-scan for the MOV/MOVK/MOVZ sequence that materialised Xbase + - print a register-friendly label using the PHY register map from + BUG_ANALYSIS.md. +""" + +import struct +import sys + + +# Known base addresses from BUG_ANALYSIS / rk3588_ddr.h +KNOWN_BASES = { + 0xFE050000: 'SGRF_DDR', + 0xF2000000: 'DDR0_PHY', # per-channel PHY base guesses + 0xF3000000: 'DDR1_PHY', + 0xF4000000: 'DDR2_PHY', + 0xF5000000: 'DDR3_PHY', +} + +# Offsets within a PHY block, from BUG_ANALYSIS register table. +PHY_OFFSETS = { + 0x0684: 'CalBusy', + 0x0A24: 'DfiStatus', + 0x10080: 'MicroReset', + 0x10090: 'MicroContMuxSel', + 0x10514: 'UctWriteProtShadow', +} + +# SGRF offsets from BUG_ANALYSIS (MMIO absolute addresses for SGRF_DDR). +SGRF_ABS_OFFSETS = { + 0xFE050054: 'SGRF_DDR_CON21', + 0xFE0500E0: 'SGRF_DDR_STATUS', +} + + +def decode_ldr_unsigned_imm(w): + """Decode LDR (immediate, unsigned offset). + Returns (rt, rn, imm_bytes) or None for unsupported forms. + Encoding: size[31:30] | 111001 | 01 | imm12 | Rn | Rt (LDR unsigned) + """ + if (w & 0xBFC00000) == 0xB9400000: # 32-bit LDR + imm12 = (w >> 10) & 0xFFF + rn = (w >> 5) & 0x1F + rt = w & 0x1F + return (rt, rn, imm12 * 4, 32) + if (w & 0xBFC00000) == 0xF9400000: # 64-bit LDR + imm12 = (w >> 10) & 0xFFF + rn = (w >> 5) & 0x1F + rt = w & 0x1F + return (rt, rn, imm12 * 8, 64) + return None + + +def decode_movz(w): + if (w & 0x7F800000) == 0x52800000: # MOVZ 32-bit + hw = (w >> 21) & 0x3 + imm16 = (w >> 5) & 0xFFFF + rd = w & 0x1F + return ('MOVZ', rd, imm16, hw, 32) + if (w & 0x7F800000) == 0xD2800000: # MOVZ 64-bit + hw = (w >> 21) & 0x3 + imm16 = (w >> 5) & 0xFFFF + rd = w & 0x1F + return ('MOVZ', rd, imm16, hw, 64) + return None + + +def decode_movk(w): + if (w & 0x7F800000) == 0x72800000: # MOVK 32-bit + hw = (w >> 21) & 0x3 + imm16 = (w >> 5) & 0xFFFF + rd = w & 0x1F + return ('MOVK', rd, imm16, hw, 32) + if (w & 0x7F800000) == 0xF2800000: # MOVK 64-bit + hw = (w >> 21) & 0x3 + imm16 = (w >> 5) & 0xFFFF + rd = w & 0x1F + return ('MOVK', rd, imm16, hw, 64) + return None + + +def materialized_base(blob, load_offset, rn, window=64): + """Backward-scan up to `window` instructions looking for MOV/MOVZ/MOVK into Rn. + Reconstruct the full 64-bit immediate if it's a clean MOVZ+MOVK sequence. + Returns (addr, confidence) or (None, 'ind') if we can't pin it. + """ + addr = 0 + seen = {} + pos = load_offset - 4 + end = max(0, load_offset - window * 4) + hits = 0 + while pos >= end: + w = struct.unpack_from('> 5) & 0x7FFFF + if not (imm19 & 0x40000): + continue + offset = -((~imm19 & 0x7FFFF) + 1) * 4 + if not (-16 <= offset <= -4): + continue + + loop_start = i + offset + cond = inst & 0xF + load_inst = None + load_offset = None + for j in range(loop_start, i, 4): + w = struct.unpack_from('2} {'site':<7} {'br.cond':<8} {'base':<10} {'off':>6} target") + print('-' * 72) + for s in sites: + dec = decode_ldr_unsigned_imm(s['load_inst']) + if dec is None: + print(f"{s['idx']:>2} 0x{s['branch_offset']:05x} B.{COND_NAMES[s['cond']]:<6} ???? — unusual LDR form") + continue + rt, rn, off, sz = dec + base, hits = materialized_base(blob, s['load_offset'], rn, window=128) + base_str = f'0x{base:08x}' if base is not None else 'indirect' + label = classify(base, off) + print(f"{s['idx']:>2} 0x{s['branch_offset']:05x} B.{COND_NAMES[s['cond']]:<6} {base_str:<10} 0x{off:04x} {label} (X{rn}→W{rt}, {sz}b, {hits} mov)") + + +if __name__ == '__main__': + main() diff --git a/patch_timeouts_v3.py b/patch_timeouts_v3.py new file mode 100755 index 0000000..684865b --- /dev/null +++ b/patch_timeouts_v3.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +"""RK3588 DDR Blob Patcher v3 — counted-loop trampolines with SITE SUBSET. + +v2 BUG identified 2026-04-15: the original patcher copied at most ONE +body instruction beyond the LDR into the trampoline. For poll patterns of +form `LDR; AND; CMP; B.cond` (5 of the 16 sites), the CMP was dropped. +The trampoline's `B.inv_cond` then tested stale flags → random branch → +brick. + +v3 fix: copy the **full** original loop body (LDR + every intermediate +instruction, in original order) into the trampoline, so the last +flag-setting instruction matches the original loop. + +New trampoline layout (N = len(body) — usually 2 or 3): + +0: MOV W16, #TIMEOUT + +4 .. +4N: ; LDR + all test instructions + +4N+4: B.inv_cond .done + +4N+8: SUBS W16, W16, #1 + +4N+12: B.NE .retry (back to +4) + +4N+16: B return_addr ; timeout path + .done: + +4N+20: B return_addr ; success path + +Size per trampoline: 4 * (N + 5) bytes. With max body of 3 instructions +that's 4*8 = 32 bytes (same as v2); 2-inst bodies take 28 bytes. + +Site subset flags: + --sites all | early | mid | late | none + --sites 0,3,5-7 # explicit index list + +Site index is stable (determined by find_poll_loops' ascending-offset +ordering). As of rk3588_ddr_v1.19: + 0..7 EARLY cluster (0x07b78..0x07f08) — SGRF + PHY firmware fsm + 8..10 MID cluster (0x09124..0x0aaf8) — DfiStatus / training start + 11..15 LATE cluster (0x0d154..0x0d378) — UctWriteProt / CalBusy / late +""" + +import struct +import sys +import argparse + + +TIMEOUT_ITERATIONS = 0x4000 +COUNTER_REG = 16 + +COND_NAMES = ['EQ','NE','CS','CC','MI','PL','VS','VC', + 'HI','LS','GE','LT','GT','LE','AL','NV'] + +CLUSTERS = { + 'early': list(range(0, 8)), + 'mid': list(range(8, 11)), + 'late': list(range(11, 16)), + 'all': list(range(0, 16)), + 'none': [], +} + + +def encode_movz(rd, imm16, shift=0): + hw = shift // 16 + return 0x52800000 | (hw << 21) | (imm16 << 5) | rd + +def encode_subs_imm(rd, rn, imm12): + return 0x71000000 | (imm12 << 10) | (rn << 5) | rd + +def encode_b(from_offset, to_offset): + delta = (to_offset - from_offset) // 4 + if delta < 0: + delta = delta & 0x3FFFFFF + return 0x14000000 | (delta & 0x3FFFFFF) + +def encode_bcond(cond, from_offset, to_offset): + delta = (to_offset - from_offset) // 4 + imm19 = delta & 0x7FFFF + return 0x54000000 | (imm19 << 5) | cond + +def encode_bne(from_offset, to_offset): + return encode_bcond(1, from_offset, to_offset) + +def invert_cond(cond): + return cond ^ 1 + + +def find_poll_loops(blob): + """Find tight backward-branch poll loops of form ; B.cond back.""" + sites = [] + for i in range(0, len(blob) - 12, 4): + inst = struct.unpack_from('> 5) & 0x7FFFF + if not (imm19 & 0x40000): + continue + offset = -((~imm19 & 0x7FFFF) + 1) * 4 + if not (-16 <= offset <= -4): + continue + + loop_start = i + offset + cond = inst & 0xF + + # Body = every instruction from loop_start up to (but excluding) the branch. + body = [] + has_load = False + for j in range(loop_start, i, 4): + w = struct.unpack_from('