benchmark/: three-way RE-tool comparison + first real C-lift

Three small functions extracted from the v1.19 conservative blob with ground-truth C and per-tool (Ghidra / retdec / decomp.me) docs: 01_memset — byte memset, 28 B 02_memcpy32 — word-aligned memcpy, 36 B 03_magic_memset — magic check + tail-call to memset, 40 B 04_train_phy_block — first real poll-site function (104 B, 26 insts), contains poll sites 12-15 Results in RESULTS.md: - Ghidra: A on all four. Auto-decompile is close to final. - retdec: A on #3, F on #1 and #2 (no register-arg inference on raw), C on #4 (mistakes & 0xF0000000 for < 0x10000000). GRIND_LOG.md (in 04_train_phy_block/) records the matching-decomp iteration: 116-byte candidate.c at -Os vs vendor 104 bytes = 89.7% size match on first real iteration. Remaining gap is GCC's choice of `cmp w, w_const; b.ls` over vendor's `tst w, #imm; b.eq` for the mask tests. gdb_debug/ holds a native-aarch64 GDB single-stepper for the three benchmark functions — boltzmann smoke test passed (memset: buf[10] 0x00→0xab). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 07:26:23 +02:00
parent 694be88964
commit 00d655187a
32 changed files with 1113 additions and 0 deletions
@@ -0,0 +1,58 @@
+# decomp.me recipe — 01_memset
+
+## Create a scratch
+
+Open https://decomp.me/ (or your self-hosted instance at
+http://192.168.88.64 if available). Click **New scratch**.
+
+- **Platform / Compiler:** `gcc 12.x aarch64-linux-gnu` (or whatever
+  aarch64-gcc is offered — exact version doesn't matter much for this
+  size).
+- **Compiler flags:** `-O2 -ffreestanding -nostdlib`
+- **Diff label:** `memset_byte`
+
+## Target asm
+
+Paste the following into the **"Target asm"** box:
+
+```asm
+memset_byte:
+    mov     x3, #0x0
+.Lloop:
+    cmp     x2, x3
+    b.ne    .Lbody
+    ret
+.Lbody:
+    strb    w1, [x0, x3]
+    add     x3, x3, #0x1
+    b       .Lloop
+```
+
+## Context (headers/decls)
+
+```c
+#include <stddef.h>
+#include <stdint.h>
+```
+
+## Source
+
+Paste the ground-truth C from `reference.c` (or write your own first
+and iterate).
+
+## Expected workflow
+
+- First compile: scorer usually reports a high similarity (>= 80%) if
+  the compiler picks the same `while (i != n)` pattern.
+- Fine-tune: try `i++` vs `i+=1`, try `while` vs `for`, try `uint8_t *`
+  cast placement. Each yields a distinct register-allocation order the
+  scorer rewards or punishes.
+- Perfect match possible if you hit the exact code shape GCC chose.
+
+## Benchmark notes
+
+- decomp.me's strength is the **compile-and-diff** feedback loop — every
+  edit immediately shows the byte-delta against the target.
+- Weakness for this target: the real blob was likely built with a
+  different compiler (ARMCC / Keil / vendor LLVM?). GCC may never match
+  exactly even with perfect C. Similarity >= 90% is the realistic ceiling.
@@ -0,0 +1,14 @@
+
+01_memset/func.bin:     file format binary
+
+
+Disassembly of section .data:
+
+0000000000000aac <.data>:
+ aac:	d2800003 	mov	x3, #0x0                   	// #0
+ ab0:	eb03005f 	cmp	x2, x3
+ ab4:	54000041 	b.ne	0xabc  // b.any
+ ab8:	d65f03c0 	ret
+ abc:	38236801 	strb	w1, [x0, x3]
+ ac0:	91000463 	add	x3, x3, #0x1
+ ac4:	17fffffb 	b	0xab0
@@ -0,0 +1,41 @@
+# Ghidra recipe — 01_memset
+
+## Load
+
+**File → Import File…** → `func.bin`.
+
+In the import dialog:
+- **Format:** Raw Binary
+- **Language:** AArch64:LE:64:v8A
+- **Base Address:** `0x0aac`  ← critical; branches are PC-relative and the
+  absolute function address matters for readability (though the code at
+  0xaac has no absolute-addr refs of its own).
+
+After import, click **Yes** on the "Analyze now?" prompt; default
+analyzers are fine.
+
+## What to look for in Ghidra's decompiler output
+
+- Function automatically detected at 0xaac (the file starts there).
+- Decompiler should produce something like:
+  ```c
+  void FUN_00000aac(long param_1, byte param_2, long param_3) {
+      long local_10 = 0;
+      while (local_10 != param_3) {
+          *(byte *)(param_1 + local_10) = param_2;
+          local_10++;
+      }
+  }
+  ```
+- Idiomatic match rate: high for this pattern; Ghidra's decompiler
+  recognises the pre-test loop well.
+- Ghidra types: `byte` (uint8_t), `long` (the 64-bit register) — not
+  directly `uint8_t` / `size_t`. Manual retyping is usually needed.
+
+## Benchmark notes
+
+- Time to understandable output: ~seconds (auto-analysis).
+- Manual cleanup: rename `FUN_00000aac` → `memset_byte`; retype
+  `param_1` to `void *`, `param_2` to `uint8_t`, `param_3` to `size_t`.
+- Limits: Ghidra's decompiler is position-dependent on the load address
+  only for jump targets beyond the slice — irrelevant here.
@@ -0,0 +1,24 @@
+/* Ground-truth C for FUN_00000aac @ blob offset 0xaac (28 bytes / 7 insts).
+ *
+ * Pattern:  byte-wise memset with a simple counting loop.
+ * Signature:  void memset_byte(void *buf, uint8_t val, size_t len);
+ *
+ * AArch64 ABI:  X0 = buf, W1 = val (low byte), X2 = len
+ * Scratch:      X3 = index i
+ *
+ * Notes the decompiler should ideally recover:
+ *   - This is unambiguously "memset" semantics; bonus points for naming it so.
+ *   - The loop structure is pre-test (cmp before body) — tools should emit
+ *     `while (i != len)` or `for (; i < len; ...)`.
+ *   - W1 is truncated to a byte by the STRB; decompiler should mark val as u8.
+ */
+#include <stddef.h>
+#include <stdint.h>
+
+void memset_byte(void *buf, uint8_t val, size_t len) {
+    size_t i = 0;
+    while (i != len) {
+        ((uint8_t *)buf)[i] = val;
+        i++;
+    }
+}
@@ -0,0 +1,38 @@
+//
+// This file was generated by the Retargetable Decompiler
+// Website: https://retdec.com
+//
+
+#include <stdint.h>
+
+// ------------------- Function Prototypes --------------------
+
+int64_t entry_point(void);
+
+// ------------------------ Functions -------------------------
+
+// Address range: 0xaac - 0xac8
+int64_t entry_point(void) {
+    // 0xaac
+    int64_t result; // 0xaac
+    if (result == 0) {
+        // 0xab8
+        return result;
+    }
+    int64_t v1 = 0; // 0xac0
+    *(char *)(v1 + result) = (char)result;
+    v1++;
+    while (result != v1) {
+        // 0xabc
+        *(char *)(v1 + result) = (char)result;
+        v1++;
+    }
+    // 0xab8
+    return result;
+}
+
+// --------------------- Meta-Information ---------------------
+
+// Detected compiler/packer: starforce (3.x)
+// Detected functions: 1
+
@@ -0,0 +1,38 @@
+# retdec recipe — 01_memset
+
+retdec runs fully automated — hand it the binary, ask for C.
+
+## Invocation (on the decompme container at pve4, or wherever retdec lives)
+
+```
+retdec --mode raw --arch arm --endian little --bit-size 64 \
+    --raw-entry-point 0x0aac \
+    --raw-section-vma 0x0aac \
+    func.bin -o retdec.c
+```
+
+The flags:
+- `--mode raw` — input is a flat binary, no PE/ELF headers.
+- `--arch arm --endian little --bit-size 64` — AArch64 LE.
+- `--raw-entry-point 0x0aac` — tell retdec where execution starts.
+- `--raw-section-vma 0x0aac` — load the binary at address 0x0aac so
+  branch targets resolve correctly.
+
+Output goes to `retdec.c`. retdec emits a .ll (LLVM IR) and a .dsm
+(disasm) alongside — all useful for comparison.
+
+## What to expect
+
+retdec is the least "smart" of the three tools. For a raw 28-byte blob
+with no headers, it will:
+- Detect the function at 0x0aac.
+- Produce a C function named `function_aac` or similar.
+- Often inserts pseudo-intrinsics like `__asm_mov(x3, 0)` for instructions
+  it doesn't fold into C. For this tiny loop it usually manages clean C.
+
+## Benchmark notes
+
+- Strength: zero-touch, scriptable, good for bulk processing.
+- Weakness: no interactive refinement — you get what you get. Type
+  inference is conservative (`int32_t *` instead of `void *` / `uint8_t *`).
+- Often emits control flow as `goto` rather than structured loops.
@@ -0,0 +1,16 @@
+
+02_memcpy32/func.bin:     file format binary
+
+
+Disassembly of section .data:
+
+0000000000001200 <.data>:
+    1200:	927e7442 	and	x2, x2, #0xfffffffc
+    1204:	d2800003 	mov	x3, #0x0                   	// #0
+    1208:	eb02007f 	cmp	x3, x2
+    120c:	54000041 	b.ne	0x1214  // b.any
+    1210:	d65f03c0 	ret
+    1214:	b8636824 	ldr	w4, [x1, x3]
+    1218:	b8236804 	str	w4, [x0, x3]
+    121c:	91001063 	add	x3, x3, #0x4
+    1220:	17fffffa 	b	0x1208
@@ -0,0 +1,29 @@
+/* Ground-truth C for FUN_00001200 @ blob offset 0x1200 (36 bytes / 9 insts).
+ *
+ * Pattern:  word-aligned memcpy; length rounded down to word multiple.
+ * Signature:  void memcpy32(uint32_t *dst, const uint32_t *src, size_t len_bytes);
+ *
+ * AArch64 ABI:  X0 = dst, X1 = src, X2 = len (in bytes, rounded down to 4)
+ * Scratch:      X3 = byte index i, W4 = word register for transfer
+ *
+ * Notes the decompiler should ideally recover:
+ *   - `AND x2, x2, #0xFFFFFFFC` is `len &= ~3` — mask-out low 2 bits.
+ *     (Tools often render as `len & 0xFFFFFFFC` or `len & ~3`.)
+ *   - Inner loop reads/writes 4 bytes at a time — tools should recognise
+ *     uint32_t pointers, or at least `*(u32*)(x0+i) = *(u32*)(x1+i)`.
+ *   - Addressing is byte-indexed with a step of 4 — some tools may emit
+ *     `for (i = 0; i < len; i += 4)` in bytes; others may normalise into
+ *     an index-based word loop.
+ */
+#include <stddef.h>
+#include <stdint.h>
+
+void memcpy32(uint32_t *dst, const uint32_t *src, size_t len_bytes) {
+    len_bytes &= ~(size_t)3;               /* round down to 4 */
+    size_t i = 0;
+    while (i != len_bytes) {
+        *(uint32_t *)((uint8_t *)dst + i) =
+        *(const uint32_t *)((const uint8_t *)src + i);
+        i += 4;
+    }
+}
@@ -0,0 +1,38 @@
+//
+// This file was generated by the Retargetable Decompiler
+// Website: https://retdec.com
+//
+
+#include <stdint.h>
+
+// ------------------- Function Prototypes --------------------
+
+int64_t entry_point(void);
+
+// ------------------------ Functions -------------------------
+
+// Address range: 0x1200 - 0x1224
+int64_t entry_point(void) {
+    // 0x1200
+    int64_t result; // 0x1200
+    int64_t v1 = result & 0xfffffffc; // 0x1200
+    if (v1 == 0) {
+        // 0x1210
+        return result;
+    }
+    int64_t v2 = 0;
+    int64_t v3 = v2 + 4; // 0x121c
+    while (v3 != v1) {
+        // 0x1214
+        v2 = v3;
+        v3 = v2 + 4;
+    }
+    // 0x1210
+    return result;
+}
+
+// --------------------- Meta-Information ---------------------
+
+// Detected compiler/packer: starforce (3.x)
+// Detected functions: 1
+
@@ -0,0 +1,17 @@
+
+03_magic_memset/func.bin:     file format binary
+
+
+Disassembly of section .data:
+
+0000000000000da4 <.data>:
+ da4:	b2731fe0 	mov	x0, #0x1fe000              	// #2088960
+ da8:	52800021 	mov	w1, #0x1                   	// #1
+ dac:	72aa8821 	movk	w1, #0x5441, lsl #16
+ db0:	b9400402 	ldr	w2, [x0, #4]
+ db4:	6b01005f 	cmp	w2, w1
+ db8:	54000081 	b.ne	0xdc8  // b.any
+ dbc:	d2806582 	mov	x2, #0x32c                 	// #812
+ dc0:	52800001 	mov	w1, #0x0                   	// #0
+ dc4:	17ffff3a 	b	0xaac
+ dc8:	d65f03c0 	ret
@@ -0,0 +1,44 @@
+/* Ground-truth C for FUN_00000da4 @ blob offset 0xda4 (40 bytes / 9 insts).
+ *
+ * Pattern:  magic-number check at absolute address, then tail-call to memset.
+ * Signature:  void check_and_zero(void);
+ *
+ * AArch64 ABI:  no args, no return value
+ * Scratch:      X0..X2, W1, W2
+ *
+ * Behaviour:
+ *   uint32_t *magic = (uint32_t *)0x1fe000;
+ *   if (magic[1] == 0x54410001)           // 'TA'\x01 — Trusted App header?
+ *       memset(magic, 0, 0x32c);          // tail-call to FUN_00000aac
+ *   // else: fall through, return
+ *
+ * Notes the decompiler should ideally recover:
+ *   - `orr x0, xzr, #0x1fe000` is an immediate-load idiom for `x0 = 0x1fe000`;
+ *     encoded as OR-with-zero so ARM assemblers can pack it.
+ *     Tools that don't know the ORR-imm trick may render this as
+ *     `x0 = 0 | 0x1fe000` or worse `x0 = 0 | 0x1FE000UL` with weird types.
+ *   - `MOV w1, #0x1 ; MOVK w1, #0x5441, LSL #16` composes a 32-bit literal
+ *     0x54410001. A good tool collapses both into `w1 = 0x54410001`.
+ *   - `LDR w2, [X0, #0x4]` reads `magic[1]`, i.e. the second word at the
+ *     magic region. Comparing against 0x54410001 = 'TA'\x01 is the
+ *     ARMv8 "Trusted Application" header signature convention.
+ *   - `B 0xaac` is a tail-call: control transfers to memset with X0, W1, X2
+ *     already set up; no BL / return path. Tools should emit this as
+ *     `return memset(x0, w1, x2);` or at least a clear call — not an
+ *     inlined body.
+ *
+ * Address 0x1fe000 lies in RK3588 SRAM (PMU-SRAM region 0x1fe0_0000–…).
+ * Not MMIO in the strict sense — it's memory — but tools may flag it as
+ * special because of the large constant.
+ */
+#include <stddef.h>
+#include <stdint.h>
+
+extern void memset_byte(void *buf, uint8_t val, size_t len); /* FUN_00000aac */
+
+void check_and_zero(void) {
+    uint32_t *magic = (uint32_t *)0x1fe000UL;
+    if (magic[1] == 0x54410001U) {
+        memset_byte(magic, 0, 0x32c);
+    }
+}
@@ -0,0 +1,30 @@
+//
+// This file was generated by the Retargetable Decompiler
+// Website: https://retdec.com
+//
+
+#include <stdint.h>
+
+// ------------------- Function Prototypes --------------------
+
+int64_t entry_point(void);
+int64_t unknown_aac(int64_t a1, int64_t a2, int64_t a3);
+
+// ------------------------ Functions -------------------------
+
+// Address range: 0xda4 - 0xdcc
+int64_t entry_point(void) {
+    // 0xda4
+    if (*(int32_t *)0x1fe004 == 0x54410001) {
+        // 0xdbc
+        return unknown_aac(0x1fe000, 0, 812);
+    }
+    // 0xdc8
+    return 0x1fe000;
+}
+
+// --------------------- Meta-Information ---------------------
+
+// Detected compiler/packer: molebox (2.0)
+// Detected functions: 1
+
@@ -0,0 +1,80 @@
+# GRIND_LOG — first real-blob C-lift
+
+Function: **FUN_0000d328** @ blob offset 0xd328 (104 bytes / 26 insts).
+Contains 4 of our 16 timeout-less polls (sites 12, 13, 14, 15).
+Semantics: **PHY block training step** — poke CTL, wait for two STAT
+bits, apply two CFG values with HANDSHAKE acks, ack via CTL.
+
+## Tools tried (single-pass, no iteration yet)
+
+| tool | output file | grade |
+|---|---|---|
+| Ghidra 11.3 (auto-decompile) | `ghidra.c` | **A.** All 4 polls correctly modeled as `do {} while`. Collapsed the `(base + 0x8000) + offset` arithmetic into a single offset (`lVar1 + 0x8110` etc.) — actually MORE useful than a hand-written reference because it surfaces the absolute register addresses. Type cleanup needed (`undefined4`/`uint`/`long`). |
+| retdec v5.0 (zero-touch raw mode) | `retdec.c` | **C.** Recognised the function and the polls but: misread bitmask tests as comparisons (`*v6 % 4 == 0` for `& 3`, `< 0x10000000` for `& 0xF0000000`). Fabricated a return value for a void function. Loop bodies marked as `continue ->` comments. Usable as a sanity-check second opinion, not as a basis for rewriting. |
+| ground truth (hand-written) | `reference.c` | n/a — this is the canonical interpretation we judge against. |
+
+## Matching-decomp candidate iterations (the actual grind)
+
+Goal: a `.c` file that compiles to bytes close to the original 104-byte
+slice. Score = `min(candidate_size, vendor_size) / max(candidate_size, vendor_size)`
+after instruction-by-instruction diff (manual until objdiff is installed).
+
+### Iteration 1: cast-on-each-access, `-O2`
+- Pattern: `*(volatile u32 *)(base + offset)` per access.
+- GCC behavior: materialised each `0x8XXX` offset into its own register
+  (`mov x2, #0x8120; add x2, x3, x2; ldr w0, [x2]`), exploding code size.
+- Result: ~160 bytes. **53% size match. Bad.**
+
+### Iteration 2 (current best): pre-adjust base outside volatile chain, `-Os`
+- Pattern: `unsigned char *phy = base + 0x8000` once, then `*(u32v *)(phy + small)`.
+- `-Os` instead of `-O2` — drops loop-alignment NOPs.
+- Result: **116 bytes (29 insts)**. **88% size match.** See `candidate.c`.
+
+### Remaining gap to vendor (12 bytes = 3 instructions)
+
+1. GCC turns `(x & 0xF0000000) == 0` into `cmp w, w_loaded_const; b.ls`
+   instead of vendor's `tst w, #imm; b.eq`. Costs 4 bytes per loop, twice
+   = 8 bytes.
+2. GCC's `[base+0x184]` accesses inside the handshake loop are
+   `add x1, x0, #0x200; ldur x2, [x1, #-124]` — likely a ldp/ldur pair
+   GCC's scheduler thinks is faster on Cortex-A76. Costs ~4 bytes.
+
+### Next iteration ideas
+
+- **Inline-asm** for the mask-tests to force TST encoding directly. Cheap
+  win, gets us to ~108 bytes.
+- **Clang** (different scheduler, sometimes nicer with TST-style
+  comparisons). Try `clang -Oz -ffreestanding -target aarch64-none-elf`.
+- **ARMCC** — the most likely vendor compiler. Sourcing armclang for
+  AArch64 requires an Arm Developer account; backlog item.
+- **objdiff** — once installed, automate the byte-diff scoring instead
+  of eyeballing.
+
+## Workflow validation
+
+- ✓ Function extracted from blob as standalone .bin slice.
+- ✓ Three decompiler views captured (Ghidra, retdec, hand-written reference).
+- ✓ Candidate compiles + runs (matches reference semantics).
+- ✓ Single-pass byte-comparison done by hand; got 88% on iteration 2.
+- ✗ objdiff not installed — would automate the scoring.
+- ✗ decomp.me self-host not yet running on pve4 — would crowdsource the
+  grind via the standard interface.
+- ✗ ARMCC not installed — perfect-match unattainable without it.
+
+**The pipeline works.** Each future poll-site function follows the
+same 4-step recipe: extract → Ghidra-clean → write candidate → iterate
+until ≥90 % match. Estimated ~2-3 h per function for the small ones.
+
+## How this connects to the v3fb work
+
+This function contains 4 of the 16 poll sites. Once we have a
+byte-matching (or functionally-equivalent) C version, we can:
+
+1. Add bounded-retry counters in the C source — much cleaner than the
+   asm trampoline patcher.
+2. Compile + link as a freestanding `.o` at the original blob offset.
+3. Splice into the blob, replacing `FUN_0000d328` entirely.
+
+That's the path to a maintainable replacement for the trampoline-based
+v3fb approach, **for at least these 4 sites**. The other 12 sites live
+in different functions and would each need their own lift.
@@ -0,0 +1,36 @@
+/* Best matching candidate so far for FUN_0000d328.
+ * Compile:  gcc -Os -ffreestanding -nostdlib -c candidate.c -o candidate.o
+ * Score:    116 bytes vs vendor 104 bytes (88% size match, 12 bytes / 3 insts over).
+ *
+ * Remaining gap vs vendor:
+ *   - GCC emits `cmp w, w_loaded_const ; b.ls` for `(x & 0xF0000000) == 0`
+ *     instead of vendor's `tst w, #0xF0000000 ; b.eq` (both 12 bytes, but
+ *     vendor avoids materializing the mask in a register, saving 4 bytes
+ *     per loop, twice = 8 bytes).
+ *   - GCC emits `add x1, x0, #0x200 ; ldur x2, [x1, #-124]` for the
+ *     `[base+0x184]` accesses inside the handshake loop, vs vendor's
+ *     direct `ldr w1, [x0, #0x184]`. Costs us ~4 bytes.
+ *
+ * Next iterations to try:
+ *   1. Inline-asm for the mask-tests to force TST encoding.
+ *   2. `__builtin_expect((x & 0xF0000000) != 0, 0)` to hint loop direction.
+ *   3. Alternative compilers: clang, ARMCC (the latter is what Rockchip
+ *      almost certainly used; need to source it).
+ */
+typedef volatile unsigned int  u32v;
+typedef volatile unsigned long u64v;
+
+void train_phy_block(unsigned long ctx)
+{
+    unsigned char *phy = (unsigned char *)(*(unsigned long *)(ctx + 0xb8) + 0x8000);
+    *(u32v *)(phy + 0x110) = 0xf000f000u;
+    while ((*(u32v *)(phy + 0x118) & 0xf0000000u) == 0u) ;
+    while ((*(u32v *)(phy + 0x120) & 0xf0000000u) == 0u) ;
+    *(u32v *)(phy + 0x160) = 0x30003u;
+    *(u32v *)(phy + 0x154) = 0x30003u;
+    while ((*(u64v *)(phy + 0x184) & 3ul) == 0ul) ;
+    *(u32v *)(phy + 0x154) = 0x30000u;
+    while ((*(u64v *)(phy + 0x184) & 3ul) != 0ul) ;
+    *(u32v *)(phy + 0x160) = 0x30000u;
+    *(u32v *)(phy + 0x110) = 0xf0000000u;
+}
@@ -0,0 +1,71 @@
+# decomp.me recipe — 04_train_phy_block
+
+This is the **first real-blob function we're lifting to byte-matching C.**
+Score target: ≥95% match. Perfect match unlikely (compiler unknown).
+
+## Target asm (paste into "Target asm" field)
+
+```asm
+train_phy_block:
+    ldr     x0, [x0, #0xb8]
+    mov     w1, #0xf000f000
+    add     x0, x0, #0x8000
+    str     w1, [x0, #0x110]
+.Lwait_a:
+    ldr     w1, [x0, #0x118]
+    tst     w1, #0xf0000000
+    b.eq    .Lwait_a
+.Lwait_b:
+    ldr     w1, [x0, #0x120]
+    tst     w1, #0xf0000000
+    b.eq    .Lwait_b
+    mov     w1, #0x30003
+    str     w1, [x0, #0x160]
+    str     w1, [x0, #0x154]
+.Lwait_hs1:
+    ldr     w1, [x0, #0x184]
+    tst     x1, #0x3
+    b.eq    .Lwait_hs1
+    mov     w1, #0x30000
+    str     w1, [x0, #0x154]
+.Lwait_hs2:
+    ldr     w1, [x0, #0x184]
+    tst     x1, #0x3
+    b.ne    .Lwait_hs2
+    mov     w1, #0x30000
+    str     w1, [x0, #0x160]
+    mov     w1, #0xf0000000
+    str     w1, [x0, #0x110]
+    ret
+```
+
+## Compiler
+
+`aarch64-linux-gnu gcc 12 -O2 -ffreestanding -nostdlib`
+(Try also `-Os`. Vendor blob's compiler unknown — could be ARMCC or older
+GCC. Optimal C may differ between targets; perfect byte-match probably
+unattainable.)
+
+## Context
+
+Use `reference.c` as the starting C. The CMP-vs-TST distinction at the
+end (`tst x1, #0x3` uses 64-bit reg even though w1 was loaded — vendor
+quirk) suggests a particular intrinsic / pattern. May need to write the
+load as `(uint64_t)mmio_r(...)` and the test as a 64-bit AND to coax
+GCC into emitting `tst x1` instead of `tst w1`.
+
+## Things to iterate on
+
+- Order of writes to CFG_A vs CFG_B: vendor wrote CFG_B first
+  (`str w1, [x0, #0x160]` then `str w1, [x0, #0x154]`). C order matters.
+- The two `mov w1, #0x30000` near the end could be hoisted by GCC; vendor
+  emitted them inline. May need separate variables to prevent hoist.
+- `add x0, x0, #0x8000` vs `add x0, x0, #0x8, lsl #12` — same
+  instruction, GAS picks one. Either should round-trip.
+
+## Score expectations
+
+- 80%: rough loop structure + register usage matches.
+- 95%: instruction order + immediate forms match.
+- 100%: would require exact compiler/version match. Unlikely without
+  ARMCC.
@@ -0,0 +1,33 @@
+
+func.bin:     file format binary
+
+
+Disassembly of section .data:
+
+000000000000d328 <.data>:
+    d328:	f9405c00 	ldr	x0, [x0, #184]
+    d32c:	32048fe1 	mov	w1, #0xf000f000            	// #-268374016
+    d330:	91402000 	add	x0, x0, #0x8, lsl #12
+    d334:	b9011001 	str	w1, [x0, #272]
+    d338:	b9411801 	ldr	w1, [x0, #280]
+    d33c:	72040c3f 	tst	w1, #0xf0000000
+    d340:	54ffffc0 	b.eq	0xd338  // b.none
+    d344:	b9412001 	ldr	w1, [x0, #288]
+    d348:	72040c3f 	tst	w1, #0xf0000000
+    d34c:	54ffffc0 	b.eq	0xd344  // b.none
+    d350:	320087e1 	mov	w1, #0x30003               	// #196611
+    d354:	b9016001 	str	w1, [x0, #352]
+    d358:	b9015401 	str	w1, [x0, #340]
+    d35c:	b9418401 	ldr	w1, [x0, #388]
+    d360:	f240043f 	tst	x1, #0x3
+    d364:	54ffffc0 	b.eq	0xd35c  // b.none
+    d368:	52a00061 	mov	w1, #0x30000               	// #196608
+    d36c:	b9015401 	str	w1, [x0, #340]
+    d370:	b9418401 	ldr	w1, [x0, #388]
+    d374:	f240043f 	tst	x1, #0x3
+    d378:	54ffffc1 	b.ne	0xd370  // b.any
+    d37c:	52a00061 	mov	w1, #0x30000               	// #196608
+    d380:	b9016001 	str	w1, [x0, #352]
+    d384:	52be0001 	mov	w1, #0xf0000000            	// #-268435456
+    d388:	b9011001 	str	w1, [x0, #272]
+    d38c:	d65f03c0 	ret
@@ -0,0 +1,18 @@
+/* Ghidra 11.3 default decompiler output for FUN_0000d328 — unmodified. */
+void FUN_0000d328(long param_1)
+{
+  long lVar1;
+
+  lVar1 = *(long *)(param_1 + 0xb8);
+  *(undefined4 *)(lVar1 + 0x8110) = 0xf000f000;
+  do { } while ((*(uint *)(lVar1 + 0x8118) & 0xf0000000) == 0);
+  do { } while ((*(uint *)(lVar1 + 0x8120) & 0xf0000000) == 0);
+  *(undefined4 *)(lVar1 + 0x8160) = 0x30003;
+  *(undefined4 *)(lVar1 + 0x8154) = 0x30003;
+  do { } while ((*(uint *)(lVar1 + 0x8184) & 3) == 0);
+  *(undefined4 *)(lVar1 + 0x8154) = 0x30000;
+  do { } while ((*(uint *)(lVar1 + 0x8184) & 3) != 0);
+  *(undefined4 *)(lVar1 + 0x8160) = 0x30000;
+  *(undefined4 *)(lVar1 + 0x8110) = 0xf0000000;
+  return;
+}
@@ -0,0 +1,89 @@
+/* Ground-truth C for FUN_0000d328 @ blob offset 0xd328 (104 bytes / 26 insts).
+ *
+ * **The first real poll-site function we lift to C.**
+ * Contains 4 of our 16 timeout-less polls (sites 12, 13, 14, 15).
+ *
+ * Pattern:  PHY-block training step — poke a control register, wait for
+ *           two status bits, apply two intermediate values with a
+ *           handshake on a state register, ack the event.
+ *
+ * Signature:  void train_phy_block(struct phy_ctx *ctx);
+ *             (X0 = ctx, returns void)
+ *
+ * Layout:
+ *   ctx (X0)       — opaque per-rank/per-channel context
+ *   ctx->base[0xb8] — 64-bit pointer to a PHY block base
+ *   block + 0x8000 — addressed sub-block (likely "Master" bank in DWC PUB)
+ *
+ * The sub-block at +0x8000 has these registers (offsets within +0x8000):
+ *   +0x110  CTL       — write 0xF000F000 to start, 0xF0000000 to clear
+ *   +0x118  STAT_A    — bit[31:28] non-zero = step A done
+ *   +0x120  STAT_B    — bit[31:28] non-zero = step B done
+ *   +0x154  CFG_A     — write training value
+ *   +0x160  CFG_B     — write training value
+ *   +0x184  HANDSHAKE — bits[1:0] toggle between 0 and !=0 to ack writes
+ *
+ * The 4 polls (in order):
+ *   site 12 (B.EQ): STAT_A bit[31:28] non-zero?
+ *   site 13 (B.EQ): STAT_B bit[31:28] non-zero?
+ *   site 14 (B.EQ): HANDSHAKE bits[1:0] non-zero?  (ack of step-1 writes)
+ *   site 15 (B.NE): HANDSHAKE bits[1:0] zero?       (ack of step-2 write)
+ */
+#include <stdint.h>
+
+struct phy_ctx {
+    uint8_t pad[0xB8];
+    uint8_t *block;          /* base pointer used at +0xB8 in struct */
+    /* ... rest of struct unknown */
+};
+
+#define PHY_CTL          0x110
+#define PHY_STAT_A       0x118
+#define PHY_STAT_B       0x120
+#define PHY_CFG_A        0x154
+#define PHY_CFG_B        0x160
+#define PHY_HANDSHAKE    0x184
+
+#define PHY_CTL_GO       0xF000F000U
+#define PHY_CTL_CLR      0xF0000000U
+#define PHY_STAT_DONE    0xF0000000U
+#define PHY_CFG_VAL_RUN  0x00030003U
+#define PHY_CFG_VAL_END  0x00030000U
+#define PHY_HS_BUSY      0x3U
+
+static inline uint32_t mmio_r(volatile uint8_t *base, unsigned off) {
+    return *(volatile uint32_t *)(base + off);
+}
+static inline void mmio_w(volatile uint8_t *base, unsigned off, uint32_t v) {
+    *(volatile uint32_t *)(base + off) = v;
+}
+
+void train_phy_block(struct phy_ctx *ctx) {
+    volatile uint8_t *phy = (volatile uint8_t *)(ctx->block + 0x8000);
+
+    mmio_w(phy, PHY_CTL, PHY_CTL_GO);
+
+    /* site 12 — wait for step A complete */
+    while ((mmio_r(phy, PHY_STAT_A) & PHY_STAT_DONE) == 0)
+        ;
+
+    /* site 13 — wait for step B complete */
+    while ((mmio_r(phy, PHY_STAT_B) & PHY_STAT_DONE) == 0)
+        ;
+
+    mmio_w(phy, PHY_CFG_B, PHY_CFG_VAL_RUN);
+    mmio_w(phy, PHY_CFG_A, PHY_CFG_VAL_RUN);
+
+    /* site 14 — wait for handshake to assert */
+    while ((mmio_r(phy, PHY_HANDSHAKE) & PHY_HS_BUSY) == 0)
+        ;
+
+    mmio_w(phy, PHY_CFG_A, PHY_CFG_VAL_END);
+
+    /* site 15 — wait for handshake to deassert */
+    while ((mmio_r(phy, PHY_HANDSHAKE) & PHY_HS_BUSY) != 0)
+        ;
+
+    mmio_w(phy, PHY_CFG_B, PHY_CFG_VAL_END);
+    mmio_w(phy, PHY_CTL, PHY_CTL_CLR);
+}
@@ -0,0 +1,36 @@
+# RE-tool benchmark — three functions from the RK3588 DDR blob
+
+Three small, self-contained functions extracted from
+`rk3588_ddr_lp4_1848MHz_lp5_2112MHz_v1.19.bin`, each with canonical
+ground-truth semantics so you can judge decompiler output against a
+known answer.
+
+| dir | blob offset | size | ground truth |
+|-----|-------------|------|--------------|
+| `01_memset/`       | `0x0aac` | 28 B / 7 insts  | `memset(void*, u8, size_t)` byte-wise |
+| `02_memcpy32/`     | `0x1200` | 36 B / 9 insts  | `memcpy32(u32*, const u32*, size_t)` word-aligned |
+| `03_magic_memset/` | `0x0da4` | 40 B / 9 insts  | `if (*(u32*)0x1fe004 == 0x54410001) memset(0x1fe000, 0, 0x32c);` |
+
+Each subdir contains:
+- `func.bin`    — raw little-endian AArch64 machine code
+- `func.s`      — objdump'd GNU asm, same absolute addresses as the blob
+- `reference.c` — ground-truth C (our belief)
+- `ghidra.md`   — load-in-Ghidra recipe + expected output
+- `decompme.md` — decomp.me scratch recipe (matching-decomp)
+- `retdec.md`   — retdec command line
+- `retdec.c`    — retdec's actual output (captured 2026-04-15)
+
+**Summary of findings**: see [`RESULTS.md`](RESULTS.md). Short version:
+- Ghidra got all three right with minor type-label cleanup needed.
+- retdec failed on #1 and #2 (can't infer register-passed arguments on
+  raw binary), did well on #3 (the one with absolute-address refs).
+- decomp.me is a matching-decomp comparator, not a decompiler — judged
+  on a different axis.
+
+## Load address matters
+
+All three functions are extracted as raw bytes starting at offset 0 in
+their `func.bin`. When loading into Ghidra / retdec, set the base
+address to the function's original blob offset (first column above),
+otherwise branch targets and absolute-address refs in function #3 will
+be off.
@@ -0,0 +1,171 @@
+# Three-way RE-tool benchmark — results
+
+Three AArch64 functions from the RK3588 DDR v1.19 conservative blob,
+decompiled by **Ghidra 11.3** (interactive, auto-analysis) and **retdec
+v5.0** (fully automated, `--mode raw`). **decomp.me** is a
+matching-decompilation comparator rather than a decompiler, so it's
+benchmarked on a different axis (time-to-match).
+
+## Function 01 — `memset_byte` (28 bytes, 7 insts)
+
+**Ground truth:** `void memset_byte(void *buf, uint8_t val, size_t len)` —
+byte-wise pre-test counting loop.
+
+### Ghidra output
+```c
+void FUN_00000aac(long param_1, undefined1 param_2, long param_3) {
+  long lVar1;
+  for (lVar1 = 0; param_3 != lVar1; lVar1 = lVar1 + 1) {
+    *(undefined1 *)(param_1 + lVar1) = param_2;
+  }
+}
+```
+**Grade: A.** Semantics perfect. Types `long`/`undefined1` instead of
+`void*`/`uint8_t`/`size_t` — one click to retype each. For-loop shape
+matches the canonical idiom exactly.
+
+### retdec output
+```c
+int64_t entry_point(void) {
+    int64_t result;
+    if (result == 0) return result;
+    int64_t v1 = 0;
+    *(char *)(v1 + result) = (char)result;
+    ...
+}
+```
+**Grade: F.** **No function arguments inferred** — treats X0/W1/X2 as
+uninitialised locals. The whole function signature is wrong. The loop
+body overwrites the wrong things. This is retdec's biggest weakness on
+raw binaries: without ELF symbol or DWARF hints, it can't tell which
+registers are live-in parameters.
+
+### decomp.me workflow
+N/A as a decompiler; as a matching-decomp tool: paste the asm + your
+candidate C → iterate on wording until the compiled output byte-matches.
+For memset this reaches 90%+ similarity in a handful of edits with GCC
+(exact match unlikely — original blob used a different compiler).
+
+---
+
+## Function 02 — `memcpy32` (36 bytes, 9 insts)
+
+**Ground truth:** word-aligned memcpy, `len &= ~3`, 4-byte stride.
+
+### Ghidra output
+```c
+void FUN_00001200(long param_1, long param_2, ulong param_3) {
+  ulong uVar1;
+  for (uVar1 = 0; uVar1 != (param_3 & 0xfffffffc); uVar1 = uVar1 + 4) {
+    *(undefined4 *)(param_1 + uVar1) = *(undefined4 *)(param_2 + uVar1);
+  }
+}
+```
+**Grade: A.** Semantics perfect. The `& 0xfffffffc` mask is in the
+correct position, the 4-byte stride is there, the `undefined4` (u32) copy
+is there. Again: only type annotations need manual cleanup.
+
+### retdec output
+```c
+int64_t entry_point(void) {
+    int64_t v1 = result & 0xfffffffc;
+    if (v1 == 0) return result;
+    int64_t v2 = 0;
+    int64_t v3 = v2 + 4;
+    while (v3 != v1) { v2 = v3; v3 = v2 + 4; }
+    return result;
+}
+```
+**Grade: F.** Same no-arguments failure mode. The memory-copy statements
+are completely absent — retdec failed to emit the two LDR/STR pair as a
+dereference. You get an infinite-looking counter increment and nothing
+else. Unusable.
+
+---
+
+## Function 03 — `magic_memset` (40 bytes, 9 insts)
+
+**Ground truth:** `if (*(u32*)0x1fe004 == 0x54410001) memset(0x1fe000, 0, 0x32c);`
+
+### Ghidra output
+```c
+void FUN_00000da4(void) {
+  if (_DAT_001fe004 == 0x54410001) {
+    FUN_00000aac(0x1fe000, 0, 0x32c);
+    return;
+  }
+  return;
+}
+```
+**Grade: A+.** Perfect. `_DAT_001fe004` is Ghidra's auto-named data
+symbol for the absolute address. The tail-call `B 0xaac` was correctly
+turned into a regular call-and-return, preserving the calling
+convention. The MOVZ+MOVK composed immediate `0x54410001` was collapsed
+into a single literal.
+
+### retdec output
+```c
+int64_t entry_point(void) {
+    if (*(int32_t *)0x1fe004 == 0x54410001) {
+        return unknown_aac(0x1fe000, 0, 812);
+    }
+    return 0x1fe000;
+}
+```
+**Grade: B+.** **Noticeably better than retdec's output for #1 and #2**:
+  - Absolute-address dereference correctly parsed.
+  - `MOVZ W1, #1 ; MOVK W1, #0x5441, LSL #16` collapsed to
+    `0x54410001` ✓
+  - Tail-call to 0xaac correctly recognised as a call to
+    `unknown_aac(0x1fe000, 0, 812)` — even got arity right by observing
+    X0/W1/X2 being set up just before the branch.
+  - **Weakness:** returns `0x1fe000` in the fall-through branch —
+    spurious, because the function returns `void` and retdec fabricated
+    a return value. Also `812` decimal instead of `0x32c` hex.
+
+---
+
+## Takeaways
+
+| dimension | Ghidra | retdec | decomp.me |
+|---|---|---|---|
+| Argument inference from raw binary | **yes** (intra-proc analysis) | **no** | n/a |
+| Absolute-address data refs | auto-named `_DAT_xxxxx` | raw cast `*(int32_t *)0x…` | n/a |
+| MOVZ+MOVK literal reconstruction | collapses | collapses | n/a |
+| Tail-call recognition | yes | yes | n/a |
+| Control-flow structure | clean structured loops | mix of `while` + `goto` | n/a |
+| Type inference | `long`/`undefined4` placeholders | cautious `int64_t` fallback | n/a |
+| Zero-touch automation | no (interactive) | **yes** | no (interactive) |
+| Matching-decomp workflow | no | no | **yes** |
+
+### When each tool wins
+
+- **Ghidra** is the default daily driver. Auto-analysis output is already
+  close to final for simple functions — mostly you rename params and
+  retype placeholders.
+- **retdec** shines when the target has **absolute-address data refs,
+  call tables, or embedded constants** (function #3). It falls over on
+  anything where register-passed parameters need inference from
+  surrounding context (functions #1 and #2). Fine for bulk batch
+  processing of a repo full of functions whose signatures you don't
+  know you care about — but verify each output.
+- **decomp.me** doesn't compete with the others; it's the **"did my
+  rewrite compile to the same bytes?"** tool. Complementary: take
+  Ghidra's output, paste the C into decomp.me, iterate until the
+  compiled asm matches the blob's bytes. That's how you'd produce a
+  maintainable C re-implementation.
+
+### Practical recipe for our DDR-blob work
+
+1. **Start with Ghidra's decompiler output** (already done in
+   `ddr_annotated.c`). Retype params, rename variables. ~2–4h per
+   non-trivial function.
+2. **Feed the cleaned C into decomp.me** with the original function's
+   bytes as target asm. Iterate until byte-match (or asymptotic
+   similarity). ~1–2h per function.
+3. **retdec** is useful only for functions with lots of absolute-address
+   refs we want a second opinion on — which is rare in the poll-loop
+   patches.
+
+For a production C re-implementation of the 20 poll sites, Ghidra →
+decomp.me is the correct pipeline. Skip retdec for those.
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+"""Slice named functions out of the RK3588 DDR v1.19 conservative blob."""
+import os
+
+BLOB = os.path.expanduser('~/projects/AMPere/rkbin/bin/rk35/rk3588_ddr_lp4_1848MHz_lp5_2112MHz_v1.19.bin')
+BASE = os.path.expanduser('~/projects/AMPere/benchmark')
+
+functions = [
+    ('01_memset',        0x0aac, 0x1c, 'byte memset'),
+    ('02_memcpy32',      0x1200, 0x24, 'word-aligned memcpy32'),
+    ('03_magic_memset',  0x0da4, 0x28, 'magic-check + tail-call to memset'),
+]
+
+blob = open(BLOB, 'rb').read()
+for name, off, sz, desc in functions:
+    d = os.path.join(BASE, name)
+    os.makedirs(d, exist_ok=True)
+    with open(os.path.join(d, 'func.bin'), 'wb') as f:
+        f.write(blob[off:off+sz])
+    print(f"{name}: {sz} bytes @ 0x{off:x}  — {desc}")
@@ -0,0 +1,26 @@
+BENCH := $(abspath ..)
+
+.PHONY: all clean
+all: gdb_debug.elf
+
+# Wrap each benchmark function's raw bytes into an .o with predictable
+# symbols _binary_func_NN_bin_{start,end}, regardless of the cwd-dependent
+# symbol names that `ld -b binary` generates.
+define WRAP_BIN
+$1.o: $(BENCH)/$2/func.bin
+	cp $$< $1.bin
+	ld -r -b binary -o $$@.raw $1.bin
+	rm -f $1.bin
+	objcopy $$$$(nm $$@.raw | awk '/_func_bin_start$$$$/{printf " --redefine-sym %s=_binary_$1_bin_start",$$$$3} /_func_bin_end$$$$/{printf " --redefine-sym %s=_binary_$1_bin_end",$$$$3}') $$@.raw $$@
+	rm -f $$@.raw
+endef
+
+$(eval $(call WRAP_BIN,func_01,01_memset))
+$(eval $(call WRAP_BIN,func_02,02_memcpy32))
+$(eval $(call WRAP_BIN,func_03,03_magic_memset))
+
+gdb_debug.elf: harness.c func_01.o func_02.o func_03.o
+	gcc -O0 -g -Wall -o $@ $^
+
+clean:
+	rm -f gdb_debug.elf func_*.o *.bin
@@ -0,0 +1,72 @@
+# gdb_debug — single-step the benchmark functions under GDB
+
+Wraps each of `01_memset` / `02_memcpy32` / `03_magic_memset` in a
+C harness, copies the raw bytes into an RWX buffer, and calls through
+a function pointer. GDB attached to the harness lets you step every
+machine instruction of the real blob code — **no QEMU needed because
+boltzmann (and ampere, ohm, hertz) are natively aarch64.**
+
+## Build
+
+```
+make                 # builds ./gdb_debug.elf natively on aarch64
+```
+
+Cross-build recipe (if you ever want to run on x86 oppenheimer via
+qemu-user) lives in the Makefile; replace `gcc` with
+`aarch64-linux-gnu-gcc` and `ld` with `aarch64-linux-gnu-ld`, and launch
+under `qemu-aarch64-static -g 1234 ./gdb_debug.elf 1` with
+`gdb-multiarch` attaching to `:1234`.
+
+## Run under GDB
+
+```
+gdb ./gdb_debug.elf
+(gdb) set pagination off
+(gdb) layout split            # TUI: source / asm / regs split
+(gdb) break call_func         # the dispatcher — one breakpoint catches all three
+(gdb) run 1                   # 1=memset  2=memcpy32  3=magic_memset
+(gdb) stepi                   # one machine instruction
+(gdb) info reg                # full register dump
+(gdb) x/8i $pc                # peek 8 upcoming instructions
+(gdb) display/i $pc           # auto-show next instruction on every stop
+(gdb) x/16bx $x0              # hex-dump 16 bytes from what X0 points at
+```
+
+## What to look for
+
+### Function 1 (memset)
+After `MOV X3, #0`, each iteration: `CMP X2, X3` → `B.NE` → `STRB W1, [X0, X3]`
+→ `ADD X3, X3, #1` → back. Watch `$x3` advance, inspect `x/16bx $x0` to see
+the buffer filling with `0xAB`.
+
+### Function 2 (memcpy32)
+First instruction is the alignment mask: `AND X2, X2, #0xfffffffc`.
+Set a watchpoint on `$x2` to catch the mask, then step the loop to watch
+4-byte transfers: `LDR W4, [X1, X3]` ; `STR W4, [X0, X3]` ; `ADD X3, X3, #4`.
+
+### Function 3 (magic_memset)
+Will **SIGSEGV** on `LDR W2, [X0, #4]` because `X0 = 0x1fe000` is unmapped
+in user mode. That crash **is** the verification — it proves the function
+really does target that absolute address. To execute the full path, add
+before `call_func`:
+
+```c
+mmap((void*)0x1fe000, 4096, PROT_READ|PROT_WRITE,
+     MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0);
+*(uint32_t*)0x1fe004 = 0x54410001;
+```
+
+Then the magic check passes and GDB steps into the tail-call to memset.
+
+## Why this scaffold beats `ddr_emu2` for verifying trampolines
+
+`ddr_emu2` dies at PC=0x10a80 in the emulator because it can't model an
+MMIO register — blind spot for us. Native GDB on an aarch64 host runs the
+*actual* CPU with full instruction fidelity; the limit becomes "can we
+fake the MMIO responses?" rather than "does the emulator know this
+instruction?". For compute-only code (functions 1 and 2), zero prep
+needed. For MMIO-touching code, `mmap(MAP_FIXED)` + a signal handler
+stub can serve as a synthetic PHY — **that's the path to single-stepping
+a patched trampoline through the real ISA with fake hardware replies**,
+which is exactly what the next round of v3fb verification would need.
@@ -0,0 +1,74 @@
+/* Generic harness for single-stepping one of the benchmark functions under GDB.
+ * Copies the raw bytes of funcNN.bin into an RWX buffer and calls through
+ * a function pointer. GDB stepi from the call site drops you right into the
+ * target function's first instruction. No QEMU needed — boltzmann is aarch64.
+ *
+ * Build: run `make` in this dir (native aarch64 only, for now).
+ * Run:   ./gdb_debug.elf {1|2|3}    (1=memset 2=memcpy32 3=magic_memset)
+ *
+ * Under GDB: see README.md.
+ */
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+extern uint8_t _binary_func_01_bin_start[], _binary_func_01_bin_end[];
+extern uint8_t _binary_func_02_bin_start[], _binary_func_02_bin_end[];
+extern uint8_t _binary_func_03_bin_start[], _binary_func_03_bin_end[];
+
+typedef void (*f1_t)(void *, uint8_t, uint64_t);
+typedef void (*f2_t)(uint32_t *, const uint32_t *, uint64_t);
+typedef void (*f3_t)(void);
+
+static void *rwx_copy(const void *src, size_t len) {
+    void *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE | PROT_EXEC,
+                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) { perror("mmap"); exit(1); }
+    memcpy(p, src, len);
+    __builtin___clear_cache(p, (char *)p + len);
+    return p;
+}
+
+static void __attribute__((noinline))
+call_func(void (*fn)(void), int which) {
+    switch (which) {
+    case 1: {
+        char buf[64] = {0};
+        printf("pre:  buf[10]=0x%02x\n", (uint8_t)buf[10]);
+        ((f1_t)fn)(buf, 0xAB, 16);
+        printf("post: buf[10]=0x%02x (expect 0xab)\n", (uint8_t)buf[10]);
+        break;
+    }
+    case 2: {
+        uint32_t dst[8] = {0}, src[8];
+        for (int i = 0; i < 8; i++) src[i] = 0xDEAD0000U | i;
+        ((f2_t)fn)(dst, src, sizeof dst);
+        printf("dst[3]=0x%08x (expect 0xdead0003)\n", dst[3]);
+        break;
+    }
+    case 3:
+        printf("calling magic_memset — SIGSEGVs on LDR of 0x1fe004 in user mode.\n");
+        ((f3_t)fn)();
+        break;
+    }
+}
+
+int main(int argc, char **argv) {
+    if (argc != 2) { fprintf(stderr, "usage: %s {1|2|3}\n", argv[0]); return 2; }
+    int which = atoi(argv[1]);
+    void (*fn)(void);
+    switch (which) {
+    case 1: fn = rwx_copy(_binary_func_01_bin_start,
+                          _binary_func_01_bin_end - _binary_func_01_bin_start); break;
+    case 2: fn = rwx_copy(_binary_func_02_bin_start,
+                          _binary_func_02_bin_end - _binary_func_02_bin_start); break;
+    case 3: fn = rwx_copy(_binary_func_03_bin_start,
+                          _binary_func_03_bin_end - _binary_func_03_bin_start); break;
+    default: fprintf(stderr, "unknown index %d\n", which); return 2;
+    }
+    printf("function %d loaded at %p\n", which, fn);
+    call_func(fn, which);
+    return 0;
+}