From 00d655187a247d9b64a2f96a79732513125791dc Mon Sep 17 00:00:00 2001
From: Markus Fritsche <mfritsche@reauktion.de>
Date: Wed, 15 Apr 2026 07:26:23 +0200
Subject: [PATCH] benchmark/: three-way RE-tool comparison + first real C-lift
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three small functions extracted from the v1.19 conservative blob with
ground-truth C and per-tool (Ghidra / retdec / decomp.me) docs:
  01_memset        — byte memset, 28 B
  02_memcpy32      — word-aligned memcpy, 36 B
  03_magic_memset  — magic check + tail-call to memset, 40 B
  04_train_phy_block — first real poll-site function (104 B, 26 insts),
                       contains poll sites 12-15

Results in RESULTS.md:
  - Ghidra: A on all four. Auto-decompile is close to final.
  - retdec: A on #3, F on #1 and #2 (no register-arg inference on raw),
    C on #4 (mistakes & 0xF0000000 for < 0x10000000).

GRIND_LOG.md (in 04_train_phy_block/) records the matching-decomp
iteration: 116-byte candidate.c at -Os vs vendor 104 bytes = 89.7%
size match on first real iteration. Remaining gap is GCC's choice of
`cmp w, w_const; b.ls` over vendor's `tst w, #imm; b.eq` for the
mask tests.

gdb_debug/ holds a native-aarch64 GDB single-stepper for the three
benchmark functions — boltzmann smoke test passed (memset:
buf[10] 0x00→0xab).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmark/01_memset/decompme.md           |  58 ++++++++
 benchmark/01_memset/func.bin              | Bin 0 -> 28 bytes
 benchmark/01_memset/func.s                |  14 ++
 benchmark/01_memset/ghidra.md             |  41 ++++++
 benchmark/01_memset/reference.c           |  24 +++
 benchmark/01_memset/retdec.c              |  38 +++++
 benchmark/01_memset/retdec.md             |  38 +++++
 benchmark/02_memcpy32/func.bin            | Bin 0 -> 36 bytes
 benchmark/02_memcpy32/func.s              |  16 ++
 benchmark/02_memcpy32/reference.c         |  29 ++++
 benchmark/02_memcpy32/retdec.c            |  38 +++++
 benchmark/03_magic_memset/func.bin        | Bin 0 -> 40 bytes
 benchmark/03_magic_memset/func.s          |  17 +++
 benchmark/03_magic_memset/reference.c     |  44 ++++++
 benchmark/03_magic_memset/retdec.c        |  30 ++++
 benchmark/04_train_phy_block/GRIND_LOG.md |  80 ++++++++++
 benchmark/04_train_phy_block/candidate.c  |  36 +++++
 benchmark/04_train_phy_block/decompme.md  |  71 +++++++++
 benchmark/04_train_phy_block/func.bin     | Bin 0 -> 104 bytes
 benchmark/04_train_phy_block/func.s       |  33 +++++
 benchmark/04_train_phy_block/ghidra.c     |  18 +++
 benchmark/04_train_phy_block/reference.c  |  89 +++++++++++
 benchmark/README.md                       |  36 +++++
 benchmark/RESULTS.md                      | 171 ++++++++++++++++++++++
 benchmark/extract.py                      |  20 +++
 benchmark/gdb_debug/Makefile              |  26 ++++
 benchmark/gdb_debug/README.md             |  72 +++++++++
 benchmark/gdb_debug/func_01.o             | Bin 0 -> 648 bytes
 benchmark/gdb_debug/func_02.o             | Bin 0 -> 656 bytes
 benchmark/gdb_debug/func_03.o             | Bin 0 -> 656 bytes
 benchmark/gdb_debug/gdb_debug.elf         | Bin 0 -> 76152 bytes
 benchmark/gdb_debug/harness.c             |  74 ++++++++++
 32 files changed, 1113 insertions(+)
 create mode 100644 benchmark/01_memset/decompme.md
 create mode 100644 benchmark/01_memset/func.bin
 create mode 100644 benchmark/01_memset/func.s
 create mode 100644 benchmark/01_memset/ghidra.md
 create mode 100644 benchmark/01_memset/reference.c
 create mode 100644 benchmark/01_memset/retdec.c
 create mode 100644 benchmark/01_memset/retdec.md
 create mode 100644 benchmark/02_memcpy32/func.bin
 create mode 100644 benchmark/02_memcpy32/func.s
 create mode 100644 benchmark/02_memcpy32/reference.c
 create mode 100644 benchmark/02_memcpy32/retdec.c
 create mode 100644 benchmark/03_magic_memset/func.bin
 create mode 100644 benchmark/03_magic_memset/func.s
 create mode 100644 benchmark/03_magic_memset/reference.c
 create mode 100644 benchmark/03_magic_memset/retdec.c
 create mode 100644 benchmark/04_train_phy_block/GRIND_LOG.md
 create mode 100644 benchmark/04_train_phy_block/candidate.c
 create mode 100644 benchmark/04_train_phy_block/decompme.md
 create mode 100644 benchmark/04_train_phy_block/func.bin
 create mode 100644 benchmark/04_train_phy_block/func.s
 create mode 100644 benchmark/04_train_phy_block/ghidra.c
 create mode 100644 benchmark/04_train_phy_block/reference.c
 create mode 100644 benchmark/README.md
 create mode 100644 benchmark/RESULTS.md
 create mode 100644 benchmark/extract.py
 create mode 100644 benchmark/gdb_debug/Makefile
 create mode 100644 benchmark/gdb_debug/README.md
 create mode 100644 benchmark/gdb_debug/func_01.o
 create mode 100644 benchmark/gdb_debug/func_02.o
 create mode 100644 benchmark/gdb_debug/func_03.o
 create mode 100755 benchmark/gdb_debug/gdb_debug.elf
 create mode 100644 benchmark/gdb_debug/harness.c

diff --git a/benchmark/01_memset/decompme.md b/benchmark/01_memset/decompme.md
new file mode 100644
index 0000000..813f756
--- /dev/null
+++ b/benchmark/01_memset/decompme.md
@@ -0,0 +1,58 @@
+# decomp.me recipe — 01_memset
+
+## Create a scratch
+
+Open https://decomp.me/ (or your self-hosted instance at
+http://192.168.88.64 if available). Click **New scratch**.
+
+- **Platform / Compiler:** `gcc 12.x aarch64-linux-gnu` (or whatever
+  aarch64-gcc is offered — exact version doesn't matter much for this
+  size).
+- **Compiler flags:** `-O2 -ffreestanding -nostdlib`
+- **Diff label:** `memset_byte`
+
+## Target asm
+
+Paste the following into the **"Target asm"** box:
+
+```asm
+memset_byte:
+    mov     x3, #0x0
+.Lloop:
+    cmp     x2, x3
+    b.ne    .Lbody
+    ret
+.Lbody:
+    strb    w1, [x0, x3]
+    add     x3, x3, #0x1
+    b       .Lloop
+```
+
+## Context (headers/decls)
+
+```c
+#include <stddef.h>
+#include <stdint.h>
+```
+
+## Source
+
+Paste the ground-truth C from `reference.c` (or write your own first
+and iterate).
+
+## Expected workflow
+
+- First compile: scorer usually reports a high similarity (>= 80%) if
+  the compiler picks the same `while (i != n)` pattern.
+- Fine-tune: try `i++` vs `i+=1`, try `while` vs `for`, try `uint8_t *`
+  cast placement. Each yields a distinct register-allocation order the
+  scorer rewards or punishes.
+- Perfect match possible if you hit the exact code shape GCC chose.
+
+## Benchmark notes
+
+- decomp.me's strength is the **compile-and-diff** feedback loop — every
+  edit immediately shows the byte-delta against the target.
+- Weakness for this target: the real blob was likely built with a
+  different compiler (ARMCC / Keil / vendor LLVM?). GCC may never match
+  exactly even with perfect C. Similarity >= 90% is the realistic ceiling.
diff --git a/benchmark/01_memset/func.bin b/benchmark/01_memset/func.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a80e2e59272b578294cc50107974149fc8cd201a
GIT binary patch
literal 28
kcmZQ(Xt)&5!2H^gfg$7obNn^N3}uUC7KVwx|Nj>U0DLJ4`~Uy|

literal 0
HcmV?d00001

diff --git a/benchmark/01_memset/func.s b/benchmark/01_memset/func.s
new file mode 100644
index 0000000..7da6e22
--- /dev/null
+++ b/benchmark/01_memset/func.s
@@ -0,0 +1,14 @@
+
+01_memset/func.bin:     file format binary
+
+
+Disassembly of section .data:
+
+0000000000000aac <.data>:
+ aac:	d2800003 	mov	x3, #0x0                   	// #0
+ ab0:	eb03005f 	cmp	x2, x3
+ ab4:	54000041 	b.ne	0xabc  // b.any
+ ab8:	d65f03c0 	ret
+ abc:	38236801 	strb	w1, [x0, x3]
+ ac0:	91000463 	add	x3, x3, #0x1
+ ac4:	17fffffb 	b	0xab0
diff --git a/benchmark/01_memset/ghidra.md b/benchmark/01_memset/ghidra.md
new file mode 100644
index 0000000..bea3aec
--- /dev/null
+++ b/benchmark/01_memset/ghidra.md
@@ -0,0 +1,41 @@
+# Ghidra recipe — 01_memset
+
+## Load
+
+**File → Import File…** → `func.bin`.
+
+In the import dialog:
+- **Format:** Raw Binary
+- **Language:** AArch64:LE:64:v8A
+- **Base Address:** `0x0aac`  ← critical; branches are PC-relative and the
+  absolute function address matters for readability (though the code at
+  0xaac has no absolute-addr refs of its own).
+
+After import, click **Yes** on the "Analyze now?" prompt; default
+analyzers are fine.
+
+## What to look for in Ghidra's decompiler output
+
+- Function automatically detected at 0xaac (the file starts there).
+- Decompiler should produce something like:
+  ```c
+  void FUN_00000aac(long param_1, byte param_2, long param_3) {
+      long local_10 = 0;
+      while (local_10 != param_3) {
+          *(byte *)(param_1 + local_10) = param_2;
+          local_10++;
+      }
+  }
+  ```
+- Idiomatic match rate: high for this pattern; Ghidra's decompiler
+  recognises the pre-test loop well.
+- Ghidra types: `byte` (uint8_t), `long` (the 64-bit register) — not
+  directly `uint8_t` / `size_t`. Manual retyping is usually needed.
+
+## Benchmark notes
+
+- Time to understandable output: ~seconds (auto-analysis).
+- Manual cleanup: rename `FUN_00000aac` → `memset_byte`; retype
+  `param_1` to `void *`, `param_2` to `uint8_t`, `param_3` to `size_t`.
+- Limits: Ghidra's decompiler is position-dependent on the load address
+  only for jump targets beyond the slice — irrelevant here.
diff --git a/benchmark/01_memset/reference.c b/benchmark/01_memset/reference.c
new file mode 100644
index 0000000..ffa7633
--- /dev/null
+++ b/benchmark/01_memset/reference.c
@@ -0,0 +1,24 @@
+/* Ground-truth C for FUN_00000aac @ blob offset 0xaac (28 bytes / 7 insts).
+ *
+ * Pattern:  byte-wise memset with a simple counting loop.
+ * Signature:  void memset_byte(void *buf, uint8_t val, size_t len);
+ *
+ * AArch64 ABI:  X0 = buf, W1 = val (low byte), X2 = len
+ * Scratch:      X3 = index i
+ *
+ * Notes the decompiler should ideally recover:
+ *   - This is unambiguously "memset" semantics; bonus points for naming it so.
+ *   - The loop structure is pre-test (cmp before body) — tools should emit
+ *     `while (i != len)` or `for (; i < len; ...)`.
+ *   - W1 is truncated to a byte by the STRB; decompiler should mark val as u8.
+ */
+#include <stddef.h>
+#include <stdint.h>
+
+void memset_byte(void *buf, uint8_t val, size_t len) {
+    size_t i = 0;
+    while (i != len) {
+        ((uint8_t *)buf)[i] = val;
+        i++;
+    }
+}
diff --git a/benchmark/01_memset/retdec.c b/benchmark/01_memset/retdec.c
new file mode 100644
index 0000000..40613ee
--- /dev/null
+++ b/benchmark/01_memset/retdec.c
@@ -0,0 +1,38 @@
+//
+// This file was generated by the Retargetable Decompiler
+// Website: https://retdec.com
+//
+
+#include <stdint.h>
+
+// ------------------- Function Prototypes --------------------
+
+int64_t entry_point(void);
+
+// ------------------------ Functions -------------------------
+
+// Address range: 0xaac - 0xac8
+int64_t entry_point(void) {
+    // 0xaac
+    int64_t result; // 0xaac
+    if (result == 0) {
+        // 0xab8
+        return result;
+    }
+    int64_t v1 = 0; // 0xac0
+    *(char *)(v1 + result) = (char)result;
+    v1++;
+    while (result != v1) {
+        // 0xabc
+        *(char *)(v1 + result) = (char)result;
+        v1++;
+    }
+    // 0xab8
+    return result;
+}
+
+// --------------------- Meta-Information ---------------------
+
+// Detected compiler/packer: starforce (3.x)
+// Detected functions: 1
+
diff --git a/benchmark/01_memset/retdec.md b/benchmark/01_memset/retdec.md
new file mode 100644
index 0000000..bba7818
--- /dev/null
+++ b/benchmark/01_memset/retdec.md
@@ -0,0 +1,38 @@
+# retdec recipe — 01_memset
+
+retdec runs fully automated — hand it the binary, ask for C.
+
+## Invocation (on the decompme container at pve4, or wherever retdec lives)
+
+```
+retdec --mode raw --arch arm --endian little --bit-size 64 \
+    --raw-entry-point 0x0aac \
+    --raw-section-vma 0x0aac \
+    func.bin -o retdec.c
+```
+
+The flags:
+- `--mode raw` — input is a flat binary, no PE/ELF headers.
+- `--arch arm --endian little --bit-size 64` — AArch64 LE.
+- `--raw-entry-point 0x0aac` — tell retdec where execution starts.
+- `--raw-section-vma 0x0aac` — load the binary at address 0x0aac so
+  branch targets resolve correctly.
+
+Output goes to `retdec.c`. retdec emits a .ll (LLVM IR) and a .dsm
+(disasm) alongside — all useful for comparison.
+
+## What to expect
+
+retdec is the least "smart" of the three tools. For a raw 28-byte blob
+with no headers, it will:
+- Detect the function at 0x0aac.
+- Produce a C function named `function_aac` or similar.
+- Often inserts pseudo-intrinsics like `__asm_mov(x3, 0)` for instructions
+  it doesn't fold into C. For this tiny loop it usually manages clean C.
+
+## Benchmark notes
+
+- Strength: zero-touch, scriptable, good for bulk processing.
+- Weakness: no interactive refinement — you get what you get. Type
+  inference is conservative (`int32_t *` instead of `void *` / `uint8_t *`).
+- Often emits control flow as `goto` rather than structured loops.
diff --git a/benchmark/02_memcpy32/func.bin b/benchmark/02_memcpy32/func.bin
new file mode 100644
index 0000000000000000000000000000000000000000..56cf856d445321ac043f3c969d691083381b63a0
GIT binary patch
literal 36
scmZ=Nshh;i&~T}qf$6m)14GCG=J;zW8Ob|XGL&~D3ouOl_5Z&(0PPbF761SM

literal 0
HcmV?d00001

diff --git a/benchmark/02_memcpy32/func.s b/benchmark/02_memcpy32/func.s
new file mode 100644
index 0000000..12e831b
--- /dev/null
+++ b/benchmark/02_memcpy32/func.s
@@ -0,0 +1,16 @@
+
+02_memcpy32/func.bin:     file format binary
+
+
+Disassembly of section .data:
+
+0000000000001200 <.data>:
+    1200:	927e7442 	and	x2, x2, #0xfffffffc
+    1204:	d2800003 	mov	x3, #0x0                   	// #0
+    1208:	eb02007f 	cmp	x3, x2
+    120c:	54000041 	b.ne	0x1214  // b.any
+    1210:	d65f03c0 	ret
+    1214:	b8636824 	ldr	w4, [x1, x3]
+    1218:	b8236804 	str	w4, [x0, x3]
+    121c:	91001063 	add	x3, x3, #0x4
+    1220:	17fffffa 	b	0x1208
diff --git a/benchmark/02_memcpy32/reference.c b/benchmark/02_memcpy32/reference.c
new file mode 100644
index 0000000..89978fd
--- /dev/null
+++ b/benchmark/02_memcpy32/reference.c
@@ -0,0 +1,29 @@
+/* Ground-truth C for FUN_00001200 @ blob offset 0x1200 (36 bytes / 9 insts).
+ *
+ * Pattern:  word-aligned memcpy; length rounded down to word multiple.
+ * Signature:  void memcpy32(uint32_t *dst, const uint32_t *src, size_t len_bytes);
+ *
+ * AArch64 ABI:  X0 = dst, X1 = src, X2 = len (in bytes, rounded down to 4)
+ * Scratch:      X3 = byte index i, W4 = word register for transfer
+ *
+ * Notes the decompiler should ideally recover:
+ *   - `AND x2, x2, #0xFFFFFFFC` is `len &= ~3` — mask-out low 2 bits.
+ *     (Tools often render as `len & 0xFFFFFFFC` or `len & ~3`.)
+ *   - Inner loop reads/writes 4 bytes at a time — tools should recognise
+ *     uint32_t pointers, or at least `*(u32*)(x0+i) = *(u32*)(x1+i)`.
+ *   - Addressing is byte-indexed with a step of 4 — some tools may emit
+ *     `for (i = 0; i < len; i += 4)` in bytes; others may normalise into
+ *     an index-based word loop.
+ */
+#include <stddef.h>
+#include <stdint.h>
+
+void memcpy32(uint32_t *dst, const uint32_t *src, size_t len_bytes) {
+    len_bytes &= ~(size_t)3;               /* round down to 4 */
+    size_t i = 0;
+    while (i != len_bytes) {
+        *(uint32_t *)((uint8_t *)dst + i) =
+        *(const uint32_t *)((const uint8_t *)src + i);
+        i += 4;
+    }
+}
diff --git a/benchmark/02_memcpy32/retdec.c b/benchmark/02_memcpy32/retdec.c
new file mode 100644
index 0000000..fb373ad
--- /dev/null
+++ b/benchmark/02_memcpy32/retdec.c
@@ -0,0 +1,38 @@
+//
+// This file was generated by the Retargetable Decompiler
+// Website: https://retdec.com
+//
+
+#include <stdint.h>
+
+// ------------------- Function Prototypes --------------------
+
+int64_t entry_point(void);
+
+// ------------------------ Functions -------------------------
+
+// Address range: 0x1200 - 0x1224
+int64_t entry_point(void) {
+    // 0x1200
+    int64_t result; // 0x1200
+    int64_t v1 = result & 0xfffffffc; // 0x1200
+    if (v1 == 0) {
+        // 0x1210
+        return result;
+    }
+    int64_t v2 = 0;
+    int64_t v3 = v2 + 4; // 0x121c
+    while (v3 != v1) {
+        // 0x1214
+        v2 = v3;
+        v3 = v2 + 4;
+    }
+    // 0x1210
+    return result;
+}
+
+// --------------------- Meta-Information ---------------------
+
+// Detected compiler/packer: starforce (3.x)
+// Detected functions: 1
+
diff --git a/benchmark/03_magic_memset/func.bin b/benchmark/03_magic_memset/func.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a4473803f3bf19d5a9b538153702ec1f26f4f968
GIT binary patch
literal 40
vcmaDLU%W|?p&>}IV^t9oi^I-%2FC1028NKP)P_rpKyj=8|HTh5$6o^g9(WGV

literal 0
HcmV?d00001

diff --git a/benchmark/03_magic_memset/func.s b/benchmark/03_magic_memset/func.s
new file mode 100644
index 0000000..52bd85f
--- /dev/null
+++ b/benchmark/03_magic_memset/func.s
@@ -0,0 +1,17 @@
+
+03_magic_memset/func.bin:     file format binary
+
+
+Disassembly of section .data:
+
+0000000000000da4 <.data>:
+ da4:	b2731fe0 	mov	x0, #0x1fe000              	// #2088960
+ da8:	52800021 	mov	w1, #0x1                   	// #1
+ dac:	72aa8821 	movk	w1, #0x5441, lsl #16
+ db0:	b9400402 	ldr	w2, [x0, #4]
+ db4:	6b01005f 	cmp	w2, w1
+ db8:	54000081 	b.ne	0xdc8  // b.any
+ dbc:	d2806582 	mov	x2, #0x32c                 	// #812
+ dc0:	52800001 	mov	w1, #0x0                   	// #0
+ dc4:	17ffff3a 	b	0xaac
+ dc8:	d65f03c0 	ret
diff --git a/benchmark/03_magic_memset/reference.c b/benchmark/03_magic_memset/reference.c
new file mode 100644
index 0000000..e52aa0f
--- /dev/null
+++ b/benchmark/03_magic_memset/reference.c
@@ -0,0 +1,44 @@
+/* Ground-truth C for FUN_00000da4 @ blob offset 0xda4 (40 bytes / 9 insts).
+ *
+ * Pattern:  magic-number check at absolute address, then tail-call to memset.
+ * Signature:  void check_and_zero(void);
+ *
+ * AArch64 ABI:  no args, no return value
+ * Scratch:      X0..X2, W1, W2
+ *
+ * Behaviour:
+ *   uint32_t *magic = (uint32_t *)0x1fe000;
+ *   if (magic[1] == 0x54410001)           // 'TA'\x01 — Trusted App header?
+ *       memset(magic, 0, 0x32c);          // tail-call to FUN_00000aac
+ *   // else: fall through, return
+ *
+ * Notes the decompiler should ideally recover:
+ *   - `orr x0, xzr, #0x1fe000` is an immediate-load idiom for `x0 = 0x1fe000`;
+ *     encoded as OR-with-zero so ARM assemblers can pack it.
+ *     Tools that don't know the ORR-imm trick may render this as
+ *     `x0 = 0 | 0x1fe000` or worse `x0 = 0 | 0x1FE000UL` with weird types.
+ *   - `MOV w1, #0x1 ; MOVK w1, #0x5441, LSL #16` composes a 32-bit literal
+ *     0x54410001. A good tool collapses both into `w1 = 0x54410001`.
+ *   - `LDR w2, [X0, #0x4]` reads `magic[1]`, i.e. the second word at the
+ *     magic region. Comparing against 0x54410001 = 'TA'\x01 is the
+ *     ARMv8 "Trusted Application" header signature convention.
+ *   - `B 0xaac` is a tail-call: control transfers to memset with X0, W1, X2
+ *     already set up; no BL / return path. Tools should emit this as
+ *     `return memset(x0, w1, x2);` or at least a clear call — not an
+ *     inlined body.
+ *
+ * Address 0x1fe000 lies in RK3588 SRAM (PMU-SRAM region 0x1fe0_0000–…).
+ * Not MMIO in the strict sense — it's memory — but tools may flag it as
+ * special because of the large constant.
+ */
+#include <stddef.h>
+#include <stdint.h>
+
+extern void memset_byte(void *buf, uint8_t val, size_t len); /* FUN_00000aac */
+
+void check_and_zero(void) {
+    uint32_t *magic = (uint32_t *)0x1fe000UL;
+    if (magic[1] == 0x54410001U) {
+        memset_byte(magic, 0, 0x32c);
+    }
+}
diff --git a/benchmark/03_magic_memset/retdec.c b/benchmark/03_magic_memset/retdec.c
new file mode 100644
index 0000000..813ab6a
--- /dev/null
+++ b/benchmark/03_magic_memset/retdec.c
@@ -0,0 +1,30 @@
+//
+// This file was generated by the Retargetable Decompiler
+// Website: https://retdec.com
+//
+
+#include <stdint.h>
+
+// ------------------- Function Prototypes --------------------
+
+int64_t entry_point(void);
+int64_t unknown_aac(int64_t a1, int64_t a2, int64_t a3);
+
+// ------------------------ Functions -------------------------
+
+// Address range: 0xda4 - 0xdcc
+int64_t entry_point(void) {
+    // 0xda4
+    if (*(int32_t *)0x1fe004 == 0x54410001) {
+        // 0xdbc
+        return unknown_aac(0x1fe000, 0, 812);
+    }
+    // 0xdc8
+    return 0x1fe000;
+}
+
+// --------------------- Meta-Information ---------------------
+
+// Detected compiler/packer: molebox (2.0)
+// Detected functions: 1
+
diff --git a/benchmark/04_train_phy_block/GRIND_LOG.md b/benchmark/04_train_phy_block/GRIND_LOG.md
new file mode 100644
index 0000000..06d3dc2
--- /dev/null
+++ b/benchmark/04_train_phy_block/GRIND_LOG.md
@@ -0,0 +1,80 @@
+# GRIND_LOG — first real-blob C-lift
+
+Function: **FUN_0000d328** @ blob offset 0xd328 (104 bytes / 26 insts).
+Contains 4 of our 16 timeout-less polls (sites 12, 13, 14, 15).
+Semantics: **PHY block training step** — poke CTL, wait for two STAT
+bits, apply two CFG values with HANDSHAKE acks, ack via CTL.
+
+## Tools tried (single-pass, no iteration yet)
+
+| tool | output file | grade |
+|---|---|---|
+| Ghidra 11.3 (auto-decompile) | `ghidra.c` | **A.** All 4 polls correctly modeled as `do {} while`. Collapsed the `(base + 0x8000) + offset` arithmetic into a single offset (`lVar1 + 0x8110` etc.) — actually MORE useful than a hand-written reference because it surfaces the absolute register addresses. Type cleanup needed (`undefined4`/`uint`/`long`). |
+| retdec v5.0 (zero-touch raw mode) | `retdec.c` | **C.** Recognised the function and the polls but: misread bitmask tests as comparisons (`*v6 % 4 == 0` for `& 3`, `< 0x10000000` for `& 0xF0000000`). Fabricated a return value for a void function. Loop bodies marked as `continue ->` comments. Usable as a sanity-check second opinion, not as a basis for rewriting. |
+| ground truth (hand-written) | `reference.c` | n/a — this is the canonical interpretation we judge against. |
+
+## Matching-decomp candidate iterations (the actual grind)
+
+Goal: a `.c` file that compiles to bytes close to the original 104-byte
+slice. Score = `min(candidate_size, vendor_size) / max(candidate_size, vendor_size)`
+after instruction-by-instruction diff (manual until objdiff is installed).
+
+### Iteration 1: cast-on-each-access, `-O2`
+- Pattern: `*(volatile u32 *)(base + offset)` per access.
+- GCC behavior: materialised each `0x8XXX` offset into its own register
+  (`mov x2, #0x8120; add x2, x3, x2; ldr w0, [x2]`), exploding code size.
+- Result: ~160 bytes. **53% size match. Bad.**
+
+### Iteration 2 (current best): pre-adjust base outside volatile chain, `-Os`
+- Pattern: `unsigned char *phy = base + 0x8000` once, then `*(u32v *)(phy + small)`.
+- `-Os` instead of `-O2` — drops loop-alignment NOPs.
+- Result: **116 bytes (29 insts)**. **88% size match.** See `candidate.c`.
+
+### Remaining gap to vendor (12 bytes = 3 instructions)
+
+1. GCC turns `(x & 0xF0000000) == 0` into `cmp w, w_loaded_const; b.ls`
+   instead of vendor's `tst w, #imm; b.eq`. Costs 4 bytes per loop, twice
+   = 8 bytes.
+2. GCC's `[base+0x184]` accesses inside the handshake loop are
+   `add x1, x0, #0x200; ldur x2, [x1, #-124]` — likely a ldp/ldur pair
+   GCC's scheduler thinks is faster on Cortex-A76. Costs ~4 bytes.
+
+### Next iteration ideas
+
+- **Inline-asm** for the mask-tests to force TST encoding directly. Cheap
+  win, gets us to ~108 bytes.
+- **Clang** (different scheduler, sometimes nicer with TST-style
+  comparisons). Try `clang -Oz -ffreestanding -target aarch64-none-elf`.
+- **ARMCC** — the most likely vendor compiler. Sourcing armclang for
+  AArch64 requires an Arm Developer account; backlog item.
+- **objdiff** — once installed, automate the byte-diff scoring instead
+  of eyeballing.
+
+## Workflow validation
+
+- ✓ Function extracted from blob as standalone .bin slice.
+- ✓ Three decompiler views captured (Ghidra, retdec, hand-written reference).
+- ✓ Candidate compiles + runs (matches reference semantics).
+- ✓ Single-pass byte-comparison done by hand; got 88% on iteration 2.
+- ✗ objdiff not installed — would automate the scoring.
+- ✗ decomp.me self-host not yet running on pve4 — would crowdsource the
+  grind via the standard interface.
+- ✗ ARMCC not installed — perfect-match unattainable without it.
+
+**The pipeline works.** Each future poll-site function follows the
+same 4-step recipe: extract → Ghidra-clean → write candidate → iterate
+until ≥90 % match. Estimated ~2-3 h per function for the small ones.
+
+## How this connects to the v3fb work
+
+This function contains 4 of the 16 poll sites. Once we have a
+byte-matching (or functionally-equivalent) C version, we can:
+
+1. Add bounded-retry counters in the C source — much cleaner than the
+   asm trampoline patcher.
+2. Compile + link as a freestanding `.o` at the original blob offset.
+3. Splice into the blob, replacing `FUN_0000d328` entirely.
+
+That's the path to a maintainable replacement for the trampoline-based
+v3fb approach, **for at least these 4 sites**. The other 12 sites live
+in different functions and would each need their own lift.
diff --git a/benchmark/04_train_phy_block/candidate.c b/benchmark/04_train_phy_block/candidate.c
new file mode 100644
index 0000000..2c20d00
--- /dev/null
+++ b/benchmark/04_train_phy_block/candidate.c
@@ -0,0 +1,36 @@
+/* Best matching candidate so far for FUN_0000d328.
+ * Compile:  gcc -Os -ffreestanding -nostdlib -c candidate.c -o candidate.o
+ * Score:    116 bytes vs vendor 104 bytes (88% size match, 12 bytes / 3 insts over).
+ *
+ * Remaining gap vs vendor:
+ *   - GCC emits `cmp w, w_loaded_const ; b.ls` for `(x & 0xF0000000) == 0`
+ *     instead of vendor's `tst w, #0xF0000000 ; b.eq` (both 12 bytes, but
+ *     vendor avoids materializing the mask in a register, saving 4 bytes
+ *     per loop, twice = 8 bytes).
+ *   - GCC emits `add x1, x0, #0x200 ; ldur x2, [x1, #-124]` for the
+ *     `[base+0x184]` accesses inside the handshake loop, vs vendor's
+ *     direct `ldr w1, [x0, #0x184]`. Costs us ~4 bytes.
+ *
+ * Next iterations to try:
+ *   1. Inline-asm for the mask-tests to force TST encoding.
+ *   2. `__builtin_expect((x & 0xF0000000) != 0, 0)` to hint loop direction.
+ *   3. Alternative compilers: clang, ARMCC (the latter is what Rockchip
+ *      almost certainly used; need to source it).
+ */
+typedef volatile unsigned int  u32v;
+typedef volatile unsigned long u64v;
+
+void train_phy_block(unsigned long ctx)
+{
+    unsigned char *phy = (unsigned char *)(*(unsigned long *)(ctx + 0xb8) + 0x8000);
+    *(u32v *)(phy + 0x110) = 0xf000f000u;
+    while ((*(u32v *)(phy + 0x118) & 0xf0000000u) == 0u) ;
+    while ((*(u32v *)(phy + 0x120) & 0xf0000000u) == 0u) ;
+    *(u32v *)(phy + 0x160) = 0x30003u;
+    *(u32v *)(phy + 0x154) = 0x30003u;
+    while ((*(u64v *)(phy + 0x184) & 3ul) == 0ul) ;
+    *(u32v *)(phy + 0x154) = 0x30000u;
+    while ((*(u64v *)(phy + 0x184) & 3ul) != 0ul) ;
+    *(u32v *)(phy + 0x160) = 0x30000u;
+    *(u32v *)(phy + 0x110) = 0xf0000000u;
+}
diff --git a/benchmark/04_train_phy_block/decompme.md b/benchmark/04_train_phy_block/decompme.md
new file mode 100644
index 0000000..070b01d
--- /dev/null
+++ b/benchmark/04_train_phy_block/decompme.md
@@ -0,0 +1,71 @@
+# decomp.me recipe — 04_train_phy_block
+
+This is the **first real-blob function we're lifting to byte-matching C.**
+Score target: ≥95% match. Perfect match unlikely (compiler unknown).
+
+## Target asm (paste into "Target asm" field)
+
+```asm
+train_phy_block:
+    ldr     x0, [x0, #0xb8]
+    mov     w1, #0xf000f000
+    add     x0, x0, #0x8000
+    str     w1, [x0, #0x110]
+.Lwait_a:
+    ldr     w1, [x0, #0x118]
+    tst     w1, #0xf0000000
+    b.eq    .Lwait_a
+.Lwait_b:
+    ldr     w1, [x0, #0x120]
+    tst     w1, #0xf0000000
+    b.eq    .Lwait_b
+    mov     w1, #0x30003
+    str     w1, [x0, #0x160]
+    str     w1, [x0, #0x154]
+.Lwait_hs1:
+    ldr     w1, [x0, #0x184]
+    tst     x1, #0x3
+    b.eq    .Lwait_hs1
+    mov     w1, #0x30000
+    str     w1, [x0, #0x154]
+.Lwait_hs2:
+    ldr     w1, [x0, #0x184]
+    tst     x1, #0x3
+    b.ne    .Lwait_hs2
+    mov     w1, #0x30000
+    str     w1, [x0, #0x160]
+    mov     w1, #0xf0000000
+    str     w1, [x0, #0x110]
+    ret
+```
+
+## Compiler
+
+`aarch64-linux-gnu gcc 12 -O2 -ffreestanding -nostdlib`
+(Try also `-Os`. Vendor blob's compiler unknown — could be ARMCC or older
+GCC. Optimal C may differ between targets; perfect byte-match probably
+unattainable.)
+
+## Context
+
+Use `reference.c` as the starting C. The CMP-vs-TST distinction at the
+end (`tst x1, #0x3` uses 64-bit reg even though w1 was loaded — vendor
+quirk) suggests a particular intrinsic / pattern. May need to write the
+load as `(uint64_t)mmio_r(...)` and the test as a 64-bit AND to coax
+GCC into emitting `tst x1` instead of `tst w1`.
+
+## Things to iterate on
+
+- Order of writes to CFG_A vs CFG_B: vendor wrote CFG_B first
+  (`str w1, [x0, #0x160]` then `str w1, [x0, #0x154]`). C order matters.
+- The two `mov w1, #0x30000` near the end could be hoisted by GCC; vendor
+  emitted them inline. May need separate variables to prevent hoist.
+- `add x0, x0, #0x8000` vs `add x0, x0, #0x8, lsl #12` — same
+  instruction, GAS picks one. Either should round-trip.
+
+## Score expectations
+
+- 80%: rough loop structure + register usage matches.
+- 95%: instruction order + immediate forms match.
+- 100%: would require exact compiler/version match. Unlikely without
+  ARMCC.
diff --git a/benchmark/04_train_phy_block/func.bin b/benchmark/04_train_phy_block/func.bin
new file mode 100644
index 0000000000000000000000000000000000000000..13f29592a5b53bf57cd0a5ad6c8c7a109b05a5bb
GIT binary patch
literal 104
zcmZRGarpVLpT&ql!C@k!0OL+Z3CErGJS;^A{{IhQR6wvFwlf$pCIIC_fVc%H&f@S1
bq$ZJJK@hUoL6{gwEyKPbkeLUV<F5e#*+M5l

literal 0
HcmV?d00001

diff --git a/benchmark/04_train_phy_block/func.s b/benchmark/04_train_phy_block/func.s
new file mode 100644
index 0000000..4cce4b0
--- /dev/null
+++ b/benchmark/04_train_phy_block/func.s
@@ -0,0 +1,33 @@
+
+func.bin:     file format binary
+
+
+Disassembly of section .data:
+
+000000000000d328 <.data>:
+    d328:	f9405c00 	ldr	x0, [x0, #184]
+    d32c:	32048fe1 	mov	w1, #0xf000f000            	// #-268374016
+    d330:	91402000 	add	x0, x0, #0x8, lsl #12
+    d334:	b9011001 	str	w1, [x0, #272]
+    d338:	b9411801 	ldr	w1, [x0, #280]
+    d33c:	72040c3f 	tst	w1, #0xf0000000
+    d340:	54ffffc0 	b.eq	0xd338  // b.none
+    d344:	b9412001 	ldr	w1, [x0, #288]
+    d348:	72040c3f 	tst	w1, #0xf0000000
+    d34c:	54ffffc0 	b.eq	0xd344  // b.none
+    d350:	320087e1 	mov	w1, #0x30003               	// #196611
+    d354:	b9016001 	str	w1, [x0, #352]
+    d358:	b9015401 	str	w1, [x0, #340]
+    d35c:	b9418401 	ldr	w1, [x0, #388]
+    d360:	f240043f 	tst	x1, #0x3
+    d364:	54ffffc0 	b.eq	0xd35c  // b.none
+    d368:	52a00061 	mov	w1, #0x30000               	// #196608
+    d36c:	b9015401 	str	w1, [x0, #340]
+    d370:	b9418401 	ldr	w1, [x0, #388]
+    d374:	f240043f 	tst	x1, #0x3
+    d378:	54ffffc1 	b.ne	0xd370  // b.any
+    d37c:	52a00061 	mov	w1, #0x30000               	// #196608
+    d380:	b9016001 	str	w1, [x0, #352]
+    d384:	52be0001 	mov	w1, #0xf0000000            	// #-268435456
+    d388:	b9011001 	str	w1, [x0, #272]
+    d38c:	d65f03c0 	ret
diff --git a/benchmark/04_train_phy_block/ghidra.c b/benchmark/04_train_phy_block/ghidra.c
new file mode 100644
index 0000000..5f99efa
--- /dev/null
+++ b/benchmark/04_train_phy_block/ghidra.c
@@ -0,0 +1,18 @@
+/* Ghidra 11.3 default decompiler output for FUN_0000d328 — unmodified. */
+void FUN_0000d328(long param_1)
+{
+  long lVar1;
+
+  lVar1 = *(long *)(param_1 + 0xb8);
+  *(undefined4 *)(lVar1 + 0x8110) = 0xf000f000;
+  do { } while ((*(uint *)(lVar1 + 0x8118) & 0xf0000000) == 0);
+  do { } while ((*(uint *)(lVar1 + 0x8120) & 0xf0000000) == 0);
+  *(undefined4 *)(lVar1 + 0x8160) = 0x30003;
+  *(undefined4 *)(lVar1 + 0x8154) = 0x30003;
+  do { } while ((*(uint *)(lVar1 + 0x8184) & 3) == 0);
+  *(undefined4 *)(lVar1 + 0x8154) = 0x30000;
+  do { } while ((*(uint *)(lVar1 + 0x8184) & 3) != 0);
+  *(undefined4 *)(lVar1 + 0x8160) = 0x30000;
+  *(undefined4 *)(lVar1 + 0x8110) = 0xf0000000;
+  return;
+}
diff --git a/benchmark/04_train_phy_block/reference.c b/benchmark/04_train_phy_block/reference.c
new file mode 100644
index 0000000..3d01b6f
--- /dev/null
+++ b/benchmark/04_train_phy_block/reference.c
@@ -0,0 +1,89 @@
+/* Ground-truth C for FUN_0000d328 @ blob offset 0xd328 (104 bytes / 26 insts).
+ *
+ * **The first real poll-site function we lift to C.**
+ * Contains 4 of our 16 timeout-less polls (sites 12, 13, 14, 15).
+ *
+ * Pattern:  PHY-block training step — poke a control register, wait for
+ *           two status bits, apply two intermediate values with a
+ *           handshake on a state register, ack the event.
+ *
+ * Signature:  void train_phy_block(struct phy_ctx *ctx);
+ *             (X0 = ctx, returns void)
+ *
+ * Layout:
+ *   ctx (X0)       — opaque per-rank/per-channel context
+ *   ctx->base[0xb8] — 64-bit pointer to a PHY block base
+ *   block + 0x8000 — addressed sub-block (likely "Master" bank in DWC PUB)
+ *
+ * The sub-block at +0x8000 has these registers (offsets within +0x8000):
+ *   +0x110  CTL       — write 0xF000F000 to start, 0xF0000000 to clear
+ *   +0x118  STAT_A    — bit[31:28] non-zero = step A done
+ *   +0x120  STAT_B    — bit[31:28] non-zero = step B done
+ *   +0x154  CFG_A     — write training value
+ *   +0x160  CFG_B     — write training value
+ *   +0x184  HANDSHAKE — bits[1:0] toggle between 0 and !=0 to ack writes
+ *
+ * The 4 polls (in order):
+ *   site 12 (B.EQ): STAT_A bit[31:28] non-zero?
+ *   site 13 (B.EQ): STAT_B bit[31:28] non-zero?
+ *   site 14 (B.EQ): HANDSHAKE bits[1:0] non-zero?  (ack of step-1 writes)
+ *   site 15 (B.NE): HANDSHAKE bits[1:0] zero?       (ack of step-2 write)
+ */
+#include <stdint.h>
+
+struct phy_ctx {
+    uint8_t pad[0xB8];
+    uint8_t *block;          /* base pointer used at +0xB8 in struct */
+    /* ... rest of struct unknown */
+};
+
+#define PHY_CTL          0x110
+#define PHY_STAT_A       0x118
+#define PHY_STAT_B       0x120
+#define PHY_CFG_A        0x154
+#define PHY_CFG_B        0x160
+#define PHY_HANDSHAKE    0x184
+
+#define PHY_CTL_GO       0xF000F000U
+#define PHY_CTL_CLR      0xF0000000U
+#define PHY_STAT_DONE    0xF0000000U
+#define PHY_CFG_VAL_RUN  0x00030003U
+#define PHY_CFG_VAL_END  0x00030000U
+#define PHY_HS_BUSY      0x3U
+
+static inline uint32_t mmio_r(volatile uint8_t *base, unsigned off) {
+    return *(volatile uint32_t *)(base + off);
+}
+static inline void mmio_w(volatile uint8_t *base, unsigned off, uint32_t v) {
+    *(volatile uint32_t *)(base + off) = v;
+}
+
+void train_phy_block(struct phy_ctx *ctx) {
+    volatile uint8_t *phy = (volatile uint8_t *)(ctx->block + 0x8000);
+
+    mmio_w(phy, PHY_CTL, PHY_CTL_GO);
+
+    /* site 12 — wait for step A complete */
+    while ((mmio_r(phy, PHY_STAT_A) & PHY_STAT_DONE) == 0)
+        ;
+
+    /* site 13 — wait for step B complete */
+    while ((mmio_r(phy, PHY_STAT_B) & PHY_STAT_DONE) == 0)
+        ;
+
+    mmio_w(phy, PHY_CFG_B, PHY_CFG_VAL_RUN);
+    mmio_w(phy, PHY_CFG_A, PHY_CFG_VAL_RUN);
+
+    /* site 14 — wait for handshake to assert */
+    while ((mmio_r(phy, PHY_HANDSHAKE) & PHY_HS_BUSY) == 0)
+        ;
+
+    mmio_w(phy, PHY_CFG_A, PHY_CFG_VAL_END);
+
+    /* site 15 — wait for handshake to deassert */
+    while ((mmio_r(phy, PHY_HANDSHAKE) & PHY_HS_BUSY) != 0)
+        ;
+
+    mmio_w(phy, PHY_CFG_B, PHY_CFG_VAL_END);
+    mmio_w(phy, PHY_CTL, PHY_CTL_CLR);
+}
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..5b3e250
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,36 @@
+# RE-tool benchmark — three functions from the RK3588 DDR blob
+
+Three small, self-contained functions extracted from
+`rk3588_ddr_lp4_1848MHz_lp5_2112MHz_v1.19.bin`, each with canonical
+ground-truth semantics so you can judge decompiler output against a
+known answer.
+
+| dir | blob offset | size | ground truth |
+|-----|-------------|------|--------------|
+| `01_memset/`       | `0x0aac` | 28 B / 7 insts  | `memset(void*, u8, size_t)` byte-wise |
+| `02_memcpy32/`     | `0x1200` | 36 B / 9 insts  | `memcpy32(u32*, const u32*, size_t)` word-aligned |
+| `03_magic_memset/` | `0x0da4` | 40 B / 9 insts  | `if (*(u32*)0x1fe004 == 0x54410001) memset(0x1fe000, 0, 0x32c);` |
+
+Each subdir contains:
+- `func.bin`    — raw little-endian AArch64 machine code
+- `func.s`      — objdump'd GNU asm, same absolute addresses as the blob
+- `reference.c` — ground-truth C (our belief)
+- `ghidra.md`   — load-in-Ghidra recipe + expected output
+- `decompme.md` — decomp.me scratch recipe (matching-decomp)
+- `retdec.md`   — retdec command line
+- `retdec.c`    — retdec's actual output (captured 2026-04-15)
+
+**Summary of findings**: see [`RESULTS.md`](RESULTS.md). Short version:
+- Ghidra got all three right with minor type-label cleanup needed.
+- retdec failed on #1 and #2 (can't infer register-passed arguments on
+  raw binary), did well on #3 (the one with absolute-address refs).
+- decomp.me is a matching-decomp comparator, not a decompiler — judged
+  on a different axis.
+
+## Load address matters
+
+All three functions are extracted as raw bytes starting at offset 0 in
+their `func.bin`. When loading into Ghidra / retdec, set the base
+address to the function's original blob offset (first column above),
+otherwise branch targets and absolute-address refs in function #3 will
+be off.
diff --git a/benchmark/RESULTS.md b/benchmark/RESULTS.md
new file mode 100644
index 0000000..f576f53
--- /dev/null
+++ b/benchmark/RESULTS.md
@@ -0,0 +1,171 @@
+# Three-way RE-tool benchmark — results
+
+Three AArch64 functions from the RK3588 DDR v1.19 conservative blob,
+decompiled by **Ghidra 11.3** (interactive, auto-analysis) and **retdec
+v5.0** (fully automated, `--mode raw`). **decomp.me** is a
+matching-decompilation comparator rather than a decompiler, so it's
+benchmarked on a different axis (time-to-match).
+
+## Function 01 — `memset_byte` (28 bytes, 7 insts)
+
+**Ground truth:** `void memset_byte(void *buf, uint8_t val, size_t len)` —
+byte-wise pre-test counting loop.
+
+### Ghidra output
+```c
+void FUN_00000aac(long param_1, undefined1 param_2, long param_3) {
+  long lVar1;
+  for (lVar1 = 0; param_3 != lVar1; lVar1 = lVar1 + 1) {
+    *(undefined1 *)(param_1 + lVar1) = param_2;
+  }
+}
+```
+**Grade: A.** Semantics perfect. Types `long`/`undefined1` instead of
+`void*`/`uint8_t`/`size_t` — one click to retype each. For-loop shape
+matches the canonical idiom exactly.
+
+### retdec output
+```c
+int64_t entry_point(void) {
+    int64_t result;
+    if (result == 0) return result;
+    int64_t v1 = 0;
+    *(char *)(v1 + result) = (char)result;
+    ...
+}
+```
+**Grade: F.** **No function arguments inferred** — treats X0/W1/X2 as
+uninitialised locals. The whole function signature is wrong. The loop
+body overwrites the wrong things. This is retdec's biggest weakness on
+raw binaries: without ELF symbol or DWARF hints, it can't tell which
+registers are live-in parameters.
+
+### decomp.me workflow
+N/A as a decompiler; as a matching-decomp tool: paste the asm + your
+candidate C → iterate on wording until the compiled output byte-matches.
+For memset this reaches 90%+ similarity in a handful of edits with GCC
+(exact match unlikely — original blob used a different compiler).
+
+---
+
+## Function 02 — `memcpy32` (36 bytes, 9 insts)
+
+**Ground truth:** word-aligned memcpy, `len &= ~3`, 4-byte stride.
+
+### Ghidra output
+```c
+void FUN_00001200(long param_1, long param_2, ulong param_3) {
+  ulong uVar1;
+  for (uVar1 = 0; uVar1 != (param_3 & 0xfffffffc); uVar1 = uVar1 + 4) {
+    *(undefined4 *)(param_1 + uVar1) = *(undefined4 *)(param_2 + uVar1);
+  }
+}
+```
+**Grade: A.** Semantics perfect. The `& 0xfffffffc` mask is in the
+correct position, the 4-byte stride is there, the `undefined4` (u32) copy
+is there. Again: only type annotations need manual cleanup.
+
+### retdec output
+```c
+int64_t entry_point(void) {
+    int64_t v1 = result & 0xfffffffc;
+    if (v1 == 0) return result;
+    int64_t v2 = 0;
+    int64_t v3 = v2 + 4;
+    while (v3 != v1) { v2 = v3; v3 = v2 + 4; }
+    return result;
+}
+```
+**Grade: F.** Same no-arguments failure mode. The memory-copy statements
+are completely absent — retdec failed to emit the two LDR/STR pair as a
+dereference. You get an infinite-looking counter increment and nothing
+else. Unusable.
+
+---
+
+## Function 03 — `magic_memset` (40 bytes, 9 insts)
+
+**Ground truth:** `if (*(u32*)0x1fe004 == 0x54410001) memset(0x1fe000, 0, 0x32c);`
+
+### Ghidra output
+```c
+void FUN_00000da4(void) {
+  if (_DAT_001fe004 == 0x54410001) {
+    FUN_00000aac(0x1fe000, 0, 0x32c);
+    return;
+  }
+  return;
+}
+```
+**Grade: A+.** Perfect. `_DAT_001fe004` is Ghidra's auto-named data
+symbol for the absolute address. The tail-call `B 0xaac` was correctly
+turned into a regular call-and-return, preserving the calling
+convention. The MOVZ+MOVK composed immediate `0x54410001` was collapsed
+into a single literal.
+
+### retdec output
+```c
+int64_t entry_point(void) {
+    if (*(int32_t *)0x1fe004 == 0x54410001) {
+        return unknown_aac(0x1fe000, 0, 812);
+    }
+    return 0x1fe000;
+}
+```
+**Grade: B+.** **Noticeably better than retdec's output for #1 and #2**:
+  - Absolute-address dereference correctly parsed.
+  - `MOVZ W1, #1 ; MOVK W1, #0x5441, LSL #16` collapsed to
+    `0x54410001` ✓
+  - Tail-call to 0xaac correctly recognised as a call to
+    `unknown_aac(0x1fe000, 0, 812)` — even got arity right by observing
+    X0/W1/X2 being set up just before the branch.
+  - **Weakness:** returns `0x1fe000` in the fall-through branch —
+    spurious, because the function returns `void` and retdec fabricated
+    a return value. Also `812` decimal instead of `0x32c` hex.
+
+---
+
+## Takeaways
+
+| dimension | Ghidra | retdec | decomp.me |
+|---|---|---|---|
+| Argument inference from raw binary | **yes** (intra-proc analysis) | **no** | n/a |
+| Absolute-address data refs | auto-named `_DAT_xxxxx` | raw cast `*(int32_t *)0x…` | n/a |
+| MOVZ+MOVK literal reconstruction | collapses | collapses | n/a |
+| Tail-call recognition | yes | yes | n/a |
+| Control-flow structure | clean structured loops | mix of `while` + `goto` | n/a |
+| Type inference | `long`/`undefined4` placeholders | cautious `int64_t` fallback | n/a |
+| Zero-touch automation | no (interactive) | **yes** | no (interactive) |
+| Matching-decomp workflow | no | no | **yes** |
+
+### When each tool wins
+
+- **Ghidra** is the default daily driver. Auto-analysis output is already
+  close to final for simple functions — mostly you rename params and
+  retype placeholders.
+- **retdec** shines when the target has **absolute-address data refs,
+  call tables, or embedded constants** (function #3). It falls over on
+  anything where register-passed parameters need inference from
+  surrounding context (functions #1 and #2). Fine for bulk batch
+  processing of a repo full of functions whose signatures you don't
+  know you care about — but verify each output.
+- **decomp.me** doesn't compete with the others; it's the **"did my
+  rewrite compile to the same bytes?"** tool. Complementary: take
+  Ghidra's output, paste the C into decomp.me, iterate until the
+  compiled asm matches the blob's bytes. That's how you'd produce a
+  maintainable C re-implementation.
+
+### Practical recipe for our DDR-blob work
+
+1. **Start with Ghidra's decompiler output** (already done in
+   `ddr_annotated.c`). Retype params, rename variables. ~2–4h per
+   non-trivial function.
+2. **Feed the cleaned C into decomp.me** with the original function's
+   bytes as target asm. Iterate until byte-match (or asymptotic
+   similarity). ~1–2h per function.
+3. **retdec** is useful only for functions with lots of absolute-address
+   refs we want a second opinion on — which is rare in the poll-loop
+   patches.
+
+For a production C re-implementation of the 20 poll sites, Ghidra →
+decomp.me is the correct pipeline. Skip retdec for those.
diff --git a/benchmark/extract.py b/benchmark/extract.py
new file mode 100644
index 0000000..4d9eb67
--- /dev/null
+++ b/benchmark/extract.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+"""Slice named functions out of the RK3588 DDR v1.19 conservative blob."""
+import os
+
+BLOB = os.path.expanduser('~/projects/AMPere/rkbin/bin/rk35/rk3588_ddr_lp4_1848MHz_lp5_2112MHz_v1.19.bin')
+BASE = os.path.expanduser('~/projects/AMPere/benchmark')
+
+functions = [
+    ('01_memset',        0x0aac, 0x1c, 'byte memset'),
+    ('02_memcpy32',      0x1200, 0x24, 'word-aligned memcpy32'),
+    ('03_magic_memset',  0x0da4, 0x28, 'magic-check + tail-call to memset'),
+]
+
+blob = open(BLOB, 'rb').read()
+for name, off, sz, desc in functions:
+    d = os.path.join(BASE, name)
+    os.makedirs(d, exist_ok=True)
+    with open(os.path.join(d, 'func.bin'), 'wb') as f:
+        f.write(blob[off:off+sz])
+    print(f"{name}: {sz} bytes @ 0x{off:x}  — {desc}")
diff --git a/benchmark/gdb_debug/Makefile b/benchmark/gdb_debug/Makefile
new file mode 100644
index 0000000..dffc28e
--- /dev/null
+++ b/benchmark/gdb_debug/Makefile
@@ -0,0 +1,26 @@
+BENCH := $(abspath ..)
+
+.PHONY: all clean
+all: gdb_debug.elf
+
+# Wrap each benchmark function's raw bytes into an .o with predictable
+# symbols _binary_func_NN_bin_{start,end}, regardless of the cwd-dependent
+# symbol names that `ld -b binary` generates.
+define WRAP_BIN
+$1.o: $(BENCH)/$2/func.bin
+	cp $$< $1.bin
+	ld -r -b binary -o $$@.raw $1.bin
+	rm -f $1.bin
+	objcopy $$$$(nm $$@.raw | awk '/_func_bin_start$$$$/{printf " --redefine-sym %s=_binary_$1_bin_start",$$$$3} /_func_bin_end$$$$/{printf " --redefine-sym %s=_binary_$1_bin_end",$$$$3}') $$@.raw $$@
+	rm -f $$@.raw
+endef
+
+$(eval $(call WRAP_BIN,func_01,01_memset))
+$(eval $(call WRAP_BIN,func_02,02_memcpy32))
+$(eval $(call WRAP_BIN,func_03,03_magic_memset))
+
+gdb_debug.elf: harness.c func_01.o func_02.o func_03.o
+	gcc -O0 -g -Wall -o $@ $^
+
+clean:
+	rm -f gdb_debug.elf func_*.o *.bin
diff --git a/benchmark/gdb_debug/README.md b/benchmark/gdb_debug/README.md
new file mode 100644
index 0000000..019d3d3
--- /dev/null
+++ b/benchmark/gdb_debug/README.md
@@ -0,0 +1,72 @@
+# gdb_debug — single-step the benchmark functions under GDB
+
+Wraps each of `01_memset` / `02_memcpy32` / `03_magic_memset` in a
+C harness, copies the raw bytes into an RWX buffer, and calls through
+a function pointer. GDB attached to the harness lets you step every
+machine instruction of the real blob code — **no QEMU needed because
+boltzmann (and ampere, ohm, hertz) are natively aarch64.**
+
+## Build
+
+```
+make                 # builds ./gdb_debug.elf natively on aarch64
+```
+
+Cross-build recipe (if you ever want to run on x86 oppenheimer via
+qemu-user) lives in the Makefile; replace `gcc` with
+`aarch64-linux-gnu-gcc` and `ld` with `aarch64-linux-gnu-ld`, and launch
+under `qemu-aarch64-static -g 1234 ./gdb_debug.elf 1` with
+`gdb-multiarch` attaching to `:1234`.
+
+## Run under GDB
+
+```
+gdb ./gdb_debug.elf
+(gdb) set pagination off
+(gdb) layout split            # TUI: source / asm / regs split
+(gdb) break call_func         # the dispatcher — one breakpoint catches all three
+(gdb) run 1                   # 1=memset  2=memcpy32  3=magic_memset
+(gdb) stepi                   # one machine instruction
+(gdb) info reg                # full register dump
+(gdb) x/8i $pc                # peek 8 upcoming instructions
+(gdb) display/i $pc           # auto-show next instruction on every stop
+(gdb) x/16bx $x0              # hex-dump 16 bytes from what X0 points at
+```
+
+## What to look for
+
+### Function 1 (memset)
+After `MOV X3, #0`, each iteration: `CMP X2, X3` → `B.NE` → `STRB W1, [X0, X3]`
+→ `ADD X3, X3, #1` → back. Watch `$x3` advance, inspect `x/16bx $x0` to see
+the buffer filling with `0xAB`.
+
+### Function 2 (memcpy32)
+First instruction is the alignment mask: `AND X2, X2, #0xfffffffc`.
+Set a watchpoint on `$x2` to catch the mask, then step the loop to watch
+4-byte transfers: `LDR W4, [X1, X3]` ; `STR W4, [X0, X3]` ; `ADD X3, X3, #4`.
+
+### Function 3 (magic_memset)
+Will **SIGSEGV** on `LDR W2, [X0, #4]` because `X0 = 0x1fe000` is unmapped
+in user mode. That crash **is** the verification — it proves the function
+really does target that absolute address. To execute the full path, add
+before `call_func`:
+
+```c
+mmap((void*)0x1fe000, 4096, PROT_READ|PROT_WRITE,
+     MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0);
+*(uint32_t*)0x1fe004 = 0x54410001;
+```
+
+Then the magic check passes and GDB steps into the tail-call to memset.
+
+## Why this scaffold beats `ddr_emu2` for verifying trampolines
+
+`ddr_emu2` dies at PC=0x10a80 in the emulator because it can't model an
+MMIO register — blind spot for us. Native GDB on an aarch64 host runs the
+*actual* CPU with full instruction fidelity; the limit becomes "can we
+fake the MMIO responses?" rather than "does the emulator know this
+instruction?". For compute-only code (functions 1 and 2), zero prep
+needed. For MMIO-touching code, `mmap(MAP_FIXED)` + a signal handler
+stub can serve as a synthetic PHY — **that's the path to single-stepping
+a patched trampoline through the real ISA with fake hardware replies**,
+which is exactly what the next round of v3fb verification would need.
diff --git a/benchmark/gdb_debug/func_01.o b/benchmark/gdb_debug/func_01.o
new file mode 100644
index 0000000000000000000000000000000000000000..8bf623ce886888d1112a428e7991227254ce0e1d
GIT binary patch
literal 648
zcmb<-^>JfjWMqH=MuzPS2p&w7fx!bw&;cy$z`)AD!obYXa4DXF`L!bhL&yQ<_-l+A
z$`;8i3=@C<|1XZvjh)4e#U_w70zh#YG%b=~@sIz}#f_lq(UdbV#3yCuB^Fi2r<LX<
z#~T=exbdlZDOkmeGpkauOOzxQl`t^q6<6k#Bqjmrk|GG50b`{kmLy`>O@@FpP!GuO
z%xHQYpnP<HgVZ7e5T6NI0Lo5)(iKn|WIjk82b3=Xr8$8#y4@hD8&G*4C=H{~?H2?}
RVAHPxRj3Hl2&K{W0|0{TIf4KH

literal 0
HcmV?d00001

diff --git a/benchmark/gdb_debug/func_02.o b/benchmark/gdb_debug/func_02.o
new file mode 100644
index 0000000000000000000000000000000000000000..5c5e901dbbda9b05db6f24645cdde7edfb529049
GIT binary patch
literal 656
zcmb<-^>JfjWMqH=MuzPS2p&w7fgu1%&;cy$z`)AD!r)X=H;I{{;Zi*V(`!cthL8iy
z@z+!`l6SCVDDOxXV3_#p|9^3WDcD)eSZoAYCxAs<8Y-@WW|T2l{NsOgafbM$%)G>+
z%J{U>yySQTBM>*fxFoTt1glJHUJ6$6;>@a41_r(2%G{E~Bp_W<1fervtdzu(L^LOn
z#Rd5f<acH?-40Mby8l3GkpYO$ge(AMXF%x+C=D_nq>cm1mw?ioKpNd{kkkVZ0R%n}
Y3QnTiF9?*ure6oDP!X;Z!a&y#01}Hm+5i9m

literal 0
HcmV?d00001

diff --git a/benchmark/gdb_debug/func_03.o b/benchmark/gdb_debug/func_03.o
new file mode 100644
index 0000000000000000000000000000000000000000..473f24a0d5fd72157a2c417f482190fa2ad8b647
GIT binary patch
literal 656
zcmb<-^>JfjWMqH=MuzPS2p&w7fgu1%&;cy$z`)AD!tg-8c#|SSLy%&}sv;&9hn?{Z
zjM<G03?WUa4VM^!;#U9viyvT)zlJaglf{h1Mv!#^3?Kh%pedIEiz7J(DsKW6M_11f
zpOl%GSX3FGR+^U_Z(t1K#usN+rDB&TNh~VCDwCR*!oZ+cT$x*vm;|ItiXe0bjFpmD
zl8EM1vbfSfT_E2vqv>{l^3nZ=9ugov6PjKSHv=kG0i{9agVb?A`4Ui?6G)@m4U&2Q
cB7ndLLcvLN`vrkA*!1f_6)M7YLKx`!0TU-aq5uE@

literal 0
HcmV?d00001

diff --git a/benchmark/gdb_debug/gdb_debug.elf b/benchmark/gdb_debug/gdb_debug.elf
new file mode 100755
index 0000000000000000000000000000000000000000..3c8cc43ff762de8be527ac406dec12f2af604d53
GIT binary patch
literal 76152
zcmeHNeRLevb-%N-W672*%d%|w3u`19>`?UK4~%VWEXg*q!Cx4Wm<BRl?T)04SG$}2
zz%r&F2!|3NrxtJmX$bL2+Qtx0k4utMo07zQG))>#>ljSj7N~4U>XfuG&8Gmf{oOb3
z$+M#shjZGV(?8zxnYr)%-o5wT_j_+<=Uw@Zja#k`83w6h(qEHkP4SR+*)eEe&}pZY
zG=<t}AvMt?^oMaNy`XFhp%X>Pa<54yB-=eK`-5_x)KgN8&eD>7rNr;2d#IA+%|_fg
z*7SL=KYiZwsAN9}gp!t0*>QMmHze(bq@9xIWME3F`SINNO3V2L<wEpvQId7Ju;lBM
zoJUaF=;LCGyQj<59Y4***e+J>bp+*29!yHA@veg%&+nra>7I7V@h0N)fSk9Ik|r^?
zS`5ZCcWt|Y-uS|;_cTAXt@kS}tEwLUgYO>u{`3dVwI<>{t%+E3BA(0-Hb)|vX#a}k
zE!kAd5^q)5m-6>f<7Pb*yY`e?AfKU0b+nsgz!KjDS>w}D)ddsaizmRD+fsg>25#dD
zmdyYfgCN?gamg)F)Z2tsDgRR^!2bcbjVo9_4WN{N2Y4y}%zr7|odACwxQ#1VIsugO
z(=h=~>3l9rX*ZKeWyo=|xkz-66YbyQ^hV+datGr%isVvpbYT-kxj&P01|sn!WpgoD
zQExgEPv&}|5FLy-z42ru5x>=?0e2vp-baz1R3=9Q1Cex*^S*o}6C?J>+i(Sy-ke=q
zHg~RfmbNUpf>d?ca&k6zZ*?%V+ZWH~+)Ve@^@&u{?T+*$T#V5-kV?uZ9Z$8SqNFW^
zU22M*$~FGSZva(eY2e>IK>0bfaXm8p1#H1EJ%&0L13I@ss^|DyqTi&iOU%;`jp?iR
zqbXk)b_IJO8kFxrg})?!F9r*|Gw`fQ{lg+gqM-nOuu7=WfdKx1)PFR9pOX0D0Iu|p
z1aLL(@c^Edeoh7OT7F<q`enb0U$yU*`Ud7z=BbMNK_BjawH)%{lYRQfe0Ze~|A7ye
zp@`lQA71=?dhe?c;u-tNf<?KEpK-Ya!cVmi_y2y*_Tm2D0o#Ymr4@b_`*67x1m5Pu
z{qtJs!{w3-{dONdO(5QFlMh#WlzVpgaR2*amk*!e^RuqWJb8Zq_ntU!(r`CEGtTdS
z?$Gyu9W$L*eQI<LI_HSaP5+>4wEE1tVLMd+YQ1fqR9Ce@2d0PGN2^05`b2vY+Ks;U
z6tp{i?JBf;eC-;v@A9>0q5V}~doJ3~``Qc8KI3aQq8&0WjDHE*jlTA!Xm|SBEok?6
zZU0<L)t!ar$%|;{!~$GDHR;LIX8Y)wCK~=3cpCvP^oy=h6MbP^e_JR#atfEgN4iGa
ziN1BbP^e>06<<M#7tR6NZ4ToT^FRaSI-<9a5p|!2?}it<MkDZX&NQAR@GjOSpx;3B
zqsI5|c)6qU&ifnQ?|A4r=%0aY=ee%Y9$_0Avpv86yJEe6RY9XO;AeNw`c>@)T_MIg
z-ZgqB^hG?&+ed9$_v$}kE;bGA+}&fY`c*~y==uH69}3RpHO%k){_hCCBhzT~G<;hg
zA2sX4Lp#sF-k`fq;0<@;i_A?OjgHi{kJg9juMjJ?9_Bx^cKB8~RvPjQ#yUN%UFh|r
zeH!{Bz-*%XPh$<({u%69mgo1sAYv-K5c=`?{ZpPe|Jl&+NEzZNqoX6dK5uo68U%ma
z33BMfNLl;Q`bzrD!7}Pr^~!Qt4+BR`0xze};Kk9sil}xZY78UZ?ez$>K8&jzSLM*!
zk^4JVA)ZUc91gA>c6ol#eY326G!z;-ap#nwn%&^BNRO@>{v`D9T~b>p7*7-mp(hG;
zbQJH3TFmq4!IOWpvytf1k-DLy^y}MSKK&*QH}=~{FxR7aVNC~rN4Z`<pFZ>X4+`Df
zKN2Nx%<#|&G2Wu~qZi`3YIq&S!#kNjD-Tfny5VIUCt@VTruL)a>df|b!B;04Z!=%*
z;A0;2&+otH32;2j>p!;M^>mcid!!!mfUkoFxMR*=;b+2IGlD#WHN)74jopaVG@q=8
zzjo}4F6I{NN;VC%-uVv@*CNah@$(wlVfd5v%Pd(}Ys9!56MWT*J^X{NQ8E76F5W9Y
z5_^T?LHiRtXY9QOrTeVZJr&SJdyCYarF37Cx~~Ov(e9ACyng<<pFw<L&d0ntpTXRC
z&iJjUJ%fFFTK1D|4F3du+~)nf4gLsvYG3oaxBB_6QGWi_TKT1g=lE-{wI7WVUEV}j
zp8VQtbo5PQ<cgjxJ6^sUe(tuPpL<uBt_xA)eGb+%gf+VRw)TgVkH-BKCp#&8|2hi2
zeW+0A{>qi+$*;oS8RP5~(Mqbh3qCsS7v`>q52B{~I$_K2s2^fqun#+5agrzmdm6L<
z*LB|Si2pYx`1``&SzN@YBxE|{uD0!-eD5chwB5Y6ZSdl@rGxytnoebNs{^>b*d0u}
z(VX2j80q;KYvEj%&3$s2XSj0Erx0@^v9`9hWm1izkpzxSefB`4FCKO9RhxBl_L-rF
z?dvypUB9vG<5@eEw6|>7X{UPOeo3#}*0$V^C+&RJ&DaB}nA;+TqkJ~fhlnrE+WVK>
zw)D1Tf8>Qh`Q)BtYHt!IF?Z0uI3{80%_pO|IDB6mvlFRE%#GQRoPBYc`;B?#m3*G!
z`eUPoLK^aY$h#pgd$&+{6tW-kDadCbk3$}Vd<$~Jdxb*Sz{B-^p|B8g{s)D^2FNv#
zH$hfIhl_FRP6`h$Fy>X(3v4#9g&1QwcXpI5V7<$5rGd3EhQe4ot_Ogvf#hQ2;+OWf
zfNg@Ljw<`h;m6I+heF%lc>QJZ3Bz`SxZZ+}`o@}%_5-+H0mk!!UE7x@J#IN~ye@3G
zpXav{^GYCY^dB}lz8reo2<>|PjVj{)mvG&U`O1D_AG*<Vz%RF7;5rT-Dj`+jSeTzv
zz#mf}W4O4FhvIWK7yeH4f4d4$-!uc70nLDBKr^5j&<tn>Gy|Fe&46Y=GoTsJ3}^;4
z1DXNNfM!55pc&8%Xa+O`ngPv#W<WEb8PE)91~dbj0nLDBKr^5j&<tn>Gy|Fe&46Y=
zGoTsJ3}^;41DXNNfM!55pc&8%Xa+O`ngPv#W<WEb8PE)91~dbj0nLDBKr^5j&<tn>
zGy|Fe&46Y=GoTsJ3}^;41DXNNfM!55pc&8%Xa+O`ngPv#W<WEb8PE)91~dbj0nLDB
zKr^5j&<tn>Gy|Fe&46Y=GoTsJ3}^;41DXNNfM!55pc&8%Xa+O`ngPv#W<WEb8PE)9
z1~dbj0nLDBKr^5j&<tn>Gy|Fe&46Y=GoTsJ3}^;41DXNNfM!55pc&8%Xa+O`ngPv#
zW<WEb8PE)91~dbj0nLDBKr^5j&<tn>Gy|Fe&46Y=GoTsJ3}^;41DXNNfM!55pc&8%
zXa+O`ngPv#W<WEb8PE)91~dbj0nLDBKr^5j&<tn>Gy|Fe&46Y=GoTsJ3}^;41DXNN
zfM!55pc&8%Xa+O`ngPv#W<WEb8PE)91~dbj0nLDBKr^5j&<tn>{$Dd-S9syQG{uvK
zjP^DuRr?;<u9WgrxmTGg+iRwIl{vDVKJQu1lWnO^b7gx__RkOW|K@{2ilM_Hufvl4
zhr?cFl58KB{W5I&oy6sIXtr$6lu`*(R?Zf@_@o=M<wR>`f0>kZQXY|j!fhFdn%AS!
zezMe8<G)w%<^x!$ejwY&<-`iIt>QQ*+lOTTsBEiwN%#M2Vo0vTVe%q4SXJ!$0g0cI
zvbdK>^(*^ee_Gnrs*Ms<2(}f0|5s9u*DvETY3PJQ=Gz@a-N#JlRinRYWi(8~Shu>)
z-0cr?{~Z(>>tEO(JreG3Iufm>;dh}kQlEXgfrfTA-1}rE6mCE2kg?}Z_`1s-I$;1`
z4ITE|wSN6-dvVvc8$M<)xx8g*%MyEO+tL+nOWRh_WeCY5-`h+J<yzmQ_G*Bl-(qW6
zWC-OMe80%-nq|^*saDF|GRo&-2rE>!7yHx-mpuYytGw(%7RIFc=nLP1dgzk6WekO^
z#eizVRy>)b@Ko4@E}aF6U}GJD2pi}QHP73`bzBsBbPm5U!gnzK#d%zxS#JI%1cq(`
zSjF}5Aa3UD86W5RG<Y@V)}~Rf`aT0D)XaIcpXEBg1HwP8WB;>$0>x9-vhpGH8?OSn
zyey788Q0;qa`F?nmm5FCeU+$J8=r&XYEhqUG=Rh_Mcp>G!_gX1Uu=96^|hkjW(>jh
zDp6l)ypH-hQExZygHETYZ!&%jpX)_^hw(V-8$^AVQHlE1v%ZM?ZsSw1-BioH{l;^s
zUnAm7*F6qD*0rKB`=Dtwp}DpEYf#&37^rWn_!B4(8b8PVx|-Fv4;j1AyK~xh)bBD}
z+^-k<pEdY3bHlW6qW50ILjC5cUqbHz<M+5bb>9L0sBt;&yXX8J?gx!3Og|z*I%F(|
ze$V8uL-Vll9O}`TFG1&s@fWDaM0CfDp8@F=^%spzu<ftn<@kZ|I`Dz=ALD+U!v6w;
zNo8|>iJsXnp>8!-O_>d8K8${o;c{MWj&d@uAixJupYj{79pTziR(zgs%$CqTjf&7N
zBlFv=$jd!x1a8Z|RK;Oo>wObAcJ1dOgw0GuC~VeYC!s#4{26pzg4$e~bTdqi>DBYl
zn$!+^V|ufwzrghx5C4X!d-%Vx{tORqgOM>q;9;JMF|+zsK;|8|O?w!(5Yrgq(Lz7L
zjlyBJm{om{wf>y7o?$I<`ySVXmdX3vJi+>T9=WdiIP0IpZC=GpAmsy<ytRM97EB0L
zcOa^S!C)C4v=9>^dZtr#C%Q7C>mu%A{pnQk2*l*pDvob0!y-^7DkPR-ejobFr<hZB
za3}kvi=Kj`vVL<a-g|P`id&&IWtH&q;Fy;;p)5>%H{QCmU_SgCt1Wh?Ui1k#<zhVc
zA8=4P85*mrSRpi=sQe||ub#&pq2b;2o1to~ne27u%6gNuwvven4G)s}1WfEdV@FNK
zA^B#!;QOn17fyviQ;lG9_|)x{?5(MRoteY0+};qDUo5)-aM=lqvYvb|T~cLX40HI<
z9aM9ZRN|Krm$FzkN6j_t-5fr9ds#LUrDoe}-g`$)OscQ(v<VMPON}?;Tekx%=XHJ>
z#&Sb9CNN)Pj-XS$p6`~n#$!|2`<y1RM#KG66K;~`w7!XMa8s71xfNmw!37yp$*fx4
zU|m_Us$%num0%-e(dHQlavD`MlyfaSi&ZK-4LmIisiNYl3bP6i!E#^Ex;bm`rB(*0
ze0782X@_Pba>Lj>!+@X34RG9w5vI(587o(`80+RJ-;3sKp0Q>QD^^+}CZCU~K#ILq
zT_U2KHq%%nY^tTJhO#TIRSlIFoJ~gvR8wJ0H)_g=W=uCOu9{JCz^E8DCYkl~>KU%a
z7g?>GRXKiTFmpsXYldM~RaAHr=dkJoRg#CP6??E4voBDCSH&e5X>J2<^JdZ_OmF_!
z^sH4Ci{|{X0fyMoMa8%WE*Jx`5*$89);{|-VNvfVnn|@=5S$m<qB(xe25+W3Oru}t
zf<MI9gZ*sb<8cq_VZ(T@YAWGp10dl-rG65sbQP{0xbDIAIIeHt3K{jGX%*tTK-``M
zX1vI^m$~;f+)UnQ=D*_>{x810$G0$#Xf3D7A+=e}*@iXkTC4nSYd(d-#v1D~h|n6#
zYQ5H~?6ZspYw?m@mUW|Ln6=MXRc6w*R`gkuZEMQ4)`lBVxz?&_YGq(G3{6|un`^8F
zZe3%|^PEmuYfZlK#wLqpt7SA=4bNKT*IKi@-fa+}$0ym=+(`}A^cKsy%3AjKR?R1@
zYVN<rx~{=m*7)J=ss?L4E8hh#)}+Q8t;)5CJrn|sj+5%`b#jC+G<*HhWx<a$yLljz
ziT1CJWCm_o*&IPL5zplkZnK+=#Un}X?uoBWq*pAroBQnM?QP_2-tJ`FNX$v+GUP=2
zBk?3T+5S`}$1k63{8ksEIZhs5SSz^(b#ypAk*rIJRI<+&w|p`i?@PKdn?J;&kwn7j
z%_pPOy96c|w7AphjpNOcx-c#x1DJ^PWyhQgD4NeW(NrQofOs%yG!=8Bx4=G?h%pCQ
zSB14~sf-I#-{WEuB6>s^-IGuAB%I!i>t@_62Tr}|48rK8z5VfMzwo>_6VJKc^n2pT
zNM@gi-Dz9mpy_0DkxY(+OS@>698@-Pi|bM0BvR2mavn>`89=m@b~Bk2;(?+!hL~46
z<K7}^MhBb8@UPyyWg}&CF<21~eIb%%zDT^KSh1Xqya{J1l1s&DEI2sN$bbdmJbXOn
zI<ZJD5^(HIQG|6%F4B{56|?SOJV%+mgP=XVk9wD3w1I(08XE!BdxOKR*f(M)47dYP
zD2{O=To%n?shGn=xW}@SPv*034E%YOWfaNu-6FhttLbsw%L|U4=!KO{#PLHoAolJU
z+aj=1M2aACsazx>)+L7@vey38fZICIi>;82_Ped=OzP8aG?#7d*t!E-r?tmTM*ErW
z*1lMe6LWj=eFU15ZZ_KzrPh2l(;81k6Zsfa;le`_@t)ScXtXuLkK~Hw&H3b>WNL4+
zIT26h2b=qn`Bv|UrA){BY0d3RyV9HVU#yhfmnEUo($8B5>)p&xYSL@Q++I=1<fA!<
zx0lpPc@pt?-SK1}Dp+8)BkE|vE#l%$;q<zZTt4GEkz@=^Vq)I=9G?jvQcP&2OQ@JQ
zKJuq`u$W|y<x%{7CeCase^QKFr1C7q_#{&Kh+@2qR34)kFDI2BDaI$0%A*wHQ%L2L
zigEn1Pxd2T-6VW}sl<=3+oY+a@=!h)EL731X?{fcnMSedQg}5TD~V61Q`1Z7*O1D0
z`Fl;8K`PHxjMtLNyA<OyDJ}C-#rP~z`Kn^Pj&@`1gT+Mlt5m+mB$b^jwlK%CazR{W
z?}GUJVnDV`x@ZD?!322y1h_p8=jS6#?-YU^-u4*3AG|O0YI-HGjvhXK9eH~gI6tZN
zwdFvb{`mdFOnz8A%LDt9{e`e^6|VA(Ui{<tzlV?C&&?k??i4zUZ(@H@el`X6KYtK@
zSp2&K@wY>t;}7DU5<d{o_js!#m1l#RC^z8FepJ3$!s6@J@a<b(Z!Vsn@z2p-^cf|d
zFaGvr|0=J|KDc;(#y_V|qR*i5xTjxDXG`v%@z2)};inXzo}Vh(9pK+vmr7D!*c>Al
z@nw!^6|Rnn{ISmS3Lb$~oSOpgrJq5+R6J_jA)E;PMfo`t@WVNq5Azeu7khqY(`$&<
zUqV<=USEZWWv3eVSYTYv4wf2M>Aw=tpCkO^c@N+`OesH$pwE0N9z4IuEeb#H6oT*P
ztAHb`Gw%1|I;kIgZ+kqC|NZKNehs?{lzkK6pPK;Zq+KaLM}SvT9R>68$5}sANou_w
zljFY)+z!OT&2MA*Z1~w7SjRHp>|f>mm1(`igTFgh2z>nWvQgmg6c#H7Qho|}V*umk
zAn+1POOFaavuPv{=Y0aN8}~an47?^Z{=Ld+vuf<mpdU`tmg1R{o6JLypJNga^7GFV
z`1!5W53X0WS$bXSftTW84dbEl@7vA5OU3_(!q0`@HT68kC(z#qoad|lo}-rNA%VYB
z2!0VfHGv=gGejw#j|zV96w2Qbk+8rZ&kFzJ*X2by?m`ual&?$u;6D7hh^KN~dh$Pk
zGjBotcY)gh-`t!BqDt{u%Xnz~@A%va^y&I&Cby&|MGFUMAr9&ep9*r3J^~yEEYA9S
z-N+sX*-c<c#*jO3`{K##QO9W()ojkqIJp5Qid+EBGmaBWIem##4^CjQTq=`wBKbjz
zrUudpH|NG$kd-K<fz&}V?nE+~$UetS<}x^#W+DSP*yaZYaQ4J8f2`#|GgqW~;j}h(
zti6z@KwxeJ>4pf-eh50dFBxrC4I$yLH-)$`HJl!KVvck5&W^1cosHWzI1XAq59DmP
zaa+gM&FcXpPvdOdBy~4!*hx;;mhGJ#Tb%7zUw!??Zl}AWbIV4Dyz_H=*Otwl>mB^n
z?F!-(xWDZ<IIcy#bOa9m@#I+NvgI<<FowXq_}uF|d^_?q>hL7J89SLsvd_gayL&5+
zwrah)x31^a>gKdSz@0B`P;|!k@)Y8D?et``GFY%vLW?s?Oxs|Dmo<_=(;vT=2k|40
z>m@LF6Tp!DV<d3T*^eSck?D|$o8qB89%QnCHIRIX_G9lx;)x;`r2-a-Ajjz&NXe{_
zSoaG~`tp%XtjLQK%ch)uoSng7qz5UrqFy8_#F5?MoCl@zxolB)Q3(EARPmH#53|V`
znj#mHpkf2bfk*$?D#_%DgH=~snzuvg&A<<rN~HM*q?YWy0pyV&a~V(et6DOZb6b$}
zXz9tv6S3xaj9QSWa5HHVXn!QzPc5;1NqF!i(vhO)7B`cPr;<et2Yneg5n%&aODA&d
z5rMVj+(F#MK55CM#I9*^`{nDQKL%UXq81TWN9X2dey0h0@2%n4t2%~>48)^|AqCI4
zwTR(7*Wi#^@bVhKI|-q4ZU}b#NCEen{Ps}wNKX$kfU1RDkgMvTN>v%g?m)X^=hl`;
zE|-b-<a2QNKP`Mx#-A+4zCu)<M#<ps6n++pN()-}6dSWw`5YyKCvn#GmlbII7XMcS
zKck`?mK7z{@3R^=a7?6jSmA4RZ2aTWUda(TApVYmd<y<hgo_LR!G_BwT#S#6F9%Zj
zq~EcUuLR@Yfj<7fFv?!#o0L@Fi7I}jr{vAB=l@%y+A1%lB>U&W@w1MKpD`{8T<lxf
ztNfLc>i_qs@s+)rKmYFu?(4+GMdi7awB>|ZXBMuY{SYu#5r52qRQWI^)%u}J6cvYR
z??=UNKbAN14kGxjEiQp$t@yi9k-a+4OSy)L^ObgAovRPv^4qICoRW$sWfqM8L20k}
zSNS?6r?7KhQT!?BPf=kNWv}vjO5UUtr3{XLC}4j~5~L*m{|0{vj{h_|{m;M3BPnV7
zjZqJde+(EO$@v-LqVkF9^IkhRo@y)k64W@wpzKxNaquJBpMov#J!P-XzeCdgoSeVf
z25MZjk50puRg}Ft52t0uZV-L`qU@FAe1+d$<vY)QMEhR`?03tZuv>V=`O!Z(>S|mC
zbDo1S6~8(UoRaoI{owk&3tNuA4VS+t`-$G)Y{SJwMb$mTyZ=g}LBB!WnhETi#9xYu
z-c<`Hod@j~OkjUenWy=ifI*;1mrY<_GuhMZ4HyKPv{Kp!=Q$`J{7<Nb3B<4bte?RC
zANfD8p;Rcv3GA;&KZZ!l?lJLNm1nqc4Dl+#^=B>h?kQ5`L>9_f{&7WV#MO;U?I-nK
k<kxgyia=mD*RyX94CtXkMcg<iP|AL0y{8!s7zCR1-+dIML;wH)

literal 0
HcmV?d00001

diff --git a/benchmark/gdb_debug/harness.c b/benchmark/gdb_debug/harness.c
new file mode 100644
index 0000000..c9a2633
--- /dev/null
+++ b/benchmark/gdb_debug/harness.c
@@ -0,0 +1,74 @@
+/* Generic harness for single-stepping one of the benchmark functions under GDB.
+ * Copies the raw bytes of funcNN.bin into an RWX buffer and calls through
+ * a function pointer. GDB stepi from the call site drops you right into the
+ * target function's first instruction. No QEMU needed — boltzmann is aarch64.
+ *
+ * Build: run `make` in this dir (native aarch64 only, for now).
+ * Run:   ./gdb_debug.elf {1|2|3}    (1=memset 2=memcpy32 3=magic_memset)
+ *
+ * Under GDB: see README.md.
+ */
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+extern uint8_t _binary_func_01_bin_start[], _binary_func_01_bin_end[];
+extern uint8_t _binary_func_02_bin_start[], _binary_func_02_bin_end[];
+extern uint8_t _binary_func_03_bin_start[], _binary_func_03_bin_end[];
+
+typedef void (*f1_t)(void *, uint8_t, uint64_t);
+typedef void (*f2_t)(uint32_t *, const uint32_t *, uint64_t);
+typedef void (*f3_t)(void);
+
+static void *rwx_copy(const void *src, size_t len) {
+    void *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE | PROT_EXEC,
+                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) { perror("mmap"); exit(1); }
+    memcpy(p, src, len);
+    __builtin___clear_cache(p, (char *)p + len);
+    return p;
+}
+
+static void __attribute__((noinline))
+call_func(void (*fn)(void), int which) {
+    switch (which) {
+    case 1: {
+        char buf[64] = {0};
+        printf("pre:  buf[10]=0x%02x\n", (uint8_t)buf[10]);
+        ((f1_t)fn)(buf, 0xAB, 16);
+        printf("post: buf[10]=0x%02x (expect 0xab)\n", (uint8_t)buf[10]);
+        break;
+    }
+    case 2: {
+        uint32_t dst[8] = {0}, src[8];
+        for (int i = 0; i < 8; i++) src[i] = 0xDEAD0000U | i;
+        ((f2_t)fn)(dst, src, sizeof dst);
+        printf("dst[3]=0x%08x (expect 0xdead0003)\n", dst[3]);
+        break;
+    }
+    case 3:
+        printf("calling magic_memset — SIGSEGVs on LDR of 0x1fe004 in user mode.\n");
+        ((f3_t)fn)();
+        break;
+    }
+}
+
+int main(int argc, char **argv) {
+    if (argc != 2) { fprintf(stderr, "usage: %s {1|2|3}\n", argv[0]); return 2; }
+    int which = atoi(argv[1]);
+    void (*fn)(void);
+    switch (which) {
+    case 1: fn = rwx_copy(_binary_func_01_bin_start,
+                          _binary_func_01_bin_end - _binary_func_01_bin_start); break;
+    case 2: fn = rwx_copy(_binary_func_02_bin_start,
+                          _binary_func_02_bin_end - _binary_func_02_bin_start); break;
+    case 3: fn = rwx_copy(_binary_func_03_bin_start,
+                          _binary_func_03_bin_end - _binary_func_03_bin_start); break;
+    default: fprintf(stderr, "unknown index %d\n", which); return 2;
+    }
+    printf("function %d loaded at %p\n", which, fn);
+    call_func(fn, which);
+    return 0;
+}