From 9c20eb013500f29dd052399685b9d044255eab90 Mon Sep 17 00:00:00 2001
From: Markus Fritsche <mfritsche@reauktion.de>
Date: Wed, 15 Apr 2026 09:16:00 +0200
Subject: [PATCH] 04_train_phy_block: clang -Oz + 32-bit-load pattern = 100%
 size match
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changed u64v handshake reads to u32v with an inline zero-extending
upcast. Clang -Oz now emits 104 bytes, exactly matching vendor's 104
bytes, with 26 instructions on both sides. Three semantic-equivalent
byte differences remain (register allocation, tst-form, test width)
that aren't closable from C alone — need armclang or inline asm.

Matching-decomp verdict for this function: semantic equivalence +
size identity + instruction-count identity = the practical ceiling.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmark/04_train_phy_block/GRIND_LOG.md | 32 +++++++++++++++++++++
 benchmark/04_train_phy_block/candidate.c  | 34 ++++++++---------------
 2 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/benchmark/04_train_phy_block/GRIND_LOG.md b/benchmark/04_train_phy_block/GRIND_LOG.md
index c93a75b..207135e 100644
--- a/benchmark/04_train_phy_block/GRIND_LOG.md
+++ b/benchmark/04_train_phy_block/GRIND_LOG.md
@@ -113,3 +113,35 @@ that clang -Oz approaches byte-match ruling suggests LLVM family.
 free Community Edition), or continue clang -Oz + hand-tweaked C + per
 -site inline asm where the last instruction doesn't converge. A single
 afternoon's iteration should push to ≥99%.
+
+## Iteration 3: 32-bit load + clang -Oz = 100% size match
+
+Changed the handshake-loop reads from `u64v` to `u32v` (32-bit volatile
+loads), with a tiny inline `xld()` helper that zero-extends to u64 for
+the test. This forced clang to use `ldr w, [x, #0x184]` inside the
+loops (instead of hoisting `add x9, x8, #0x184` out), cutting the
+4-byte setup overhead.
+
+| compiler | flag | size | diff | score |
+|---|---|---|---|---|
+| clang 19 | -Oz  | **104 B** | **0** | **100% (size-match)** |
+| gcc 15   | -Os  | see below | see below | see below |
+
+### Byte-level comparison (clang vs vendor, both 104 B, both 26 insts)
+
+Three semantic-equivalent differences remain — not closable from C alone:
+
+1. **Reg choice**: vendor `x0/w1`, clang `x8/w9/w10`.
+2. **Mask test form**: vendor `tst w1, #0xf0000000; b.eq`, clang
+   `lsr w9, #28; cbz w9, .loop`. Same size, same effect.
+3. **Handshake test width**: vendor `tst x1, #0x3` (64-bit on
+   zero-extended w1), clang `tst w9, #0x3` (32-bit). Same size.
+
+None of these affect semantics. To chase byte-level exactness you'd need:
+- inline asm stubs forcing the specific mask-test form
+- register-allocation hints that C doesn't really expose
+- **or** the vendor's actual armclang binary
+
+**Verdict: done.** Semantic equivalence + identical size + identical
+instruction count is the realistic ceiling from C. Further chase is
+purely cosmetic.
diff --git a/benchmark/04_train_phy_block/candidate.c b/benchmark/04_train_phy_block/candidate.c
index 2c20d00..a0ffd3e 100644
--- a/benchmark/04_train_phy_block/candidate.c
+++ b/benchmark/04_train_phy_block/candidate.c
@@ -1,24 +1,12 @@
-/* Best matching candidate so far for FUN_0000d328.
- * Compile:  gcc -Os -ffreestanding -nostdlib -c candidate.c -o candidate.o
- * Score:    116 bytes vs vendor 104 bytes (88% size match, 12 bytes / 3 insts over).
- *
- * Remaining gap vs vendor:
- *   - GCC emits `cmp w, w_loaded_const ; b.ls` for `(x & 0xF0000000) == 0`
- *     instead of vendor's `tst w, #0xF0000000 ; b.eq` (both 12 bytes, but
- *     vendor avoids materializing the mask in a register, saving 4 bytes
- *     per loop, twice = 8 bytes).
- *   - GCC emits `add x1, x0, #0x200 ; ldur x2, [x1, #-124]` for the
- *     `[base+0x184]` accesses inside the handshake loop, vs vendor's
- *     direct `ldr w1, [x0, #0x184]`. Costs us ~4 bytes.
- *
- * Next iterations to try:
- *   1. Inline-asm for the mask-tests to force TST encoding.
- *   2. `__builtin_expect((x & 0xF0000000) != 0, 0)` to hint loop direction.
- *   3. Alternative compilers: clang, ARMCC (the latter is what Rockchip
- *      almost certainly used; need to source it).
- */
-typedef volatile unsigned int  u32v;
-typedef volatile unsigned long u64v;
+/* Iteration 3: match vendor's "32-bit load, 64-bit test" pattern.
+ * The u64v volatile forced clang to 64-bit loads and hoist the base
+ * address out of the loop. Use u32v loads but upcast for the test. */
+typedef volatile unsigned int u32v;
+
+static inline unsigned long xld(u32v *p) {
+    /* Zero-extend 32-bit load to 64-bit implicit via ldr w; tst x. */
+    return (unsigned long)*p;
+}
 
 void train_phy_block(unsigned long ctx)
 {
@@ -28,9 +16,9 @@ void train_phy_block(unsigned long ctx)
     while ((*(u32v *)(phy + 0x120) & 0xf0000000u) == 0u) ;
     *(u32v *)(phy + 0x160) = 0x30003u;
     *(u32v *)(phy + 0x154) = 0x30003u;
-    while ((*(u64v *)(phy + 0x184) & 3ul) == 0ul) ;
+    while ((xld((u32v *)(phy + 0x184)) & 3ul) == 0ul) ;
     *(u32v *)(phy + 0x154) = 0x30000u;
-    while ((*(u64v *)(phy + 0x184) & 3ul) != 0ul) ;
+    while ((xld((u32v *)(phy + 0x184)) & 3ul) != 0ul) ;
     *(u32v *)(phy + 0x160) = 0x30000u;
     *(u32v *)(phy + 0x110) = 0xf0000000u;
 }