From 955bd82efbccafc2718c64516e5f08d52ebb67ae Mon Sep 17 00:00:00 2001
From: Markus Fritsche <mfritsche@reauktion.de>
Date: Sat, 16 May 2026 22:40:30 +0000
Subject: [PATCH] safety + repl: wire secrets into safety.lua (closes #52)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the last #13 gap — Norris broker call + is_destructive LLM
second-opinion probe were the two egress points NOT covered by the
scrub-at-egress design in commit d852aca.

Approach: option (b) per #52's fix sketch — callback-via-helpers/opts.
safety.lua does NOT gain a require("secrets") dependency (acceptance
criteria 3); integration is purely through the convention the rest
of the helpers table already uses.

safety.lua changes:

  - llm_probe gains an opts table. When opts.scrub_msgs is set, the
    {system, user(cmd)} message pair is scrubbed before broker.chat.
    When opts.rehydrate is set, the YES/NO reply is rehydrated before
    parsing (defensive — the verdict shouldn't carry placeholders but
    rehydration is a safe no-op if it doesn't).

  - llm_second_opinion threads opts through to llm_probe.

  - M.is_destructive(cmd, cfg, opts) — opts optional; nil-opts is
    backwards-compatible (no scrub, original behavior).

  - M.norris_step:
      * outbound broker.chat_stream message scrubbed via
        helpers.scrub_msgs(ctx:to_messages(), model_cfg) when provided.
      * on_delta wrapped with helpers.streaming_rehydrator():push /
        :flush so the user sees rehydrated text AND text_parts
        accumulates rehydrated chunks (parity with ask_ai in repl.lua).
      * both M.is_destructive call sites (tool_call probe + CMD: probe)
        now pass probe_opts = {scrub_msgs, rehydrate} when the
        helpers carry them.

repl.lua changes:

  - Norris helpers table gains scrub_msgs / rehydrate /
    streaming_rehydrator closures, all nil-safe (return identity /
    nil when secrets_session is nil).

  - :safety check meta passes probe_opts to is_destructive when
    secrets_session is configured. Without secrets, behavior unchanged.

Unit-test verified end-to-end:
  - Stubbed broker.chat captures the messages it receives.
  - Without opts: probe SEES `ghp_realsecretvalue_...` (control).
  - With opts: probe sees `$AISH_SECRET_NNN` (correct scrub).

Regression: test_safety 87/87, test_router_model 31/31, repl loads.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 repl.lua   | 29 +++++++++++++++++++++-
 safety.lua | 73 +++++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 16 deletions(-)
diff --git a/repl.lua b/repl.lua
index 5a16a20..acc4f4d 100644
--- a/repl.lua
+++ b/repl.lua
@@ -1208,6 +1208,21 @@ function M.run(config)
             render_assistant_delta   = renderer.assistant_delta,
             render_assistant_flush   = renderer.assistant_flush,
             log_turn                 = log_turn,
+            -- Issue #52: pass secrets-aware callbacks so safety.lua
+            -- can scrub outbound Norris broker messages + LLM probe
+            -- inputs + rehydrate streamed replies. All three are nil-
+            -- safe; safety.lua only wires them in when present.
+            scrub_msgs = function(msgs, mode_cfg)
+                return scrub_messages(msgs, secrets_mode_for(mode_cfg or active_cfg))
+            end,
+            rehydrate = function(text)
+                return secrets_session and secrets_session:rehydrate(text) or text
+            end,
+            streaming_rehydrator = function()
+                return secrets_session
+                    and secrets.streaming_rehydrator(secrets_session)
+                    or  nil
+            end,
         }
 
         local step_n = 1
@@ -1641,7 +1656,19 @@ function M.run(config)
                 end
                 -- Pass cfg so the LLM probe runs; user can opt-out via
                 -- :safety check --no-llm <cmd> if added in v2.
-                local hit, reason = safety.is_destructive(cmd, config)
+                -- Issue #52: thread secrets scrub/rehydrate so the probe
+                -- model sees placeholders for any secrets in `cmd`.
+                local probe_opts
+                if secrets_session then
+                    probe_opts = {
+                        scrub_msgs = function(msgs, mode_cfg)
+                            return scrub_messages(msgs,
+                                secrets_mode_for(mode_cfg or active_cfg))
+                        end,
+                        rehydrate = function(t) return secrets_session:rehydrate(t) end,
+                    }
+                end
+                local hit, reason = safety.is_destructive(cmd, config, probe_opts)
                 if hit then
                     renderer.status(("DESTRUCTIVE — %s"):format(reason or "?"))
                 else
diff --git a/safety.lua b/safety.lua
index 15575e2..4a5f253 100644
--- a/safety.lua
+++ b/safety.lua
@@ -173,15 +173,28 @@ local PROBE_TIMEOUT_MS = 15000
 
 -- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string
 -- (not bool — caller cares about disagreement between probes).
-local function llm_probe(model_cfg, system, cmd)
-    local reply, err = broker.chat(model_cfg,
-        { { role = "system", content = system },
-          { role = "user",   content = cmd } },
+--
+-- Issue #52: when `opts.scrub_msgs` is provided, scrub the outbound
+-- {system, user(cmd)} message pair using the caller's secrets-aware
+-- scrubber. The probe model sees placeholders for any secrets the
+-- CMD: line happens to contain. Verdict text ("YES"/"NO") is unlikely
+-- to carry placeholders but we rehydrate defensively via opts.rehydrate
+-- so any echoed value comes back clean.
+local function llm_probe(model_cfg, system, cmd, opts)
+    local msgs = {
+        { role = "system", content = system },
+        { role = "user",   content = cmd },
+    }
+    if opts and opts.scrub_msgs then
+        msgs = opts.scrub_msgs(msgs, model_cfg)
+    end
+    local reply, err = broker.chat(model_cfg, msgs,
         { max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS })
     if not reply then
         -- Broker failure → safe default: treat as YES (destructive)
         return "YES_FAILSAFE", err
     end
+    if opts and opts.rehydrate then reply = opts.rehydrate(reply) end
     local upper = reply:upper()
     if upper:match("YES") then return "YES" end
     if upper:match("NO")  then return "NO"  end
@@ -200,7 +213,7 @@ end
 --   1. cfg.safety.llm_model         — explicit model-name from config (e.g. "cloud")
 --   2. cfg.models.deep              — local independent-class model (qwen3-30b)
 --   3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default
-local function llm_second_opinion(cmd, cfg)
+local function llm_second_opinion(cmd, cfg, opts)
     local probe_cfg
     if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then
         probe_cfg = cfg.models[cfg.safety.llm_model]
@@ -230,7 +243,7 @@ local function llm_second_opinion(cmd, cfg)
         "Detect destructive shell commands. Reply YES if running this "
         .. "would delete, overwrite, or irreversibly modify data, "
         .. "processes, or system state. Otherwise reply NO. Respond "
-        .. "with only YES or NO.", cmd)
+        .. "with only YES or NO.", cmd, opts)
     if p1 == "YES" then return memo(true, "LLM flagged as destructive") end
     if p1 == "YES_FAILSAFE"    then return memo(true, "LLM probe failed (fail-safe)") end
     if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end
@@ -238,7 +251,7 @@ local function llm_second_opinion(cmd, cfg)
     -- Probe 2 (NO → re-roll with inverted phrasing per R-B2):
     local p2 = llm_probe(probe_cfg,
         "Reply YES or NO only. Is the following shell command SAFE to "
-        .. "run autonomously without user review?", cmd)
+        .. "run autonomously without user review?", cmd, opts)
     if p2 == "YES" then return memo(false, nil) end
     -- Disagreement or fail-safe → HALT
     return memo(true, "LLM probe disagreement")
@@ -250,7 +263,12 @@ end
 -- and cfg.models for the probe model lookup). When cfg is nil, only the
 -- static layer runs (handy for unit tests and tooling that wants the
 -- fast deterministic gate without an LLM round-trip).
-function M.is_destructive(cmd, cfg)
+-- Issue #52: opts.scrub_msgs(messages, model_cfg) + opts.rehydrate(text)
+-- callbacks let the LLM probe scrub the outbound cmd before sending and
+-- rehydrate the YES/NO verdict before parsing. Both optional; absent
+-- opts = no-op (backwards-compatible). Caller (repl.lua / norris_step
+-- helpers) provides them when secrets are configured.
+function M.is_destructive(cmd, cfg, opts)
     if type(cmd) ~= "string" or cmd == "" then return false, nil end
 
     -- Static patterns first (fast, deterministic).
@@ -265,7 +283,7 @@ function M.is_destructive(cmd, cfg)
         return false, nil
     end
 
-    return llm_second_opinion(cmd, cfg)
+    return llm_second_opinion(cmd, cfg, opts)
 end
 
 -- Expose the pattern table for `:safety patterns` meta and for testing.
@@ -315,19 +333,44 @@ function M.norris_step(ctx, model_cfg, helpers, opts)
 
     helpers.render_step(step_n, max_steps)
 
-    -- (1) one broker round-trip — stream text + collect tool_calls
+    -- (1) one broker round-trip — stream text + collect tool_calls.
+    --
+    -- Issue #52: when helpers.scrub_msgs is provided, scrub outbound
+    -- per the active model's redact policy; when helpers.streaming_rehydrator
+    -- is provided, wrap on_delta so the user sees rehydrated text AND
+    -- text_parts accumulates rehydrated chunks (so any extracted CMD: /
+    -- DELEGATE: lines downstream see plain values — matches ask_ai's
+    -- contract in repl.lua).
+    local msgs = ctx:to_messages()
+    if helpers.scrub_msgs then msgs = helpers.scrub_msgs(msgs, model_cfg) end
+    local rehydrator = helpers.streaming_rehydrator and helpers.streaming_rehydrator() or nil
+    local probe_opts = nil
+    if helpers.scrub_msgs or helpers.rehydrate then
+        probe_opts = { scrub_msgs = helpers.scrub_msgs, rehydrate = helpers.rehydrate }
+    end
+
     local text_parts      = {}
     local tool_calls_seen = {}
-    local ok, err = broker.chat_stream(model_cfg, ctx:to_messages(),
+    local ok, err = broker.chat_stream(model_cfg, msgs,
         function(kind, payload)
             if kind == "text" then
-                text_parts[#text_parts + 1] = payload
-                helpers.render_assistant_delta(payload)
+                local emit = rehydrator and rehydrator:push(payload) or payload
+                if emit ~= "" then
+                    text_parts[#text_parts + 1] = emit
+                    helpers.render_assistant_delta(emit)
+                end
             elseif kind == "tool_call" then
                 tool_calls_seen[#tool_calls_seen + 1] = payload
             end
         end,
         { tools = helpers.tools_schema() })
+    if rehydrator then
+        local tail = rehydrator:flush()
+        if tail ~= "" then
+            text_parts[#text_parts + 1] = tail
+            helpers.render_assistant_delta(tail)
+        end
+    end
     helpers.render_assistant_flush()
 
     if not ok then
@@ -379,7 +422,7 @@ function M.norris_step(ctx, model_cfg, helpers, opts)
 
         -- Probe destructive on the JSON-serialized call as a proxy.
         local call_repr = (call.name or "?") .. " " .. (call.arguments or "")
-        local destr, reason = M.is_destructive(call_repr, cfg)
+        local destr, reason = M.is_destructive(call_repr, cfg, probe_opts)
 
         local verdict
         if destr then
@@ -421,7 +464,7 @@ function M.norris_step(ctx, model_cfg, helpers, opts)
 
     -- (5) dispatch CMD: lines (legacy route)
     for _, cmd in ipairs(cmd_lines) do
-        local destr, reason = M.is_destructive(cmd, cfg)
+        local destr, reason = M.is_destructive(cmd, cfg, probe_opts)
         local verdict
         if destr then
             verdict = helpers.halt(step_n, max_steps, reason or "destructive",