From 955bd82efbccafc2718c64516e5f08d52ebb67ae Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Sat, 16 May 2026 22:40:30 +0000 Subject: [PATCH] safety + repl: wire secrets into safety.lua (closes #52) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the last #13 gap — Norris broker call + is_destructive LLM second-opinion probe were the two egress points NOT covered by the scrub-at-egress design in commit d852aca. Approach: option (b) per #52's fix sketch — callback-via-helpers/opts. safety.lua does NOT gain a require("secrets") dependency (acceptance criteria 3); integration is purely through the convention the rest of the helpers table already uses. safety.lua changes: - llm_probe gains an opts table. When opts.scrub_msgs is set, the {system, user(cmd)} message pair is scrubbed before broker.chat. When opts.rehydrate is set, the YES/NO reply is rehydrated before parsing (defensive — the verdict shouldn't carry placeholders but rehydration is a safe no-op if it doesn't). - llm_second_opinion threads opts through to llm_probe. - M.is_destructive(cmd, cfg, opts) — opts optional; nil-opts is backwards-compatible (no scrub, original behavior). - M.norris_step: * outbound broker.chat_stream message scrubbed via helpers.scrub_msgs(ctx:to_messages(), model_cfg) when provided. * on_delta wrapped with helpers.streaming_rehydrator():push / :flush so the user sees rehydrated text AND text_parts accumulates rehydrated chunks (parity with ask_ai in repl.lua). * both M.is_destructive call sites (tool_call probe + CMD: probe) now pass probe_opts = {scrub_msgs, rehydrate} when the helpers carry them. repl.lua changes: - Norris helpers table gains scrub_msgs / rehydrate / streaming_rehydrator closures, all nil-safe (return identity / nil when secrets_session is nil). - :safety check meta passes probe_opts to is_destructive when secrets_session is configured. Without secrets, behavior unchanged. Unit-test verified end-to-end: - Stubbed broker.chat captures the messages it receives. - Without opts: probe SEES `ghp_realsecretvalue_...` (control). - With opts: probe sees `$AISH_SECRET_NNN` (correct scrub). Regression: test_safety 87/87, test_router_model 31/31, repl loads. Co-Authored-By: Claude Opus 4.7 (1M context) --- repl.lua | 29 +++++++++++++++++++++- safety.lua | 73 +++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 86 insertions(+), 16 deletions(-) diff --git a/repl.lua b/repl.lua index 5a16a20..acc4f4d 100644 --- a/repl.lua +++ b/repl.lua @@ -1208,6 +1208,21 @@ function M.run(config) render_assistant_delta = renderer.assistant_delta, render_assistant_flush = renderer.assistant_flush, log_turn = log_turn, + -- Issue #52: pass secrets-aware callbacks so safety.lua + -- can scrub outbound Norris broker messages + LLM probe + -- inputs + rehydrate streamed replies. All three are nil- + -- safe; safety.lua only wires them in when present. + scrub_msgs = function(msgs, mode_cfg) + return scrub_messages(msgs, secrets_mode_for(mode_cfg or active_cfg)) + end, + rehydrate = function(text) + return secrets_session and secrets_session:rehydrate(text) or text + end, + streaming_rehydrator = function() + return secrets_session + and secrets.streaming_rehydrator(secrets_session) + or nil + end, } local step_n = 1 @@ -1641,7 +1656,19 @@ function M.run(config) end -- Pass cfg so the LLM probe runs; user can opt-out via -- :safety check --no-llm if added in v2. - local hit, reason = safety.is_destructive(cmd, config) + -- Issue #52: thread secrets scrub/rehydrate so the probe + -- model sees placeholders for any secrets in `cmd`. + local probe_opts + if secrets_session then + probe_opts = { + scrub_msgs = function(msgs, mode_cfg) + return scrub_messages(msgs, + secrets_mode_for(mode_cfg or active_cfg)) + end, + rehydrate = function(t) return secrets_session:rehydrate(t) end, + } + end + local hit, reason = safety.is_destructive(cmd, config, probe_opts) if hit then renderer.status(("DESTRUCTIVE — %s"):format(reason or "?")) else diff --git a/safety.lua b/safety.lua index 15575e2..4a5f253 100644 --- a/safety.lua +++ b/safety.lua @@ -173,15 +173,28 @@ local PROBE_TIMEOUT_MS = 15000 -- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string -- (not bool — caller cares about disagreement between probes). -local function llm_probe(model_cfg, system, cmd) - local reply, err = broker.chat(model_cfg, - { { role = "system", content = system }, - { role = "user", content = cmd } }, +-- +-- Issue #52: when `opts.scrub_msgs` is provided, scrub the outbound +-- {system, user(cmd)} message pair using the caller's secrets-aware +-- scrubber. The probe model sees placeholders for any secrets the +-- CMD: line happens to contain. Verdict text ("YES"/"NO") is unlikely +-- to carry placeholders but we rehydrate defensively via opts.rehydrate +-- so any echoed value comes back clean. +local function llm_probe(model_cfg, system, cmd, opts) + local msgs = { + { role = "system", content = system }, + { role = "user", content = cmd }, + } + if opts and opts.scrub_msgs then + msgs = opts.scrub_msgs(msgs, model_cfg) + end + local reply, err = broker.chat(model_cfg, msgs, { max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS }) if not reply then -- Broker failure → safe default: treat as YES (destructive) return "YES_FAILSAFE", err end + if opts and opts.rehydrate then reply = opts.rehydrate(reply) end local upper = reply:upper() if upper:match("YES") then return "YES" end if upper:match("NO") then return "NO" end @@ -200,7 +213,7 @@ end -- 1. cfg.safety.llm_model — explicit model-name from config (e.g. "cloud") -- 2. cfg.models.deep — local independent-class model (qwen3-30b) -- 3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default -local function llm_second_opinion(cmd, cfg) +local function llm_second_opinion(cmd, cfg, opts) local probe_cfg if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then probe_cfg = cfg.models[cfg.safety.llm_model] @@ -230,7 +243,7 @@ local function llm_second_opinion(cmd, cfg) "Detect destructive shell commands. Reply YES if running this " .. "would delete, overwrite, or irreversibly modify data, " .. "processes, or system state. Otherwise reply NO. Respond " - .. "with only YES or NO.", cmd) + .. "with only YES or NO.", cmd, opts) if p1 == "YES" then return memo(true, "LLM flagged as destructive") end if p1 == "YES_FAILSAFE" then return memo(true, "LLM probe failed (fail-safe)") end if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end @@ -238,7 +251,7 @@ local function llm_second_opinion(cmd, cfg) -- Probe 2 (NO → re-roll with inverted phrasing per R-B2): local p2 = llm_probe(probe_cfg, "Reply YES or NO only. Is the following shell command SAFE to " - .. "run autonomously without user review?", cmd) + .. "run autonomously without user review?", cmd, opts) if p2 == "YES" then return memo(false, nil) end -- Disagreement or fail-safe → HALT return memo(true, "LLM probe disagreement") @@ -250,7 +263,12 @@ end -- and cfg.models for the probe model lookup). When cfg is nil, only the -- static layer runs (handy for unit tests and tooling that wants the -- fast deterministic gate without an LLM round-trip). -function M.is_destructive(cmd, cfg) +-- Issue #52: opts.scrub_msgs(messages, model_cfg) + opts.rehydrate(text) +-- callbacks let the LLM probe scrub the outbound cmd before sending and +-- rehydrate the YES/NO verdict before parsing. Both optional; absent +-- opts = no-op (backwards-compatible). Caller (repl.lua / norris_step +-- helpers) provides them when secrets are configured. +function M.is_destructive(cmd, cfg, opts) if type(cmd) ~= "string" or cmd == "" then return false, nil end -- Static patterns first (fast, deterministic). @@ -265,7 +283,7 @@ function M.is_destructive(cmd, cfg) return false, nil end - return llm_second_opinion(cmd, cfg) + return llm_second_opinion(cmd, cfg, opts) end -- Expose the pattern table for `:safety patterns` meta and for testing. @@ -315,19 +333,44 @@ function M.norris_step(ctx, model_cfg, helpers, opts) helpers.render_step(step_n, max_steps) - -- (1) one broker round-trip — stream text + collect tool_calls + -- (1) one broker round-trip — stream text + collect tool_calls. + -- + -- Issue #52: when helpers.scrub_msgs is provided, scrub outbound + -- per the active model's redact policy; when helpers.streaming_rehydrator + -- is provided, wrap on_delta so the user sees rehydrated text AND + -- text_parts accumulates rehydrated chunks (so any extracted CMD: / + -- DELEGATE: lines downstream see plain values — matches ask_ai's + -- contract in repl.lua). + local msgs = ctx:to_messages() + if helpers.scrub_msgs then msgs = helpers.scrub_msgs(msgs, model_cfg) end + local rehydrator = helpers.streaming_rehydrator and helpers.streaming_rehydrator() or nil + local probe_opts = nil + if helpers.scrub_msgs or helpers.rehydrate then + probe_opts = { scrub_msgs = helpers.scrub_msgs, rehydrate = helpers.rehydrate } + end + local text_parts = {} local tool_calls_seen = {} - local ok, err = broker.chat_stream(model_cfg, ctx:to_messages(), + local ok, err = broker.chat_stream(model_cfg, msgs, function(kind, payload) if kind == "text" then - text_parts[#text_parts + 1] = payload - helpers.render_assistant_delta(payload) + local emit = rehydrator and rehydrator:push(payload) or payload + if emit ~= "" then + text_parts[#text_parts + 1] = emit + helpers.render_assistant_delta(emit) + end elseif kind == "tool_call" then tool_calls_seen[#tool_calls_seen + 1] = payload end end, { tools = helpers.tools_schema() }) + if rehydrator then + local tail = rehydrator:flush() + if tail ~= "" then + text_parts[#text_parts + 1] = tail + helpers.render_assistant_delta(tail) + end + end helpers.render_assistant_flush() if not ok then @@ -379,7 +422,7 @@ function M.norris_step(ctx, model_cfg, helpers, opts) -- Probe destructive on the JSON-serialized call as a proxy. local call_repr = (call.name or "?") .. " " .. (call.arguments or "") - local destr, reason = M.is_destructive(call_repr, cfg) + local destr, reason = M.is_destructive(call_repr, cfg, probe_opts) local verdict if destr then @@ -421,7 +464,7 @@ function M.norris_step(ctx, model_cfg, helpers, opts) -- (5) dispatch CMD: lines (legacy route) for _, cmd in ipairs(cmd_lines) do - local destr, reason = M.is_destructive(cmd, cfg) + local destr, reason = M.is_destructive(cmd, cfg, probe_opts) local verdict if destr then verdict = helpers.halt(step_n, max_steps, reason or "destructive",