safety + repl: wire secrets into safety.lua (closes #52)
Closes the last #13 gap — Norris broker call + is_destructive LLM
second-opinion probe were the two egress points NOT covered by the
scrub-at-egress design in commit d852aca.
Approach: option (b) per #52's fix sketch — callback-via-helpers/opts.
safety.lua does NOT gain a require("secrets") dependency (acceptance
criteria 3); integration is purely through the convention the rest
of the helpers table already uses.
safety.lua changes:
- llm_probe gains an opts table. When opts.scrub_msgs is set, the
{system, user(cmd)} message pair is scrubbed before broker.chat.
When opts.rehydrate is set, the YES/NO reply is rehydrated before
parsing (defensive — the verdict shouldn't carry placeholders but
rehydration is a safe no-op if it doesn't).
- llm_second_opinion threads opts through to llm_probe.
- M.is_destructive(cmd, cfg, opts) — opts optional; nil-opts is
backwards-compatible (no scrub, original behavior).
- M.norris_step:
* outbound broker.chat_stream message scrubbed via
helpers.scrub_msgs(ctx:to_messages(), model_cfg) when provided.
* on_delta wrapped with helpers.streaming_rehydrator():push /
:flush so the user sees rehydrated text AND text_parts
accumulates rehydrated chunks (parity with ask_ai in repl.lua).
* both M.is_destructive call sites (tool_call probe + CMD: probe)
now pass probe_opts = {scrub_msgs, rehydrate} when the
helpers carry them.
repl.lua changes:
- Norris helpers table gains scrub_msgs / rehydrate /
streaming_rehydrator closures, all nil-safe (return identity /
nil when secrets_session is nil).
- :safety check meta passes probe_opts to is_destructive when
secrets_session is configured. Without secrets, behavior unchanged.
Unit-test verified end-to-end:
- Stubbed broker.chat captures the messages it receives.
- Without opts: probe SEES `ghp_realsecretvalue_...` (control).
- With opts: probe sees `$AISH_SECRET_NNN` (correct scrub).
Regression: test_safety 87/87, test_router_model 31/31, repl loads.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1208,6 +1208,21 @@ function M.run(config)
|
|||||||
render_assistant_delta = renderer.assistant_delta,
|
render_assistant_delta = renderer.assistant_delta,
|
||||||
render_assistant_flush = renderer.assistant_flush,
|
render_assistant_flush = renderer.assistant_flush,
|
||||||
log_turn = log_turn,
|
log_turn = log_turn,
|
||||||
|
-- Issue #52: pass secrets-aware callbacks so safety.lua
|
||||||
|
-- can scrub outbound Norris broker messages + LLM probe
|
||||||
|
-- inputs + rehydrate streamed replies. All three are nil-
|
||||||
|
-- safe; safety.lua only wires them in when present.
|
||||||
|
scrub_msgs = function(msgs, mode_cfg)
|
||||||
|
return scrub_messages(msgs, secrets_mode_for(mode_cfg or active_cfg))
|
||||||
|
end,
|
||||||
|
rehydrate = function(text)
|
||||||
|
return secrets_session and secrets_session:rehydrate(text) or text
|
||||||
|
end,
|
||||||
|
streaming_rehydrator = function()
|
||||||
|
return secrets_session
|
||||||
|
and secrets.streaming_rehydrator(secrets_session)
|
||||||
|
or nil
|
||||||
|
end,
|
||||||
}
|
}
|
||||||
|
|
||||||
local step_n = 1
|
local step_n = 1
|
||||||
@@ -1641,7 +1656,19 @@ function M.run(config)
|
|||||||
end
|
end
|
||||||
-- Pass cfg so the LLM probe runs; user can opt-out via
|
-- Pass cfg so the LLM probe runs; user can opt-out via
|
||||||
-- :safety check --no-llm <cmd> if added in v2.
|
-- :safety check --no-llm <cmd> if added in v2.
|
||||||
local hit, reason = safety.is_destructive(cmd, config)
|
-- Issue #52: thread secrets scrub/rehydrate so the probe
|
||||||
|
-- model sees placeholders for any secrets in `cmd`.
|
||||||
|
local probe_opts
|
||||||
|
if secrets_session then
|
||||||
|
probe_opts = {
|
||||||
|
scrub_msgs = function(msgs, mode_cfg)
|
||||||
|
return scrub_messages(msgs,
|
||||||
|
secrets_mode_for(mode_cfg or active_cfg))
|
||||||
|
end,
|
||||||
|
rehydrate = function(t) return secrets_session:rehydrate(t) end,
|
||||||
|
}
|
||||||
|
end
|
||||||
|
local hit, reason = safety.is_destructive(cmd, config, probe_opts)
|
||||||
if hit then
|
if hit then
|
||||||
renderer.status(("DESTRUCTIVE — %s"):format(reason or "?"))
|
renderer.status(("DESTRUCTIVE — %s"):format(reason or "?"))
|
||||||
else
|
else
|
||||||
|
|||||||
+58
-15
@@ -173,15 +173,28 @@ local PROBE_TIMEOUT_MS = 15000
|
|||||||
|
|
||||||
-- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string
|
-- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string
|
||||||
-- (not bool — caller cares about disagreement between probes).
|
-- (not bool — caller cares about disagreement between probes).
|
||||||
local function llm_probe(model_cfg, system, cmd)
|
--
|
||||||
local reply, err = broker.chat(model_cfg,
|
-- Issue #52: when `opts.scrub_msgs` is provided, scrub the outbound
|
||||||
{ { role = "system", content = system },
|
-- {system, user(cmd)} message pair using the caller's secrets-aware
|
||||||
{ role = "user", content = cmd } },
|
-- scrubber. The probe model sees placeholders for any secrets the
|
||||||
|
-- CMD: line happens to contain. Verdict text ("YES"/"NO") is unlikely
|
||||||
|
-- to carry placeholders but we rehydrate defensively via opts.rehydrate
|
||||||
|
-- so any echoed value comes back clean.
|
||||||
|
local function llm_probe(model_cfg, system, cmd, opts)
|
||||||
|
local msgs = {
|
||||||
|
{ role = "system", content = system },
|
||||||
|
{ role = "user", content = cmd },
|
||||||
|
}
|
||||||
|
if opts and opts.scrub_msgs then
|
||||||
|
msgs = opts.scrub_msgs(msgs, model_cfg)
|
||||||
|
end
|
||||||
|
local reply, err = broker.chat(model_cfg, msgs,
|
||||||
{ max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS })
|
{ max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS })
|
||||||
if not reply then
|
if not reply then
|
||||||
-- Broker failure → safe default: treat as YES (destructive)
|
-- Broker failure → safe default: treat as YES (destructive)
|
||||||
return "YES_FAILSAFE", err
|
return "YES_FAILSAFE", err
|
||||||
end
|
end
|
||||||
|
if opts and opts.rehydrate then reply = opts.rehydrate(reply) end
|
||||||
local upper = reply:upper()
|
local upper = reply:upper()
|
||||||
if upper:match("YES") then return "YES" end
|
if upper:match("YES") then return "YES" end
|
||||||
if upper:match("NO") then return "NO" end
|
if upper:match("NO") then return "NO" end
|
||||||
@@ -200,7 +213,7 @@ end
|
|||||||
-- 1. cfg.safety.llm_model — explicit model-name from config (e.g. "cloud")
|
-- 1. cfg.safety.llm_model — explicit model-name from config (e.g. "cloud")
|
||||||
-- 2. cfg.models.deep — local independent-class model (qwen3-30b)
|
-- 2. cfg.models.deep — local independent-class model (qwen3-30b)
|
||||||
-- 3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default
|
-- 3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default
|
||||||
local function llm_second_opinion(cmd, cfg)
|
local function llm_second_opinion(cmd, cfg, opts)
|
||||||
local probe_cfg
|
local probe_cfg
|
||||||
if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then
|
if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then
|
||||||
probe_cfg = cfg.models[cfg.safety.llm_model]
|
probe_cfg = cfg.models[cfg.safety.llm_model]
|
||||||
@@ -230,7 +243,7 @@ local function llm_second_opinion(cmd, cfg)
|
|||||||
"Detect destructive shell commands. Reply YES if running this "
|
"Detect destructive shell commands. Reply YES if running this "
|
||||||
.. "would delete, overwrite, or irreversibly modify data, "
|
.. "would delete, overwrite, or irreversibly modify data, "
|
||||||
.. "processes, or system state. Otherwise reply NO. Respond "
|
.. "processes, or system state. Otherwise reply NO. Respond "
|
||||||
.. "with only YES or NO.", cmd)
|
.. "with only YES or NO.", cmd, opts)
|
||||||
if p1 == "YES" then return memo(true, "LLM flagged as destructive") end
|
if p1 == "YES" then return memo(true, "LLM flagged as destructive") end
|
||||||
if p1 == "YES_FAILSAFE" then return memo(true, "LLM probe failed (fail-safe)") end
|
if p1 == "YES_FAILSAFE" then return memo(true, "LLM probe failed (fail-safe)") end
|
||||||
if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end
|
if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end
|
||||||
@@ -238,7 +251,7 @@ local function llm_second_opinion(cmd, cfg)
|
|||||||
-- Probe 2 (NO → re-roll with inverted phrasing per R-B2):
|
-- Probe 2 (NO → re-roll with inverted phrasing per R-B2):
|
||||||
local p2 = llm_probe(probe_cfg,
|
local p2 = llm_probe(probe_cfg,
|
||||||
"Reply YES or NO only. Is the following shell command SAFE to "
|
"Reply YES or NO only. Is the following shell command SAFE to "
|
||||||
.. "run autonomously without user review?", cmd)
|
.. "run autonomously without user review?", cmd, opts)
|
||||||
if p2 == "YES" then return memo(false, nil) end
|
if p2 == "YES" then return memo(false, nil) end
|
||||||
-- Disagreement or fail-safe → HALT
|
-- Disagreement or fail-safe → HALT
|
||||||
return memo(true, "LLM probe disagreement")
|
return memo(true, "LLM probe disagreement")
|
||||||
@@ -250,7 +263,12 @@ end
|
|||||||
-- and cfg.models for the probe model lookup). When cfg is nil, only the
|
-- and cfg.models for the probe model lookup). When cfg is nil, only the
|
||||||
-- static layer runs (handy for unit tests and tooling that wants the
|
-- static layer runs (handy for unit tests and tooling that wants the
|
||||||
-- fast deterministic gate without an LLM round-trip).
|
-- fast deterministic gate without an LLM round-trip).
|
||||||
function M.is_destructive(cmd, cfg)
|
-- Issue #52: opts.scrub_msgs(messages, model_cfg) + opts.rehydrate(text)
|
||||||
|
-- callbacks let the LLM probe scrub the outbound cmd before sending and
|
||||||
|
-- rehydrate the YES/NO verdict before parsing. Both optional; absent
|
||||||
|
-- opts = no-op (backwards-compatible). Caller (repl.lua / norris_step
|
||||||
|
-- helpers) provides them when secrets are configured.
|
||||||
|
function M.is_destructive(cmd, cfg, opts)
|
||||||
if type(cmd) ~= "string" or cmd == "" then return false, nil end
|
if type(cmd) ~= "string" or cmd == "" then return false, nil end
|
||||||
|
|
||||||
-- Static patterns first (fast, deterministic).
|
-- Static patterns first (fast, deterministic).
|
||||||
@@ -265,7 +283,7 @@ function M.is_destructive(cmd, cfg)
|
|||||||
return false, nil
|
return false, nil
|
||||||
end
|
end
|
||||||
|
|
||||||
return llm_second_opinion(cmd, cfg)
|
return llm_second_opinion(cmd, cfg, opts)
|
||||||
end
|
end
|
||||||
|
|
||||||
-- Expose the pattern table for `:safety patterns` meta and for testing.
|
-- Expose the pattern table for `:safety patterns` meta and for testing.
|
||||||
@@ -315,19 +333,44 @@ function M.norris_step(ctx, model_cfg, helpers, opts)
|
|||||||
|
|
||||||
helpers.render_step(step_n, max_steps)
|
helpers.render_step(step_n, max_steps)
|
||||||
|
|
||||||
-- (1) one broker round-trip — stream text + collect tool_calls
|
-- (1) one broker round-trip — stream text + collect tool_calls.
|
||||||
|
--
|
||||||
|
-- Issue #52: when helpers.scrub_msgs is provided, scrub outbound
|
||||||
|
-- per the active model's redact policy; when helpers.streaming_rehydrator
|
||||||
|
-- is provided, wrap on_delta so the user sees rehydrated text AND
|
||||||
|
-- text_parts accumulates rehydrated chunks (so any extracted CMD: /
|
||||||
|
-- DELEGATE: lines downstream see plain values — matches ask_ai's
|
||||||
|
-- contract in repl.lua).
|
||||||
|
local msgs = ctx:to_messages()
|
||||||
|
if helpers.scrub_msgs then msgs = helpers.scrub_msgs(msgs, model_cfg) end
|
||||||
|
local rehydrator = helpers.streaming_rehydrator and helpers.streaming_rehydrator() or nil
|
||||||
|
local probe_opts = nil
|
||||||
|
if helpers.scrub_msgs or helpers.rehydrate then
|
||||||
|
probe_opts = { scrub_msgs = helpers.scrub_msgs, rehydrate = helpers.rehydrate }
|
||||||
|
end
|
||||||
|
|
||||||
local text_parts = {}
|
local text_parts = {}
|
||||||
local tool_calls_seen = {}
|
local tool_calls_seen = {}
|
||||||
local ok, err = broker.chat_stream(model_cfg, ctx:to_messages(),
|
local ok, err = broker.chat_stream(model_cfg, msgs,
|
||||||
function(kind, payload)
|
function(kind, payload)
|
||||||
if kind == "text" then
|
if kind == "text" then
|
||||||
text_parts[#text_parts + 1] = payload
|
local emit = rehydrator and rehydrator:push(payload) or payload
|
||||||
helpers.render_assistant_delta(payload)
|
if emit ~= "" then
|
||||||
|
text_parts[#text_parts + 1] = emit
|
||||||
|
helpers.render_assistant_delta(emit)
|
||||||
|
end
|
||||||
elseif kind == "tool_call" then
|
elseif kind == "tool_call" then
|
||||||
tool_calls_seen[#tool_calls_seen + 1] = payload
|
tool_calls_seen[#tool_calls_seen + 1] = payload
|
||||||
end
|
end
|
||||||
end,
|
end,
|
||||||
{ tools = helpers.tools_schema() })
|
{ tools = helpers.tools_schema() })
|
||||||
|
if rehydrator then
|
||||||
|
local tail = rehydrator:flush()
|
||||||
|
if tail ~= "" then
|
||||||
|
text_parts[#text_parts + 1] = tail
|
||||||
|
helpers.render_assistant_delta(tail)
|
||||||
|
end
|
||||||
|
end
|
||||||
helpers.render_assistant_flush()
|
helpers.render_assistant_flush()
|
||||||
|
|
||||||
if not ok then
|
if not ok then
|
||||||
@@ -379,7 +422,7 @@ function M.norris_step(ctx, model_cfg, helpers, opts)
|
|||||||
|
|
||||||
-- Probe destructive on the JSON-serialized call as a proxy.
|
-- Probe destructive on the JSON-serialized call as a proxy.
|
||||||
local call_repr = (call.name or "?") .. " " .. (call.arguments or "")
|
local call_repr = (call.name or "?") .. " " .. (call.arguments or "")
|
||||||
local destr, reason = M.is_destructive(call_repr, cfg)
|
local destr, reason = M.is_destructive(call_repr, cfg, probe_opts)
|
||||||
|
|
||||||
local verdict
|
local verdict
|
||||||
if destr then
|
if destr then
|
||||||
@@ -421,7 +464,7 @@ function M.norris_step(ctx, model_cfg, helpers, opts)
|
|||||||
|
|
||||||
-- (5) dispatch CMD: lines (legacy route)
|
-- (5) dispatch CMD: lines (legacy route)
|
||||||
for _, cmd in ipairs(cmd_lines) do
|
for _, cmd in ipairs(cmd_lines) do
|
||||||
local destr, reason = M.is_destructive(cmd, cfg)
|
local destr, reason = M.is_destructive(cmd, cfg, probe_opts)
|
||||||
local verdict
|
local verdict
|
||||||
if destr then
|
if destr then
|
||||||
verdict = helpers.halt(step_n, max_steps, reason or "destructive",
|
verdict = helpers.halt(step_n, max_steps, reason or "destructive",
|
||||||
|
|||||||
Reference in New Issue
Block a user