Files
aish/safety.lua
T
marfrit 11b1f566b3 safety: norris_step planner (Phase 3 commit #4)
Phase 3 commit #4 per docs/PHASE3.md §12. Single-iteration planner.
The driver loop in repl.lua (commit #5) calls this in a while loop,
advancing step_n on every "continue" return.

M.norris_step(ctx, model_cfg, helpers, opts):
  1. One broker.chat_stream round-trip — text + tool_calls collected,
     text streamed via helpers.render_assistant_delta.
  2. Parse actions from response: tool_calls (already collected),
     CMD: lines (via helpers.extract_cmd_lines), GOAL: complete
     sentinel (line-level exact match per R-C5).
  3. Record the assistant turn (with tool_calls if any) and log it.
     If no actions AND no goal_done → status="stalled".
  4. Dispatch tool_calls (structured route first):
       - is_destructive check on serialized call.
       - If destructive → halt_fn(proceed/skip/abort).
       - Else → auto_approve lookup; absent → halt for consent (R-C6:
         Norris is conservative; auto_approve is the only consent
         bypass).
       - On skip: synthesize role:tool turn "[aish] tool call
         skipped by user" — alternation preserved per C5/C7.
       - On abort: return status="aborted".
       - On proceed: dispatch via helpers.dispatch_tool, append
         role:tool turn with result content.
       - Argument JSON parse failure also synthesizes a tool turn
         (same alternation rationale).
  5. Dispatch CMD: lines (legacy route):
       - is_destructive check.
       - Destructive → halt_fn.
       - Non-destructive → run directly (Norris user accepted
         autonomy for non-destructive shell).
       - skip → ctx:append_exec_output "[aish] CMD skipped by user".
       - proceed → exec via helpers.exec_cmd, frame via
         render_exec_begin/end.
  6. Skip-budget escalation (R-C1): after dispatch, if
     ctx.norris_consecutive_skips >= 3 → escalation halt; abort exits,
     proceed resets counter.
  7. Goal-done check AFTER all dispatch (R-C2 / Q25 resolution).
  8. Budget check: step_n >= max_steps → status="budget_exhausted".
  9. Otherwise → status="continue", driver advances.

Helpers are passed in as injected functions rather than directly
requiring repl/renderer/executor — keeps safety.lua's coupling clean
and norris_step testable with a mocked helpers table.

State carried across iterations on the ctx:
  - ctx.norris_consecutive_skips (resets on any successful proceed)
  - ctx.norris_goal / ctx.norris_active (set/cleared by the driver)

Existing test_safety.lua corpus (87 cases) still passes — norris_step
addition doesn't touch is_destructive's behavior.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 23:37:53 +00:00

448 lines
20 KiB
Lua

-- safety.lua — workflow safeguards for tool execution.
-- Phase 2: M.confirm_tool_call (per-call confirm gate + auto-approve policy).
-- Phase 3: M.is_destructive (static pattern + LLM second-opinion gate for
-- Norris autonomous mode) and M.norris_step (single-iteration
-- planning loop). See docs/PHASE2.md §6 and docs/PHASE3.md §4 / §5.
local rl = require("ffi.readline")
local json = require("dkjson")
local broker = require("broker")
local M = {}
-- Render the call as `name({"path":"/tmp"})` for the confirm prompt.
-- Truncate to keep one-line prompts.
local function pretty_call(name, args)
local body = ""
if args and next(args) then
local ok, encoded = pcall(json.encode, args)
if ok then
body = (#encoded <= 80) and encoded or (encoded:sub(1, 77) .. "...")
else
body = "..."
end
end
return name .. "(" .. body .. ")"
end
-- Ask the user whether tool `name` may be called with `args`, consulting
-- `cfg.mcp.auto_approve` first. Policy keys:
-- "<alias>__<tool>" → exact-match auto-approve
-- "<alias>__*" → whole-server auto-approve
-- Anything else falls back to a [y/N] prompt; empty / non-"y" answer rejects.
-- The separator switched from "." to "__" 2026-05-12 because Anthropic via
-- Bedrock rejects dots in tool names (regex ^[a-zA-Z0-9_-]{1,128}$).
function M.confirm_tool_call(name, args, cfg)
local policy = (cfg and cfg.mcp and cfg.mcp.auto_approve) or {}
if policy[name] then return true end
local alias = name:match("^(.-)__")
if alias and alias ~= "" and policy[alias .. "__*"] then return true end
local prompt = ("call '%s'? [y/N] "):format(pretty_call(name, args))
local ans = rl.readline(prompt) or ""
return ans:lower():sub(1, 1) == "y"
end
-- ---------------------------------------------------------------- is_destructive
-- Phase 3 commit #1: static-pattern matcher only (no LLM second-opinion yet —
-- that lands in commit #2). Patterns are Lua patterns (NOT regex). When
-- `ci = true` is set on a rule, the input is lowercased before matching so
-- the rule matches case-insensitively (`DROP TABLE`, `drop table`, etc.).
-- See docs/PHASE3.md §5 for the rationale and the wrapper-bypass class
-- (R-B1) the first nine entries below are guarding against.
local DESTRUCTIVE_PATTERNS = {
-- ── Shell wrappers (R-B1) — flag the wrapper itself; can't inspect
-- the inner content safely without parsing the inner shell.
-- Norris HALTs on these unconditionally; the user reads the inner
-- before proceeding.
{ pat = "^%s*bash%s+%-l?c%s", reason = "bash -c (wrapped shell)" },
{ pat = "^%s*sh%s+%-l?c%s", reason = "sh -c (wrapped shell)" },
{ pat = "^%s*zsh%s+%-l?c%s", reason = "zsh -c (wrapped shell)" },
{ pat = "^%s*eval%s", reason = "eval (dynamic shell)" },
{ pat = "^%s*python3?%s+%-c%s", reason = "python -c (inline script)" },
{ pat = "^%s*perl%s+%-e%s", reason = "perl -e (inline script)" },
{ pat = "|%s*sh%s", reason = "pipe-to-sh" },
{ pat = "|%s*sh%s*$", reason = "pipe-to-sh (eol)" },
{ pat = "|%s*bash%s", reason = "pipe-to-bash" },
{ pat = "|%s*bash%s*$", reason = "pipe-to-bash (eol)" },
{ pat = "xargs%s+.-rm", reason = "xargs ... rm" },
-- ── Filesystem destructive
{ pat = "rm%s+.-%-rf?", reason = "rm -rf" },
{ pat = "rm%s+.-%-fr?", reason = "rm -fr" },
{ pat = "find%s+.-%-delete", reason = "find -delete" },
{ pat = "find%s+.-%-exec%s+rm", reason = "find -exec rm" },
{ pat = ">%s*/dev/sd[a-z]", reason = "write to raw disk" },
{ pat = "dd%s+.-of=/dev/", reason = "dd to device" },
{ pat = "mkfs%.", reason = "mkfs (format)" },
{ pat = "shred%s", reason = "shred" },
{ pat = "wipefs%s", reason = "wipefs" },
{ pat = "truncate%s+.-%-s%s*0", reason = "truncate to zero" },
-- ── Version control destructive
{ pat = "git%s+push%s+.-%-%-force", reason = "git push --force" },
{ pat = "git%s+push%s+.-%-f%f[%s]", reason = "git push -f" },
{ pat = "git%s+reset%s+.-%-%-hard", reason = "git reset --hard" },
{ pat = "git%s+clean%s+.-%-fd?", reason = "git clean -fd" },
{ pat = "git%s+branch%s+.-%-D", reason = "git branch -D" },
-- ── Database / process
-- ci=true rules use lowercase patterns; the matcher lowercases the
-- input before testing. Don't use uppercase patterns with ci=true.
{ pat = "drop%s+table", reason = "DROP TABLE", ci = true },
{ pat = "drop%s+database", reason = "DROP DATABASE", ci = true },
{ pat = "truncate%s+table", reason = "TRUNCATE TABLE", ci = true },
-- pkill BEFORE kill so the more specific match wins (Lua tables are
-- order-preserving; first hit reports the reason).
{ pat = "pkill%s+%-9", reason = "pkill -9" },
-- kill -9 needs a word boundary so "pkill -9" doesn't match this rule's
-- "kill" substring. %f[%w] is Lua's frontier pattern — matches a
-- transition from non-word to word characters.
{ pat = "%f[%w]kill%s+%-9", reason = "kill -9" },
-- ── Network/permission
{ pat = "chmod%s+.-777", reason = "chmod 777" },
{ pat = "chown%s+.-%s+/%s*$", reason = "chown on root path" },
}
-- Match each rule against `cmd`. Returns (true, reason) on first hit;
-- (false, nil) if no rule matches. Static-only — does NOT invoke the
-- LLM probe (that's `is_destructive` below, which calls this first).
local function match_static(cmd)
if type(cmd) ~= "string" or cmd == "" then return false, nil end
local lower = nil -- lazily computed for ci-rules
for _, rule in ipairs(DESTRUCTIVE_PATTERNS) do
local target = cmd
if rule.ci then
lower = lower or cmd:lower()
target = lower
end
if target:match(rule.pat) then
return true, rule.reason
end
end
return false, nil
end
-- ---------------------------------------------------------------- LLM probe
-- Session-scoped cache for the LLM second-opinion. Keyed by the normalized
-- (lowercased, whitespace-collapsed) command text. Mitigates Q23 latency
-- when the same command pattern recurs within a single Norris run.
local _llm_cache = {}
local function normalize(cmd)
return cmd:lower():gsub("%s+", " "):gsub("^%s+", ""):gsub("%s+$", "")
end
-- Per-probe timeout. The probe must be quick — destructive detection has
-- to keep up with Norris's pace. We override the model's default timeout
-- (which can be 30+ min for deep/slow local models) with a tight cap.
local PROBE_TIMEOUT_MS = 15000
-- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string
-- (not bool — caller cares about disagreement between probes).
local function llm_probe(model_cfg, system, cmd)
local reply, err = broker.chat(model_cfg,
{ { role = "system", content = system },
{ role = "user", content = cmd } },
{ max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS })
if not reply then
-- Broker failure → safe default: treat as YES (destructive)
return "YES_FAILSAFE", err
end
local upper = reply:upper()
if upper:match("YES") then return "YES" end
if upper:match("NO") then return "NO" end
-- Unparseable response → treat as YES (safe default)
return "YES_UNPARSEABLE", reply
end
-- LLM second-opinion gate. Returns:
-- true, "LLM flagged as destructive" first probe said YES
-- true, "LLM probe disagreement" first NO, second NO ("is it safe?")
-- true, "LLM probe failed: <err>" broker error → fail-safe YES
-- false, nil both probes agree command is safe
-- Cached per normalized command for the session.
--
-- Model-selection precedence (per PHASE3.md §5):
-- 1. cfg.safety.llm_model — explicit model-name from config (e.g. "cloud")
-- 2. cfg.models.deep — local independent-class model (qwen3-30b)
-- 3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default
local function llm_second_opinion(cmd, cfg)
local probe_cfg
if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then
probe_cfg = cfg.models[cfg.safety.llm_model]
end
if not probe_cfg and cfg and cfg.models then
probe_cfg = cfg.models.deep or cfg.models[cfg.default_model or "fast"]
end
if not probe_cfg then
-- No probe model available → fail-safe YES
return true, "LLM probe unavailable (no model in config)"
end
local key = normalize(cmd)
local cached = _llm_cache[key]
if cached ~= nil then
if cached[1] then return cached[1], cached[2] end
return false, nil
end
local function memo(is_d, reason)
_llm_cache[key] = { is_d, reason }
return is_d, reason
end
-- Probe 1: "Is this destructive?"
local p1 = llm_probe(probe_cfg,
"Detect destructive shell commands. Reply YES if running this "
.. "would delete, overwrite, or irreversibly modify data, "
.. "processes, or system state. Otherwise reply NO. Respond "
.. "with only YES or NO.", cmd)
if p1 == "YES" then return memo(true, "LLM flagged as destructive") end
if p1 == "YES_FAILSAFE" then return memo(true, "LLM probe failed (fail-safe)") end
if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end
-- Probe 2 (NO → re-roll with inverted phrasing per R-B2):
local p2 = llm_probe(probe_cfg,
"Reply YES or NO only. Is the following shell command SAFE to "
.. "run autonomously without user review?", cmd)
if p2 == "YES" then return memo(false, nil) end
-- Disagreement or fail-safe → HALT
return memo(true, "LLM probe disagreement")
end
-- Main entry point. Returns (true, reason) if EITHER the static patterns
-- OR the LLM second-opinion flag the command. Used by the Norris loop.
-- `cfg` is the full aish config (carries cfg.safety.llm_second_opinion
-- and cfg.models for the probe model lookup). When cfg is nil, only the
-- static layer runs (handy for unit tests and tooling that wants the
-- fast deterministic gate without an LLM round-trip).
function M.is_destructive(cmd, cfg)
if type(cmd) ~= "string" or cmd == "" then return false, nil end
-- Static patterns first (fast, deterministic).
local hit, reason = match_static(cmd)
if hit then return true, reason end
-- LLM second-opinion. Default ON when cfg is present; off when cfg
-- is nil (test/static-only mode). Explicit opt-out via
-- cfg.safety.llm_second_opinion = false.
if cfg == nil then return false, nil end
if cfg.safety and cfg.safety.llm_second_opinion == false then
return false, nil
end
return llm_second_opinion(cmd, cfg)
end
-- Expose the pattern table for `:safety patterns` meta and for testing.
M._patterns = DESTRUCTIVE_PATTERNS
M._match_static = match_static -- testable in isolation
M._reset_cache = function() _llm_cache = {} end
-- ---------------------------------------------------------------- norris_step
-- One iteration of the Norris planning loop per PHASE3.md §4.
-- The driver in repl.lua calls this in a while loop, advancing on every
-- non-terminal status.
--
-- Inputs:
-- ctx aish Context (read & written: turns + pending_exec_output)
-- model_cfg the active broker model config (model_cfg.endpoint/.model/etc.)
-- helpers table of injected dispatch helpers:
-- .tools_schema() → tools array for opts.tools
-- .exec_cmd(cmd) → run shell cmd; returns (out, exit_code)
-- .dispatch_tool(call,args)→ run an MCP tool; returns (content, is_error)
-- .extract_cmd_lines(text)→ executor.extract_cmd_lines (passed in)
-- .halt(step_n, max_n, reason, action) → "proceed"|"skip"|"abort"
-- .render_step(n, max_n, descr) (renderer.norris_step)
-- .render_tool_begin(name, args) (renderer.tool_call_begin)
-- .render_tool_end(content, is_error) (renderer.tool_call_end)
-- .render_exec_begin() (renderer.exec_begin)
-- .render_exec_end(code) (renderer.exec_end)
-- .render_assistant_delta(chunk) (renderer.assistant_delta)
-- .render_assistant_flush() (renderer.assistant_flush)
-- .log_turn(turn) (session log append)
-- opts:
-- .step_n current step (1-based)
-- .max_steps budget cap (default 8)
-- .cfg full aish config (for is_destructive)
--
-- Returns: { status, reason } where status ∈ {
-- "continue" — keep looping (driver bumps step_n)
-- "done" — model emitted GOAL: complete
-- "aborted" — user typed abort at a halt prompt
-- "stalled" — model emitted nothing actionable
-- "budget_exhausted" — step_n >= max_steps after this iteration
-- "broker_error" — broker.chat_stream returned (nil, err)
-- }
function M.norris_step(ctx, model_cfg, helpers, opts)
local step_n = opts.step_n or 1
local max_steps = opts.max_steps or 8
local cfg = opts.cfg
helpers.render_step(step_n, max_steps)
-- (1) one broker round-trip — stream text + collect tool_calls
local text_parts = {}
local tool_calls_seen = {}
local ok, err = broker.chat_stream(model_cfg, ctx:to_messages(),
function(kind, payload)
if kind == "text" then
text_parts[#text_parts + 1] = payload
helpers.render_assistant_delta(payload)
elseif kind == "tool_call" then
tool_calls_seen[#tool_calls_seen + 1] = payload
end
end,
{ tools = helpers.tools_schema() })
helpers.render_assistant_flush()
if not ok then
return { status = "broker_error", reason = tostring(err) }
end
local resp_text = table.concat(text_parts)
-- (2) parse actions from response
local cmd_lines = helpers.extract_cmd_lines(resp_text) or {}
local goal_done = false
for line in (resp_text .. "\n"):gmatch("([^\n]*)\n") do
local trimmed = line:gsub("^%s+", ""):gsub("%s+$", "")
if trimmed == "GOAL: complete" then
goal_done = true; break
end
end
local n_actions = #tool_calls_seen + #cmd_lines
-- (3) record assistant turn (with optional tool_calls)
if #tool_calls_seen > 0 then
ctx:append({ role = "assistant", content = resp_text,
tool_calls = tool_calls_seen })
else
ctx:append({ role = "assistant", content = resp_text })
end
helpers.log_turn(ctx.turns[#ctx.turns])
if n_actions == 0 and not goal_done then
return { status = "stalled", reason = "no action emitted" }
end
-- (4) dispatch tool_calls first (structured route)
for _, call in ipairs(tool_calls_seen) do
local args_table = {}
if call.arguments and call.arguments ~= "" then
local d, _, derr = json.decode(call.arguments)
if d then args_table = d
else
-- Argument JSON parse failure: synthesize tool turn (alternation)
ctx:append({ role = "tool", tool_call_id = call.id,
content = "[aish] tool arguments not "
.. "parseable as JSON: " .. tostring(derr) })
helpers.log_turn(ctx.turns[#ctx.turns])
goto continue_tool
end
end
-- Probe destructive on the JSON-serialized call as a proxy.
local call_repr = (call.name or "?") .. " " .. (call.arguments or "")
local destr, reason = M.is_destructive(call_repr, cfg)
local verdict
if destr then
verdict = helpers.halt(step_n, max_steps, reason or "destructive",
call_repr)
else
-- Non-destructive tool_call: auto_approve OR halt for consent
local policy = cfg and cfg.mcp and cfg.mcp.auto_approve or {}
local alias = (call.name or ""):match("^(.-)__")
local auto = policy[call.name]
or (alias and alias ~= "" and policy[alias .. "__*"])
if auto then
verdict = "proceed"
else
verdict = helpers.halt(step_n, max_steps, "tool consent",
call_repr)
end
end
if verdict == "abort" then
return { status = "aborted", reason = "user abort at halt" }
elseif verdict == "skip" then
ctx.norris_consecutive_skips = (ctx.norris_consecutive_skips or 0) + 1
ctx:append({ role = "tool", tool_call_id = call.id,
content = "[aish] tool call skipped by user: "
.. (reason or "no reason") })
helpers.log_turn(ctx.turns[#ctx.turns])
else -- proceed
ctx.norris_consecutive_skips = 0
helpers.render_tool_begin(call.name, call.arguments)
local content, is_error = helpers.dispatch_tool(call.name, args_table)
helpers.render_tool_end(content, is_error)
ctx:append({ role = "tool", tool_call_id = call.id,
content = content or "" })
helpers.log_turn(ctx.turns[#ctx.turns])
end
::continue_tool::
end
-- (5) dispatch CMD: lines (legacy route)
for _, cmd in ipairs(cmd_lines) do
local destr, reason = M.is_destructive(cmd, cfg)
local verdict
if destr then
verdict = helpers.halt(step_n, max_steps, reason or "destructive",
cmd)
else
verdict = "proceed" -- non-destructive CMD: runs without consent
-- in Norris (Norris user accepted autonomy)
end
if verdict == "abort" then
return { status = "aborted", reason = "user abort at halt" }
elseif verdict == "skip" then
ctx.norris_consecutive_skips = (ctx.norris_consecutive_skips or 0) + 1
-- CMD: skip → synthesize exec-output line so the model sees it
ctx:append_exec_output("[aish] CMD skipped by user: "
.. (reason or "no reason"))
else -- proceed
ctx.norris_consecutive_skips = 0
helpers.render_exec_begin()
local out, code = helpers.exec_cmd(cmd)
helpers.render_exec_end(code)
if cfg and cfg.shell and cfg.shell.capture_output then
ctx:append_exec_output(out)
end
end
end
-- Skip-budget escalation: R-C1
if (ctx.norris_consecutive_skips or 0) >= 3 then
local verdict = helpers.halt(step_n, max_steps,
("%d consecutive user skips"):format(ctx.norris_consecutive_skips),
"(repeated similar destructive proposals)")
if verdict == "abort" then
return { status = "aborted", reason = "user abort on skip-escalation" }
end
-- Else: reset the counter and continue (user said proceed)
ctx.norris_consecutive_skips = 0
end
-- (6) goal_done after dispatch
if goal_done then
return { status = "done", reason = "GOAL: complete" }
end
-- (7) budget
if step_n >= max_steps then
return { status = "budget_exhausted",
reason = ("%d step limit reached"):format(max_steps) }
end
return { status = "continue" }
end
return M