-- safety.lua — workflow safeguards for tool execution. -- Phase 2: M.confirm_tool_call (per-call confirm gate + auto-approve policy). -- Phase 3: M.is_destructive (static pattern + LLM second-opinion gate for -- Norris autonomous mode) and M.norris_step (single-iteration -- planning loop). See docs/PHASE2.md §6 and docs/PHASE3.md §4 / §5. -- Issue #9: M.classify_command (allow/confirm/deny rule list — interactive -- CMD: gate, supersedes the confirm_cmd boolean when configured). local rl = require("ffi.readline") local json = require("dkjson") local broker = require("broker") local M = {} -- ---------------------------------------------------------------- classify_command -- Walk config.permissions (allow / confirm / deny rule lists) against `cmd` -- in priority order: deny > confirm > allow. First match in the chosen -- category wins. Returns the verdict string and the matching pattern (for -- status messages); falls back to the legacy confirm_cmd boolean when no -- permissions table is configured. Default verdict when permissions is set -- but no rule matches is "confirm" — per the issue body. -- verdict ∈ "allow" | "confirm" | "deny" local function _match_any(cmd, rules) if not rules then return nil end for _, p in ipairs(rules) do if cmd:find(p) then return p end end return nil end function M.classify_command(cmd, cfg) local perms = cfg and cfg.permissions if perms then local mp = _match_any(cmd, perms.deny); if mp then return "deny", mp end mp = _match_any(cmd, perms.confirm); if mp then return "confirm", mp end mp = _match_any(cmd, perms.allow); if mp then return "allow", mp end return "confirm", nil end if cfg and cfg.shell and cfg.shell.confirm_cmd then return "confirm", nil end return "allow", nil end -- Render the call as `name({"path":"/tmp"})` for the confirm prompt. -- Truncate to keep one-line prompts. local function pretty_call(name, args) local body = "" if args and next(args) then local ok, encoded = pcall(json.encode, args) if ok then body = (#encoded <= 80) and encoded or (encoded:sub(1, 77) .. "...") else body = "..." end end return name .. "(" .. body .. ")" end -- Ask the user whether tool `name` may be called with `args`, consulting -- `cfg.mcp.auto_approve` first. Policy keys: -- "__" → exact-match auto-approve -- "__*" → whole-server auto-approve -- Anything else falls back to a [y/N] prompt; empty / non-"y" answer rejects. -- The separator switched from "." to "__" 2026-05-12 because Anthropic via -- Bedrock rejects dots in tool names (regex ^[a-zA-Z0-9_-]{1,128}$). function M.confirm_tool_call(name, args, cfg) local policy = (cfg and cfg.mcp and cfg.mcp.auto_approve) or {} if policy[name] then return true end local alias = name:match("^(.-)__") if alias and alias ~= "" and policy[alias .. "__*"] then return true end local prompt = ("call '%s'? [y/N] "):format(pretty_call(name, args)) local ans = rl.readline(prompt) or "" return ans:lower():sub(1, 1) == "y" end -- ---------------------------------------------------------------- is_destructive -- Phase 3 commit #1: static-pattern matcher only (no LLM second-opinion yet — -- that lands in commit #2). Patterns are Lua patterns (NOT regex). When -- `ci = true` is set on a rule, the input is lowercased before matching so -- the rule matches case-insensitively (`DROP TABLE`, `drop table`, etc.). -- See docs/PHASE3.md §5 for the rationale and the wrapper-bypass class -- (R-B1) the first nine entries below are guarding against. local DESTRUCTIVE_PATTERNS = { -- ── Shell wrappers (R-B1) — flag the wrapper itself; can't inspect -- the inner content safely without parsing the inner shell. -- Norris HALTs on these unconditionally; the user reads the inner -- before proceeding. { pat = "^%s*bash%s+%-l?c%s", reason = "bash -c (wrapped shell)" }, { pat = "^%s*sh%s+%-l?c%s", reason = "sh -c (wrapped shell)" }, { pat = "^%s*zsh%s+%-l?c%s", reason = "zsh -c (wrapped shell)" }, { pat = "^%s*eval%s", reason = "eval (dynamic shell)" }, { pat = "^%s*python3?%s+%-c%s", reason = "python -c (inline script)" }, { pat = "^%s*perl%s+%-e%s", reason = "perl -e (inline script)" }, { pat = "|%s*sh%s", reason = "pipe-to-sh" }, { pat = "|%s*sh%s*$", reason = "pipe-to-sh (eol)" }, { pat = "|%s*bash%s", reason = "pipe-to-bash" }, { pat = "|%s*bash%s*$", reason = "pipe-to-bash (eol)" }, { pat = "xargs%s+.-rm", reason = "xargs ... rm" }, -- ── Filesystem destructive { pat = "rm%s+.-%-rf?", reason = "rm -rf" }, { pat = "rm%s+.-%-fr?", reason = "rm -fr" }, { pat = "find%s+.-%-delete", reason = "find -delete" }, { pat = "find%s+.-%-exec%s+rm", reason = "find -exec rm" }, { pat = ">%s*/dev/sd[a-z]", reason = "write to raw disk" }, { pat = "dd%s+.-of=/dev/", reason = "dd to device" }, { pat = "mkfs%.", reason = "mkfs (format)" }, { pat = "shred%s", reason = "shred" }, { pat = "wipefs%s", reason = "wipefs" }, { pat = "truncate%s+.-%-s%s*0", reason = "truncate to zero" }, -- ── Version control destructive { pat = "git%s+push%s+.-%-%-force", reason = "git push --force" }, { pat = "git%s+push%s+.-%-f%f[%s]", reason = "git push -f" }, { pat = "git%s+reset%s+.-%-%-hard", reason = "git reset --hard" }, { pat = "git%s+clean%s+.-%-fd?", reason = "git clean -fd" }, { pat = "git%s+branch%s+.-%-D", reason = "git branch -D" }, -- ── Database / process -- ci=true rules use lowercase patterns; the matcher lowercases the -- input before testing. Don't use uppercase patterns with ci=true. { pat = "drop%s+table", reason = "DROP TABLE", ci = true }, { pat = "drop%s+database", reason = "DROP DATABASE", ci = true }, { pat = "truncate%s+table", reason = "TRUNCATE TABLE", ci = true }, -- pkill BEFORE kill so the more specific match wins (Lua tables are -- order-preserving; first hit reports the reason). { pat = "pkill%s+%-9", reason = "pkill -9" }, -- kill -9 needs a word boundary so "pkill -9" doesn't match this rule's -- "kill" substring. %f[%w] is Lua's frontier pattern — matches a -- transition from non-word to word characters. { pat = "%f[%w]kill%s+%-9", reason = "kill -9" }, -- ── Network/permission { pat = "chmod%s+.-777", reason = "chmod 777" }, { pat = "chown%s+.-%s+/%s*$", reason = "chown on root path" }, } -- Match each rule against `cmd`. Returns (true, reason) on first hit; -- (false, nil) if no rule matches. Static-only — does NOT invoke the -- LLM probe (that's `is_destructive` below, which calls this first). local function match_static(cmd) if type(cmd) ~= "string" or cmd == "" then return false, nil end local lower = nil -- lazily computed for ci-rules for _, rule in ipairs(DESTRUCTIVE_PATTERNS) do local target = cmd if rule.ci then lower = lower or cmd:lower() target = lower end if target:match(rule.pat) then return true, rule.reason end end return false, nil end -- ---------------------------------------------------------------- LLM probe -- Session-scoped cache for the LLM second-opinion. Keyed by the normalized -- (lowercased, whitespace-collapsed) command text. Mitigates Q23 latency -- when the same command pattern recurs within a single Norris run. local _llm_cache = {} local function normalize(cmd) return cmd:lower():gsub("%s+", " "):gsub("^%s+", ""):gsub("%s+$", "") end -- Per-probe timeout. The probe must be quick — destructive detection has -- to keep up with Norris's pace. We override the model's default timeout -- (which can be 30+ min for deep/slow local models) with a tight cap. local PROBE_TIMEOUT_MS = 15000 -- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string -- (not bool — caller cares about disagreement between probes). -- -- Issue #52: when `opts.scrub_msgs` is provided, scrub the outbound -- {system, user(cmd)} message pair using the caller's secrets-aware -- scrubber. The probe model sees placeholders for any secrets the -- CMD: line happens to contain. Verdict text ("YES"/"NO") is unlikely -- to carry placeholders but we rehydrate defensively via opts.rehydrate -- so any echoed value comes back clean. local function llm_probe(model_cfg, system, cmd, opts) local msgs = { { role = "system", content = system }, { role = "user", content = cmd }, } if opts and opts.scrub_msgs then msgs = opts.scrub_msgs(msgs, model_cfg) end -- Phase 7: opts.category = "probe" tags the usage in the -- accumulator so :cost detail surfaces probe spend separately. -- broker.chat returns (text, usage) on success; capture as -- (reply, second) and branch on reply nil-ness. local reply, second = broker.chat(model_cfg, msgs, { max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS, category = "probe" }) if not reply then -- Broker failure → safe default: treat as YES (destructive) return "YES_FAILSAFE", second end -- Phase 7 (N4): route the usage payload through opts.on_usage if -- the caller wired one (repl.lua's _record_usage when secrets/ -- cost are configured). if second and opts and opts.on_usage then opts.on_usage(second.model, second.category, second) end if opts and opts.rehydrate then reply = opts.rehydrate(reply) end local upper = reply:upper() if upper:match("YES") then return "YES" end if upper:match("NO") then return "NO" end -- Unparseable response → treat as YES (safe default) return "YES_UNPARSEABLE", reply end -- LLM second-opinion gate. Returns: -- true, "LLM flagged as destructive" first probe said YES -- true, "LLM probe disagreement" first NO, second NO ("is it safe?") -- true, "LLM probe failed: " broker error → fail-safe YES -- false, nil both probes agree command is safe -- Cached per normalized command for the session. -- -- Model-selection precedence (per PHASE3.md §5): -- 1. cfg.safety.llm_model — explicit model-name from config (e.g. "cloud") -- 2. cfg.models.deep — local independent-class model (qwen3-30b) -- 3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default local function llm_second_opinion(cmd, cfg, opts) local probe_cfg if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then probe_cfg = cfg.models[cfg.safety.llm_model] end if not probe_cfg and cfg and cfg.models then probe_cfg = cfg.models.deep or cfg.models[cfg.default_model or "fast"] end if not probe_cfg then -- No probe model available → fail-safe YES return true, "LLM probe unavailable (no model in config)" end local key = normalize(cmd) local cached = _llm_cache[key] if cached ~= nil then if cached[1] then return cached[1], cached[2] end return false, nil end local function memo(is_d, reason) _llm_cache[key] = { is_d, reason } return is_d, reason end -- Probe 1: "Is this destructive?" local p1 = llm_probe(probe_cfg, "Detect destructive shell commands. Reply YES if running this " .. "would delete, overwrite, or irreversibly modify data, " .. "processes, or system state. Otherwise reply NO. Respond " .. "with only YES or NO.", cmd, opts) if p1 == "YES" then return memo(true, "LLM flagged as destructive") end if p1 == "YES_FAILSAFE" then return memo(true, "LLM probe failed (fail-safe)") end if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end -- Probe 2 (NO → re-roll with inverted phrasing per R-B2): local p2 = llm_probe(probe_cfg, "Reply YES or NO only. Is the following shell command SAFE to " .. "run autonomously without user review?", cmd, opts) if p2 == "YES" then return memo(false, nil) end -- Disagreement or fail-safe → HALT return memo(true, "LLM probe disagreement") end -- Main entry point. Returns (true, reason) if EITHER the static patterns -- OR the LLM second-opinion flag the command. Used by the Norris loop. -- `cfg` is the full aish config (carries cfg.safety.llm_second_opinion -- and cfg.models for the probe model lookup). When cfg is nil, only the -- static layer runs (handy for unit tests and tooling that wants the -- fast deterministic gate without an LLM round-trip). -- Issue #52: opts.scrub_msgs(messages, model_cfg) + opts.rehydrate(text) -- callbacks let the LLM probe scrub the outbound cmd before sending and -- rehydrate the YES/NO verdict before parsing. Both optional; absent -- opts = no-op (backwards-compatible). Caller (repl.lua / norris_step -- helpers) provides them when secrets are configured. function M.is_destructive(cmd, cfg, opts) if type(cmd) ~= "string" or cmd == "" then return false, nil end -- Static patterns first (fast, deterministic). local hit, reason = match_static(cmd) if hit then return true, reason end -- LLM second-opinion. Default ON when cfg is present; off when cfg -- is nil (test/static-only mode). Explicit opt-out via -- cfg.safety.llm_second_opinion = false. if cfg == nil then return false, nil end if cfg.safety and cfg.safety.llm_second_opinion == false then return false, nil end return llm_second_opinion(cmd, cfg, opts) end -- Expose the pattern table for `:safety patterns` meta and for testing. M._patterns = DESTRUCTIVE_PATTERNS M._match_static = match_static -- testable in isolation M._reset_cache = function() _llm_cache = {} end -- ---------------------------------------------------------------- norris_step -- One iteration of the Norris planning loop per PHASE3.md §4. -- The driver in repl.lua calls this in a while loop, advancing on every -- non-terminal status. -- -- Inputs: -- ctx aish Context (read & written: turns + pending_exec_output) -- model_cfg the active broker model config (model_cfg.endpoint/.model/etc.) -- helpers table of injected dispatch helpers: -- .tools_schema() → tools array for opts.tools -- .exec_cmd(cmd) → run shell cmd; returns (out, exit_code) -- .dispatch_tool(call,args)→ run an MCP tool; returns (content, is_error) -- .extract_cmd_lines(text)→ executor.extract_cmd_lines (passed in) -- .halt(step_n, max_n, reason, action) → "proceed"|"skip"|"abort" -- .render_step(n, max_n, descr) (renderer.norris_step) -- .render_tool_begin(name, args) (renderer.tool_call_begin) -- .render_tool_end(content, is_error) (renderer.tool_call_end) -- .render_exec_begin() (renderer.exec_begin) -- .render_exec_end(code) (renderer.exec_end) -- .render_assistant_delta(chunk) (renderer.assistant_delta) -- .render_assistant_flush() (renderer.assistant_flush) -- .log_turn(turn) (session log append) -- opts: -- .step_n current step (1-based) -- .max_steps budget cap (default 8) -- .cfg full aish config (for is_destructive) -- -- Returns: { status, reason } where status ∈ { -- "continue" — keep looping (driver bumps step_n) -- "done" — model emitted GOAL: complete -- "aborted" — user typed abort at a halt prompt -- "stalled" — model emitted nothing actionable -- "budget_exhausted" — step_n >= max_steps after this iteration -- "broker_error" — broker.chat_stream returned (nil, err) -- } function M.norris_step(ctx, model_cfg, helpers, opts) local step_n = opts.step_n or 1 local max_steps = opts.max_steps or 8 local cfg = opts.cfg helpers.render_step(step_n, max_steps) -- (1) one broker round-trip — stream text + collect tool_calls. -- -- Issue #52: when helpers.scrub_msgs is provided, scrub outbound -- per the active model's redact policy; when helpers.streaming_rehydrator -- is provided, wrap on_delta so the user sees rehydrated text AND -- text_parts accumulates rehydrated chunks (so any extracted CMD: / -- DELEGATE: lines downstream see plain values — matches ask_ai's -- contract in repl.lua). local msgs = ctx:to_messages() if helpers.scrub_msgs then msgs = helpers.scrub_msgs(msgs, model_cfg) end local rehydrator = helpers.streaming_rehydrator and helpers.streaming_rehydrator() or nil -- Phase 7: thread on_usage callback into the LLM probe via -- probe_opts so destructive-check costs land in the accumulator -- under the "probe" category. helpers.on_usage is repl.lua's -- _record_usage (the central chokepoint with warn-threshold check). local probe_opts = nil if helpers.scrub_msgs or helpers.rehydrate or helpers.on_usage then probe_opts = { scrub_msgs = helpers.scrub_msgs, rehydrate = helpers.rehydrate, on_usage = helpers.on_usage, } end local text_parts = {} local tool_calls_seen = {} local ok, err = broker.chat_stream(model_cfg, msgs, function(kind, payload) if kind == "text" then local emit = rehydrator and rehydrator:push(payload) or payload if emit ~= "" then text_parts[#text_parts + 1] = emit helpers.render_assistant_delta(emit) end elseif kind == "tool_call" then tool_calls_seen[#tool_calls_seen + 1] = payload elseif kind == "usage" then -- Phase 7: route Norris's own broker usage to the -- accumulator via helpers.on_usage. R5 chokepoint -- (_record_usage) is what's wired in. if helpers.on_usage then helpers.on_usage(payload.model, payload.category, payload) end end end, { tools = helpers.tools_schema(), category = "norris" }) if rehydrator then local tail = rehydrator:flush() if tail ~= "" then text_parts[#text_parts + 1] = tail helpers.render_assistant_delta(tail) end end helpers.render_assistant_flush() if not ok then return { status = "broker_error", reason = tostring(err) } end local resp_text = table.concat(text_parts) -- (2) parse actions from response local cmd_lines = helpers.extract_cmd_lines(resp_text) or {} local goal_done = false for line in (resp_text .. "\n"):gmatch("([^\n]*)\n") do local trimmed = line:gsub("^%s+", ""):gsub("%s+$", "") if trimmed == "GOAL: complete" then goal_done = true; break end end local n_actions = #tool_calls_seen + #cmd_lines -- (3) record assistant turn (with optional tool_calls) if #tool_calls_seen > 0 then ctx:append({ role = "assistant", content = resp_text, tool_calls = tool_calls_seen }) else ctx:append({ role = "assistant", content = resp_text }) end helpers.log_turn(ctx.turns[#ctx.turns]) if n_actions == 0 and not goal_done then return { status = "stalled", reason = "no action emitted" } end -- (4) dispatch tool_calls first (structured route) for _, call in ipairs(tool_calls_seen) do local args_table = {} if call.arguments and call.arguments ~= "" then local d, _, derr = json.decode(call.arguments) if d then args_table = d else -- Argument JSON parse failure: synthesize tool turn (alternation) ctx:append({ role = "tool", tool_call_id = call.id, content = "[aish] tool arguments not " .. "parseable as JSON: " .. tostring(derr) }) helpers.log_turn(ctx.turns[#ctx.turns]) goto continue_tool end end -- Probe destructive on the JSON-serialized call as a proxy. local call_repr = (call.name or "?") .. " " .. (call.arguments or "") local destr, reason = M.is_destructive(call_repr, cfg, probe_opts) local verdict if destr then verdict = helpers.halt(step_n, max_steps, reason or "destructive", call_repr) else -- Non-destructive tool_call: auto_approve OR halt for consent local policy = cfg and cfg.mcp and cfg.mcp.auto_approve or {} local alias = (call.name or ""):match("^(.-)__") local auto = policy[call.name] or (alias and alias ~= "" and policy[alias .. "__*"]) if auto then verdict = "proceed" else verdict = helpers.halt(step_n, max_steps, "tool consent", call_repr) end end if verdict == "abort" then return { status = "aborted", reason = "user abort at halt" } elseif verdict == "skip" then ctx.norris_consecutive_skips = (ctx.norris_consecutive_skips or 0) + 1 ctx:append({ role = "tool", tool_call_id = call.id, content = "[aish] tool call skipped by user: " .. (reason or "no reason") }) helpers.log_turn(ctx.turns[#ctx.turns]) else -- proceed ctx.norris_consecutive_skips = 0 helpers.render_tool_begin(call.name, call.arguments) local content, is_error = helpers.dispatch_tool(call.name, args_table) helpers.render_tool_end(content, is_error) ctx:append({ role = "tool", tool_call_id = call.id, content = content or "" }) helpers.log_turn(ctx.turns[#ctx.turns]) end ::continue_tool:: end -- (5) dispatch CMD: lines (legacy route) for _, cmd in ipairs(cmd_lines) do local destr, reason = M.is_destructive(cmd, cfg, probe_opts) local verdict if destr then verdict = helpers.halt(step_n, max_steps, reason or "destructive", cmd) else verdict = "proceed" -- non-destructive CMD: runs without consent -- in Norris (Norris user accepted autonomy) end if verdict == "abort" then return { status = "aborted", reason = "user abort at halt" } elseif verdict == "skip" then ctx.norris_consecutive_skips = (ctx.norris_consecutive_skips or 0) + 1 -- CMD: skip → synthesize exec-output line so the model sees it ctx:append_exec_output("[aish] CMD skipped by user: " .. (reason or "no reason")) else -- proceed ctx.norris_consecutive_skips = 0 helpers.render_exec_begin() local out, code = helpers.exec_cmd(cmd) helpers.render_exec_end(code) if cfg and cfg.shell and cfg.shell.capture_output then ctx:append_exec_output(out) end end end -- Skip-budget escalation: R-C1 if (ctx.norris_consecutive_skips or 0) >= 3 then local verdict = helpers.halt(step_n, max_steps, ("%d consecutive user skips"):format(ctx.norris_consecutive_skips), "(repeated similar destructive proposals)") if verdict == "abort" then return { status = "aborted", reason = "user abort on skip-escalation" } end -- Else: reset the counter and continue (user said proceed) ctx.norris_consecutive_skips = 0 end -- (6) goal_done after dispatch if goal_done then return { status = "done", reason = "GOAL: complete" } end -- (7) budget if step_n >= max_steps then return { status = "budget_exhausted", reason = ("%d step limit reached"):format(max_steps) } end return { status = "continue" } end return M