-- safety.lua — workflow safeguards for tool execution. -- Phase 2: M.confirm_tool_call (per-call confirm gate + auto-approve policy). -- Phase 3: M.is_destructive (static pattern + LLM second-opinion gate for -- Norris autonomous mode) and M.norris_step (single-iteration -- planning loop). See docs/PHASE2.md §6 and docs/PHASE3.md §4 / §5. local rl = require("ffi.readline") local json = require("dkjson") local M = {} -- Render the call as `name({"path":"/tmp"})` for the confirm prompt. -- Truncate to keep one-line prompts. local function pretty_call(name, args) local body = "" if args and next(args) then local ok, encoded = pcall(json.encode, args) if ok then body = (#encoded <= 80) and encoded or (encoded:sub(1, 77) .. "...") else body = "..." end end return name .. "(" .. body .. ")" end -- Ask the user whether tool `name` may be called with `args`, consulting -- `cfg.mcp.auto_approve` first. Policy keys: -- "__" → exact-match auto-approve -- "__*" → whole-server auto-approve -- Anything else falls back to a [y/N] prompt; empty / non-"y" answer rejects. -- The separator switched from "." to "__" 2026-05-12 because Anthropic via -- Bedrock rejects dots in tool names (regex ^[a-zA-Z0-9_-]{1,128}$). function M.confirm_tool_call(name, args, cfg) local policy = (cfg and cfg.mcp and cfg.mcp.auto_approve) or {} if policy[name] then return true end local alias = name:match("^(.-)__") if alias and alias ~= "" and policy[alias .. "__*"] then return true end local prompt = ("call '%s'? [y/N] "):format(pretty_call(name, args)) local ans = rl.readline(prompt) or "" return ans:lower():sub(1, 1) == "y" end -- ---------------------------------------------------------------- is_destructive -- Phase 3 commit #1: static-pattern matcher only (no LLM second-opinion yet — -- that lands in commit #2). Patterns are Lua patterns (NOT regex). When -- `ci = true` is set on a rule, the input is lowercased before matching so -- the rule matches case-insensitively (`DROP TABLE`, `drop table`, etc.). -- See docs/PHASE3.md §5 for the rationale and the wrapper-bypass class -- (R-B1) the first nine entries below are guarding against. local DESTRUCTIVE_PATTERNS = { -- ── Shell wrappers (R-B1) — flag the wrapper itself; can't inspect -- the inner content safely without parsing the inner shell. -- Norris HALTs on these unconditionally; the user reads the inner -- before proceeding. { pat = "^%s*bash%s+%-l?c%s", reason = "bash -c (wrapped shell)" }, { pat = "^%s*sh%s+%-l?c%s", reason = "sh -c (wrapped shell)" }, { pat = "^%s*zsh%s+%-l?c%s", reason = "zsh -c (wrapped shell)" }, { pat = "^%s*eval%s", reason = "eval (dynamic shell)" }, { pat = "^%s*python3?%s+%-c%s", reason = "python -c (inline script)" }, { pat = "^%s*perl%s+%-e%s", reason = "perl -e (inline script)" }, { pat = "|%s*sh%s", reason = "pipe-to-sh" }, { pat = "|%s*sh%s*$", reason = "pipe-to-sh (eol)" }, { pat = "|%s*bash%s", reason = "pipe-to-bash" }, { pat = "|%s*bash%s*$", reason = "pipe-to-bash (eol)" }, { pat = "xargs%s+.-rm", reason = "xargs ... rm" }, -- ── Filesystem destructive { pat = "rm%s+.-%-rf?", reason = "rm -rf" }, { pat = "rm%s+.-%-fr?", reason = "rm -fr" }, { pat = "find%s+.-%-delete", reason = "find -delete" }, { pat = "find%s+.-%-exec%s+rm", reason = "find -exec rm" }, { pat = ">%s*/dev/sd[a-z]", reason = "write to raw disk" }, { pat = "dd%s+.-of=/dev/", reason = "dd to device" }, { pat = "mkfs%.", reason = "mkfs (format)" }, { pat = "shred%s", reason = "shred" }, { pat = "wipefs%s", reason = "wipefs" }, { pat = "truncate%s+.-%-s%s*0", reason = "truncate to zero" }, -- ── Version control destructive { pat = "git%s+push%s+.-%-%-force", reason = "git push --force" }, { pat = "git%s+push%s+.-%-f%f[%s]", reason = "git push -f" }, { pat = "git%s+reset%s+.-%-%-hard", reason = "git reset --hard" }, { pat = "git%s+clean%s+.-%-fd?", reason = "git clean -fd" }, { pat = "git%s+branch%s+.-%-D", reason = "git branch -D" }, -- ── Database / process -- ci=true rules use lowercase patterns; the matcher lowercases the -- input before testing. Don't use uppercase patterns with ci=true. { pat = "drop%s+table", reason = "DROP TABLE", ci = true }, { pat = "drop%s+database", reason = "DROP DATABASE", ci = true }, { pat = "truncate%s+table", reason = "TRUNCATE TABLE", ci = true }, -- pkill BEFORE kill so the more specific match wins (Lua tables are -- order-preserving; first hit reports the reason). { pat = "pkill%s+%-9", reason = "pkill -9" }, -- kill -9 needs a word boundary so "pkill -9" doesn't match this rule's -- "kill" substring. %f[%w] is Lua's frontier pattern — matches a -- transition from non-word to word characters. { pat = "%f[%w]kill%s+%-9", reason = "kill -9" }, -- ── Network/permission { pat = "chmod%s+.-777", reason = "chmod 777" }, { pat = "chown%s+.-%s+/%s*$", reason = "chown on root path" }, } -- Match each rule against `cmd`. Returns (true, reason) on first hit; -- (false, nil) if no rule matches. Used by the Norris loop to gate -- shell commands; ALSO called on tool-call args by Norris's tool path -- (the JSON-serialized arguments are passed in as cmd). function M.is_destructive(cmd) if type(cmd) ~= "string" or cmd == "" then return false, nil end local lower = nil -- lazily computed for ci-rules for _, rule in ipairs(DESTRUCTIVE_PATTERNS) do local target = cmd if rule.ci then lower = lower or cmd:lower() target = lower end if target:match(rule.pat) then return true, rule.reason end end return false, nil end -- Expose the pattern table for `:safety patterns` meta and for testing. M._patterns = DESTRUCTIVE_PATTERNS -- ---------------------------------------------------------------- norris_step -- Phase 3 commit #4 lands the planner. Stub stays for now. function M.norris_step(plan, broker, executor) error("safety.norris_step: not implemented yet (lands in Phase 3 commit #4)") end return M