-- safety.lua — workflow safeguards for tool execution. -- Phase 2: M.confirm_tool_call (per-call confirm gate + auto-approve policy). -- Phase 3: M.is_destructive (static pattern + LLM second-opinion gate for -- Norris autonomous mode) and M.norris_step (single-iteration -- planning loop). See docs/PHASE2.md §6 and docs/PHASE3.md §4 / §5. local rl = require("ffi.readline") local json = require("dkjson") local broker = require("broker") local M = {} -- Render the call as `name({"path":"/tmp"})` for the confirm prompt. -- Truncate to keep one-line prompts. local function pretty_call(name, args) local body = "" if args and next(args) then local ok, encoded = pcall(json.encode, args) if ok then body = (#encoded <= 80) and encoded or (encoded:sub(1, 77) .. "...") else body = "..." end end return name .. "(" .. body .. ")" end -- Ask the user whether tool `name` may be called with `args`, consulting -- `cfg.mcp.auto_approve` first. Policy keys: -- "__" → exact-match auto-approve -- "__*" → whole-server auto-approve -- Anything else falls back to a [y/N] prompt; empty / non-"y" answer rejects. -- The separator switched from "." to "__" 2026-05-12 because Anthropic via -- Bedrock rejects dots in tool names (regex ^[a-zA-Z0-9_-]{1,128}$). function M.confirm_tool_call(name, args, cfg) local policy = (cfg and cfg.mcp and cfg.mcp.auto_approve) or {} if policy[name] then return true end local alias = name:match("^(.-)__") if alias and alias ~= "" and policy[alias .. "__*"] then return true end local prompt = ("call '%s'? [y/N] "):format(pretty_call(name, args)) local ans = rl.readline(prompt) or "" return ans:lower():sub(1, 1) == "y" end -- ---------------------------------------------------------------- is_destructive -- Phase 3 commit #1: static-pattern matcher only (no LLM second-opinion yet — -- that lands in commit #2). Patterns are Lua patterns (NOT regex). When -- `ci = true` is set on a rule, the input is lowercased before matching so -- the rule matches case-insensitively (`DROP TABLE`, `drop table`, etc.). -- See docs/PHASE3.md §5 for the rationale and the wrapper-bypass class -- (R-B1) the first nine entries below are guarding against. local DESTRUCTIVE_PATTERNS = { -- ── Shell wrappers (R-B1) — flag the wrapper itself; can't inspect -- the inner content safely without parsing the inner shell. -- Norris HALTs on these unconditionally; the user reads the inner -- before proceeding. { pat = "^%s*bash%s+%-l?c%s", reason = "bash -c (wrapped shell)" }, { pat = "^%s*sh%s+%-l?c%s", reason = "sh -c (wrapped shell)" }, { pat = "^%s*zsh%s+%-l?c%s", reason = "zsh -c (wrapped shell)" }, { pat = "^%s*eval%s", reason = "eval (dynamic shell)" }, { pat = "^%s*python3?%s+%-c%s", reason = "python -c (inline script)" }, { pat = "^%s*perl%s+%-e%s", reason = "perl -e (inline script)" }, { pat = "|%s*sh%s", reason = "pipe-to-sh" }, { pat = "|%s*sh%s*$", reason = "pipe-to-sh (eol)" }, { pat = "|%s*bash%s", reason = "pipe-to-bash" }, { pat = "|%s*bash%s*$", reason = "pipe-to-bash (eol)" }, { pat = "xargs%s+.-rm", reason = "xargs ... rm" }, -- ── Filesystem destructive { pat = "rm%s+.-%-rf?", reason = "rm -rf" }, { pat = "rm%s+.-%-fr?", reason = "rm -fr" }, { pat = "find%s+.-%-delete", reason = "find -delete" }, { pat = "find%s+.-%-exec%s+rm", reason = "find -exec rm" }, { pat = ">%s*/dev/sd[a-z]", reason = "write to raw disk" }, { pat = "dd%s+.-of=/dev/", reason = "dd to device" }, { pat = "mkfs%.", reason = "mkfs (format)" }, { pat = "shred%s", reason = "shred" }, { pat = "wipefs%s", reason = "wipefs" }, { pat = "truncate%s+.-%-s%s*0", reason = "truncate to zero" }, -- ── Version control destructive { pat = "git%s+push%s+.-%-%-force", reason = "git push --force" }, { pat = "git%s+push%s+.-%-f%f[%s]", reason = "git push -f" }, { pat = "git%s+reset%s+.-%-%-hard", reason = "git reset --hard" }, { pat = "git%s+clean%s+.-%-fd?", reason = "git clean -fd" }, { pat = "git%s+branch%s+.-%-D", reason = "git branch -D" }, -- ── Database / process -- ci=true rules use lowercase patterns; the matcher lowercases the -- input before testing. Don't use uppercase patterns with ci=true. { pat = "drop%s+table", reason = "DROP TABLE", ci = true }, { pat = "drop%s+database", reason = "DROP DATABASE", ci = true }, { pat = "truncate%s+table", reason = "TRUNCATE TABLE", ci = true }, -- pkill BEFORE kill so the more specific match wins (Lua tables are -- order-preserving; first hit reports the reason). { pat = "pkill%s+%-9", reason = "pkill -9" }, -- kill -9 needs a word boundary so "pkill -9" doesn't match this rule's -- "kill" substring. %f[%w] is Lua's frontier pattern — matches a -- transition from non-word to word characters. { pat = "%f[%w]kill%s+%-9", reason = "kill -9" }, -- ── Network/permission { pat = "chmod%s+.-777", reason = "chmod 777" }, { pat = "chown%s+.-%s+/%s*$", reason = "chown on root path" }, } -- Match each rule against `cmd`. Returns (true, reason) on first hit; -- (false, nil) if no rule matches. Static-only — does NOT invoke the -- LLM probe (that's `is_destructive` below, which calls this first). local function match_static(cmd) if type(cmd) ~= "string" or cmd == "" then return false, nil end local lower = nil -- lazily computed for ci-rules for _, rule in ipairs(DESTRUCTIVE_PATTERNS) do local target = cmd if rule.ci then lower = lower or cmd:lower() target = lower end if target:match(rule.pat) then return true, rule.reason end end return false, nil end -- ---------------------------------------------------------------- LLM probe -- Session-scoped cache for the LLM second-opinion. Keyed by the normalized -- (lowercased, whitespace-collapsed) command text. Mitigates Q23 latency -- when the same command pattern recurs within a single Norris run. local _llm_cache = {} local function normalize(cmd) return cmd:lower():gsub("%s+", " "):gsub("^%s+", ""):gsub("%s+$", "") end -- Per-probe timeout. The probe must be quick — destructive detection has -- to keep up with Norris's pace. We override the model's default timeout -- (which can be 30+ min for deep/slow local models) with a tight cap. local PROBE_TIMEOUT_MS = 15000 -- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string -- (not bool — caller cares about disagreement between probes). local function llm_probe(model_cfg, system, cmd) local reply, err = broker.chat(model_cfg, { { role = "system", content = system }, { role = "user", content = cmd } }, { max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS }) if not reply then -- Broker failure → safe default: treat as YES (destructive) return "YES_FAILSAFE", err end local upper = reply:upper() if upper:match("YES") then return "YES" end if upper:match("NO") then return "NO" end -- Unparseable response → treat as YES (safe default) return "YES_UNPARSEABLE", reply end -- LLM second-opinion gate. Returns: -- true, "LLM flagged as destructive" first probe said YES -- true, "LLM probe disagreement" first NO, second NO ("is it safe?") -- true, "LLM probe failed: " broker error → fail-safe YES -- false, nil both probes agree command is safe -- Cached per normalized command for the session. -- -- Model-selection precedence (per PHASE3.md §5): -- 1. cfg.safety.llm_model — explicit model-name from config (e.g. "cloud") -- 2. cfg.models.deep — local independent-class model (qwen3-30b) -- 3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default local function llm_second_opinion(cmd, cfg) local probe_cfg if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then probe_cfg = cfg.models[cfg.safety.llm_model] end if not probe_cfg and cfg and cfg.models then probe_cfg = cfg.models.deep or cfg.models[cfg.default_model or "fast"] end if not probe_cfg then -- No probe model available → fail-safe YES return true, "LLM probe unavailable (no model in config)" end local key = normalize(cmd) local cached = _llm_cache[key] if cached ~= nil then if cached[1] then return cached[1], cached[2] end return false, nil end local function memo(is_d, reason) _llm_cache[key] = { is_d, reason } return is_d, reason end -- Probe 1: "Is this destructive?" local p1 = llm_probe(probe_cfg, "Detect destructive shell commands. Reply YES if running this " .. "would delete, overwrite, or irreversibly modify data, " .. "processes, or system state. Otherwise reply NO. Respond " .. "with only YES or NO.", cmd) if p1 == "YES" then return memo(true, "LLM flagged as destructive") end if p1 == "YES_FAILSAFE" then return memo(true, "LLM probe failed (fail-safe)") end if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end -- Probe 2 (NO → re-roll with inverted phrasing per R-B2): local p2 = llm_probe(probe_cfg, "Reply YES or NO only. Is the following shell command SAFE to " .. "run autonomously without user review?", cmd) if p2 == "YES" then return memo(false, nil) end -- Disagreement or fail-safe → HALT return memo(true, "LLM probe disagreement") end -- Main entry point. Returns (true, reason) if EITHER the static patterns -- OR the LLM second-opinion flag the command. Used by the Norris loop. -- `cfg` is the full aish config (carries cfg.safety.llm_second_opinion -- and cfg.models for the probe model lookup). When cfg is nil, only the -- static layer runs (handy for unit tests and tooling that wants the -- fast deterministic gate without an LLM round-trip). function M.is_destructive(cmd, cfg) if type(cmd) ~= "string" or cmd == "" then return false, nil end -- Static patterns first (fast, deterministic). local hit, reason = match_static(cmd) if hit then return true, reason end -- LLM second-opinion. Default ON when cfg is present; off when cfg -- is nil (test/static-only mode). Explicit opt-out via -- cfg.safety.llm_second_opinion = false. if cfg == nil then return false, nil end if cfg.safety and cfg.safety.llm_second_opinion == false then return false, nil end return llm_second_opinion(cmd, cfg) end -- Expose the pattern table for `:safety patterns` meta and for testing. M._patterns = DESTRUCTIVE_PATTERNS M._match_static = match_static -- testable in isolation M._reset_cache = function() _llm_cache = {} end -- ---------------------------------------------------------------- norris_step -- Phase 3 commit #4 lands the planner. Stub stays for now. function M.norris_step(plan, broker, executor) error("safety.norris_step: not implemented yet (lands in Phase 3 commit #4)") end return M