2abd5da3a6
Phase 3 commit #2 per docs/PHASE3.md §12. Adds the LLM-probe gate on top of commit #1's static patterns. Together they form is_destructive. broker.lua extension: - opts.max_tokens (A2) — passed through to the request body. Phase 3 probes cap at 4 tokens for YES/NO replies. - opts.timeout_ms — overrides model_cfg.timeout_ms per-call. Probe uses 15000ms cap regardless of the model's normal timeout (the user's deep model has 1800000ms for long generations; the probe must stay snappy). - M.chat now accepts an opts table (same shape as chat_stream's). Backwards compatible — existing callers passing (cfg, msgs) unaffected. safety.lua additions: - llm_probe(cfg, system, cmd): single broker.chat call returning "YES"/"NO"/"YES_FAILSAFE"/"YES_UNPARSEABLE" — fail-safe defaults. - llm_second_opinion(cmd, cfg): two-probe protocol per R-B2. Probe 1: "Is this destructive?" — YES → flag. Probe 2 (only if probe 1 said NO): "Is this safe?" inverted question — NO → flag (disagreement = HALT). Both NO → safe. - Session-scoped cache _llm_cache keyed by normalized command (lowercased + whitespace-collapsed). Mitigates Q23 latency for repeated commands within a Norris run. - Model-selection precedence: cfg.safety.llm_model (explicit) → cfg.models.deep (independent local class) → cfg.models[default]. Fail-safe YES if none configured. - is_destructive(cmd, cfg): runs static patterns first (always), then LLM if cfg present + not explicitly opted-out. cfg=nil yields static-only mode (handy for tests). End-to-end verified against hossenfelder using qwen-coder-7b-32k as the deep probe (qwen3-30b-a3b-instruct in repo's config.lua isn't currently loaded on the local backend): cat /etc/hostname → hit=false (LLM: NO, NO inverted = safe) rm /tmp/x.log → hit=true (LLM flagged; static missed because no -r/-f flags) cp /etc/passwd /tmp/passwd.bak → hit=false (safe copy) cache: second probe on same cmd → 0s wall time static-only (cfg=nil): rm -rf /tmp/x → static hit, no LLM call opt-out (llm_second_opinion=false): cp x y → hit=false, no probe Test corpus (test_safety.lua, 87 cases) still all pass — cfg=nil preserves the static-only behavior. Note: production config.lua currently has `deep = qwen3-30b-a3b-instruct` which isn't loaded on the proxy backend right now; Norris users will hit the fail-safe (everything flagged destructive) until either the deep model is brought up OR cfg.safety.llm_model = "cloud" is set to route the probe through anthropic/claude-haiku-4.5. Update the config or model deployment for production use — covered by Phase 3 verify test case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
252 lines
12 KiB
Lua
252 lines
12 KiB
Lua
-- safety.lua — workflow safeguards for tool execution.
|
|
-- Phase 2: M.confirm_tool_call (per-call confirm gate + auto-approve policy).
|
|
-- Phase 3: M.is_destructive (static pattern + LLM second-opinion gate for
|
|
-- Norris autonomous mode) and M.norris_step (single-iteration
|
|
-- planning loop). See docs/PHASE2.md §6 and docs/PHASE3.md §4 / §5.
|
|
|
|
local rl = require("ffi.readline")
|
|
local json = require("dkjson")
|
|
local broker = require("broker")
|
|
|
|
local M = {}
|
|
|
|
-- Render the call as `name({"path":"/tmp"})` for the confirm prompt.
|
|
-- Truncate to keep one-line prompts.
|
|
local function pretty_call(name, args)
|
|
local body = ""
|
|
if args and next(args) then
|
|
local ok, encoded = pcall(json.encode, args)
|
|
if ok then
|
|
body = (#encoded <= 80) and encoded or (encoded:sub(1, 77) .. "...")
|
|
else
|
|
body = "..."
|
|
end
|
|
end
|
|
return name .. "(" .. body .. ")"
|
|
end
|
|
|
|
-- Ask the user whether tool `name` may be called with `args`, consulting
|
|
-- `cfg.mcp.auto_approve` first. Policy keys:
|
|
-- "<alias>__<tool>" → exact-match auto-approve
|
|
-- "<alias>__*" → whole-server auto-approve
|
|
-- Anything else falls back to a [y/N] prompt; empty / non-"y" answer rejects.
|
|
-- The separator switched from "." to "__" 2026-05-12 because Anthropic via
|
|
-- Bedrock rejects dots in tool names (regex ^[a-zA-Z0-9_-]{1,128}$).
|
|
function M.confirm_tool_call(name, args, cfg)
|
|
local policy = (cfg and cfg.mcp and cfg.mcp.auto_approve) or {}
|
|
if policy[name] then return true end
|
|
local alias = name:match("^(.-)__")
|
|
if alias and alias ~= "" and policy[alias .. "__*"] then return true end
|
|
|
|
local prompt = ("call '%s'? [y/N] "):format(pretty_call(name, args))
|
|
local ans = rl.readline(prompt) or ""
|
|
return ans:lower():sub(1, 1) == "y"
|
|
end
|
|
|
|
-- ---------------------------------------------------------------- is_destructive
|
|
-- Phase 3 commit #1: static-pattern matcher only (no LLM second-opinion yet —
|
|
-- that lands in commit #2). Patterns are Lua patterns (NOT regex). When
|
|
-- `ci = true` is set on a rule, the input is lowercased before matching so
|
|
-- the rule matches case-insensitively (`DROP TABLE`, `drop table`, etc.).
|
|
-- See docs/PHASE3.md §5 for the rationale and the wrapper-bypass class
|
|
-- (R-B1) the first nine entries below are guarding against.
|
|
|
|
local DESTRUCTIVE_PATTERNS = {
|
|
-- ── Shell wrappers (R-B1) — flag the wrapper itself; can't inspect
|
|
-- the inner content safely without parsing the inner shell.
|
|
-- Norris HALTs on these unconditionally; the user reads the inner
|
|
-- before proceeding.
|
|
{ pat = "^%s*bash%s+%-l?c%s", reason = "bash -c (wrapped shell)" },
|
|
{ pat = "^%s*sh%s+%-l?c%s", reason = "sh -c (wrapped shell)" },
|
|
{ pat = "^%s*zsh%s+%-l?c%s", reason = "zsh -c (wrapped shell)" },
|
|
{ pat = "^%s*eval%s", reason = "eval (dynamic shell)" },
|
|
{ pat = "^%s*python3?%s+%-c%s", reason = "python -c (inline script)" },
|
|
{ pat = "^%s*perl%s+%-e%s", reason = "perl -e (inline script)" },
|
|
{ pat = "|%s*sh%s", reason = "pipe-to-sh" },
|
|
{ pat = "|%s*sh%s*$", reason = "pipe-to-sh (eol)" },
|
|
{ pat = "|%s*bash%s", reason = "pipe-to-bash" },
|
|
{ pat = "|%s*bash%s*$", reason = "pipe-to-bash (eol)" },
|
|
{ pat = "xargs%s+.-rm", reason = "xargs ... rm" },
|
|
|
|
-- ── Filesystem destructive
|
|
{ pat = "rm%s+.-%-rf?", reason = "rm -rf" },
|
|
{ pat = "rm%s+.-%-fr?", reason = "rm -fr" },
|
|
{ pat = "find%s+.-%-delete", reason = "find -delete" },
|
|
{ pat = "find%s+.-%-exec%s+rm", reason = "find -exec rm" },
|
|
{ pat = ">%s*/dev/sd[a-z]", reason = "write to raw disk" },
|
|
{ pat = "dd%s+.-of=/dev/", reason = "dd to device" },
|
|
{ pat = "mkfs%.", reason = "mkfs (format)" },
|
|
{ pat = "shred%s", reason = "shred" },
|
|
{ pat = "wipefs%s", reason = "wipefs" },
|
|
{ pat = "truncate%s+.-%-s%s*0", reason = "truncate to zero" },
|
|
|
|
-- ── Version control destructive
|
|
{ pat = "git%s+push%s+.-%-%-force", reason = "git push --force" },
|
|
{ pat = "git%s+push%s+.-%-f%f[%s]", reason = "git push -f" },
|
|
{ pat = "git%s+reset%s+.-%-%-hard", reason = "git reset --hard" },
|
|
{ pat = "git%s+clean%s+.-%-fd?", reason = "git clean -fd" },
|
|
{ pat = "git%s+branch%s+.-%-D", reason = "git branch -D" },
|
|
|
|
-- ── Database / process
|
|
-- ci=true rules use lowercase patterns; the matcher lowercases the
|
|
-- input before testing. Don't use uppercase patterns with ci=true.
|
|
{ pat = "drop%s+table", reason = "DROP TABLE", ci = true },
|
|
{ pat = "drop%s+database", reason = "DROP DATABASE", ci = true },
|
|
{ pat = "truncate%s+table", reason = "TRUNCATE TABLE", ci = true },
|
|
-- pkill BEFORE kill so the more specific match wins (Lua tables are
|
|
-- order-preserving; first hit reports the reason).
|
|
{ pat = "pkill%s+%-9", reason = "pkill -9" },
|
|
-- kill -9 needs a word boundary so "pkill -9" doesn't match this rule's
|
|
-- "kill" substring. %f[%w] is Lua's frontier pattern — matches a
|
|
-- transition from non-word to word characters.
|
|
{ pat = "%f[%w]kill%s+%-9", reason = "kill -9" },
|
|
|
|
-- ── Network/permission
|
|
{ pat = "chmod%s+.-777", reason = "chmod 777" },
|
|
{ pat = "chown%s+.-%s+/%s*$", reason = "chown on root path" },
|
|
}
|
|
|
|
-- Match each rule against `cmd`. Returns (true, reason) on first hit;
|
|
-- (false, nil) if no rule matches. Static-only — does NOT invoke the
|
|
-- LLM probe (that's `is_destructive` below, which calls this first).
|
|
local function match_static(cmd)
|
|
if type(cmd) ~= "string" or cmd == "" then return false, nil end
|
|
local lower = nil -- lazily computed for ci-rules
|
|
for _, rule in ipairs(DESTRUCTIVE_PATTERNS) do
|
|
local target = cmd
|
|
if rule.ci then
|
|
lower = lower or cmd:lower()
|
|
target = lower
|
|
end
|
|
if target:match(rule.pat) then
|
|
return true, rule.reason
|
|
end
|
|
end
|
|
return false, nil
|
|
end
|
|
|
|
-- ---------------------------------------------------------------- LLM probe
|
|
-- Session-scoped cache for the LLM second-opinion. Keyed by the normalized
|
|
-- (lowercased, whitespace-collapsed) command text. Mitigates Q23 latency
|
|
-- when the same command pattern recurs within a single Norris run.
|
|
local _llm_cache = {}
|
|
|
|
local function normalize(cmd)
|
|
return cmd:lower():gsub("%s+", " "):gsub("^%s+", ""):gsub("%s+$", "")
|
|
end
|
|
|
|
-- Per-probe timeout. The probe must be quick — destructive detection has
|
|
-- to keep up with Norris's pace. We override the model's default timeout
|
|
-- (which can be 30+ min for deep/slow local models) with a tight cap.
|
|
local PROBE_TIMEOUT_MS = 15000
|
|
|
|
-- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string
|
|
-- (not bool — caller cares about disagreement between probes).
|
|
local function llm_probe(model_cfg, system, cmd)
|
|
local reply, err = broker.chat(model_cfg,
|
|
{ { role = "system", content = system },
|
|
{ role = "user", content = cmd } },
|
|
{ max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS })
|
|
if not reply then
|
|
-- Broker failure → safe default: treat as YES (destructive)
|
|
return "YES_FAILSAFE", err
|
|
end
|
|
local upper = reply:upper()
|
|
if upper:match("YES") then return "YES" end
|
|
if upper:match("NO") then return "NO" end
|
|
-- Unparseable response → treat as YES (safe default)
|
|
return "YES_UNPARSEABLE", reply
|
|
end
|
|
|
|
-- LLM second-opinion gate. Returns:
|
|
-- true, "LLM flagged as destructive" first probe said YES
|
|
-- true, "LLM probe disagreement" first NO, second NO ("is it safe?")
|
|
-- true, "LLM probe failed: <err>" broker error → fail-safe YES
|
|
-- false, nil both probes agree command is safe
|
|
-- Cached per normalized command for the session.
|
|
--
|
|
-- Model-selection precedence (per PHASE3.md §5):
|
|
-- 1. cfg.safety.llm_model — explicit model-name from config (e.g. "cloud")
|
|
-- 2. cfg.models.deep — local independent-class model (qwen3-30b)
|
|
-- 3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default
|
|
local function llm_second_opinion(cmd, cfg)
|
|
local probe_cfg
|
|
if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then
|
|
probe_cfg = cfg.models[cfg.safety.llm_model]
|
|
end
|
|
if not probe_cfg and cfg and cfg.models then
|
|
probe_cfg = cfg.models.deep or cfg.models[cfg.default_model or "fast"]
|
|
end
|
|
if not probe_cfg then
|
|
-- No probe model available → fail-safe YES
|
|
return true, "LLM probe unavailable (no model in config)"
|
|
end
|
|
|
|
local key = normalize(cmd)
|
|
local cached = _llm_cache[key]
|
|
if cached ~= nil then
|
|
if cached[1] then return cached[1], cached[2] end
|
|
return false, nil
|
|
end
|
|
|
|
local function memo(is_d, reason)
|
|
_llm_cache[key] = { is_d, reason }
|
|
return is_d, reason
|
|
end
|
|
|
|
-- Probe 1: "Is this destructive?"
|
|
local p1 = llm_probe(probe_cfg,
|
|
"Detect destructive shell commands. Reply YES if running this "
|
|
.. "would delete, overwrite, or irreversibly modify data, "
|
|
.. "processes, or system state. Otherwise reply NO. Respond "
|
|
.. "with only YES or NO.", cmd)
|
|
if p1 == "YES" then return memo(true, "LLM flagged as destructive") end
|
|
if p1 == "YES_FAILSAFE" then return memo(true, "LLM probe failed (fail-safe)") end
|
|
if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end
|
|
|
|
-- Probe 2 (NO → re-roll with inverted phrasing per R-B2):
|
|
local p2 = llm_probe(probe_cfg,
|
|
"Reply YES or NO only. Is the following shell command SAFE to "
|
|
.. "run autonomously without user review?", cmd)
|
|
if p2 == "YES" then return memo(false, nil) end
|
|
-- Disagreement or fail-safe → HALT
|
|
return memo(true, "LLM probe disagreement")
|
|
end
|
|
|
|
-- Main entry point. Returns (true, reason) if EITHER the static patterns
|
|
-- OR the LLM second-opinion flag the command. Used by the Norris loop.
|
|
-- `cfg` is the full aish config (carries cfg.safety.llm_second_opinion
|
|
-- and cfg.models for the probe model lookup). When cfg is nil, only the
|
|
-- static layer runs (handy for unit tests and tooling that wants the
|
|
-- fast deterministic gate without an LLM round-trip).
|
|
function M.is_destructive(cmd, cfg)
|
|
if type(cmd) ~= "string" or cmd == "" then return false, nil end
|
|
|
|
-- Static patterns first (fast, deterministic).
|
|
local hit, reason = match_static(cmd)
|
|
if hit then return true, reason end
|
|
|
|
-- LLM second-opinion. Default ON when cfg is present; off when cfg
|
|
-- is nil (test/static-only mode). Explicit opt-out via
|
|
-- cfg.safety.llm_second_opinion = false.
|
|
if cfg == nil then return false, nil end
|
|
if cfg.safety and cfg.safety.llm_second_opinion == false then
|
|
return false, nil
|
|
end
|
|
|
|
return llm_second_opinion(cmd, cfg)
|
|
end
|
|
|
|
-- Expose the pattern table for `:safety patterns` meta and for testing.
|
|
M._patterns = DESTRUCTIVE_PATTERNS
|
|
M._match_static = match_static -- testable in isolation
|
|
M._reset_cache = function() _llm_cache = {} end
|
|
|
|
-- ---------------------------------------------------------------- norris_step
|
|
-- Phase 3 commit #4 lands the planner. Stub stays for now.
|
|
function M.norris_step(plan, broker, executor)
|
|
error("safety.norris_step: not implemented yet (lands in Phase 3 commit #4)")
|
|
end
|
|
|
|
return M
|