safety: is_destructive static pattern matcher (Phase 3 commit #1)
Phase 3 commit #1 per docs/PHASE3.md §12. Static-pattern destructive-op heuristic; no LLM second-opinion yet (lands in commit #2). Implementation: - 34 patterns in DESTRUCTIVE_PATTERNS table, grouped: 9 shell-wrapper patterns (R-B1 — bash -c / sh -c / zsh -c / eval / python -c / perl -e / pipe-to-sh both forms / pipe-to-bash both forms / xargs ... rm). HALT on the wrapper itself; user reads the inner before proceeding. 10 filesystem destructive (rm -rf, find -delete, dd to device, mkfs, shred, wipefs, truncate -s 0, ...). 5 version-control destructive (git push --force/-f, git reset --hard, git clean -fd, git branch -D). 5 database/process (DROP TABLE/DATABASE, TRUNCATE TABLE, kill/pkill -9). 2 permission (chmod 777, chown on root path). - ci=true flag for case-insensitive SQL patterns; rule patterns must be lowercase when ci is set (matcher lowercases input). - pkill -9 ordered BEFORE kill -9; kill rule uses %f[%w] frontier so "pkill -9 nginx" reports "pkill -9" not "kill -9" substring match. - M._patterns exposes the rule table for :safety patterns meta (Phase 3 commit #5) and for the test corpus. - M.norris_step stub stays — lands in commit #4. Test corpus (test_safety.lua, 87 cases): - 49 destructive cases across all categories (incl. all 11 wrapper forms, the canonical curl|sh end-of-string bypass, sudo-prefixed rm -rf, etc.). - 38 safe cases (read-only commands, non-destructive variants of risky verbs like "git push" without --force, "find" without -delete, "chmod 644", "kill 1234" without -9, etc.). - Documented one accepted false positive: echo "rm -rf /" matches the rm pattern by substring — Norris user can proceed after reading; tradeoff between false positives and false negatives, biased toward false positives per §5. - Run from repo root: `luajit test_safety.lua`. Exit 0 on pass. - Verified all 87 pass at commit time. R-C4 / readline rebind, broker opts.max_tokens, LLM second-opinion, norris_step planner, repl driver, and the wider Norris UX land in subsequent commits per §12. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+90
-8
@@ -1,7 +1,8 @@
|
||||
-- safety.lua — workflow safeguards for tool execution.
|
||||
-- Phase 2: M.confirm_tool_call only (per-call confirm gate, with config-driven
|
||||
-- auto-approve policy). See docs/PHASE2.md §6.
|
||||
-- Phase 3 (deferred): destructive-op heuristic + Norris autonomous gate.
|
||||
-- Phase 2: M.confirm_tool_call (per-call confirm gate + auto-approve policy).
|
||||
-- Phase 3: M.is_destructive (static pattern + LLM second-opinion gate for
|
||||
-- Norris autonomous mode) and M.norris_step (single-iteration
|
||||
-- planning loop). See docs/PHASE2.md §6 and docs/PHASE3.md §4 / §5.
|
||||
|
||||
local rl = require("ffi.readline")
|
||||
local json = require("dkjson")
|
||||
@@ -41,15 +42,96 @@ function M.confirm_tool_call(name, args, cfg)
|
||||
return ans:lower():sub(1, 1) == "y"
|
||||
end
|
||||
|
||||
-- ---------------------------------------------------------------- Phase 3 stubs
|
||||
-- Destructive-op heuristic for Norris autonomous mode. Not part of the
|
||||
-- Phase 2 surface (see docs/PHASE2.md §10 / PHASE0.md §11 row 3).
|
||||
-- ---------------------------------------------------------------- is_destructive
|
||||
-- Phase 3 commit #1: static-pattern matcher only (no LLM second-opinion yet —
|
||||
-- that lands in commit #2). Patterns are Lua patterns (NOT regex). When
|
||||
-- `ci = true` is set on a rule, the input is lowercased before matching so
|
||||
-- the rule matches case-insensitively (`DROP TABLE`, `drop table`, etc.).
|
||||
-- See docs/PHASE3.md §5 for the rationale and the wrapper-bypass class
|
||||
-- (R-B1) the first nine entries below are guarding against.
|
||||
|
||||
local DESTRUCTIVE_PATTERNS = {
|
||||
-- ── Shell wrappers (R-B1) — flag the wrapper itself; can't inspect
|
||||
-- the inner content safely without parsing the inner shell.
|
||||
-- Norris HALTs on these unconditionally; the user reads the inner
|
||||
-- before proceeding.
|
||||
{ pat = "^%s*bash%s+%-l?c%s", reason = "bash -c (wrapped shell)" },
|
||||
{ pat = "^%s*sh%s+%-l?c%s", reason = "sh -c (wrapped shell)" },
|
||||
{ pat = "^%s*zsh%s+%-l?c%s", reason = "zsh -c (wrapped shell)" },
|
||||
{ pat = "^%s*eval%s", reason = "eval (dynamic shell)" },
|
||||
{ pat = "^%s*python3?%s+%-c%s", reason = "python -c (inline script)" },
|
||||
{ pat = "^%s*perl%s+%-e%s", reason = "perl -e (inline script)" },
|
||||
{ pat = "|%s*sh%s", reason = "pipe-to-sh" },
|
||||
{ pat = "|%s*sh%s*$", reason = "pipe-to-sh (eol)" },
|
||||
{ pat = "|%s*bash%s", reason = "pipe-to-bash" },
|
||||
{ pat = "|%s*bash%s*$", reason = "pipe-to-bash (eol)" },
|
||||
{ pat = "xargs%s+.-rm", reason = "xargs ... rm" },
|
||||
|
||||
-- ── Filesystem destructive
|
||||
{ pat = "rm%s+.-%-rf?", reason = "rm -rf" },
|
||||
{ pat = "rm%s+.-%-fr?", reason = "rm -fr" },
|
||||
{ pat = "find%s+.-%-delete", reason = "find -delete" },
|
||||
{ pat = "find%s+.-%-exec%s+rm", reason = "find -exec rm" },
|
||||
{ pat = ">%s*/dev/sd[a-z]", reason = "write to raw disk" },
|
||||
{ pat = "dd%s+.-of=/dev/", reason = "dd to device" },
|
||||
{ pat = "mkfs%.", reason = "mkfs (format)" },
|
||||
{ pat = "shred%s", reason = "shred" },
|
||||
{ pat = "wipefs%s", reason = "wipefs" },
|
||||
{ pat = "truncate%s+.-%-s%s*0", reason = "truncate to zero" },
|
||||
|
||||
-- ── Version control destructive
|
||||
{ pat = "git%s+push%s+.-%-%-force", reason = "git push --force" },
|
||||
{ pat = "git%s+push%s+.-%-f%f[%s]", reason = "git push -f" },
|
||||
{ pat = "git%s+reset%s+.-%-%-hard", reason = "git reset --hard" },
|
||||
{ pat = "git%s+clean%s+.-%-fd?", reason = "git clean -fd" },
|
||||
{ pat = "git%s+branch%s+.-%-D", reason = "git branch -D" },
|
||||
|
||||
-- ── Database / process
|
||||
-- ci=true rules use lowercase patterns; the matcher lowercases the
|
||||
-- input before testing. Don't use uppercase patterns with ci=true.
|
||||
{ pat = "drop%s+table", reason = "DROP TABLE", ci = true },
|
||||
{ pat = "drop%s+database", reason = "DROP DATABASE", ci = true },
|
||||
{ pat = "truncate%s+table", reason = "TRUNCATE TABLE", ci = true },
|
||||
-- pkill BEFORE kill so the more specific match wins (Lua tables are
|
||||
-- order-preserving; first hit reports the reason).
|
||||
{ pat = "pkill%s+%-9", reason = "pkill -9" },
|
||||
-- kill -9 needs a word boundary so "pkill -9" doesn't match this rule's
|
||||
-- "kill" substring. %f[%w] is Lua's frontier pattern — matches a
|
||||
-- transition from non-word to word characters.
|
||||
{ pat = "%f[%w]kill%s+%-9", reason = "kill -9" },
|
||||
|
||||
-- ── Network/permission
|
||||
{ pat = "chmod%s+.-777", reason = "chmod 777" },
|
||||
{ pat = "chown%s+.-%s+/%s*$", reason = "chown on root path" },
|
||||
}
|
||||
|
||||
-- Match each rule against `cmd`. Returns (true, reason) on first hit;
|
||||
-- (false, nil) if no rule matches. Used by the Norris loop to gate
|
||||
-- shell commands; ALSO called on tool-call args by Norris's tool path
|
||||
-- (the JSON-serialized arguments are passed in as cmd).
|
||||
function M.is_destructive(cmd)
|
||||
error("safety.is_destructive: not implemented (Phase 3)")
|
||||
if type(cmd) ~= "string" or cmd == "" then return false, nil end
|
||||
local lower = nil -- lazily computed for ci-rules
|
||||
for _, rule in ipairs(DESTRUCTIVE_PATTERNS) do
|
||||
local target = cmd
|
||||
if rule.ci then
|
||||
lower = lower or cmd:lower()
|
||||
target = lower
|
||||
end
|
||||
if target:match(rule.pat) then
|
||||
return true, rule.reason
|
||||
end
|
||||
end
|
||||
return false, nil
|
||||
end
|
||||
|
||||
-- Expose the pattern table for `:safety patterns` meta and for testing.
|
||||
M._patterns = DESTRUCTIVE_PATTERNS
|
||||
|
||||
-- ---------------------------------------------------------------- norris_step
|
||||
-- Phase 3 commit #4 lands the planner. Stub stays for now.
|
||||
function M.norris_step(plan, broker, executor)
|
||||
error("safety.norris_step: not implemented (Phase 3)")
|
||||
error("safety.norris_step: not implemented yet (lands in Phase 3 commit #4)")
|
||||
end
|
||||
|
||||
return M
|
||||
|
||||
Reference in New Issue
Block a user