aish/safety.lua

-- safety.lua — workflow safeguards for tool execution.
-- Phase 2: M.confirm_tool_call (per-call confirm gate + auto-approve policy).
-- Phase 3: M.is_destructive (static pattern + LLM second-opinion gate for
--          Norris autonomous mode) and M.norris_step (single-iteration
--          planning loop). See docs/PHASE2.md §6 and docs/PHASE3.md §4 / §5.

local rl     = require("ffi.readline")
local json   = require("dkjson")
local broker = require("broker")

local M = {}

-- Render the call as `name({"path":"/tmp"})` for the confirm prompt.
-- Truncate to keep one-line prompts.
local function pretty_call(name, args)
    local body = ""
    if args and next(args) then
        local ok, encoded = pcall(json.encode, args)
        if ok then
            body = (#encoded <= 80) and encoded or (encoded:sub(1, 77) .. "...")
        else
            body = "..."
        end
    end
    return name .. "(" .. body .. ")"
end

-- Ask the user whether tool `name` may be called with `args`, consulting
-- `cfg.mcp.auto_approve` first. Policy keys:
--   "<alias>__<tool>"  → exact-match auto-approve
--   "<alias>__*"       → whole-server auto-approve
-- Anything else falls back to a [y/N] prompt; empty / non-"y" answer rejects.
-- The separator switched from "." to "__" 2026-05-12 because Anthropic via
-- Bedrock rejects dots in tool names (regex ^[a-zA-Z0-9_-]{1,128}$).
function M.confirm_tool_call(name, args, cfg)
    local policy = (cfg and cfg.mcp and cfg.mcp.auto_approve) or {}
    if policy[name] then return true end
    local alias = name:match("^(.-)__")
    if alias and alias ~= "" and policy[alias .. "__*"] then return true end

    local prompt = ("call '%s'? [y/N] "):format(pretty_call(name, args))
    local ans = rl.readline(prompt) or ""
    return ans:lower():sub(1, 1) == "y"
end

-- ---------------------------------------------------------------- is_destructive
-- Phase 3 commit #1: static-pattern matcher only (no LLM second-opinion yet —
-- that lands in commit #2). Patterns are Lua patterns (NOT regex). When
-- `ci = true` is set on a rule, the input is lowercased before matching so
-- the rule matches case-insensitively (`DROP TABLE`, `drop table`, etc.).
-- See docs/PHASE3.md §5 for the rationale and the wrapper-bypass class
-- (R-B1) the first nine entries below are guarding against.

local DESTRUCTIVE_PATTERNS = {
    -- ── Shell wrappers (R-B1) — flag the wrapper itself; can't inspect
    --    the inner content safely without parsing the inner shell.
    --    Norris HALTs on these unconditionally; the user reads the inner
    --    before proceeding.
    { pat = "^%s*bash%s+%-l?c%s",              reason = "bash -c (wrapped shell)" },
    { pat = "^%s*sh%s+%-l?c%s",                reason = "sh -c (wrapped shell)" },
    { pat = "^%s*zsh%s+%-l?c%s",               reason = "zsh -c (wrapped shell)" },
    { pat = "^%s*eval%s",                      reason = "eval (dynamic shell)" },
    { pat = "^%s*python3?%s+%-c%s",            reason = "python -c (inline script)" },
    { pat = "^%s*perl%s+%-e%s",                reason = "perl -e (inline script)" },
    { pat = "|%s*sh%s",                        reason = "pipe-to-sh" },
    { pat = "|%s*sh%s*$",                      reason = "pipe-to-sh (eol)" },
    { pat = "|%s*bash%s",                      reason = "pipe-to-bash" },
    { pat = "|%s*bash%s*$",                    reason = "pipe-to-bash (eol)" },
    { pat = "xargs%s+.-rm",                    reason = "xargs ... rm" },

    -- ── Filesystem destructive
    { pat = "rm%s+.-%-rf?",                    reason = "rm -rf" },
    { pat = "rm%s+.-%-fr?",                    reason = "rm -fr" },
    { pat = "find%s+.-%-delete",               reason = "find -delete" },
    { pat = "find%s+.-%-exec%s+rm",            reason = "find -exec rm" },
    { pat = ">%s*/dev/sd[a-z]",                reason = "write to raw disk" },
    { pat = "dd%s+.-of=/dev/",                 reason = "dd to device" },
    { pat = "mkfs%.",                          reason = "mkfs (format)" },
    { pat = "shred%s",                         reason = "shred" },
    { pat = "wipefs%s",                        reason = "wipefs" },
    { pat = "truncate%s+.-%-s%s*0",            reason = "truncate to zero" },

    -- ── Version control destructive
    { pat = "git%s+push%s+.-%-%-force",        reason = "git push --force" },
    { pat = "git%s+push%s+.-%-f%f[%s]",        reason = "git push -f" },
    { pat = "git%s+reset%s+.-%-%-hard",        reason = "git reset --hard" },
    { pat = "git%s+clean%s+.-%-fd?",           reason = "git clean -fd" },
    { pat = "git%s+branch%s+.-%-D",            reason = "git branch -D" },

    -- ── Database / process
    -- ci=true rules use lowercase patterns; the matcher lowercases the
    -- input before testing. Don't use uppercase patterns with ci=true.
    { pat = "drop%s+table",                    reason = "DROP TABLE",        ci = true },
    { pat = "drop%s+database",                 reason = "DROP DATABASE",     ci = true },
    { pat = "truncate%s+table",                reason = "TRUNCATE TABLE",    ci = true },
    -- pkill BEFORE kill so the more specific match wins (Lua tables are
    -- order-preserving; first hit reports the reason).
    { pat = "pkill%s+%-9",                     reason = "pkill -9" },
    -- kill -9 needs a word boundary so "pkill -9" doesn't match this rule's
    -- "kill" substring. %f[%w] is Lua's frontier pattern — matches a
    -- transition from non-word to word characters.
    { pat = "%f[%w]kill%s+%-9",                reason = "kill -9" },

    -- ── Network/permission
    { pat = "chmod%s+.-777",                   reason = "chmod 777" },
    { pat = "chown%s+.-%s+/%s*$",              reason = "chown on root path" },
}

-- Match each rule against `cmd`. Returns (true, reason) on first hit;
-- (false, nil) if no rule matches. Static-only — does NOT invoke the
-- LLM probe (that's `is_destructive` below, which calls this first).
local function match_static(cmd)
    if type(cmd) ~= "string" or cmd == "" then return false, nil end
    local lower = nil  -- lazily computed for ci-rules
    for _, rule in ipairs(DESTRUCTIVE_PATTERNS) do
        local target = cmd
        if rule.ci then
            lower = lower or cmd:lower()
            target = lower
        end
        if target:match(rule.pat) then
            return true, rule.reason
        end
    end
    return false, nil
end

-- ---------------------------------------------------------------- LLM probe
-- Session-scoped cache for the LLM second-opinion. Keyed by the normalized
-- (lowercased, whitespace-collapsed) command text. Mitigates Q23 latency
-- when the same command pattern recurs within a single Norris run.
local _llm_cache = {}

local function normalize(cmd)
    return cmd:lower():gsub("%s+", " "):gsub("^%s+", ""):gsub("%s+$", "")
end

-- Per-probe timeout. The probe must be quick — destructive detection has
-- to keep up with Norris's pace. We override the model's default timeout
-- (which can be 30+ min for deep/slow local models) with a tight cap.
local PROBE_TIMEOUT_MS = 15000

-- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string
-- (not bool — caller cares about disagreement between probes).
local function llm_probe(model_cfg, system, cmd)
    local reply, err = broker.chat(model_cfg,
        { { role = "system", content = system },
          { role = "user",   content = cmd } },
        { max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS })
    if not reply then
        -- Broker failure → safe default: treat as YES (destructive)
        return "YES_FAILSAFE", err
    end
    local upper = reply:upper()
    if upper:match("YES") then return "YES" end
    if upper:match("NO")  then return "NO"  end
    -- Unparseable response → treat as YES (safe default)
    return "YES_UNPARSEABLE", reply
end

-- LLM second-opinion gate. Returns:
--   true,  "LLM flagged as destructive"      first probe said YES
--   true,  "LLM probe disagreement"          first NO, second NO ("is it safe?")
--   true,  "LLM probe failed: <err>"         broker error → fail-safe YES
--   false, nil                                both probes agree command is safe
-- Cached per normalized command for the session.
--
-- Model-selection precedence (per PHASE3.md §5):
--   1. cfg.safety.llm_model         — explicit model-name from config (e.g. "cloud")
--   2. cfg.models.deep              — local independent-class model (qwen3-30b)
--   3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default
local function llm_second_opinion(cmd, cfg)
    local probe_cfg
    if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then
        probe_cfg = cfg.models[cfg.safety.llm_model]
    end
    if not probe_cfg and cfg and cfg.models then
        probe_cfg = cfg.models.deep or cfg.models[cfg.default_model or "fast"]
    end
    if not probe_cfg then
        -- No probe model available → fail-safe YES
        return true, "LLM probe unavailable (no model in config)"
    end

    local key = normalize(cmd)
    local cached = _llm_cache[key]
    if cached ~= nil then
        if cached[1] then return cached[1], cached[2] end
        return false, nil
    end

    local function memo(is_d, reason)
        _llm_cache[key] = { is_d, reason }
        return is_d, reason
    end

    -- Probe 1: "Is this destructive?"
    local p1 = llm_probe(probe_cfg,
        "Detect destructive shell commands. Reply YES if running this "
        .. "would delete, overwrite, or irreversibly modify data, "
        .. "processes, or system state. Otherwise reply NO. Respond "
        .. "with only YES or NO.", cmd)
    if p1 == "YES" then return memo(true, "LLM flagged as destructive") end
    if p1 == "YES_FAILSAFE"    then return memo(true, "LLM probe failed (fail-safe)") end
    if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end

    -- Probe 2 (NO → re-roll with inverted phrasing per R-B2):
    local p2 = llm_probe(probe_cfg,
        "Reply YES or NO only. Is the following shell command SAFE to "
        .. "run autonomously without user review?", cmd)
    if p2 == "YES" then return memo(false, nil) end
    -- Disagreement or fail-safe → HALT
    return memo(true, "LLM probe disagreement")
end

-- Main entry point. Returns (true, reason) if EITHER the static patterns
-- OR the LLM second-opinion flag the command. Used by the Norris loop.
-- `cfg` is the full aish config (carries cfg.safety.llm_second_opinion
-- and cfg.models for the probe model lookup). When cfg is nil, only the
-- static layer runs (handy for unit tests and tooling that wants the
-- fast deterministic gate without an LLM round-trip).
function M.is_destructive(cmd, cfg)
    if type(cmd) ~= "string" or cmd == "" then return false, nil end

    -- Static patterns first (fast, deterministic).
    local hit, reason = match_static(cmd)
    if hit then return true, reason end

    -- LLM second-opinion. Default ON when cfg is present; off when cfg
    -- is nil (test/static-only mode). Explicit opt-out via
    -- cfg.safety.llm_second_opinion = false.
    if cfg == nil then return false, nil end
    if cfg.safety and cfg.safety.llm_second_opinion == false then
        return false, nil
    end

    return llm_second_opinion(cmd, cfg)
end

-- Expose the pattern table for `:safety patterns` meta and for testing.
M._patterns       = DESTRUCTIVE_PATTERNS
M._match_static   = match_static       -- testable in isolation
M._reset_cache    = function() _llm_cache = {} end

-- ---------------------------------------------------------------- norris_step
-- Phase 3 commit #4 lands the planner. Stub stays for now.
function M.norris_step(plan, broker, executor)
    error("safety.norris_step: not implemented yet (lands in Phase 3 commit #4)")
end

return M