aish/safety.lua

-- safety.lua — workflow safeguards for tool execution.
-- Phase 2: M.confirm_tool_call (per-call confirm gate + auto-approve policy).
-- Phase 3: M.is_destructive (static pattern + LLM second-opinion gate for
--          Norris autonomous mode) and M.norris_step (single-iteration
--          planning loop). See docs/PHASE2.md §6 and docs/PHASE3.md §4 / §5.

local rl     = require("ffi.readline")
local json   = require("dkjson")
local broker = require("broker")

local M = {}

-- Render the call as `name({"path":"/tmp"})` for the confirm prompt.
-- Truncate to keep one-line prompts.
local function pretty_call(name, args)
    local body = ""
    if args and next(args) then
        local ok, encoded = pcall(json.encode, args)
        if ok then
            body = (#encoded <= 80) and encoded or (encoded:sub(1, 77) .. "...")
        else
            body = "..."
        end
    end
    return name .. "(" .. body .. ")"
end

-- Ask the user whether tool `name` may be called with `args`, consulting
-- `cfg.mcp.auto_approve` first. Policy keys:
--   "<alias>__<tool>"  → exact-match auto-approve
--   "<alias>__*"       → whole-server auto-approve
-- Anything else falls back to a [y/N] prompt; empty / non-"y" answer rejects.
-- The separator switched from "." to "__" 2026-05-12 because Anthropic via
-- Bedrock rejects dots in tool names (regex ^[a-zA-Z0-9_-]{1,128}$).
function M.confirm_tool_call(name, args, cfg)
    local policy = (cfg and cfg.mcp and cfg.mcp.auto_approve) or {}
    if policy[name] then return true end
    local alias = name:match("^(.-)__")
    if alias and alias ~= "" and policy[alias .. "__*"] then return true end

    local prompt = ("call '%s'? [y/N] "):format(pretty_call(name, args))
    local ans = rl.readline(prompt) or ""
    return ans:lower():sub(1, 1) == "y"
end

-- ---------------------------------------------------------------- is_destructive
-- Phase 3 commit #1: static-pattern matcher only (no LLM second-opinion yet —
-- that lands in commit #2). Patterns are Lua patterns (NOT regex). When
-- `ci = true` is set on a rule, the input is lowercased before matching so
-- the rule matches case-insensitively (`DROP TABLE`, `drop table`, etc.).
-- See docs/PHASE3.md §5 for the rationale and the wrapper-bypass class
-- (R-B1) the first nine entries below are guarding against.

local DESTRUCTIVE_PATTERNS = {
    -- ── Shell wrappers (R-B1) — flag the wrapper itself; can't inspect
    --    the inner content safely without parsing the inner shell.
    --    Norris HALTs on these unconditionally; the user reads the inner
    --    before proceeding.
    { pat = "^%s*bash%s+%-l?c%s",              reason = "bash -c (wrapped shell)" },
    { pat = "^%s*sh%s+%-l?c%s",                reason = "sh -c (wrapped shell)" },
    { pat = "^%s*zsh%s+%-l?c%s",               reason = "zsh -c (wrapped shell)" },
    { pat = "^%s*eval%s",                      reason = "eval (dynamic shell)" },
    { pat = "^%s*python3?%s+%-c%s",            reason = "python -c (inline script)" },
    { pat = "^%s*perl%s+%-e%s",                reason = "perl -e (inline script)" },
    { pat = "|%s*sh%s",                        reason = "pipe-to-sh" },
    { pat = "|%s*sh%s*$",                      reason = "pipe-to-sh (eol)" },
    { pat = "|%s*bash%s",                      reason = "pipe-to-bash" },
    { pat = "|%s*bash%s*$",                    reason = "pipe-to-bash (eol)" },
    { pat = "xargs%s+.-rm",                    reason = "xargs ... rm" },

    -- ── Filesystem destructive
    { pat = "rm%s+.-%-rf?",                    reason = "rm -rf" },
    { pat = "rm%s+.-%-fr?",                    reason = "rm -fr" },
    { pat = "find%s+.-%-delete",               reason = "find -delete" },
    { pat = "find%s+.-%-exec%s+rm",            reason = "find -exec rm" },
    { pat = ">%s*/dev/sd[a-z]",                reason = "write to raw disk" },
    { pat = "dd%s+.-of=/dev/",                 reason = "dd to device" },
    { pat = "mkfs%.",                          reason = "mkfs (format)" },
    { pat = "shred%s",                         reason = "shred" },
    { pat = "wipefs%s",                        reason = "wipefs" },
    { pat = "truncate%s+.-%-s%s*0",            reason = "truncate to zero" },

    -- ── Version control destructive
    { pat = "git%s+push%s+.-%-%-force",        reason = "git push --force" },
    { pat = "git%s+push%s+.-%-f%f[%s]",        reason = "git push -f" },
    { pat = "git%s+reset%s+.-%-%-hard",        reason = "git reset --hard" },
    { pat = "git%s+clean%s+.-%-fd?",           reason = "git clean -fd" },
    { pat = "git%s+branch%s+.-%-D",            reason = "git branch -D" },

    -- ── Database / process
    -- ci=true rules use lowercase patterns; the matcher lowercases the
    -- input before testing. Don't use uppercase patterns with ci=true.
    { pat = "drop%s+table",                    reason = "DROP TABLE",        ci = true },
    { pat = "drop%s+database",                 reason = "DROP DATABASE",     ci = true },
    { pat = "truncate%s+table",                reason = "TRUNCATE TABLE",    ci = true },
    -- pkill BEFORE kill so the more specific match wins (Lua tables are
    -- order-preserving; first hit reports the reason).
    { pat = "pkill%s+%-9",                     reason = "pkill -9" },
    -- kill -9 needs a word boundary so "pkill -9" doesn't match this rule's
    -- "kill" substring. %f[%w] is Lua's frontier pattern — matches a
    -- transition from non-word to word characters.
    { pat = "%f[%w]kill%s+%-9",                reason = "kill -9" },

    -- ── Network/permission
    { pat = "chmod%s+.-777",                   reason = "chmod 777" },
    { pat = "chown%s+.-%s+/%s*$",              reason = "chown on root path" },
}

-- Match each rule against `cmd`. Returns (true, reason) on first hit;
-- (false, nil) if no rule matches. Static-only — does NOT invoke the
-- LLM probe (that's `is_destructive` below, which calls this first).
local function match_static(cmd)
    if type(cmd) ~= "string" or cmd == "" then return false, nil end
    local lower = nil  -- lazily computed for ci-rules
    for _, rule in ipairs(DESTRUCTIVE_PATTERNS) do
        local target = cmd
        if rule.ci then
            lower = lower or cmd:lower()
            target = lower
        end
        if target:match(rule.pat) then
            return true, rule.reason
        end
    end
    return false, nil
end

-- ---------------------------------------------------------------- LLM probe
-- Session-scoped cache for the LLM second-opinion. Keyed by the normalized
-- (lowercased, whitespace-collapsed) command text. Mitigates Q23 latency
-- when the same command pattern recurs within a single Norris run.
local _llm_cache = {}

local function normalize(cmd)
    return cmd:lower():gsub("%s+", " "):gsub("^%s+", ""):gsub("%s+$", "")
end

-- Per-probe timeout. The probe must be quick — destructive detection has
-- to keep up with Norris's pace. We override the model's default timeout
-- (which can be 30+ min for deep/slow local models) with a tight cap.
local PROBE_TIMEOUT_MS = 15000

-- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string
-- (not bool — caller cares about disagreement between probes).
local function llm_probe(model_cfg, system, cmd)
    local reply, err = broker.chat(model_cfg,
        { { role = "system", content = system },
          { role = "user",   content = cmd } },
        { max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS })
    if not reply then
        -- Broker failure → safe default: treat as YES (destructive)
        return "YES_FAILSAFE", err
    end
    local upper = reply:upper()
    if upper:match("YES") then return "YES" end
    if upper:match("NO")  then return "NO"  end
    -- Unparseable response → treat as YES (safe default)
    return "YES_UNPARSEABLE", reply
end

-- LLM second-opinion gate. Returns:
--   true,  "LLM flagged as destructive"      first probe said YES
--   true,  "LLM probe disagreement"          first NO, second NO ("is it safe?")
--   true,  "LLM probe failed: <err>"         broker error → fail-safe YES
--   false, nil                                both probes agree command is safe
-- Cached per normalized command for the session.
--
-- Model-selection precedence (per PHASE3.md §5):
--   1. cfg.safety.llm_model         — explicit model-name from config (e.g. "cloud")
--   2. cfg.models.deep              — local independent-class model (qwen3-30b)
--   3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default
local function llm_second_opinion(cmd, cfg)
    local probe_cfg
    if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then
        probe_cfg = cfg.models[cfg.safety.llm_model]
    end
    if not probe_cfg and cfg and cfg.models then
        probe_cfg = cfg.models.deep or cfg.models[cfg.default_model or "fast"]
    end
    if not probe_cfg then
        -- No probe model available → fail-safe YES
        return true, "LLM probe unavailable (no model in config)"
    end

    local key = normalize(cmd)
    local cached = _llm_cache[key]
    if cached ~= nil then
        if cached[1] then return cached[1], cached[2] end
        return false, nil
    end

    local function memo(is_d, reason)
        _llm_cache[key] = { is_d, reason }
        return is_d, reason
    end

    -- Probe 1: "Is this destructive?"
    local p1 = llm_probe(probe_cfg,
        "Detect destructive shell commands. Reply YES if running this "
        .. "would delete, overwrite, or irreversibly modify data, "
        .. "processes, or system state. Otherwise reply NO. Respond "
        .. "with only YES or NO.", cmd)
    if p1 == "YES" then return memo(true, "LLM flagged as destructive") end
    if p1 == "YES_FAILSAFE"    then return memo(true, "LLM probe failed (fail-safe)") end
    if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end

    -- Probe 2 (NO → re-roll with inverted phrasing per R-B2):
    local p2 = llm_probe(probe_cfg,
        "Reply YES or NO only. Is the following shell command SAFE to "
        .. "run autonomously without user review?", cmd)
    if p2 == "YES" then return memo(false, nil) end
    -- Disagreement or fail-safe → HALT
    return memo(true, "LLM probe disagreement")
end

-- Main entry point. Returns (true, reason) if EITHER the static patterns
-- OR the LLM second-opinion flag the command. Used by the Norris loop.
-- `cfg` is the full aish config (carries cfg.safety.llm_second_opinion
-- and cfg.models for the probe model lookup). When cfg is nil, only the
-- static layer runs (handy for unit tests and tooling that wants the
-- fast deterministic gate without an LLM round-trip).
function M.is_destructive(cmd, cfg)
    if type(cmd) ~= "string" or cmd == "" then return false, nil end

    -- Static patterns first (fast, deterministic).
    local hit, reason = match_static(cmd)
    if hit then return true, reason end

    -- LLM second-opinion. Default ON when cfg is present; off when cfg
    -- is nil (test/static-only mode). Explicit opt-out via
    -- cfg.safety.llm_second_opinion = false.
    if cfg == nil then return false, nil end
    if cfg.safety and cfg.safety.llm_second_opinion == false then
        return false, nil
    end

    return llm_second_opinion(cmd, cfg)
end

-- Expose the pattern table for `:safety patterns` meta and for testing.
M._patterns       = DESTRUCTIVE_PATTERNS
M._match_static   = match_static       -- testable in isolation
M._reset_cache    = function() _llm_cache = {} end

-- ---------------------------------------------------------------- norris_step
-- One iteration of the Norris planning loop per PHASE3.md §4.
-- The driver in repl.lua calls this in a while loop, advancing on every
-- non-terminal status.
--
-- Inputs:
--   ctx          aish Context (read & written: turns + pending_exec_output)
--   model_cfg    the active broker model config (model_cfg.endpoint/.model/etc.)
--   helpers      table of injected dispatch helpers:
--                  .tools_schema()         → tools array for opts.tools
--                  .exec_cmd(cmd)          → run shell cmd; returns (out, exit_code)
--                  .dispatch_tool(call,args)→ run an MCP tool; returns (content, is_error)
--                  .extract_cmd_lines(text)→ executor.extract_cmd_lines (passed in)
--                  .halt(step_n, max_n, reason, action) → "proceed"|"skip"|"abort"
--                  .render_step(n, max_n, descr)        (renderer.norris_step)
--                  .render_tool_begin(name, args)       (renderer.tool_call_begin)
--                  .render_tool_end(content, is_error)  (renderer.tool_call_end)
--                  .render_exec_begin()                 (renderer.exec_begin)
--                  .render_exec_end(code)               (renderer.exec_end)
--                  .render_assistant_delta(chunk)       (renderer.assistant_delta)
--                  .render_assistant_flush()            (renderer.assistant_flush)
--                  .log_turn(turn)                      (session log append)
--   opts:
--                  .step_n             current step (1-based)
--                  .max_steps          budget cap (default 8)
--                  .cfg                full aish config (for is_destructive)
--
-- Returns: { status, reason } where status ∈ {
--    "continue"          — keep looping (driver bumps step_n)
--    "done"              — model emitted GOAL: complete
--    "aborted"           — user typed abort at a halt prompt
--    "stalled"           — model emitted nothing actionable
--    "budget_exhausted"  — step_n >= max_steps after this iteration
--    "broker_error"      — broker.chat_stream returned (nil, err)
-- }
function M.norris_step(ctx, model_cfg, helpers, opts)
    local step_n    = opts.step_n or 1
    local max_steps = opts.max_steps or 8
    local cfg       = opts.cfg

    helpers.render_step(step_n, max_steps)

    -- (1) one broker round-trip — stream text + collect tool_calls
    local text_parts      = {}
    local tool_calls_seen = {}
    local ok, err = broker.chat_stream(model_cfg, ctx:to_messages(),
        function(kind, payload)
            if kind == "text" then
                text_parts[#text_parts + 1] = payload
                helpers.render_assistant_delta(payload)
            elseif kind == "tool_call" then
                tool_calls_seen[#tool_calls_seen + 1] = payload
            end
        end,
        { tools = helpers.tools_schema() })
    helpers.render_assistant_flush()

    if not ok then
        return { status = "broker_error", reason = tostring(err) }
    end

    local resp_text = table.concat(text_parts)

    -- (2) parse actions from response
    local cmd_lines = helpers.extract_cmd_lines(resp_text) or {}
    local goal_done = false
    for line in (resp_text .. "\n"):gmatch("([^\n]*)\n") do
        local trimmed = line:gsub("^%s+", ""):gsub("%s+$", "")
        if trimmed == "GOAL: complete" then
            goal_done = true; break
        end
    end

    local n_actions = #tool_calls_seen + #cmd_lines

    -- (3) record assistant turn (with optional tool_calls)
    if #tool_calls_seen > 0 then
        ctx:append({ role = "assistant", content = resp_text,
                     tool_calls = tool_calls_seen })
    else
        ctx:append({ role = "assistant", content = resp_text })
    end
    helpers.log_turn(ctx.turns[#ctx.turns])

    if n_actions == 0 and not goal_done then
        return { status = "stalled", reason = "no action emitted" }
    end

    -- (4) dispatch tool_calls first (structured route)
    for _, call in ipairs(tool_calls_seen) do
        local args_table = {}
        if call.arguments and call.arguments ~= "" then
            local d, _, derr = json.decode(call.arguments)
            if d then args_table = d
            else
                -- Argument JSON parse failure: synthesize tool turn (alternation)
                ctx:append({ role = "tool", tool_call_id = call.id,
                             content = "[aish] tool arguments not "
                                       .. "parseable as JSON: " .. tostring(derr) })
                helpers.log_turn(ctx.turns[#ctx.turns])
                goto continue_tool
            end
        end

        -- Probe destructive on the JSON-serialized call as a proxy.
        local call_repr = (call.name or "?") .. " " .. (call.arguments or "")
        local destr, reason = M.is_destructive(call_repr, cfg)

        local verdict
        if destr then
            verdict = helpers.halt(step_n, max_steps, reason or "destructive",
                                   call_repr)
        else
            -- Non-destructive tool_call: auto_approve OR halt for consent
            local policy = cfg and cfg.mcp and cfg.mcp.auto_approve or {}
            local alias = (call.name or ""):match("^(.-)__")
            local auto = policy[call.name]
                         or (alias and alias ~= "" and policy[alias .. "__*"])
            if auto then
                verdict = "proceed"
            else
                verdict = helpers.halt(step_n, max_steps, "tool consent",
                                       call_repr)
            end
        end

        if verdict == "abort" then
            return { status = "aborted", reason = "user abort at halt" }
        elseif verdict == "skip" then
            ctx.norris_consecutive_skips = (ctx.norris_consecutive_skips or 0) + 1
            ctx:append({ role = "tool", tool_call_id = call.id,
                         content = "[aish] tool call skipped by user: "
                                   .. (reason or "no reason") })
            helpers.log_turn(ctx.turns[#ctx.turns])
        else  -- proceed
            ctx.norris_consecutive_skips = 0
            helpers.render_tool_begin(call.name, call.arguments)
            local content, is_error = helpers.dispatch_tool(call.name, args_table)
            helpers.render_tool_end(content, is_error)
            ctx:append({ role = "tool", tool_call_id = call.id,
                         content = content or "" })
            helpers.log_turn(ctx.turns[#ctx.turns])
        end
        ::continue_tool::
    end

    -- (5) dispatch CMD: lines (legacy route)
    for _, cmd in ipairs(cmd_lines) do
        local destr, reason = M.is_destructive(cmd, cfg)
        local verdict
        if destr then
            verdict = helpers.halt(step_n, max_steps, reason or "destructive",
                                   cmd)
        else
            verdict = "proceed"  -- non-destructive CMD: runs without consent
                                 -- in Norris (Norris user accepted autonomy)
        end

        if verdict == "abort" then
            return { status = "aborted", reason = "user abort at halt" }
        elseif verdict == "skip" then
            ctx.norris_consecutive_skips = (ctx.norris_consecutive_skips or 0) + 1
            -- CMD: skip → synthesize exec-output line so the model sees it
            ctx:append_exec_output("[aish] CMD skipped by user: "
                                   .. (reason or "no reason"))
        else  -- proceed
            ctx.norris_consecutive_skips = 0
            helpers.render_exec_begin()
            local out, code = helpers.exec_cmd(cmd)
            helpers.render_exec_end(code)
            if cfg and cfg.shell and cfg.shell.capture_output then
                ctx:append_exec_output(out)
            end
        end
    end

    -- Skip-budget escalation: R-C1
    if (ctx.norris_consecutive_skips or 0) >= 3 then
        local verdict = helpers.halt(step_n, max_steps,
            ("%d consecutive user skips"):format(ctx.norris_consecutive_skips),
            "(repeated similar destructive proposals)")
        if verdict == "abort" then
            return { status = "aborted", reason = "user abort on skip-escalation" }
        end
        -- Else: reset the counter and continue (user said proceed)
        ctx.norris_consecutive_skips = 0
    end

    -- (6) goal_done after dispatch
    if goal_done then
        return { status = "done", reason = "GOAL: complete" }
    end

    -- (7) budget
    if step_n >= max_steps then
        return { status = "budget_exhausted",
                 reason = ("%d step limit reached"):format(max_steps) }
    end

    return { status = "continue" }
end

return M