diff --git a/broker.lua b/broker.lua index 0e2602d..4e5a47b 100644 --- a/broker.lua +++ b/broker.lua @@ -27,7 +27,7 @@ local function build_headers(model_cfg) return h end -local function build_request(model_cfg, messages, stream, tools) +local function build_request(model_cfg, messages, stream, tools, max_tokens) if not (model_cfg and model_cfg.endpoint and model_cfg.model) then return nil, "broker: model_cfg.endpoint and .model are required" end @@ -41,6 +41,10 @@ local function build_request(model_cfg, messages, stream, tools) -- Per PHASE2.md §12 risk row "Empty tools array": some servers reject -- "tools": []. Only set the field when the list has entries. if tools and #tools > 0 then req.tools = tools end + -- Phase 3 (A2): max_tokens passthrough — used by safety.is_destructive + -- to cap YES/NO probes at ~4 tokens. Omitted when nil (Phase 1/2 + -- callers unaffected — model defaults still apply). + if max_tokens then req.max_tokens = max_tokens end return url, json.encode(req), build_headers(model_cfg), (model_cfg.timeout_ms or 60000) end @@ -59,8 +63,13 @@ end function M.chat_stream(model_cfg, messages, on_delta, opts) opts = opts or {} local url, body, headers, timeout_ms = - build_request(model_cfg, messages, true, opts.tools) + build_request(model_cfg, messages, true, opts.tools, opts.max_tokens) if not url then return nil, body end -- url slot carries err on bad cfg + -- Phase 3: opts.timeout_ms overrides the model's default. Used by + -- safety.is_destructive's LLM probe to cap YES/NO checks at ~15s even + -- when the model's normal timeout is much higher (e.g. user's deep + -- model has 1800000ms for long generations). + if opts.timeout_ms then timeout_ms = opts.timeout_ms end local done = false local api_err @@ -152,11 +161,11 @@ end -- Returns: -- assistant_content_string on success -- nil, errmsg on transport / decode / API failure -function M.chat(model_cfg, messages) +function M.chat(model_cfg, messages, opts) local parts = {} local ok, err = M.chat_stream(model_cfg, messages, function(kind, payload) if kind == "text" then parts[#parts + 1] = payload end - end) + end, opts) if not ok then return nil, err end return table.concat(parts) end diff --git a/safety.lua b/safety.lua index a4c0a22..b76ba8c 100644 --- a/safety.lua +++ b/safety.lua @@ -4,8 +4,9 @@ -- Norris autonomous mode) and M.norris_step (single-iteration -- planning loop). See docs/PHASE2.md §6 and docs/PHASE3.md §4 / §5. -local rl = require("ffi.readline") -local json = require("dkjson") +local rl = require("ffi.readline") +local json = require("dkjson") +local broker = require("broker") local M = {} @@ -106,10 +107,9 @@ local DESTRUCTIVE_PATTERNS = { } -- Match each rule against `cmd`. Returns (true, reason) on first hit; --- (false, nil) if no rule matches. Used by the Norris loop to gate --- shell commands; ALSO called on tool-call args by Norris's tool path --- (the JSON-serialized arguments are passed in as cmd). -function M.is_destructive(cmd) +-- (false, nil) if no rule matches. Static-only — does NOT invoke the +-- LLM probe (that's `is_destructive` below, which calls this first). +local function match_static(cmd) if type(cmd) ~= "string" or cmd == "" then return false, nil end local lower = nil -- lazily computed for ci-rules for _, rule in ipairs(DESTRUCTIVE_PATTERNS) do @@ -125,8 +125,122 @@ function M.is_destructive(cmd) return false, nil end +-- ---------------------------------------------------------------- LLM probe +-- Session-scoped cache for the LLM second-opinion. Keyed by the normalized +-- (lowercased, whitespace-collapsed) command text. Mitigates Q23 latency +-- when the same command pattern recurs within a single Norris run. +local _llm_cache = {} + +local function normalize(cmd) + return cmd:lower():gsub("%s+", " "):gsub("^%s+", ""):gsub("%s+$", "") +end + +-- Per-probe timeout. The probe must be quick — destructive detection has +-- to keep up with Norris's pace. We override the model's default timeout +-- (which can be 30+ min for deep/slow local models) with a tight cap. +local PROBE_TIMEOUT_MS = 15000 + +-- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string +-- (not bool — caller cares about disagreement between probes). +local function llm_probe(model_cfg, system, cmd) + local reply, err = broker.chat(model_cfg, + { { role = "system", content = system }, + { role = "user", content = cmd } }, + { max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS }) + if not reply then + -- Broker failure → safe default: treat as YES (destructive) + return "YES_FAILSAFE", err + end + local upper = reply:upper() + if upper:match("YES") then return "YES" end + if upper:match("NO") then return "NO" end + -- Unparseable response → treat as YES (safe default) + return "YES_UNPARSEABLE", reply +end + +-- LLM second-opinion gate. Returns: +-- true, "LLM flagged as destructive" first probe said YES +-- true, "LLM probe disagreement" first NO, second NO ("is it safe?") +-- true, "LLM probe failed: " broker error → fail-safe YES +-- false, nil both probes agree command is safe +-- Cached per normalized command for the session. +-- +-- Model-selection precedence (per PHASE3.md §5): +-- 1. cfg.safety.llm_model — explicit model-name from config (e.g. "cloud") +-- 2. cfg.models.deep — local independent-class model (qwen3-30b) +-- 3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default +local function llm_second_opinion(cmd, cfg) + local probe_cfg + if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then + probe_cfg = cfg.models[cfg.safety.llm_model] + end + if not probe_cfg and cfg and cfg.models then + probe_cfg = cfg.models.deep or cfg.models[cfg.default_model or "fast"] + end + if not probe_cfg then + -- No probe model available → fail-safe YES + return true, "LLM probe unavailable (no model in config)" + end + + local key = normalize(cmd) + local cached = _llm_cache[key] + if cached ~= nil then + if cached[1] then return cached[1], cached[2] end + return false, nil + end + + local function memo(is_d, reason) + _llm_cache[key] = { is_d, reason } + return is_d, reason + end + + -- Probe 1: "Is this destructive?" + local p1 = llm_probe(probe_cfg, + "Detect destructive shell commands. Reply YES if running this " + .. "would delete, overwrite, or irreversibly modify data, " + .. "processes, or system state. Otherwise reply NO. Respond " + .. "with only YES or NO.", cmd) + if p1 == "YES" then return memo(true, "LLM flagged as destructive") end + if p1 == "YES_FAILSAFE" then return memo(true, "LLM probe failed (fail-safe)") end + if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end + + -- Probe 2 (NO → re-roll with inverted phrasing per R-B2): + local p2 = llm_probe(probe_cfg, + "Reply YES or NO only. Is the following shell command SAFE to " + .. "run autonomously without user review?", cmd) + if p2 == "YES" then return memo(false, nil) end + -- Disagreement or fail-safe → HALT + return memo(true, "LLM probe disagreement") +end + +-- Main entry point. Returns (true, reason) if EITHER the static patterns +-- OR the LLM second-opinion flag the command. Used by the Norris loop. +-- `cfg` is the full aish config (carries cfg.safety.llm_second_opinion +-- and cfg.models for the probe model lookup). When cfg is nil, only the +-- static layer runs (handy for unit tests and tooling that wants the +-- fast deterministic gate without an LLM round-trip). +function M.is_destructive(cmd, cfg) + if type(cmd) ~= "string" or cmd == "" then return false, nil end + + -- Static patterns first (fast, deterministic). + local hit, reason = match_static(cmd) + if hit then return true, reason end + + -- LLM second-opinion. Default ON when cfg is present; off when cfg + -- is nil (test/static-only mode). Explicit opt-out via + -- cfg.safety.llm_second_opinion = false. + if cfg == nil then return false, nil end + if cfg.safety and cfg.safety.llm_second_opinion == false then + return false, nil + end + + return llm_second_opinion(cmd, cfg) +end + -- Expose the pattern table for `:safety patterns` meta and for testing. -M._patterns = DESTRUCTIVE_PATTERNS +M._patterns = DESTRUCTIVE_PATTERNS +M._match_static = match_static -- testable in isolation +M._reset_cache = function() _llm_cache = {} end -- ---------------------------------------------------------------- norris_step -- Phase 3 commit #4 lands the planner. Stub stays for now.