From 2abd5da3a636498b97c13b7b47f3cf774c9eafde Mon Sep 17 00:00:00 2001
From: Markus Fritsche <mfritsche@reauktion.de>
Date: Tue, 12 May 2026 23:36:06 +0000
Subject: [PATCH] safety: LLM second-opinion + session cache (Phase 3 commit
 #2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 3 commit #2 per docs/PHASE3.md §12. Adds the LLM-probe gate on
top of commit #1's static patterns. Together they form is_destructive.

broker.lua extension:
  - opts.max_tokens (A2) — passed through to the request body. Phase 3
    probes cap at 4 tokens for YES/NO replies.
  - opts.timeout_ms — overrides model_cfg.timeout_ms per-call. Probe
    uses 15000ms cap regardless of the model's normal timeout
    (the user's deep model has 1800000ms for long generations; the
    probe must stay snappy).
  - M.chat now accepts an opts table (same shape as chat_stream's).
    Backwards compatible — existing callers passing (cfg, msgs)
    unaffected.

safety.lua additions:
  - llm_probe(cfg, system, cmd): single broker.chat call returning
    "YES"/"NO"/"YES_FAILSAFE"/"YES_UNPARSEABLE" — fail-safe defaults.
  - llm_second_opinion(cmd, cfg): two-probe protocol per R-B2.
    Probe 1: "Is this destructive?" — YES → flag.
    Probe 2 (only if probe 1 said NO): "Is this safe?" inverted
    question — NO → flag (disagreement = HALT).
    Both NO → safe.
  - Session-scoped cache _llm_cache keyed by normalized command
    (lowercased + whitespace-collapsed). Mitigates Q23 latency for
    repeated commands within a Norris run.
  - Model-selection precedence: cfg.safety.llm_model (explicit)
    → cfg.models.deep (independent local class) → cfg.models[default].
    Fail-safe YES if none configured.
  - is_destructive(cmd, cfg): runs static patterns first (always),
    then LLM if cfg present + not explicitly opted-out. cfg=nil
    yields static-only mode (handy for tests).

End-to-end verified against hossenfelder using qwen-coder-7b-32k as
the deep probe (qwen3-30b-a3b-instruct in repo's config.lua isn't
currently loaded on the local backend):
  cat /etc/hostname              → hit=false (LLM: NO, NO inverted = safe)
  rm /tmp/x.log                  → hit=true  (LLM flagged; static missed
                                              because no -r/-f flags)
  cp /etc/passwd /tmp/passwd.bak → hit=false (safe copy)
  cache: second probe on same cmd → 0s wall time
  static-only (cfg=nil): rm -rf /tmp/x → static hit, no LLM call
  opt-out (llm_second_opinion=false): cp x y → hit=false, no probe

Test corpus (test_safety.lua, 87 cases) still all pass — cfg=nil
preserves the static-only behavior.

Note: production config.lua currently has `deep = qwen3-30b-a3b-instruct`
which isn't loaded on the proxy backend right now; Norris users will
hit the fail-safe (everything flagged destructive) until either the
deep model is brought up OR cfg.safety.llm_model = "cloud" is set
to route the probe through anthropic/claude-haiku-4.5. Update the
config or model deployment for production use — covered by Phase 3
verify test case.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 broker.lua |  17 +++++--
 safety.lua | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 134 insertions(+), 11 deletions(-)

diff --git a/broker.lua b/broker.lua
index 0e2602d..4e5a47b 100644
--- a/broker.lua
+++ b/broker.lua
@@ -27,7 +27,7 @@ local function build_headers(model_cfg)
     return h
 end
 
-local function build_request(model_cfg, messages, stream, tools)
+local function build_request(model_cfg, messages, stream, tools, max_tokens)
     if not (model_cfg and model_cfg.endpoint and model_cfg.model) then
         return nil, "broker: model_cfg.endpoint and .model are required"
     end
@@ -41,6 +41,10 @@ local function build_request(model_cfg, messages, stream, tools)
     -- Per PHASE2.md §12 risk row "Empty tools array": some servers reject
     -- "tools": []. Only set the field when the list has entries.
     if tools and #tools > 0 then req.tools = tools end
+    -- Phase 3 (A2): max_tokens passthrough — used by safety.is_destructive
+    -- to cap YES/NO probes at ~4 tokens. Omitted when nil (Phase 1/2
+    -- callers unaffected — model defaults still apply).
+    if max_tokens then req.max_tokens = max_tokens end
     return url, json.encode(req), build_headers(model_cfg),
            (model_cfg.timeout_ms or 60000)
 end
@@ -59,8 +63,13 @@ end
 function M.chat_stream(model_cfg, messages, on_delta, opts)
     opts = opts or {}
     local url, body, headers, timeout_ms =
-        build_request(model_cfg, messages, true, opts.tools)
+        build_request(model_cfg, messages, true, opts.tools, opts.max_tokens)
     if not url then return nil, body end  -- url slot carries err on bad cfg
+    -- Phase 3: opts.timeout_ms overrides the model's default. Used by
+    -- safety.is_destructive's LLM probe to cap YES/NO checks at ~15s even
+    -- when the model's normal timeout is much higher (e.g. user's deep
+    -- model has 1800000ms for long generations).
+    if opts.timeout_ms then timeout_ms = opts.timeout_ms end
 
     local done = false
     local api_err
@@ -152,11 +161,11 @@ end
 -- Returns:
 --   assistant_content_string         on success
 --   nil, errmsg                       on transport / decode / API failure
-function M.chat(model_cfg, messages)
+function M.chat(model_cfg, messages, opts)
     local parts = {}
     local ok, err = M.chat_stream(model_cfg, messages, function(kind, payload)
         if kind == "text" then parts[#parts + 1] = payload end
-    end)
+    end, opts)
     if not ok then return nil, err end
     return table.concat(parts)
 end
diff --git a/safety.lua b/safety.lua
index a4c0a22..b76ba8c 100644
--- a/safety.lua
+++ b/safety.lua
@@ -4,8 +4,9 @@
 --          Norris autonomous mode) and M.norris_step (single-iteration
 --          planning loop). See docs/PHASE2.md §6 and docs/PHASE3.md §4 / §5.
 
-local rl   = require("ffi.readline")
-local json = require("dkjson")
+local rl     = require("ffi.readline")
+local json   = require("dkjson")
+local broker = require("broker")
 
 local M = {}
 
@@ -106,10 +107,9 @@ local DESTRUCTIVE_PATTERNS = {
 }
 
 -- Match each rule against `cmd`. Returns (true, reason) on first hit;
--- (false, nil) if no rule matches. Used by the Norris loop to gate
--- shell commands; ALSO called on tool-call args by Norris's tool path
--- (the JSON-serialized arguments are passed in as cmd).
-function M.is_destructive(cmd)
+-- (false, nil) if no rule matches. Static-only — does NOT invoke the
+-- LLM probe (that's `is_destructive` below, which calls this first).
+local function match_static(cmd)
     if type(cmd) ~= "string" or cmd == "" then return false, nil end
     local lower = nil  -- lazily computed for ci-rules
     for _, rule in ipairs(DESTRUCTIVE_PATTERNS) do
@@ -125,8 +125,122 @@ function M.is_destructive(cmd)
     return false, nil
 end
 
+-- ---------------------------------------------------------------- LLM probe
+-- Session-scoped cache for the LLM second-opinion. Keyed by the normalized
+-- (lowercased, whitespace-collapsed) command text. Mitigates Q23 latency
+-- when the same command pattern recurs within a single Norris run.
+local _llm_cache = {}
+
+local function normalize(cmd)
+    return cmd:lower():gsub("%s+", " "):gsub("^%s+", ""):gsub("%s+$", "")
+end
+
+-- Per-probe timeout. The probe must be quick — destructive detection has
+-- to keep up with Norris's pace. We override the model's default timeout
+-- (which can be 30+ min for deep/slow local models) with a tight cap.
+local PROBE_TIMEOUT_MS = 15000
+
+-- Ask `model_cfg` whether `cmd` is destructive. Returns "YES"/"NO" string
+-- (not bool — caller cares about disagreement between probes).
+local function llm_probe(model_cfg, system, cmd)
+    local reply, err = broker.chat(model_cfg,
+        { { role = "system", content = system },
+          { role = "user",   content = cmd } },
+        { max_tokens = 4, timeout_ms = PROBE_TIMEOUT_MS })
+    if not reply then
+        -- Broker failure → safe default: treat as YES (destructive)
+        return "YES_FAILSAFE", err
+    end
+    local upper = reply:upper()
+    if upper:match("YES") then return "YES" end
+    if upper:match("NO")  then return "NO"  end
+    -- Unparseable response → treat as YES (safe default)
+    return "YES_UNPARSEABLE", reply
+end
+
+-- LLM second-opinion gate. Returns:
+--   true,  "LLM flagged as destructive"      first probe said YES
+--   true,  "LLM probe disagreement"          first NO, second NO ("is it safe?")
+--   true,  "LLM probe failed: <err>"         broker error → fail-safe YES
+--   false, nil                                both probes agree command is safe
+-- Cached per normalized command for the session.
+--
+-- Model-selection precedence (per PHASE3.md §5):
+--   1. cfg.safety.llm_model         — explicit model-name from config (e.g. "cloud")
+--   2. cfg.models.deep              — local independent-class model (qwen3-30b)
+--   3. cfg.models[cfg.default_model] — fallback to whatever the user runs by default
+local function llm_second_opinion(cmd, cfg)
+    local probe_cfg
+    if cfg and cfg.safety and cfg.safety.llm_model and cfg.models then
+        probe_cfg = cfg.models[cfg.safety.llm_model]
+    end
+    if not probe_cfg and cfg and cfg.models then
+        probe_cfg = cfg.models.deep or cfg.models[cfg.default_model or "fast"]
+    end
+    if not probe_cfg then
+        -- No probe model available → fail-safe YES
+        return true, "LLM probe unavailable (no model in config)"
+    end
+
+    local key = normalize(cmd)
+    local cached = _llm_cache[key]
+    if cached ~= nil then
+        if cached[1] then return cached[1], cached[2] end
+        return false, nil
+    end
+
+    local function memo(is_d, reason)
+        _llm_cache[key] = { is_d, reason }
+        return is_d, reason
+    end
+
+    -- Probe 1: "Is this destructive?"
+    local p1 = llm_probe(probe_cfg,
+        "Detect destructive shell commands. Reply YES if running this "
+        .. "would delete, overwrite, or irreversibly modify data, "
+        .. "processes, or system state. Otherwise reply NO. Respond "
+        .. "with only YES or NO.", cmd)
+    if p1 == "YES" then return memo(true, "LLM flagged as destructive") end
+    if p1 == "YES_FAILSAFE"    then return memo(true, "LLM probe failed (fail-safe)") end
+    if p1 == "YES_UNPARSEABLE" then return memo(true, "LLM unparseable (fail-safe)") end
+
+    -- Probe 2 (NO → re-roll with inverted phrasing per R-B2):
+    local p2 = llm_probe(probe_cfg,
+        "Reply YES or NO only. Is the following shell command SAFE to "
+        .. "run autonomously without user review?", cmd)
+    if p2 == "YES" then return memo(false, nil) end
+    -- Disagreement or fail-safe → HALT
+    return memo(true, "LLM probe disagreement")
+end
+
+-- Main entry point. Returns (true, reason) if EITHER the static patterns
+-- OR the LLM second-opinion flag the command. Used by the Norris loop.
+-- `cfg` is the full aish config (carries cfg.safety.llm_second_opinion
+-- and cfg.models for the probe model lookup). When cfg is nil, only the
+-- static layer runs (handy for unit tests and tooling that wants the
+-- fast deterministic gate without an LLM round-trip).
+function M.is_destructive(cmd, cfg)
+    if type(cmd) ~= "string" or cmd == "" then return false, nil end
+
+    -- Static patterns first (fast, deterministic).
+    local hit, reason = match_static(cmd)
+    if hit then return true, reason end
+
+    -- LLM second-opinion. Default ON when cfg is present; off when cfg
+    -- is nil (test/static-only mode). Explicit opt-out via
+    -- cfg.safety.llm_second_opinion = false.
+    if cfg == nil then return false, nil end
+    if cfg.safety and cfg.safety.llm_second_opinion == false then
+        return false, nil
+    end
+
+    return llm_second_opinion(cmd, cfg)
+end
+
 -- Expose the pattern table for `:safety patterns` meta and for testing.
-M._patterns = DESTRUCTIVE_PATTERNS
+M._patterns       = DESTRUCTIVE_PATTERNS
+M._match_static   = match_static       -- testable in isolation
+M._reset_cache    = function() _llm_cache = {} end
 
 -- ---------------------------------------------------------------- norris_step
 -- Phase 3 commit #4 lands the planner. Stub stays for now.