aish/config.lua

-- config.lua — model registry, routing rules, user preferences.
-- Loaded with dofile() at startup; returns a plain Lua table.
-- See docs/PHASE0.md §10 for resolution order and full schema.
--
-- Per issue #12: hossenfelder is the canonical single-URL broker. It does
-- model-aware routing server-side (local models on boltzmann; cloud routes
-- through OpenRouter using its own bearer auth — no client-side key here).
-- Discovery: GET http://hossenfelder.fritz.box:8082/v1/models.
--
-- Phase 9 (docs/PHASE9.md): a `.aish.lua` in/above your cwd (walking up
-- to $HOME) overlays this user config. First encounter prompts to trust;
-- sha256-pinned in ~/.aish/trusted-projects. Use it for repo-specific
-- model presets, permissions, hooks, etc.
--
-- IMPORTANT: shallow merge. If your `.aish.lua` sets a top-level block
-- (models, permissions, cost, shell, ...), it REPLACES the user's
-- entire block — list every entry you want available OR omit the block
-- to keep the user's. Inspect the merge via `:config show` at runtime.

local HOSSENFELDER = "http://hossenfelder.fritz.box:8082"

return {
    default_model = "fast",

    models = {
        fast = {
            endpoint    = HOSSENFELDER,
            model       = "qwen2.5-coder-1.5b-q4_k_m.gguf",
            temperature = 0.2,
        },
        deep = {
            endpoint    = HOSSENFELDER,
            -- 2026-05-13: qwen3-30b not loaded on hossenfelder right now;
            -- using deepseek-coder-v2-lite (16B MoE, ~2.4B active) for the
            -- time being. Restore qwen3-30b when the slot is back up.
            model       = "deepseek-coder-v2-lite",
            timeout_ms  = 300000,   -- 5 min; MoE inference is faster than dense 30B
            temperature = 0.1,
        },
        cloud = {
            endpoint    = HOSSENFELDER,
            model       = "anthropic/claude-haiku-4.5",
            temperature = 0.2,
        },
    },

    shell = {
        known_commands = {
            "ls", "cat", "cd", "grep", "find", "cp", "mv", "rm",
            "mkdir", "rmdir", "git", "make", "cmake", "gcc", "clang",
            "python3", "luajit", "ssh", "scp", "curl", "wget",
        },
        capture_output = true,    -- inject exec output into context
        confirm_cmd    = true,    -- prompt before executing CMD: suggestions

        -- Issue #10: prompt template. When set, replaces the default
        -- "[aish:<model>]> " prompt. Variables (substituted via {name}):
        --   {model}  {ctx_used}  {ctx_max}  {turn}
        --   {cwd}    {cwd_short} (cwd with $HOME -> ~)
        --   {last_status} (last exec exit code, empty if none yet)
        --   {mode}   (norris / plan / normal)
        -- prompt = "[{model} {ctx_used}/{ctx_max}t T{turn} {mode}] {cwd_short} > ",
    },

    context = {
        max_turns    = 40,
        token_budget = 4096,
    },

    history = {
        dir = (os.getenv("HOME") or ".") .. "/.local/share/aish",
    },

    -- Issue #3: pre/post CMD hooks. Optional shell scripts triggered around
    -- every CMD: execution. Each hook receives the command on stdin and
    -- AISH_CMD / AISH_TURN / AISH_CWD as env vars. Non-zero exit on pre_cmd
    -- aborts execution; post_cmd's exit code is ignored but its stdout is
    -- logged. Default off (no hooks). Uncomment to enable.
    -- hooks = {
    --     pre_cmd  = (os.getenv("HOME") or ".") .. "/.aish/hooks/pre-cmd",
    --     post_cmd = (os.getenv("HOME") or ".") .. "/.aish/hooks/post-cmd",
    -- },

    -- Issue #13: secret redaction. Vault is a separate file at ~/.aish/
    -- secrets.lua (mode 0600 enforced). When set, outbound broker messages
    -- are scrubbed: vault literals + autodetect heuristics (OpenAI sk-,
    -- OpenRouter sk-or-v1-, GitHub ghp_/gho_/ghs_, AWS AKIA, JWT eyJ...,
    -- SSH/GPG PRIVATE KEY headers) become $AISH_SECRET_NNN placeholders.
    -- The streamed reply is rehydrated before display so the user sees
    -- real values. Per-broker override via models[*].redact:
    --   "off"               -- no scrubbing (trusted local)
    --   "vault"             -- vault literals only
    --   "vault+autodetect"  -- + heuristics (default when vault loaded)
    --   "stealth"           -- + heuristics, opaque decoys, no rehydrate
    -- Default per-broker is the global config.secrets.default, falling
    -- back to "vault+autodetect" when vault loaded, else "off".
    -- secrets = {
    --     vault   = "~/.aish/secrets.lua",
    --     default = "vault+autodetect",  -- applies when models[*].redact is nil
    -- },

    -- Issue #8: background CMD (CMD&: marker). Requires history.dir set
    -- (logs land at <history.dir>/bg/<id>.log + .status sidecar). The
    -- feature is always-on once history.dir exists — no config flag — but
    -- only fires when the model emits "CMD&: " or the user runs :bg-spawn.

    -- Issue #9: permission policy DSL for AI-suggested CMD: lines. When set,
    -- supersedes shell.confirm_cmd. Patterns are Lua patterns (NOT regex)
    -- per substrate invariant §3 (no compiled extensions). Priority order:
    -- deny > confirm > allow; first match in the chosen category wins.
    -- Unmatched commands default to "confirm". Probe with :perms check <cmd>.
    -- permissions = {
    --     allow   = { "^ls%s", "^cat%s", "^git status", "^git diff" },
    --     confirm = { "^rm%s", "^git push", "^docker%s", "^sudo%s" },
    --     deny    = { "^ssh%s+root@", "^curl%s+http[^s]" },
    -- },

    -- Phase 2 (docs/PHASE2.md): MCP server registry + tool-call policy.
    -- The block is OFF by default — connect-at-startup happens only when
    -- `servers` is non-empty. Uncomment + adjust per your fleet.
    --
    -- mcp = {
    --     servers = {
    --         -- Each entry: alias = { url = "...", auth_token = "..." | auth_env = "..." }
    --         -- auth_token literal > auth_env env-var indirection > nil (no auth).
    --         -- Aliases become the namespace prefix on tool names sent to the model
    --         -- ("<alias>__<tool>" — e.g. "boltzmann__list_dir"). The separator is
    --         -- "__" (two underscores) because Anthropic via Bedrock validates tool
    --         -- names against ^[a-zA-Z0-9_-]{1,128}$ — dots are rejected.
    --         -- Aliases themselves must not contain "__".
    --         boltzmann = {
    --             url      = "http://boltzmann.fritz.box:8080/mcp",
    --             auth_env = "BOLTZMANN_MCP_TOKEN",
    --         },
    --         hertz = {
    --             url      = "http://hertz.fritz.box:8080/mcp",
    --             auth_env = "HERTZ_MCP_TOKEN",
    --         },
    --         broglie = {
    --             url = "http://broglie.fritz.box:8080/mcp",  -- LAN-only, no auth
    --         },
    --     },
    --
    --     -- Per-call confirm gate auto-approve policy.
    --     -- Key forms:
    --     --   "<alias>__<tool>" — auto-approve one specific tool
    --     --   "<alias>__*"      — auto-approve every tool on that server
    --     -- Anything not matched falls back to the [y/N] prompt.
    --     auto_approve = {
    --         ["boltzmann__read_file"]    = true,
    --         ["boltzmann__list_dir"]     = true,
    --         ["boltzmann__search_files"] = true,
    --         ["hertz__*"]                = true,   -- trust the hub fully
    --     },
    --
    --     -- Tool-call sub-loop budget per ask_ai turn. Hitting the cap surfaces
    --     -- a status and breaks; default 8 if absent.
    --     max_tool_depth = 8,
    -- },

    -- Phase 3 (docs/PHASE3.md): Chuck Norris autonomous mode + destructive-op
    -- heuristic. The block is OFF by default (sane defaults kick in when
    -- absent); uncomment to tune.
    --
    -- safety = {
    --     -- LLM second-opinion on commands the static patterns don't flag.
    --     -- Default true. Set false for static-only operation (faster, but
    --     -- misses novel destructive patterns the static list doesn't know
    --     -- about — bash -c content, custom destructive idioms, etc.).
    --     llm_second_opinion = true,
    --
    --     -- Which configured model to use for the YES/NO destructive probe.
    --     -- Precedence: this field → models.deep → models[default_model].
    --     -- R-B2: prefer an INDEPENDENT model class from the action-emitting
    --     -- model (avoids self-policing). Recommended values:
    --     --   "cloud"  — anthropic/claude-haiku-4.5 via openrouter. Fast and
    --     --              reliable. Costs money per probe (typical Norris
    --     --              session = 16 probes max, often cached).
    --     --   "deep"   — local large model (qwen3-30b on this fleet). Free
    --     --              but slow on RK3588 hardware (~1-3s per probe).
    --     --              Falls back here automatically if not set.
    --     --   "fast"   — same model as the action-emitter. NOT RECOMMENDED
    --     --              (circular trust); use only when no other option.
    --     llm_model = "cloud",
    --
    --     -- Norris planning-loop budget. Iterations of safety.norris_step.
    --     -- Each iteration is one broker round-trip + dispatch of actions.
    --     -- Default 8. Bump for long-running goals; cap low for testing.
    --     max_norris_steps = 8,
    -- },

    -- Phase 4 (docs/PHASE4.md): cross-session memory.jsonl + startup
    -- injection + :memory management surface. The block is OFF by
    -- default (no startup injection); uncomment to tune. Note that
    -- :remember / :memory list / :memory forget / :memory summarize
    -- all work without this block — they store to <history.dir>/
    -- memory.jsonl regardless. The block only configures the
    -- injection-into-system-prompt behavior at startup.
    --
    -- memory = {
    --     -- Cap on total characters injected at startup. ~2000 chars ≈
    --     -- 500 tokens. LRU-by-ts selection if your memory.jsonl has
    --     -- more recent items than fit. Older items remain in the
    --     -- file; only injection is bounded. Suppressed entirely in
    --     -- Norris mode (R-C1).
    --     inject_max_chars = 2000,
    --
    --     -- Which configured model to use for :memory summarize.
    --     -- Defaults to the active model when nil. Use "fast" for
    --     -- speed; "deep" or "cloud" for better extraction quality
    --     -- (cloud may have variable cost per session).
    --     summarizer_model = "fast",
    -- },

    -- Phase 5 (docs/PHASE5.md): multi-model routing + cloud fallback +
    -- summarize-on-evict. OFF by default — auto-routing can spend money
    -- silently on the cloud preset; require explicit opt-in.
    --
    -- routing = {
    --     -- Enable auto-routing per request. When true, router.classify_model
    --     -- inspects each prompt and may switch the model for THAT request
    --     -- only (the :model selection is preserved across requests).
    --     -- Default false. Toggle at runtime with :route on / :route off.
    --     auto = true,
    --
    --     -- Class → model mapping. nil = "keep current" (heuristic fires
    --     -- but no override). Ships with reasoning = nil because mapping
    --     -- "explain ..." prompts to a paid cloud model would spend money
    --     -- silently — opt in by uncommenting the reasoning line below.
    --     classes = {
    --         code      = "deep",      -- code-like prompts to local deep
    --         -- reasoning = "cloud",  -- OPT-IN: "explain"/"why"/"how does" → paid
    --         -- default   = nil,      -- keep active model
    --     },
    --
    --     -- Single-hop retry on transport failure (HTTP 5xx, 408,
    --     -- 404 model_not_found, DNS, connection refused, timeouts).
    --     -- Retries against fallback_model once. Skipped if any text
    --     -- has already streamed (no partial-output duplication).
    --     -- Toggle at runtime with :fallback on / :fallback off.
    --     fallback       = false,           -- default off (cost-safety)
    --     fallback_model = "cloud",
    --
    --     -- Issue #86: per-class system_prompt override. When the
    --     -- classified request falls into a class with an entry here,
    --     -- the BASE system_prompt is REPLACED for that one request
    --     -- (dynamic blocks — [background], [project], [earlier
    --     -- summary], NORRIS suffix — still compose on top). Mostly
    --     -- useful for tightening small local models' instruction
    --     -- adherence. Default {} (no override).
    --     system_prompts = {
    --         code = [[You are a code assistant. Rules:
    -- 1. Output ONLY the requested code or command.
    -- 2. No prose explanation unless explicitly asked.
    -- 3. Wrap shell commands in CMD: prefix.
    -- 4. Max response: 200 tokens.]],
    --         default = [[You are a shell assistant.
    -- Output shell commands as: CMD: <command>
    -- Output answers as single short sentences.
    -- Do not ask clarifying questions.]],
    --         -- reasoning routes to cloud; no override usually needed
    --     },
    --
    --     -- Issue #88: per-class GBNF grammar passthrough. llama.cpp
    --     -- constrains the sampler to ONLY emit tokens matching the
    --     -- grammar — eliminates format drift on small models. Cloud
    --     -- (Anthropic/Bedrock) silently ignores the field, so default
    --     -- passthrough is safe; no per-model opt-out needed. Misformed
    --     -- grammar surfaces as a broker error at request time.
    --     grammars = {
    --         code    = [[root ::= "CMD: " [^\n]+ "\n"]],
    --         default = [[root ::= ("CMD: " [^\n]+ "\n") | [^\n]+ "\n"]],
    --     },
    -- },
    --
    -- Issue #88 (continued): for the safety LLM probe (YES/NO
    -- destructive classification), set safety.probe_grammar to force
    -- the probe model to emit exactly YES or NO. Eliminates the
    -- regex-match fallback for unparseable verdicts; small models
    -- become reliable enough to use as the probe.
    --
    -- safety = {
    --     llm_second_opinion = true,
    --     llm_model          = "fast",
    --     probe_grammar      = [[root ::= ("YES" | "NO")]],
    -- },

    -- ── Issue #87 (route-aware context compression).
    -- When a routed model preset has `local_compress = true`, each
    -- broker call against THAT preset gets a compressed view of
    -- ctx.turns: only the last `keep_turns` turns; any turn whose
    -- content exceeds `max_turn_chars` is tail-truncated. The full
    -- context lives on (visible via :history); compression is purely
    -- per-request for small models that effectively use a fraction
    -- of their advertised context window.
    --
    -- Set the per-model opt-in on models[<name>]:
    --     models.fast = { ..., local_compress = true }
    -- Defaults live under context.compress:
    --     context = {
    --         ...
    --         compress = { keep_turns = 2, max_turn_chars = 800 },
    --     }
    --
    -- Trade-off documented in the FR: tool turns lose information
    -- when tail-truncated. Acceptable for shell-output blocks (the
    -- tail is usually the relevant bit); known limitation for
    -- structured tool results. Disable per-model if it bites.

    -- ── Issue #89 / Phase 10: cloud preplanner → local executor split.
    -- When cfg.norris.preplanner names a model preset, :norris launch
    -- fires ONE broker.chat against that preset asking for a sequence
    -- of TASK: <imperative> lines. Parsed list (capped at tasks_max)
    -- becomes ctx.norris_tasks; the executor model (cfg.norris.executor,
    -- defaulting to the active :model selection) runs each task with
    -- the current task shown in the per-step header.
    --
    -- Goal: small fast local models are cheap per step but easily
    -- distracted on multi-step plans; cloud is capable at planning
    -- but expensive per step. Use cloud ONCE for the plan, local for
    -- every step. Falls back to single-model Norris (existing
    -- behavior) when preplanner unset / fails / produces no TASKs.
    --
    -- norris = {
    --     preplanner = "anthropic",   -- model name in cfg.models;
    --                                 -- this preset is called ONCE per
    --                                 -- :norris launch. Omit to run
    --                                 -- single-model (Phase 6 behavior).
    --     executor   = "fast",        -- model that runs each step.
    --                                 -- Omit to use the active :model.
    --     tasks_max  = 16,            -- cap on preplan list size.
    --     -- preplan_system = "...",  -- override the built-in prompt
    -- },
    --
    -- :cost detail separates norris-preplan and norris rows so you
    -- can see cloud planning cost vs local execution cost. The
    -- preplan call does NOT retry via fallback_model (a different
    -- model = a different decomposition; clean hard-fail to single-
    -- model is safer).

    -- ── Phase 5 context summarization on sliding-window eviction.
    -- Set INSIDE the context = { ... } block above to enable:
    --     context = {
    --         max_turns          = 40,
    --         token_budget       = 4096,
    --         summarize_on_evict = true,
    --         summarizer_model   = "fast",   -- model name in models{}
    --         max_summary_chars  = 2000,
    --     },
    -- When summarize_on_evict is true, evicted turn pairs are fed to
    -- summarizer_model and the result lives on ctx.summary, appended to
    -- the system prompt as [earlier conversation summary]. Suppressed
    -- in Norris mode (R-C4 — planner stays on its goal). If broker
    -- fails, falls back to Phase 0 silent eviction (no crash).

    -- Phase 6 (docs/PHASE6.md): project file-tree context + :diff /
    -- :tree / :highlight metas. The :diff and :tree metas work without
    -- any config. The `project` block below only controls the
    -- AUTO-injection-at-startup behavior; manual `:tree` always works
    -- regardless. Uncomment to enable startup auto-inject.
    --
    -- project = {
    --     auto_tree      = true,   -- run `:tree` once at startup
    --     tree_depth     = 3,      -- depth filter for the scan (find fallback only;
    --                              -- git ls-files emits full repo-relative paths)
    --     tree_max_chars = 4096,   -- truncate the injected block above this
    -- },
    --
    -- :highlight has no config flag in v1 — toggled at runtime only.
    -- Requires the external `tree-sitter` CLI plus configured parser-
    -- directories with cloned + built `tree-sitter-<lang>` grammars
    -- (see `:highlight on` for the install hints).

    -- Phase 7 (docs/PHASE7.md): cost / usage observability. broker.lua
    -- captures `usage` (+ `cost` for cloud) from every chat/chat_stream
    -- call and routes via ctx:add_usage to a per-session accumulator.
    -- `:cost` / `:cost detail` / `:cost reset` surface the totals.
    -- The `cost` block below configures OPTIONAL warn thresholds —
    -- a single status line fires the first time the cumulative
    -- crosses each threshold. Default off. Useful when paid cloud
    -- presets are in play so runaway-cost sessions get a nudge.
    --
    -- cost = {
    --     warn_at_dollars = 0.50,    -- one-shot warn when cumulative cost crosses
    --     warn_at_tokens  = 100000,  -- one-shot warn when cumulative tokens crosses
    -- },
    --
    -- Both flags are independent (R4 — first-to-fire doesn't suppress
    -- the other); `:cost reset` re-arms both. Per-turn usage is also
    -- written to session/*.jsonl (assistant-turn `usage` field) for
    -- after-the-fact scripting; cross-session aggregation deferred
    -- to a future phase (Q-C2).

    -- Phase 8 (docs/PHASE8.md): accurate tokenization via the broker's
    -- /tokenize endpoint, replacing the Phase 0 §8 char/4 heuristic.
    -- Two consequences when use_endpoint=true:
    --   (1) Context:estimate_tokens hits <endpoint>/tokenize once per
    --       new turn (cached on the turn dict thereafter). Network
    --       cost is one round-trip (~30ms) per fresh turn; subsequent
    --       calls reuse the cache.
    --   (2) Context:enforce_budget actually ENFORCES token_budget now
    --       (previously only max_turns was checked). Sessions that
    --       fit under char/4 may evict earlier — raise token_budget
    --       to match your model's real context window if needed.
    -- Cloud endpoints (OpenRouter) don't expose /tokenize; capability
    -- cached as unsupported on first probe -> silent char/4 fallback.
    --
    -- tokenize = {
    --     use_endpoint = true,
    -- },
}