diff --git a/config.lua b/config.lua index 58bc39a..621ddc6 100644 --- a/config.lua +++ b/config.lua @@ -285,6 +285,28 @@ return { -- probe_grammar = [[root ::= ("YES" | "NO")]], -- }, + -- ── Issue #87 (route-aware context compression). + -- When a routed model preset has `local_compress = true`, each + -- broker call against THAT preset gets a compressed view of + -- ctx.turns: only the last `keep_turns` turns; any turn whose + -- content exceeds `max_turn_chars` is tail-truncated. The full + -- context lives on (visible via :history); compression is purely + -- per-request for small models that effectively use a fraction + -- of their advertised context window. + -- + -- Set the per-model opt-in on models[]: + -- models.fast = { ..., local_compress = true } + -- Defaults live under context.compress: + -- context = { + -- ... + -- compress = { keep_turns = 2, max_turn_chars = 800 }, + -- } + -- + -- Trade-off documented in the FR: tool turns lose information + -- when tail-truncated. Acceptable for shell-output blocks (the + -- tail is usually the relevant bit); known limitation for + -- structured tool results. Disable per-model if it bites. + -- ── Phase 5 context summarization on sliding-window eviction. -- Set INSIDE the context = { ... } block above to enable: -- context = { diff --git a/context.lua b/context.lua index 6acbe36..895dff7 100644 --- a/context.lua +++ b/context.lua @@ -228,6 +228,40 @@ The user will be prompted to confirm destructive actions; expect their verdict in the next turn as a synthesized "[aish] ... skipped by user" message if they declined.]] +-- #87: route-aware context compression. Keeps the LAST keep_turns +-- turns; tail-truncates any turn whose content exceeds max_turn_chars. +-- Drops tool turns at the slice head (they'd be orphaned without +-- their assistant-with-tool_calls anchor; strict chat templates +-- reject the resulting tool-without-anchor shape). Returns a new +-- list of turn-shaped tables; self.turns is NEVER mutated. +local function _compress_turns(turns, keep_turns, max_chars) + local n = #turns + if keep_turns and n > keep_turns then + -- start index is the first turn we keep + end + local start = math.max(1, n - (keep_turns or 2) + 1) + -- Drop orphan tool turns at the head. + while start <= n and turns[start].role == "tool" do + start = start + 1 + end + local out = {} + for i = start, n do + local t = turns[i] + local c = t.content or "" + if max_chars and #c > max_chars then + out[#out + 1] = { + role = t.role, + content = c:sub(-max_chars), + tool_calls = t.tool_calls, + tool_call_id = t.tool_call_id, + } + else + out[#out + 1] = t -- ref the existing turn; no copy needed + end + end + return out +end + function Context:to_messages(opts) -- Phase 10 (#86): per-call system_prompt_override. Replaces the -- BASE system_prompt for THIS render only (state unchanged); the @@ -254,8 +288,19 @@ function Context:to_messages(opts) end local msgs = { { role = "system", content = sys_content } } + -- #87: route-aware compression. When opts.compress is set, swap + -- the turn iterable for a truncated copy. self.turns unchanged + -- (this is a per-render transformation; persistence + display + -- via :history see the full context). + local turns = self.turns + if opts and opts.compress then + turns = _compress_turns(self.turns, + opts.compress.keep_turns or 2, + opts.compress.max_turn_chars or 800) + end + if self.use_tool_role then - for _, t in ipairs(self.turns) do + for _, t in ipairs(turns) do local m = { role = t.role, content = t.content } if t.role == "assistant" and t.tool_calls then -- OpenAI shape wraps each call as @@ -292,9 +337,10 @@ function Context:to_messages(opts) end end + -- #87: same compressed `turns` view used by the fallback path. local i = 1 - while i <= #self.turns do - local t = self.turns[i] + while i <= #turns do + local t = turns[i] if t.role == "assistant" and t.tool_calls then local parts = {} if t.content and t.content ~= "" then @@ -302,7 +348,7 @@ function Context:to_messages(opts) end for ci, call in ipairs(t.tool_calls) do local result_text = "" - local next_t = self.turns[i + ci] + local next_t = turns[i + ci] if next_t and next_t.role == "tool" and next_t.tool_call_id == call.id then result_text = next_t.content diff --git a/repl.lua b/repl.lua index fdbdd1a..657919f 100644 --- a/repl.lua +++ b/repl.lua @@ -1007,6 +1007,20 @@ function M.run(config) and config.routing.grammars and req_class and config.routing.grammars[req_class] + -- #87: route-aware context compression. When the routed model + -- preset has `local_compress = true`, ctx:to_messages keeps only + -- the last N turns and tail-truncates oversized content for + -- THIS request. Cloud routes (model_cfg.local_compress nil/false) + -- get the full context unchanged. Defaults from cfg.context.compress; + -- per-model opt-in keeps the design surface predictable. + local compress_opts + if req_cfg and req_cfg.local_compress then + local cc = (config.context and config.context.compress) or {} + compress_opts = { + keep_turns = cc.keep_turns or 2, + max_turn_chars = cc.max_turn_chars or 800, + } + end local depth = 0 local final_resp = "" @@ -1017,7 +1031,10 @@ function M.run(config) local tool_calls_seen = {} local redact_mode = secrets_mode_for(req_cfg) local scrubbed_msgs = scrub_messages( - ctx:to_messages({ system_prompt_override = sys_override }), + ctx:to_messages({ + system_prompt_override = sys_override, + compress = compress_opts, + }), redact_mode) -- Streaming rehydrator wraps the on_delta so the user sees real -- values; text_parts accumulates the REHYDRATED chunks so