From c55077bc07e45fdb5af0b1c143279372abf84b5d Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Sun, 17 May 2026 07:50:07 +0000 Subject: [PATCH] context + repl + config: route-aware context compression (closes #87) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Small local models effectively use a fraction of their advertised context window. Per-request compression for routes that hit a local-compress-flagged model preset: keeps only the last N turns and tail-truncates oversized content. Cloud routes get the full context unchanged. Changes: - context.lua _compress_turns(turns, keep, max_chars): returns a new list (self.turns NEVER mutated) with the last `keep` turns preserved + content tail-truncated to `max_chars`. Defensive: drops tool turns at the slice head (orphaned without their assistant-with-tool_calls anchor — strict chat templates would reject them; same gotcha PHASE0 §6 warned about for user/user). - Context:to_messages(opts) — opts.compress = { keep_turns, max_turn_chars } swaps the turn iterable for the compressed view. Affects BOTH the use_tool_role=true path and the use_tool_role=false fallback (PHASE2.md Q18 strict-template workaround). Persistence + display via :history see the full uncompressed ctx.turns. - repl.lua ask_ai: when req_cfg (the routed model's cfg) has `local_compress = true`, build compress_opts from config.context.compress (defaults keep_turns=2, max_turn_chars=800). Pass through ctx:to_messages alongside the existing system_prompt_override (#86) — orthogonal opts that compose. - Norris unaffected: safety.norris_step builds its own messages array; the planner needs full history per PHASE3 design. - config.lua gains a header comment explaining the per-model opt-in + the context.compress defaults block + the documented tool-turn truncation trade-off. 13 unit cases verified: - no opts -> full turn list (no regression) - keep_turns=2 -> exactly last 2 emitted - long content tail-truncated to max_chars - self.turns unchanged after render - orphan tool-turn at slice head dropped (no chat-template violation) - tool turn included WITH its assistant anchor when keep_turns >= 3 E2E against live local broker: - models.fast.local_compress = true; keep_turns=1; max=200 - 4-turn session: each broker call sees ONLY the current turn (verified by short coherent CMD replies despite no cross-turn memory available to the model). FR-promised small-model friendliness in action; conversation continuity is the documented trade-off. Regression: test_safety 87/87, test_router_model 31/31, repl loads. Co-Authored-By: Claude Opus 4.7 (1M context) --- config.lua | 22 ++++++++++++++++++++++ context.lua | 54 +++++++++++++++++++++++++++++++++++++++++++++++++---- repl.lua | 19 ++++++++++++++++++- 3 files changed, 90 insertions(+), 5 deletions(-) diff --git a/config.lua b/config.lua index 58bc39a..621ddc6 100644 --- a/config.lua +++ b/config.lua @@ -285,6 +285,28 @@ return { -- probe_grammar = [[root ::= ("YES" | "NO")]], -- }, + -- ── Issue #87 (route-aware context compression). + -- When a routed model preset has `local_compress = true`, each + -- broker call against THAT preset gets a compressed view of + -- ctx.turns: only the last `keep_turns` turns; any turn whose + -- content exceeds `max_turn_chars` is tail-truncated. The full + -- context lives on (visible via :history); compression is purely + -- per-request for small models that effectively use a fraction + -- of their advertised context window. + -- + -- Set the per-model opt-in on models[]: + -- models.fast = { ..., local_compress = true } + -- Defaults live under context.compress: + -- context = { + -- ... + -- compress = { keep_turns = 2, max_turn_chars = 800 }, + -- } + -- + -- Trade-off documented in the FR: tool turns lose information + -- when tail-truncated. Acceptable for shell-output blocks (the + -- tail is usually the relevant bit); known limitation for + -- structured tool results. Disable per-model if it bites. + -- ── Phase 5 context summarization on sliding-window eviction. -- Set INSIDE the context = { ... } block above to enable: -- context = { diff --git a/context.lua b/context.lua index 6acbe36..895dff7 100644 --- a/context.lua +++ b/context.lua @@ -228,6 +228,40 @@ The user will be prompted to confirm destructive actions; expect their verdict in the next turn as a synthesized "[aish] ... skipped by user" message if they declined.]] +-- #87: route-aware context compression. Keeps the LAST keep_turns +-- turns; tail-truncates any turn whose content exceeds max_turn_chars. +-- Drops tool turns at the slice head (they'd be orphaned without +-- their assistant-with-tool_calls anchor; strict chat templates +-- reject the resulting tool-without-anchor shape). Returns a new +-- list of turn-shaped tables; self.turns is NEVER mutated. +local function _compress_turns(turns, keep_turns, max_chars) + local n = #turns + if keep_turns and n > keep_turns then + -- start index is the first turn we keep + end + local start = math.max(1, n - (keep_turns or 2) + 1) + -- Drop orphan tool turns at the head. + while start <= n and turns[start].role == "tool" do + start = start + 1 + end + local out = {} + for i = start, n do + local t = turns[i] + local c = t.content or "" + if max_chars and #c > max_chars then + out[#out + 1] = { + role = t.role, + content = c:sub(-max_chars), + tool_calls = t.tool_calls, + tool_call_id = t.tool_call_id, + } + else + out[#out + 1] = t -- ref the existing turn; no copy needed + end + end + return out +end + function Context:to_messages(opts) -- Phase 10 (#86): per-call system_prompt_override. Replaces the -- BASE system_prompt for THIS render only (state unchanged); the @@ -254,8 +288,19 @@ function Context:to_messages(opts) end local msgs = { { role = "system", content = sys_content } } + -- #87: route-aware compression. When opts.compress is set, swap + -- the turn iterable for a truncated copy. self.turns unchanged + -- (this is a per-render transformation; persistence + display + -- via :history see the full context). + local turns = self.turns + if opts and opts.compress then + turns = _compress_turns(self.turns, + opts.compress.keep_turns or 2, + opts.compress.max_turn_chars or 800) + end + if self.use_tool_role then - for _, t in ipairs(self.turns) do + for _, t in ipairs(turns) do local m = { role = t.role, content = t.content } if t.role == "assistant" and t.tool_calls then -- OpenAI shape wraps each call as @@ -292,9 +337,10 @@ function Context:to_messages(opts) end end + -- #87: same compressed `turns` view used by the fallback path. local i = 1 - while i <= #self.turns do - local t = self.turns[i] + while i <= #turns do + local t = turns[i] if t.role == "assistant" and t.tool_calls then local parts = {} if t.content and t.content ~= "" then @@ -302,7 +348,7 @@ function Context:to_messages(opts) end for ci, call in ipairs(t.tool_calls) do local result_text = "" - local next_t = self.turns[i + ci] + local next_t = turns[i + ci] if next_t and next_t.role == "tool" and next_t.tool_call_id == call.id then result_text = next_t.content diff --git a/repl.lua b/repl.lua index fdbdd1a..657919f 100644 --- a/repl.lua +++ b/repl.lua @@ -1007,6 +1007,20 @@ function M.run(config) and config.routing.grammars and req_class and config.routing.grammars[req_class] + -- #87: route-aware context compression. When the routed model + -- preset has `local_compress = true`, ctx:to_messages keeps only + -- the last N turns and tail-truncates oversized content for + -- THIS request. Cloud routes (model_cfg.local_compress nil/false) + -- get the full context unchanged. Defaults from cfg.context.compress; + -- per-model opt-in keeps the design surface predictable. + local compress_opts + if req_cfg and req_cfg.local_compress then + local cc = (config.context and config.context.compress) or {} + compress_opts = { + keep_turns = cc.keep_turns or 2, + max_turn_chars = cc.max_turn_chars or 800, + } + end local depth = 0 local final_resp = "" @@ -1017,7 +1031,10 @@ function M.run(config) local tool_calls_seen = {} local redact_mode = secrets_mode_for(req_cfg) local scrubbed_msgs = scrub_messages( - ctx:to_messages({ system_prompt_override = sys_override }), + ctx:to_messages({ + system_prompt_override = sys_override, + compress = compress_opts, + }), redact_mode) -- Streaming rehydrator wraps the on_delta so the user sees real -- values; text_parts accumulates the REHYDRATED chunks so