From ed03f5970193513d4104603ff26b6b0790869f9a Mon Sep 17 00:00:00 2001 From: Ruben de Smet Date: Tue, 19 May 2026 17:29:09 +0200 Subject: [PATCH 1/2] feat(provider): AGENTMEMORY_DISABLE_THINKING to disable LLM thinking mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When AGENTMEMORY_DISABLE_THINKING=true, the OpenAI-compatible provider forces thinking mode OFF on hybrid-reasoning models (Qwen3 family, GLM, Kimi, DeepSeek V4-Flash). Without this, every call burns tokens on a ... block before the actual answer, and structured- output prompts (graph extraction, XML/JSON-mode summarization, etc.) often truncate inside the thinking block — yielding empty `content` and a meandering `reasoning` field that parsers can't recover. Belt-and-suspenders: send `chat_template_kwargs.enable_thinking=false` as the server-side signal AND prefix `/no_think` to the system message as the client-side fallback (same pattern as gitops-assistant's llm_engine.py:6207-6260, which has a documented "$7 Qwen3-32B incident" from missing this signal). The env var is opt-in (default off), so existing setups with thinking- required models are unaffected. Operators running Qwen3.x / GLM / Kimi behind an OpenAI-compatible endpoint (vLLM, LM Studio, Ollama, etc.) can set this to recover deterministic structured-output behaviour. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/providers/openai.ts | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/providers/openai.ts b/src/providers/openai.ts index bca2370f..4c5321f1 100644 --- a/src/providers/openai.ts +++ b/src/providers/openai.ts @@ -77,17 +77,39 @@ export class OpenAIProvider implements MemoryProvider { private async call(systemPrompt: string, userPrompt: string): Promise { const url = buildChatUrl(this.baseUrl, this.isAzure, this.azureApiVersion); + // AGENTMEMORY_DISABLE_THINKING=true forces thinking mode OFF on + // hybrid-reasoning models (Qwen3, GLM, Kimi, DeepSeek V4-Flash). + // Without this, every call burns tokens on a ... + // block before the actual answer, and structured-output prompts + // (graph extraction, XML/JSON-mode summarization, etc.) often + // truncate inside the thinking block — yielding empty `content` + // and a meandering `reasoning` field that parsers can't recover. + // + // Belt-and-suspenders: send `chat_template_kwargs.enable_thinking= + // false` as the server-side signal AND prefix `/no_think` to the + // system message as the client-side fallback (same pattern as + // gitops-assistant's llm_engine.py:6207-6260, which has a + // documented "$7 Qwen3-32B incident" from missing this signal). + const disableThinking = + (process.env["AGENTMEMORY_DISABLE_THINKING"] || "").toLowerCase() === "true"; + const effectiveSystemPrompt = disableThinking + ? `/no_think\n\n${systemPrompt}` + : systemPrompt; + const body: Record = { model: this.model, max_tokens: this.maxTokens, messages: [ - { role: "system", content: systemPrompt }, + { role: "system", content: effectiveSystemPrompt }, { role: "user", content: userPrompt }, ], }; if (this.reasoningEffort) { body.reasoning_effort = this.reasoningEffort; } + if (disableThinking) { + body.chat_template_kwargs = { enable_thinking: false }; + } // Bound the request via the shared fetchWithTimeout helper, which // owns the AbortController + clearTimeout cleanup for every raw-fetch From c91a69326166a02fd32d9c3ff07e9c730fbdd483 Mon Sep 17 00:00:00 2001 From: Ruben de Smet Date: Wed, 20 May 2026 22:10:16 +0200 Subject: [PATCH 2/2] openai: read AGENTMEMORY_DISABLE_THINKING via getEnvVar MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit nitpick on #569 — every other env var in this file goes through the getEnvVar helper, this one was the only direct process.env read. Aligns with the rest of the module. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/providers/openai.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/providers/openai.ts b/src/providers/openai.ts index 4c5321f1..6def787f 100644 --- a/src/providers/openai.ts +++ b/src/providers/openai.ts @@ -91,7 +91,7 @@ export class OpenAIProvider implements MemoryProvider { // gitops-assistant's llm_engine.py:6207-6260, which has a // documented "$7 Qwen3-32B incident" from missing this signal). const disableThinking = - (process.env["AGENTMEMORY_DISABLE_THINKING"] || "").toLowerCase() === "true"; + (getEnvVar("AGENTMEMORY_DISABLE_THINKING") || "").toLowerCase() === "true"; const effectiveSystemPrompt = disableThinking ? `/no_think\n\n${systemPrompt}` : systemPrompt;