From ed03f5970193513d4104603ff26b6b0790869f9a Mon Sep 17 00:00:00 2001
From: Ruben de Smet <ruben@lunascens.io>
Date: Tue, 19 May 2026 17:29:09 +0200
Subject: [PATCH 1/2] feat(provider): AGENTMEMORY_DISABLE_THINKING to disable
 LLM thinking mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When AGENTMEMORY_DISABLE_THINKING=true, the OpenAI-compatible provider
forces thinking mode OFF on hybrid-reasoning models (Qwen3 family,
GLM, Kimi, DeepSeek V4-Flash). Without this, every call burns tokens
on a <think>...</think> block before the actual answer, and structured-
output prompts (graph extraction, XML/JSON-mode summarization, etc.)
often truncate inside the thinking block — yielding empty `content`
and a meandering `reasoning` field that parsers can't recover.

Belt-and-suspenders: send `chat_template_kwargs.enable_thinking=false`
as the server-side signal AND prefix `/no_think` to the system message
as the client-side fallback (same pattern as gitops-assistant's
llm_engine.py:6207-6260, which has a documented "$7 Qwen3-32B
incident" from missing this signal).

The env var is opt-in (default off), so existing setups with thinking-
required models are unaffected. Operators running Qwen3.x / GLM / Kimi
behind an OpenAI-compatible endpoint (vLLM, LM Studio, Ollama, etc.)
can set this to recover deterministic structured-output behaviour.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/providers/openai.ts | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/src/providers/openai.ts b/src/providers/openai.ts
index bca2370f..4c5321f1 100644
--- a/src/providers/openai.ts
+++ b/src/providers/openai.ts
@@ -77,17 +77,39 @@ export class OpenAIProvider implements MemoryProvider {
 
   private async call(systemPrompt: string, userPrompt: string): Promise<string> {
     const url = buildChatUrl(this.baseUrl, this.isAzure, this.azureApiVersion);
+    // AGENTMEMORY_DISABLE_THINKING=true forces thinking mode OFF on
+    // hybrid-reasoning models (Qwen3, GLM, Kimi, DeepSeek V4-Flash).
+    // Without this, every call burns tokens on a <think>...</think>
+    // block before the actual answer, and structured-output prompts
+    // (graph extraction, XML/JSON-mode summarization, etc.) often
+    // truncate inside the thinking block — yielding empty `content`
+    // and a meandering `reasoning` field that parsers can't recover.
+    //
+    // Belt-and-suspenders: send `chat_template_kwargs.enable_thinking=
+    // false` as the server-side signal AND prefix `/no_think` to the
+    // system message as the client-side fallback (same pattern as
+    // gitops-assistant's llm_engine.py:6207-6260, which has a
+    // documented "$7 Qwen3-32B incident" from missing this signal).
+    const disableThinking =
+      (process.env["AGENTMEMORY_DISABLE_THINKING"] || "").toLowerCase() === "true";
+    const effectiveSystemPrompt = disableThinking
+      ? `/no_think\n\n${systemPrompt}`
+      : systemPrompt;
+
     const body: Record<string, unknown> = {
       model: this.model,
       max_tokens: this.maxTokens,
       messages: [
-        { role: "system", content: systemPrompt },
+        { role: "system", content: effectiveSystemPrompt },
         { role: "user", content: userPrompt },
       ],
     };
     if (this.reasoningEffort) {
       body.reasoning_effort = this.reasoningEffort;
     }
+    if (disableThinking) {
+      body.chat_template_kwargs = { enable_thinking: false };
+    }
 
     // Bound the request via the shared fetchWithTimeout helper, which
     // owns the AbortController + clearTimeout cleanup for every raw-fetch

From c91a69326166a02fd32d9c3ff07e9c730fbdd483 Mon Sep 17 00:00:00 2001
From: Ruben de Smet <ruben@lunascens.io>
Date: Wed, 20 May 2026 22:10:16 +0200
Subject: [PATCH 2/2] openai: read AGENTMEMORY_DISABLE_THINKING via getEnvVar
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CodeRabbit nitpick on #569 — every other env var in this file goes
through the getEnvVar helper, this one was the only direct process.env
read. Aligns with the rest of the module.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/providers/openai.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/providers/openai.ts b/src/providers/openai.ts
index 4c5321f1..6def787f 100644
--- a/src/providers/openai.ts
+++ b/src/providers/openai.ts
@@ -91,7 +91,7 @@ export class OpenAIProvider implements MemoryProvider {
     // gitops-assistant's llm_engine.py:6207-6260, which has a
     // documented "$7 Qwen3-32B incident" from missing this signal).
     const disableThinking =
-      (process.env["AGENTMEMORY_DISABLE_THINKING"] || "").toLowerCase() === "true";
+      (getEnvVar("AGENTMEMORY_DISABLE_THINKING") || "").toLowerCase() === "true";
     const effectiveSystemPrompt = disableThinking
       ? `/no_think\n\n${systemPrompt}`
       : systemPrompt;