From 01f2678df4484ad959a6385f8596152b25fc00a3 Mon Sep 17 00:00:00 2001
From: hexin <he.xin@h3c.com>
Date: Tue, 12 May 2026 11:46:09 +0800
Subject: [PATCH] feat(engine): allow DEEPSEEK_MAX_OUTPUT_TOKENS env override
 for tight-context providers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `effective_max_output_tokens` heuristic defaults to 64K for any model
not in the known-context-window table. This is fine for DeepSeek's hosted
API (1M context) but causes immediate HTTP 400s on self-hosted providers
with tight `max-model-len`.

Example: vLLM serving Qwen3.6 with `--max-model-len 65536` rejects
requests because 64000 (output) + ~1500 (input) exceeds the limit by 1
token.

This change lets the operator set `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` (or
whatever fits their deployment) to override the heuristic. The env var
takes precedence over the model-table lookup when set to a positive
integer; otherwise the existing behavior is preserved.

No new config struct field — env-only override keeps the public API
unchanged. Useful for embedded users (e.g. pinvou3) who need to control
output budget without forking the engine config schema.
---
 crates/tui/src/core/engine/context.rs | 14 +++++
 crates/tui/src/core/engine/tests.rs   | 82 +++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)
diff --git a/crates/tui/src/core/engine/context.rs b/crates/tui/src/core/engine/context.rs
index 6a28d6b45..726f1a920 100644
--- a/crates/tui/src/core/engine/context.rs
+++ b/crates/tui/src/core/engine/context.rs
@@ -28,7 +28,21 @@ const API_MAX_OUTPUT_TOKENS: u32 = 65_536;
 /// model. Uses `API_MAX_OUTPUT_TOKENS` (64K) which fits within common provider
 /// limits (128K+ total). For non-V4 models with smaller context windows, caps
 /// at half the context window.
+///
+/// Override: when the env var `DEEPSEEK_MAX_OUTPUT_TOKENS` is set to a positive
+/// integer, this function returns that value directly. Use this for self-hosted
+/// providers (vLLM/SGLang) whose `max-model-len` is tight and where the
+/// model-table heuristic above would over-allocate. Example: vLLM serving
+/// Qwen3.6 with `--max-model-len 65536` should set
+/// `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` so input + output stays well under the
+/// provider's hard limit.
 pub(super) fn effective_max_output_tokens(model: &str) -> u32 {
+    if let Ok(raw) = std::env::var("DEEPSEEK_MAX_OUTPUT_TOKENS")
+        && let Ok(n) = raw.trim().parse::<u32>()
+        && n > 0
+    {
+        return n;
+    }
     let window = context_window_for_model(model).unwrap_or(128_000);
     if window >= 500_000 {
         // V4-class models on large-context providers: use 64K which is safe
diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs
index e68f5fb24..347957e1f 100644
--- a/crates/tui/src/core/engine/tests.rs
+++ b/crates/tui/src/core/engine/tests.rs
@@ -915,6 +915,9 @@ fn detects_context_length_errors_from_provider_payloads() {
 
 #[test]
 fn context_budget_reserves_output_and_headroom() {
+    // Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
+    // the internal effective_max_output_tokens() call sees a stable env.
+    let _lock = lock_test_env();
     // V4 has a 1M context window — the only family that comfortably hosts
     // a 256K output reservation without saturating the input budget to 0.
     let budget = context_input_budget("deepseek-v4-pro")
@@ -926,6 +929,9 @@ fn context_budget_reserves_output_and_headroom() {
 
 #[test]
 fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
+    // Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
+    // v4_cap and flash_cap below see the same env state.
+    let _lock = lock_test_env();
     // V4 models have a 1M context window but the API request cap must stay
     // well below common provider limits (e.g., 131K total on self-hosted
     // vLLM/SGLang). The cap should never exceed 65K.
@@ -943,8 +949,84 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() {
     assert_eq!(v4_cap, flash_cap);
 }
 
+struct ScopedDeepSeekMaxOutputTokens {
+    previous: Option<OsString>,
+}
+
+impl ScopedDeepSeekMaxOutputTokens {
+    fn set(value: &str) -> Self {
+        let previous = std::env::var_os("DEEPSEEK_MAX_OUTPUT_TOKENS");
+        // Safety: tests using this helper serialize with lock_test_env() and
+        // restore the original value in Drop.
+        unsafe {
+            std::env::set_var("DEEPSEEK_MAX_OUTPUT_TOKENS", value);
+        }
+        Self { previous }
+    }
+
+    fn unset() -> Self {
+        let previous = std::env::var_os("DEEPSEEK_MAX_OUTPUT_TOKENS");
+        // Safety: see set().
+        unsafe {
+            std::env::remove_var("DEEPSEEK_MAX_OUTPUT_TOKENS");
+        }
+        Self { previous }
+    }
+}
+
+impl Drop for ScopedDeepSeekMaxOutputTokens {
+    fn drop(&mut self) {
+        // Safety: tests using this helper serialize with lock_test_env().
+        unsafe {
+            if let Some(previous) = self.previous.take() {
+                std::env::set_var("DEEPSEEK_MAX_OUTPUT_TOKENS", previous);
+            } else {
+                std::env::remove_var("DEEPSEEK_MAX_OUTPUT_TOKENS");
+            }
+        }
+    }
+}
+
+#[test]
+fn effective_max_output_tokens_env_override_returns_positive_value() {
+    let _lock = lock_test_env();
+    let _guard = ScopedDeepSeekMaxOutputTokens::set("16384");
+
+    // Override applies regardless of model — V4 hosted, V4 flash, sub-500K
+    // self-hosted all return the env value verbatim.
+    assert_eq!(effective_max_output_tokens("deepseek-v4-pro"), 16_384);
+    assert_eq!(effective_max_output_tokens("deepseek-v4-flash"), 16_384);
+    assert_eq!(effective_max_output_tokens("qwen3-32b-256k"), 16_384);
+}
+
+#[test]
+fn effective_max_output_tokens_env_override_rejects_zero_and_invalid() {
+    let _lock = lock_test_env();
+    // Establish the heuristic baseline with the env unset.
+    let baseline = {
+        let _guard = ScopedDeepSeekMaxOutputTokens::unset();
+        effective_max_output_tokens("deepseek-v4-pro")
+    };
+    assert!(baseline > 0);
+
+    // 0, non-numeric, and empty values must all fall through to the heuristic
+    // rather than producing a zero/garbage cap that would silently break
+    // request budgeting.
+    for raw in ["0", "abc", "", "  ", "-1"] {
+        let _guard = ScopedDeepSeekMaxOutputTokens::set(raw);
+        assert_eq!(
+            effective_max_output_tokens("deepseek-v4-pro"),
+            baseline,
+            "env={raw:?} should fall through to heuristic"
+        );
+    }
+}
+
 #[test]
 fn internal_context_budget_tiers_reserved_output_by_window() {
+    // Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so
+    // both branches below see a stable env.
+    let _lock = lock_test_env();
     // Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS
     // headroom so long V4 sessions don't compact prematurely.
     let internal_budget =