From 01f2678df4484ad959a6385f8596152b25fc00a3 Mon Sep 17 00:00:00 2001 From: hexin Date: Tue, 12 May 2026 11:46:09 +0800 Subject: [PATCH] feat(engine): allow DEEPSEEK_MAX_OUTPUT_TOKENS env override for tight-context providers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `effective_max_output_tokens` heuristic defaults to 64K for any model not in the known-context-window table. This is fine for DeepSeek's hosted API (1M context) but causes immediate HTTP 400s on self-hosted providers with tight `max-model-len`. Example: vLLM serving Qwen3.6 with `--max-model-len 65536` rejects requests because 64000 (output) + ~1500 (input) exceeds the limit by 1 token. This change lets the operator set `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` (or whatever fits their deployment) to override the heuristic. The env var takes precedence over the model-table lookup when set to a positive integer; otherwise the existing behavior is preserved. No new config struct field — env-only override keeps the public API unchanged. Useful for embedded users (e.g. pinvou3) who need to control output budget without forking the engine config schema. --- crates/tui/src/core/engine/context.rs | 14 +++++ crates/tui/src/core/engine/tests.rs | 82 +++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/crates/tui/src/core/engine/context.rs b/crates/tui/src/core/engine/context.rs index 6a28d6b45..726f1a920 100644 --- a/crates/tui/src/core/engine/context.rs +++ b/crates/tui/src/core/engine/context.rs @@ -28,7 +28,21 @@ const API_MAX_OUTPUT_TOKENS: u32 = 65_536; /// model. Uses `API_MAX_OUTPUT_TOKENS` (64K) which fits within common provider /// limits (128K+ total). For non-V4 models with smaller context windows, caps /// at half the context window. +/// +/// Override: when the env var `DEEPSEEK_MAX_OUTPUT_TOKENS` is set to a positive +/// integer, this function returns that value directly. Use this for self-hosted +/// providers (vLLM/SGLang) whose `max-model-len` is tight and where the +/// model-table heuristic above would over-allocate. Example: vLLM serving +/// Qwen3.6 with `--max-model-len 65536` should set +/// `DEEPSEEK_MAX_OUTPUT_TOKENS=16384` so input + output stays well under the +/// provider's hard limit. pub(super) fn effective_max_output_tokens(model: &str) -> u32 { + if let Ok(raw) = std::env::var("DEEPSEEK_MAX_OUTPUT_TOKENS") + && let Ok(n) = raw.trim().parse::() + && n > 0 + { + return n; + } let window = context_window_for_model(model).unwrap_or(128_000); if window >= 500_000 { // V4-class models on large-context providers: use 64K which is safe diff --git a/crates/tui/src/core/engine/tests.rs b/crates/tui/src/core/engine/tests.rs index e68f5fb24..347957e1f 100644 --- a/crates/tui/src/core/engine/tests.rs +++ b/crates/tui/src/core/engine/tests.rs @@ -915,6 +915,9 @@ fn detects_context_length_errors_from_provider_payloads() { #[test] fn context_budget_reserves_output_and_headroom() { + // Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so + // the internal effective_max_output_tokens() call sees a stable env. + let _lock = lock_test_env(); // V4 has a 1M context window — the only family that comfortably hosts // a 256K output reservation without saturating the input budget to 0. let budget = context_input_budget("deepseek-v4-pro") @@ -926,6 +929,9 @@ fn context_budget_reserves_output_and_headroom() { #[test] fn effective_max_output_tokens_caps_api_request_for_large_window_models() { + // Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so + // v4_cap and flash_cap below see the same env state. + let _lock = lock_test_env(); // V4 models have a 1M context window but the API request cap must stay // well below common provider limits (e.g., 131K total on self-hosted // vLLM/SGLang). The cap should never exceed 65K. @@ -943,8 +949,84 @@ fn effective_max_output_tokens_caps_api_request_for_large_window_models() { assert_eq!(v4_cap, flash_cap); } +struct ScopedDeepSeekMaxOutputTokens { + previous: Option, +} + +impl ScopedDeepSeekMaxOutputTokens { + fn set(value: &str) -> Self { + let previous = std::env::var_os("DEEPSEEK_MAX_OUTPUT_TOKENS"); + // Safety: tests using this helper serialize with lock_test_env() and + // restore the original value in Drop. + unsafe { + std::env::set_var("DEEPSEEK_MAX_OUTPUT_TOKENS", value); + } + Self { previous } + } + + fn unset() -> Self { + let previous = std::env::var_os("DEEPSEEK_MAX_OUTPUT_TOKENS"); + // Safety: see set(). + unsafe { + std::env::remove_var("DEEPSEEK_MAX_OUTPUT_TOKENS"); + } + Self { previous } + } +} + +impl Drop for ScopedDeepSeekMaxOutputTokens { + fn drop(&mut self) { + // Safety: tests using this helper serialize with lock_test_env(). + unsafe { + if let Some(previous) = self.previous.take() { + std::env::set_var("DEEPSEEK_MAX_OUTPUT_TOKENS", previous); + } else { + std::env::remove_var("DEEPSEEK_MAX_OUTPUT_TOKENS"); + } + } + } +} + +#[test] +fn effective_max_output_tokens_env_override_returns_positive_value() { + let _lock = lock_test_env(); + let _guard = ScopedDeepSeekMaxOutputTokens::set("16384"); + + // Override applies regardless of model — V4 hosted, V4 flash, sub-500K + // self-hosted all return the env value verbatim. + assert_eq!(effective_max_output_tokens("deepseek-v4-pro"), 16_384); + assert_eq!(effective_max_output_tokens("deepseek-v4-flash"), 16_384); + assert_eq!(effective_max_output_tokens("qwen3-32b-256k"), 16_384); +} + +#[test] +fn effective_max_output_tokens_env_override_rejects_zero_and_invalid() { + let _lock = lock_test_env(); + // Establish the heuristic baseline with the env unset. + let baseline = { + let _guard = ScopedDeepSeekMaxOutputTokens::unset(); + effective_max_output_tokens("deepseek-v4-pro") + }; + assert!(baseline > 0); + + // 0, non-numeric, and empty values must all fall through to the heuristic + // rather than producing a zero/garbage cap that would silently break + // request budgeting. + for raw in ["0", "abc", "", " ", "-1"] { + let _guard = ScopedDeepSeekMaxOutputTokens::set(raw); + assert_eq!( + effective_max_output_tokens("deepseek-v4-pro"), + baseline, + "env={raw:?} should fall through to heuristic" + ); + } +} + #[test] fn internal_context_budget_tiers_reserved_output_by_window() { + // Serialize with other tests that mutate DEEPSEEK_MAX_OUTPUT_TOKENS so + // both branches below see a stable env. + let _lock = lock_test_env(); // Large-context (>=500K) models reserve the full TURN_MAX_OUTPUT_TOKENS // headroom so long V4 sessions don't compact prematurely. let internal_budget =