From 72c7bce205f7071c11c1efd1e7f2920e2f3e3863 Mon Sep 17 00:00:00 2001
From: Denys Yurchenko <denys.yurchenko@kyivstar.net>
Date: Wed, 4 Mar 2026 17:16:15 +0200
Subject: [PATCH] feat: add reasoning_effort support for litellm backend

---
 .../models/endpoints/litellm_model.py         |  38 ++++-
 src/lighteval/models/model_input.py           |  14 +-
 tests/unit/logging/test_evaluation_tracker.py |   1 +
 .../models/endpoints/test_endpoint_model.py   |   1 +
 .../models/endpoints/test_litellm_model.py    | 149 ++++++++++++++++++
 tests/unit/models/endpoints/test_tgi_model.py |   1 +
 tests/unit/models/test_model_input.py         |  28 ++++
 7 files changed, 224 insertions(+), 8 deletions(-)
 create mode 100644 tests/unit/models/endpoints/test_litellm_model.py

diff --git a/src/lighteval/models/endpoints/litellm_model.py b/src/lighteval/models/endpoints/litellm_model.py
index 87332d1d7..9034f206e 100644
--- a/src/lighteval/models/endpoints/litellm_model.py
+++ b/src/lighteval/models/endpoints/litellm_model.py
@@ -173,12 +173,27 @@ def _prepare_stop_sequence(self, stop_sequence):
                 stop_sequence = [s for s in stop_sequence if s and s.strip()]
         return stop_sequence
 
+    @staticmethod
+    def _is_o_series_model(model_name: str) -> bool:
+        base_model_name = model_name.split("/")[-1].lower()
+        return base_model_name.startswith(("o1", "o3", "o4"))
+
     def _prepare_max_new_tokens(self, max_new_tokens) -> int | None:
         """Calculate completion tokens based on max_new_tokens."""
         if not max_new_tokens or max_new_tokens <= 0:
             return None
 
-        if supports_reasoning(self.model):
+        reasoning_effort = self.generation_parameters.reasoning_effort
+        should_boost_for_reasoning = isinstance(reasoning_effort, str) and reasoning_effort.strip().lower() != "none"
+
+        if supports_reasoning(self.model) and reasoning_effort is None:
+            logger.warning(
+                f"Model {self.model} supports reasoning but no reasoning_effort is set. "
+                "Token budget will not be boosted for reasoning. If you want the model to reason, "
+                "set reasoning_effort explicitly (e.g., 'low', 'medium', 'high')."
+            )
+
+        if supports_reasoning(self.model) and should_boost_for_reasoning:
             # We need to allow more tokens to include reasoning tokens
             max_new_tokens = min(max_new_tokens * 10, self.max_length)
 
@@ -212,12 +227,23 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se
             "timeout": self.timeout,
         }
 
-        if "o1" in self.model:
-            logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.")
+        litellm_generation_kwargs = self.generation_parameters.to_litellm_dict()
+        model_supports_reasoning = supports_reasoning(self.model)
+        # O-series models reject sampling params (temperature, top_p, stop); only pass reasoning_effort
+        if self._is_o_series_model(self.model):
+            logger.warning("O-series models do not support temperature, top_p, stop sequence. Disabling.")
+            reasoning_effort = litellm_generation_kwargs.get("reasoning_effort")
+            if reasoning_effort is not None:
+                kwargs["reasoning_effort"] = reasoning_effort
         else:
-            kwargs.update(self.generation_parameters.to_litellm_dict())
-
-        if kwargs.get("max_completion_tokens", None) is None:
+            kwargs.update(litellm_generation_kwargs)
+
+        # OpenAI non-reasoning models reject max_tokens and max_completion_tokens set at the same time;
+        # drop max_completion_tokens and keep max_tokens (already set above)
+        is_openai_non_reasoning_model = self.provider == "openai" and not model_supports_reasoning
+        if is_openai_non_reasoning_model:
+            kwargs.pop("max_completion_tokens", None)
+        elif kwargs.get("max_completion_tokens", None) is None:
             kwargs["max_completion_tokens"] = max_new_tokens
 
         for attempt in range(self.API_MAX_RETRY):
diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py
index ad41c23eb..9e587f58d 100644
--- a/src/lighteval/models/model_input.py
+++ b/src/lighteval/models/model_input.py
@@ -52,6 +52,9 @@ class GenerationParameters(BaseModel, extra="forbid"):
     # response format to be followed by the model,
     # more info here https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format
     response_format: str | None = None  # inference_providers
+    # Provider-agnostic reasoning control for litellm; litellm maps it to each provider's
+    # native format (e.g., OpenAI reasoning_effort, Anthropic thinking, Google thinkingBudget).
+    reasoning_effort: str | None = None  # litellm
 
     @classmethod
     def from_dict(cls, config_dict: dict):
@@ -118,6 +121,7 @@ def to_litellm_dict(self) -> dict:
             "seed": self.seed,
             "repetition_penalty": self.repetition_penalty,
             "frequency_penalty": self.frequency_penalty,
+            "reasoning_effort": self.reasoning_effort,
         }
         return {k: v for k, v in args.items() if v is not None}
 
@@ -157,7 +161,12 @@ def to_vllm_dict(self) -> dict:
 
         # Task specific sampling params to set in model: n, best_of, use_beam_search
         # Generation specific params to set in model: logprobs, prompt_logprobs
-        x = {sampling_params_to_vllm_naming.get(k, k): v for k, v in self.model_dump().items() if v is not None}
+        x = {
+            sampling_params_to_vllm_naming.get(k, k): v
+            # Exclude reasoning_effort: vLLM's SamplingParams doesn't support it
+            for k, v in self.model_dump(exclude={"reasoning_effort"}).items()
+            if v is not None
+        }
         # VLLM max_tokens is 16 by default, however the pipeline expect the max_tokens to be None, if the user didn't specify it
         if not x.get("max_tokens"):
             x["max_tokens"] = None
@@ -172,7 +181,8 @@ def to_vllm_openai_dict(self) -> dict:
         """
         # Task specific sampling params to set in model: n, best_of, use_beam_search
         # Generation specific params to set in model: logprobs, prompt_logprobs
-        return {k: v for k, v in self.model_dump().items() if v is not None}
+        # Exclude reasoning_effort: vLLM's SamplingParams doesn't support it
+        return {k: v for k, v in self.model_dump(exclude={"reasoning_effort"}).items() if v is not None}
 
     def to_transformers_dict(self) -> dict:
         """Selects relevant generation and sampling parameters for transformers models.
diff --git a/tests/unit/logging/test_evaluation_tracker.py b/tests/unit/logging/test_evaluation_tracker.py
index 45c5790d0..f35db1751 100644
--- a/tests/unit/logging/test_evaluation_tracker.py
+++ b/tests/unit/logging/test_evaluation_tracker.py
@@ -250,6 +250,7 @@ def setUp(self):
             "truncate_prompt": None,
             "cache_implementation": None,
             "response_format": None,
+            "reasoning_effort": None,
         }  # ruff: noqa: E501
         self.dummy_ref_config = {
             "model_name": "test/case",
diff --git a/tests/unit/models/endpoints/test_endpoint_model.py b/tests/unit/models/endpoints/test_endpoint_model.py
index 4f009ca9a..35b11ab5f 100644
--- a/tests/unit/models/endpoints/test_endpoint_model.py
+++ b/tests/unit/models/endpoints/test_endpoint_model.py
@@ -70,6 +70,7 @@ class TestInferenceEndpointModelConfig:
                         "top_p": 0.9,
                         "truncate_prompt": None,
                         "response_format": None,
+                        "reasoning_effort": None,
                     },
                     "cache_dir": "~/.cache/huggingface/lighteval",
                 },
diff --git a/tests/unit/models/endpoints/test_litellm_model.py b/tests/unit/models/endpoints/test_litellm_model.py
new file mode 100644
index 000000000..0190a1690
--- /dev/null
+++ b/tests/unit/models/endpoints/test_litellm_model.py
@@ -0,0 +1,149 @@
+# MIT License
+
+# Copyright (c) 2026 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from unittest.mock import Mock, patch
+
+import pytest
+
+from lighteval.models.endpoints.litellm_model import LiteLLMClient
+from lighteval.models.model_input import GenerationParameters
+from lighteval.utils.imports import is_package_available
+
+
+pytestmark = pytest.mark.skipif(not is_package_available("litellm"), reason="litellm extra is not installed")
+
+
+def _build_client(model_name: str, generation_parameters: GenerationParameters) -> LiteLLMClient:
+    client = LiteLLMClient.__new__(LiteLLMClient)
+    client.model = model_name
+    client.provider = "openai"
+    client.base_url = None
+    client.api_key = None
+    client.generation_parameters = generation_parameters
+    client._max_length = 10_000
+    client.API_MAX_RETRY = 1
+    client.API_RETRY_SLEEP = 0
+    client.API_RETRY_MULTIPLIER = 1
+    client.timeout = None
+    return client
+
+
+@pytest.mark.parametrize(
+    "reasoning_effort, supports_reasoning_value, expected_prepared_max_new_tokens",
+    [
+        (None, True, 100),
+        ("none", True, 100),
+        ("low", False, 100),
+        ("low", True, 1000),
+    ],
+)
+def test_prepare_max_new_tokens_boosts_only_with_reasoning_effort(
+    reasoning_effort: str | None, supports_reasoning_value: bool, expected_prepared_max_new_tokens: int
+):
+    client = _build_client("openai/o3-mini", GenerationParameters(reasoning_effort=reasoning_effort))
+
+    with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=supports_reasoning_value):
+        assert client._prepare_max_new_tokens(100) == expected_prepared_max_new_tokens
+
+
+def test_call_api_o_series_keeps_reasoning_effort_but_drops_sampling_params():
+    client = _build_client("openai/o3-mini", GenerationParameters(temperature=0.2, top_p=0.9, reasoning_effort="low"))
+    response = Mock()
+    response.choices = [Mock(message=Mock(content="ok"))]
+
+    with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=False):
+        with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion:
+            client._LiteLLMClient__call_api(
+                prompt=[{"role": "user", "content": "hello"}],
+                return_logits=False,
+                max_new_tokens=64,
+                num_samples=1,
+                stop_sequence=None,
+            )
+
+    completion_kwargs = completion.call_args.kwargs
+    assert completion_kwargs["reasoning_effort"] == "low"
+    assert "temperature" not in completion_kwargs
+    assert "top_p" not in completion_kwargs
+
+
+def test_call_api_non_o_series_passes_full_litellm_generation_kwargs():
+    client = _build_client(
+        "google/gemini-2.5-flash", GenerationParameters(temperature=0.2, top_p=0.9, reasoning_effort="low")
+    )
+    response = Mock()
+    response.choices = [Mock(message=Mock(content="ok"))]
+
+    with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=False):
+        with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion:
+            client._LiteLLMClient__call_api(
+                prompt=[{"role": "user", "content": "hello"}],
+                return_logits=False,
+                max_new_tokens=64,
+                num_samples=1,
+                stop_sequence=None,
+            )
+
+    completion_kwargs = completion.call_args.kwargs
+    assert completion_kwargs["temperature"] == 0.2
+    assert completion_kwargs["top_p"] == 0.9
+    assert completion_kwargs["reasoning_effort"] == "low"
+
+
+def test_call_api_openai_non_reasoning_uses_only_max_tokens():
+    client = _build_client("openai/gpt-4.1-nano", GenerationParameters(max_new_tokens=96))
+    response = Mock()
+    response.choices = [Mock(message=Mock(content="ok"))]
+
+    with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=False):
+        with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion:
+            client._LiteLLMClient__call_api(
+                prompt=[{"role": "user", "content": "hello"}],
+                return_logits=False,
+                max_new_tokens=64,
+                num_samples=1,
+                stop_sequence=None,
+            )
+
+    completion_kwargs = completion.call_args.kwargs
+    assert completion_kwargs["max_tokens"] == 64
+    assert "max_completion_tokens" not in completion_kwargs
+
+
+def test_call_api_openai_reasoning_keeps_max_completion_tokens():
+    client = _build_client("openai/gpt-5-mini", GenerationParameters(max_new_tokens=96, reasoning_effort="low"))
+    response = Mock()
+    response.choices = [Mock(message=Mock(content="ok"))]
+
+    with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=True):
+        with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion:
+            client._LiteLLMClient__call_api(
+                prompt=[{"role": "user", "content": "hello"}],
+                return_logits=False,
+                max_new_tokens=64,
+                num_samples=1,
+                stop_sequence=None,
+            )
+
+    completion_kwargs = completion.call_args.kwargs
+    assert completion_kwargs["max_tokens"] == 640
+    assert completion_kwargs["max_completion_tokens"] == 96
diff --git a/tests/unit/models/endpoints/test_tgi_model.py b/tests/unit/models/endpoints/test_tgi_model.py
index e784bc0d4..23acd7256 100644
--- a/tests/unit/models/endpoints/test_tgi_model.py
+++ b/tests/unit/models/endpoints/test_tgi_model.py
@@ -58,6 +58,7 @@ class TestTGIModelConfig:
                         "top_p": None,
                         "truncate_prompt": None,
                         "response_format": None,
+                        "reasoning_effort": None,
                     },
                     "cache_dir": "~/.cache/huggingface/lighteval",
                 },
diff --git a/tests/unit/models/test_model_input.py b/tests/unit/models/test_model_input.py
index 7c06df445..e4fa8f856 100644
--- a/tests/unit/models/test_model_input.py
+++ b/tests/unit/models/test_model_input.py
@@ -47,3 +47,31 @@ def test_extract_num_samples(self, model_args: str, expected):
         gen = GenerationParameters.from_model_args(model_args)
         for k, v in expected.items():
             assert getattr(gen, k) == v
+
+    @pytest.mark.parametrize("reasoning_effort", ["low", "medium", "high"])
+    def test_extract_reasoning_effort(self, reasoning_effort: str):
+        model_args = (
+            "pretrained=google/gemini-2.5-flash,"
+            f'generation_parameters={{temperature: 0.2,reasoning_effort: "{reasoning_effort}"}},'
+            "dtype=float16"
+        )
+        gen = GenerationParameters.from_model_args(model_args)
+
+        assert gen.temperature == 0.2
+        assert gen.reasoning_effort == reasoning_effort
+
+    @pytest.mark.parametrize("reasoning_effort", ["low", "medium", "high"])
+    def test_to_litellm_dict_includes_reasoning_effort(self, reasoning_effort: str):
+        gen = GenerationParameters(temperature=0.2, top_p=0.9, reasoning_effort=reasoning_effort)
+
+        assert gen.to_litellm_dict() == {"temperature": 0.2, "top_p": 0.9, "reasoning_effort": reasoning_effort}
+
+    def test_vllm_dict_excludes_reasoning_effort(self):
+        gen = GenerationParameters(max_new_tokens=128, temperature=0.1, reasoning_effort="low")
+
+        assert gen.to_vllm_dict() == {"max_tokens": 128, "temperature": 0.1}
+
+    def test_vllm_openai_dict_excludes_reasoning_effort(self):
+        gen = GenerationParameters(max_new_tokens=128, temperature=0.1, reasoning_effort="low")
+
+        assert gen.to_vllm_openai_dict() == {"max_new_tokens": 128, "temperature": 0.1}