From 72c7bce205f7071c11c1efd1e7f2920e2f3e3863 Mon Sep 17 00:00:00 2001 From: Denys Yurchenko Date: Wed, 4 Mar 2026 17:16:15 +0200 Subject: [PATCH] feat: add reasoning_effort support for litellm backend --- .../models/endpoints/litellm_model.py | 38 ++++- src/lighteval/models/model_input.py | 14 +- tests/unit/logging/test_evaluation_tracker.py | 1 + .../models/endpoints/test_endpoint_model.py | 1 + .../models/endpoints/test_litellm_model.py | 149 ++++++++++++++++++ tests/unit/models/endpoints/test_tgi_model.py | 1 + tests/unit/models/test_model_input.py | 28 ++++ 7 files changed, 224 insertions(+), 8 deletions(-) create mode 100644 tests/unit/models/endpoints/test_litellm_model.py diff --git a/src/lighteval/models/endpoints/litellm_model.py b/src/lighteval/models/endpoints/litellm_model.py index 87332d1d7..9034f206e 100644 --- a/src/lighteval/models/endpoints/litellm_model.py +++ b/src/lighteval/models/endpoints/litellm_model.py @@ -173,12 +173,27 @@ def _prepare_stop_sequence(self, stop_sequence): stop_sequence = [s for s in stop_sequence if s and s.strip()] return stop_sequence + @staticmethod + def _is_o_series_model(model_name: str) -> bool: + base_model_name = model_name.split("/")[-1].lower() + return base_model_name.startswith(("o1", "o3", "o4")) + def _prepare_max_new_tokens(self, max_new_tokens) -> int | None: """Calculate completion tokens based on max_new_tokens.""" if not max_new_tokens or max_new_tokens <= 0: return None - if supports_reasoning(self.model): + reasoning_effort = self.generation_parameters.reasoning_effort + should_boost_for_reasoning = isinstance(reasoning_effort, str) and reasoning_effort.strip().lower() != "none" + + if supports_reasoning(self.model) and reasoning_effort is None: + logger.warning( + f"Model {self.model} supports reasoning but no reasoning_effort is set. " + "Token budget will not be boosted for reasoning. If you want the model to reason, " + "set reasoning_effort explicitly (e.g., 'low', 'medium', 'high')." + ) + + if supports_reasoning(self.model) and should_boost_for_reasoning: # We need to allow more tokens to include reasoning tokens max_new_tokens = min(max_new_tokens * 10, self.max_length) @@ -212,12 +227,23 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se "timeout": self.timeout, } - if "o1" in self.model: - logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.") + litellm_generation_kwargs = self.generation_parameters.to_litellm_dict() + model_supports_reasoning = supports_reasoning(self.model) + # O-series models reject sampling params (temperature, top_p, stop); only pass reasoning_effort + if self._is_o_series_model(self.model): + logger.warning("O-series models do not support temperature, top_p, stop sequence. Disabling.") + reasoning_effort = litellm_generation_kwargs.get("reasoning_effort") + if reasoning_effort is not None: + kwargs["reasoning_effort"] = reasoning_effort else: - kwargs.update(self.generation_parameters.to_litellm_dict()) - - if kwargs.get("max_completion_tokens", None) is None: + kwargs.update(litellm_generation_kwargs) + + # OpenAI non-reasoning models reject max_tokens and max_completion_tokens set at the same time; + # drop max_completion_tokens and keep max_tokens (already set above) + is_openai_non_reasoning_model = self.provider == "openai" and not model_supports_reasoning + if is_openai_non_reasoning_model: + kwargs.pop("max_completion_tokens", None) + elif kwargs.get("max_completion_tokens", None) is None: kwargs["max_completion_tokens"] = max_new_tokens for attempt in range(self.API_MAX_RETRY): diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py index ad41c23eb..9e587f58d 100644 --- a/src/lighteval/models/model_input.py +++ b/src/lighteval/models/model_input.py @@ -52,6 +52,9 @@ class GenerationParameters(BaseModel, extra="forbid"): # response format to be followed by the model, # more info here https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format response_format: str | None = None # inference_providers + # Provider-agnostic reasoning control for litellm; litellm maps it to each provider's + # native format (e.g., OpenAI reasoning_effort, Anthropic thinking, Google thinkingBudget). + reasoning_effort: str | None = None # litellm @classmethod def from_dict(cls, config_dict: dict): @@ -118,6 +121,7 @@ def to_litellm_dict(self) -> dict: "seed": self.seed, "repetition_penalty": self.repetition_penalty, "frequency_penalty": self.frequency_penalty, + "reasoning_effort": self.reasoning_effort, } return {k: v for k, v in args.items() if v is not None} @@ -157,7 +161,12 @@ def to_vllm_dict(self) -> dict: # Task specific sampling params to set in model: n, best_of, use_beam_search # Generation specific params to set in model: logprobs, prompt_logprobs - x = {sampling_params_to_vllm_naming.get(k, k): v for k, v in self.model_dump().items() if v is not None} + x = { + sampling_params_to_vllm_naming.get(k, k): v + # Exclude reasoning_effort: vLLM's SamplingParams doesn't support it + for k, v in self.model_dump(exclude={"reasoning_effort"}).items() + if v is not None + } # VLLM max_tokens is 16 by default, however the pipeline expect the max_tokens to be None, if the user didn't specify it if not x.get("max_tokens"): x["max_tokens"] = None @@ -172,7 +181,8 @@ def to_vllm_openai_dict(self) -> dict: """ # Task specific sampling params to set in model: n, best_of, use_beam_search # Generation specific params to set in model: logprobs, prompt_logprobs - return {k: v for k, v in self.model_dump().items() if v is not None} + # Exclude reasoning_effort: vLLM's SamplingParams doesn't support it + return {k: v for k, v in self.model_dump(exclude={"reasoning_effort"}).items() if v is not None} def to_transformers_dict(self) -> dict: """Selects relevant generation and sampling parameters for transformers models. diff --git a/tests/unit/logging/test_evaluation_tracker.py b/tests/unit/logging/test_evaluation_tracker.py index 45c5790d0..f35db1751 100644 --- a/tests/unit/logging/test_evaluation_tracker.py +++ b/tests/unit/logging/test_evaluation_tracker.py @@ -250,6 +250,7 @@ def setUp(self): "truncate_prompt": None, "cache_implementation": None, "response_format": None, + "reasoning_effort": None, } # ruff: noqa: E501 self.dummy_ref_config = { "model_name": "test/case", diff --git a/tests/unit/models/endpoints/test_endpoint_model.py b/tests/unit/models/endpoints/test_endpoint_model.py index 4f009ca9a..35b11ab5f 100644 --- a/tests/unit/models/endpoints/test_endpoint_model.py +++ b/tests/unit/models/endpoints/test_endpoint_model.py @@ -70,6 +70,7 @@ class TestInferenceEndpointModelConfig: "top_p": 0.9, "truncate_prompt": None, "response_format": None, + "reasoning_effort": None, }, "cache_dir": "~/.cache/huggingface/lighteval", }, diff --git a/tests/unit/models/endpoints/test_litellm_model.py b/tests/unit/models/endpoints/test_litellm_model.py new file mode 100644 index 000000000..0190a1690 --- /dev/null +++ b/tests/unit/models/endpoints/test_litellm_model.py @@ -0,0 +1,149 @@ +# MIT License + +# Copyright (c) 2026 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from unittest.mock import Mock, patch + +import pytest + +from lighteval.models.endpoints.litellm_model import LiteLLMClient +from lighteval.models.model_input import GenerationParameters +from lighteval.utils.imports import is_package_available + + +pytestmark = pytest.mark.skipif(not is_package_available("litellm"), reason="litellm extra is not installed") + + +def _build_client(model_name: str, generation_parameters: GenerationParameters) -> LiteLLMClient: + client = LiteLLMClient.__new__(LiteLLMClient) + client.model = model_name + client.provider = "openai" + client.base_url = None + client.api_key = None + client.generation_parameters = generation_parameters + client._max_length = 10_000 + client.API_MAX_RETRY = 1 + client.API_RETRY_SLEEP = 0 + client.API_RETRY_MULTIPLIER = 1 + client.timeout = None + return client + + +@pytest.mark.parametrize( + "reasoning_effort, supports_reasoning_value, expected_prepared_max_new_tokens", + [ + (None, True, 100), + ("none", True, 100), + ("low", False, 100), + ("low", True, 1000), + ], +) +def test_prepare_max_new_tokens_boosts_only_with_reasoning_effort( + reasoning_effort: str | None, supports_reasoning_value: bool, expected_prepared_max_new_tokens: int +): + client = _build_client("openai/o3-mini", GenerationParameters(reasoning_effort=reasoning_effort)) + + with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=supports_reasoning_value): + assert client._prepare_max_new_tokens(100) == expected_prepared_max_new_tokens + + +def test_call_api_o_series_keeps_reasoning_effort_but_drops_sampling_params(): + client = _build_client("openai/o3-mini", GenerationParameters(temperature=0.2, top_p=0.9, reasoning_effort="low")) + response = Mock() + response.choices = [Mock(message=Mock(content="ok"))] + + with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=False): + with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion: + client._LiteLLMClient__call_api( + prompt=[{"role": "user", "content": "hello"}], + return_logits=False, + max_new_tokens=64, + num_samples=1, + stop_sequence=None, + ) + + completion_kwargs = completion.call_args.kwargs + assert completion_kwargs["reasoning_effort"] == "low" + assert "temperature" not in completion_kwargs + assert "top_p" not in completion_kwargs + + +def test_call_api_non_o_series_passes_full_litellm_generation_kwargs(): + client = _build_client( + "google/gemini-2.5-flash", GenerationParameters(temperature=0.2, top_p=0.9, reasoning_effort="low") + ) + response = Mock() + response.choices = [Mock(message=Mock(content="ok"))] + + with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=False): + with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion: + client._LiteLLMClient__call_api( + prompt=[{"role": "user", "content": "hello"}], + return_logits=False, + max_new_tokens=64, + num_samples=1, + stop_sequence=None, + ) + + completion_kwargs = completion.call_args.kwargs + assert completion_kwargs["temperature"] == 0.2 + assert completion_kwargs["top_p"] == 0.9 + assert completion_kwargs["reasoning_effort"] == "low" + + +def test_call_api_openai_non_reasoning_uses_only_max_tokens(): + client = _build_client("openai/gpt-4.1-nano", GenerationParameters(max_new_tokens=96)) + response = Mock() + response.choices = [Mock(message=Mock(content="ok"))] + + with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=False): + with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion: + client._LiteLLMClient__call_api( + prompt=[{"role": "user", "content": "hello"}], + return_logits=False, + max_new_tokens=64, + num_samples=1, + stop_sequence=None, + ) + + completion_kwargs = completion.call_args.kwargs + assert completion_kwargs["max_tokens"] == 64 + assert "max_completion_tokens" not in completion_kwargs + + +def test_call_api_openai_reasoning_keeps_max_completion_tokens(): + client = _build_client("openai/gpt-5-mini", GenerationParameters(max_new_tokens=96, reasoning_effort="low")) + response = Mock() + response.choices = [Mock(message=Mock(content="ok"))] + + with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=True): + with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion: + client._LiteLLMClient__call_api( + prompt=[{"role": "user", "content": "hello"}], + return_logits=False, + max_new_tokens=64, + num_samples=1, + stop_sequence=None, + ) + + completion_kwargs = completion.call_args.kwargs + assert completion_kwargs["max_tokens"] == 640 + assert completion_kwargs["max_completion_tokens"] == 96 diff --git a/tests/unit/models/endpoints/test_tgi_model.py b/tests/unit/models/endpoints/test_tgi_model.py index e784bc0d4..23acd7256 100644 --- a/tests/unit/models/endpoints/test_tgi_model.py +++ b/tests/unit/models/endpoints/test_tgi_model.py @@ -58,6 +58,7 @@ class TestTGIModelConfig: "top_p": None, "truncate_prompt": None, "response_format": None, + "reasoning_effort": None, }, "cache_dir": "~/.cache/huggingface/lighteval", }, diff --git a/tests/unit/models/test_model_input.py b/tests/unit/models/test_model_input.py index 7c06df445..e4fa8f856 100644 --- a/tests/unit/models/test_model_input.py +++ b/tests/unit/models/test_model_input.py @@ -47,3 +47,31 @@ def test_extract_num_samples(self, model_args: str, expected): gen = GenerationParameters.from_model_args(model_args) for k, v in expected.items(): assert getattr(gen, k) == v + + @pytest.mark.parametrize("reasoning_effort", ["low", "medium", "high"]) + def test_extract_reasoning_effort(self, reasoning_effort: str): + model_args = ( + "pretrained=google/gemini-2.5-flash," + f'generation_parameters={{temperature: 0.2,reasoning_effort: "{reasoning_effort}"}},' + "dtype=float16" + ) + gen = GenerationParameters.from_model_args(model_args) + + assert gen.temperature == 0.2 + assert gen.reasoning_effort == reasoning_effort + + @pytest.mark.parametrize("reasoning_effort", ["low", "medium", "high"]) + def test_to_litellm_dict_includes_reasoning_effort(self, reasoning_effort: str): + gen = GenerationParameters(temperature=0.2, top_p=0.9, reasoning_effort=reasoning_effort) + + assert gen.to_litellm_dict() == {"temperature": 0.2, "top_p": 0.9, "reasoning_effort": reasoning_effort} + + def test_vllm_dict_excludes_reasoning_effort(self): + gen = GenerationParameters(max_new_tokens=128, temperature=0.1, reasoning_effort="low") + + assert gen.to_vllm_dict() == {"max_tokens": 128, "temperature": 0.1} + + def test_vllm_openai_dict_excludes_reasoning_effort(self): + gen = GenerationParameters(max_new_tokens=128, temperature=0.1, reasoning_effort="low") + + assert gen.to_vllm_openai_dict() == {"max_new_tokens": 128, "temperature": 0.1}