Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 32 additions & 6 deletions src/lighteval/models/endpoints/litellm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,27 @@ def _prepare_stop_sequence(self, stop_sequence):
stop_sequence = [s for s in stop_sequence if s and s.strip()]
return stop_sequence

@staticmethod
def _is_o_series_model(model_name: str) -> bool:
base_model_name = model_name.split("/")[-1].lower()
return base_model_name.startswith(("o1", "o3", "o4"))

def _prepare_max_new_tokens(self, max_new_tokens) -> int | None:
"""Calculate completion tokens based on max_new_tokens."""
if not max_new_tokens or max_new_tokens <= 0:
return None

if supports_reasoning(self.model):
reasoning_effort = self.generation_parameters.reasoning_effort
should_boost_for_reasoning = isinstance(reasoning_effort, str) and reasoning_effort.strip().lower() != "none"

if supports_reasoning(self.model) and reasoning_effort is None:
logger.warning(
f"Model {self.model} supports reasoning but no reasoning_effort is set. "
"Token budget will not be boosted for reasoning. If you want the model to reason, "
"set reasoning_effort explicitly (e.g., 'low', 'medium', 'high')."
)

if supports_reasoning(self.model) and should_boost_for_reasoning:
# We need to allow more tokens to include reasoning tokens
max_new_tokens = min(max_new_tokens * 10, self.max_length)

Expand Down Expand Up @@ -212,12 +227,23 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se
"timeout": self.timeout,
}

if "o1" in self.model:
logger.warning("O1 models do not support temperature, top_p, stop sequence. Disabling.")
litellm_generation_kwargs = self.generation_parameters.to_litellm_dict()
model_supports_reasoning = supports_reasoning(self.model)
# O-series models reject sampling params (temperature, top_p, stop); only pass reasoning_effort
if self._is_o_series_model(self.model):
logger.warning("O-series models do not support temperature, top_p, stop sequence. Disabling.")
reasoning_effort = litellm_generation_kwargs.get("reasoning_effort")
if reasoning_effort is not None:
kwargs["reasoning_effort"] = reasoning_effort
else:
kwargs.update(self.generation_parameters.to_litellm_dict())

if kwargs.get("max_completion_tokens", None) is None:
kwargs.update(litellm_generation_kwargs)

# OpenAI non-reasoning models reject max_tokens and max_completion_tokens set at the same time;
# drop max_completion_tokens and keep max_tokens (already set above)
is_openai_non_reasoning_model = self.provider == "openai" and not model_supports_reasoning
if is_openai_non_reasoning_model:
kwargs.pop("max_completion_tokens", None)
elif kwargs.get("max_completion_tokens", None) is None:
kwargs["max_completion_tokens"] = max_new_tokens

for attempt in range(self.API_MAX_RETRY):
Expand Down
14 changes: 12 additions & 2 deletions src/lighteval/models/model_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class GenerationParameters(BaseModel, extra="forbid"):
# response format to be followed by the model,
# more info here https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format
response_format: str | None = None # inference_providers
# Provider-agnostic reasoning control for litellm; litellm maps it to each provider's
# native format (e.g., OpenAI reasoning_effort, Anthropic thinking, Google thinkingBudget).
reasoning_effort: str | None = None # litellm

@classmethod
def from_dict(cls, config_dict: dict):
Expand Down Expand Up @@ -118,6 +121,7 @@ def to_litellm_dict(self) -> dict:
"seed": self.seed,
"repetition_penalty": self.repetition_penalty,
"frequency_penalty": self.frequency_penalty,
"reasoning_effort": self.reasoning_effort,
}
return {k: v for k, v in args.items() if v is not None}

Expand Down Expand Up @@ -157,7 +161,12 @@ def to_vllm_dict(self) -> dict:

# Task specific sampling params to set in model: n, best_of, use_beam_search
# Generation specific params to set in model: logprobs, prompt_logprobs
x = {sampling_params_to_vllm_naming.get(k, k): v for k, v in self.model_dump().items() if v is not None}
x = {
sampling_params_to_vllm_naming.get(k, k): v
# Exclude reasoning_effort: vLLM's SamplingParams doesn't support it
for k, v in self.model_dump(exclude={"reasoning_effort"}).items()
if v is not None
}
# VLLM max_tokens is 16 by default, however the pipeline expect the max_tokens to be None, if the user didn't specify it
if not x.get("max_tokens"):
x["max_tokens"] = None
Expand All @@ -172,7 +181,8 @@ def to_vllm_openai_dict(self) -> dict:
"""
# Task specific sampling params to set in model: n, best_of, use_beam_search
# Generation specific params to set in model: logprobs, prompt_logprobs
return {k: v for k, v in self.model_dump().items() if v is not None}
# Exclude reasoning_effort: vLLM's SamplingParams doesn't support it
return {k: v for k, v in self.model_dump(exclude={"reasoning_effort"}).items() if v is not None}

def to_transformers_dict(self) -> dict:
"""Selects relevant generation and sampling parameters for transformers models.
Expand Down
1 change: 1 addition & 0 deletions tests/unit/logging/test_evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ def setUp(self):
"truncate_prompt": None,
"cache_implementation": None,
"response_format": None,
"reasoning_effort": None,
} # ruff: noqa: E501
self.dummy_ref_config = {
"model_name": "test/case",
Expand Down
1 change: 1 addition & 0 deletions tests/unit/models/endpoints/test_endpoint_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class TestInferenceEndpointModelConfig:
"top_p": 0.9,
"truncate_prompt": None,
"response_format": None,
"reasoning_effort": None,
},
"cache_dir": "~/.cache/huggingface/lighteval",
},
Expand Down
149 changes: 149 additions & 0 deletions tests/unit/models/endpoints/test_litellm_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# MIT License

# Copyright (c) 2026 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from unittest.mock import Mock, patch

import pytest

from lighteval.models.endpoints.litellm_model import LiteLLMClient
from lighteval.models.model_input import GenerationParameters
from lighteval.utils.imports import is_package_available


pytestmark = pytest.mark.skipif(not is_package_available("litellm"), reason="litellm extra is not installed")


def _build_client(model_name: str, generation_parameters: GenerationParameters) -> LiteLLMClient:
client = LiteLLMClient.__new__(LiteLLMClient)
client.model = model_name
client.provider = "openai"
client.base_url = None
client.api_key = None
client.generation_parameters = generation_parameters
client._max_length = 10_000
client.API_MAX_RETRY = 1
client.API_RETRY_SLEEP = 0
client.API_RETRY_MULTIPLIER = 1
client.timeout = None
return client


@pytest.mark.parametrize(
"reasoning_effort, supports_reasoning_value, expected_prepared_max_new_tokens",
[
(None, True, 100),
("none", True, 100),
("low", False, 100),
("low", True, 1000),
],
)
def test_prepare_max_new_tokens_boosts_only_with_reasoning_effort(
reasoning_effort: str | None, supports_reasoning_value: bool, expected_prepared_max_new_tokens: int
):
client = _build_client("openai/o3-mini", GenerationParameters(reasoning_effort=reasoning_effort))

with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=supports_reasoning_value):
assert client._prepare_max_new_tokens(100) == expected_prepared_max_new_tokens


def test_call_api_o_series_keeps_reasoning_effort_but_drops_sampling_params():
client = _build_client("openai/o3-mini", GenerationParameters(temperature=0.2, top_p=0.9, reasoning_effort="low"))
response = Mock()
response.choices = [Mock(message=Mock(content="ok"))]

with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=False):
with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion:
client._LiteLLMClient__call_api(
prompt=[{"role": "user", "content": "hello"}],
return_logits=False,
max_new_tokens=64,
num_samples=1,
stop_sequence=None,
)

completion_kwargs = completion.call_args.kwargs
assert completion_kwargs["reasoning_effort"] == "low"
assert "temperature" not in completion_kwargs
assert "top_p" not in completion_kwargs


def test_call_api_non_o_series_passes_full_litellm_generation_kwargs():
client = _build_client(
"google/gemini-2.5-flash", GenerationParameters(temperature=0.2, top_p=0.9, reasoning_effort="low")
)
response = Mock()
response.choices = [Mock(message=Mock(content="ok"))]

with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=False):
with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion:
client._LiteLLMClient__call_api(
prompt=[{"role": "user", "content": "hello"}],
return_logits=False,
max_new_tokens=64,
num_samples=1,
stop_sequence=None,
)

completion_kwargs = completion.call_args.kwargs
assert completion_kwargs["temperature"] == 0.2
assert completion_kwargs["top_p"] == 0.9
assert completion_kwargs["reasoning_effort"] == "low"


def test_call_api_openai_non_reasoning_uses_only_max_tokens():
client = _build_client("openai/gpt-4.1-nano", GenerationParameters(max_new_tokens=96))
response = Mock()
response.choices = [Mock(message=Mock(content="ok"))]

with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=False):
with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion:
client._LiteLLMClient__call_api(
prompt=[{"role": "user", "content": "hello"}],
return_logits=False,
max_new_tokens=64,
num_samples=1,
stop_sequence=None,
)

completion_kwargs = completion.call_args.kwargs
assert completion_kwargs["max_tokens"] == 64
assert "max_completion_tokens" not in completion_kwargs


def test_call_api_openai_reasoning_keeps_max_completion_tokens():
client = _build_client("openai/gpt-5-mini", GenerationParameters(max_new_tokens=96, reasoning_effort="low"))
response = Mock()
response.choices = [Mock(message=Mock(content="ok"))]

with patch("lighteval.models.endpoints.litellm_model.supports_reasoning", return_value=True):
with patch("lighteval.models.endpoints.litellm_model.litellm.completion", return_value=response) as completion:
client._LiteLLMClient__call_api(
prompt=[{"role": "user", "content": "hello"}],
return_logits=False,
max_new_tokens=64,
num_samples=1,
stop_sequence=None,
)

completion_kwargs = completion.call_args.kwargs
assert completion_kwargs["max_tokens"] == 640
assert completion_kwargs["max_completion_tokens"] == 96
1 change: 1 addition & 0 deletions tests/unit/models/endpoints/test_tgi_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class TestTGIModelConfig:
"top_p": None,
"truncate_prompt": None,
"response_format": None,
"reasoning_effort": None,
},
"cache_dir": "~/.cache/huggingface/lighteval",
},
Expand Down
28 changes: 28 additions & 0 deletions tests/unit/models/test_model_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,31 @@ def test_extract_num_samples(self, model_args: str, expected):
gen = GenerationParameters.from_model_args(model_args)
for k, v in expected.items():
assert getattr(gen, k) == v

@pytest.mark.parametrize("reasoning_effort", ["low", "medium", "high"])
def test_extract_reasoning_effort(self, reasoning_effort: str):
model_args = (
"pretrained=google/gemini-2.5-flash,"
f'generation_parameters={{temperature: 0.2,reasoning_effort: "{reasoning_effort}"}},'
"dtype=float16"
)
gen = GenerationParameters.from_model_args(model_args)

assert gen.temperature == 0.2
assert gen.reasoning_effort == reasoning_effort

@pytest.mark.parametrize("reasoning_effort", ["low", "medium", "high"])
def test_to_litellm_dict_includes_reasoning_effort(self, reasoning_effort: str):
gen = GenerationParameters(temperature=0.2, top_p=0.9, reasoning_effort=reasoning_effort)

assert gen.to_litellm_dict() == {"temperature": 0.2, "top_p": 0.9, "reasoning_effort": reasoning_effort}

def test_vllm_dict_excludes_reasoning_effort(self):
gen = GenerationParameters(max_new_tokens=128, temperature=0.1, reasoning_effort="low")

assert gen.to_vllm_dict() == {"max_tokens": 128, "temperature": 0.1}

def test_vllm_openai_dict_excludes_reasoning_effort(self):
gen = GenerationParameters(max_new_tokens=128, temperature=0.1, reasoning_effort="low")

assert gen.to_vllm_openai_dict() == {"max_new_tokens": 128, "temperature": 0.1}