Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion benchmarks/utils/llm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@
from pathlib import Path

from openhands.sdk import LLM
from openhands.sdk.llm.utils.model_features import model_matches


# Models where LiteLLM handles reasoning_effort incorrectly.
# LiteLLM maps reasoning_effort="high" to type="adaptive" for 4.6 but to
# type="enabled" with fixed budget_tokens=4096 for 4.7, causing issues.
Comment on lines +6 to +11
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Suggestion: Add a comment linking to the upstream LiteLLM issue (if one exists) so we know when this workaround can be removed:

# Models where LiteLLM handles reasoning_effort incorrectly.
# TODO: Remove this workaround once LiteLLM fixes the mapping.
# See: https://github.com/BerriAI/litellm/issues/XXXXX
# LiteLLM maps reasoning_effort="high" to type="adaptive" for 4.6 but to
# type="enabled" with fixed budget_tokens=4096 for 4.7, causing issues.
OPUS_4_7_MODELS = [
    "claude-opus-4-7",
]

If no issue exists, consider filing one to track this upstream.

OPUS_4_7_MODELS = [
"claude-opus-4-7",
]


def load_llm_config(config_path: str | Path) -> LLM:
Expand All @@ -13,4 +22,19 @@ def load_llm_config(config_path: str | Path) -> LLM:
with config_path.open("r", encoding="utf-8") as f:
llm_config = f.read()

return LLM.model_validate_json(llm_config)
llm = LLM.model_validate_json(llm_config)

# FIX: LiteLLM handles reasoning_effort differently for Opus 4.6 vs 4.7.
# For 4.6, reasoning_effort="high" maps to type="adaptive" (model decides).
# For 4.7, it maps to type="enabled" with fixed budget_tokens=4096.
# This causes unexpected behavior (excessive thinking, token limit issues).
# The fix: disable reasoning_effort for Opus 4.7 models to use default behavior.
if model_matches(llm.model, OPUS_4_7_MODELS) and llm.reasoning_effort is not None:
llm = LLM(
**{
**llm.model_dump(),
"reasoning_effort": None,
}
Comment on lines +32 to +37
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Suggestion: Pydantic models support model.copy(update={...}) which is cleaner than manual dict unpacking:

Suggested change
if model_matches(llm.model, OPUS_4_7_MODELS) and llm.reasoning_effort is not None:
llm = LLM(
**{
**llm.model_dump(),
"reasoning_effort": None,
}
if model_matches(llm.model, OPUS_4_7_MODELS) and llm.reasoning_effort is not None:
llm = llm.model_copy(update={"reasoning_effort": None})

This avoids the nested dict unpacking and is more idiomatic for Pydantic models.

)

return llm
Loading