diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 1d6d9f7c..f8e7a23e 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -1,9 +1,9 @@ import asyncio from typing import List -from openai import AsyncOpenAI +from litellm import acompletion +from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall -from eval_protocol.auth import get_fireworks_api_base, get_fireworks_api_key from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest.types import RolloutProcessorConfig @@ -11,11 +11,7 @@ async def default_single_turn_rollout_processor( rows: List[EvaluationRow], config: RolloutProcessorConfig ) -> List[EvaluationRow]: - """Generate a single response from a Fireworks model concurrently.""" - - api_key = get_fireworks_api_key() - api_base = get_fireworks_api_base() - client = AsyncOpenAI(api_key=api_key, base_url=f"{api_base}/inference/v1") + """Generate a single response from any supported model provider using LiteLLM.""" async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row asynchronously.""" @@ -24,17 +20,35 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: messages_payload = [{"role": m.role, "content": m.content} for m in row.messages] - create_kwargs = dict(model=config.model, messages=messages_payload, **config.input_params) + request_params = {"model": config.model, "messages": messages_payload, **config.input_params} + if row.tools is not None: - create_kwargs["tools"] = row.tools - response = await client.chat.completions.create(**create_kwargs) + request_params["tools"] = row.tools + + response = await acompletion(**request_params) + assistant_content = response.choices[0].message.content or "" tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None + + converted_tool_calls = None + if tool_calls: + converted_tool_calls = [ + ChatCompletionMessageToolCall( + id=tool_call.id, + type=tool_call.type, + function={ + "name": tool_call.function.name, + "arguments": tool_call.function.arguments, + }, + ) + for tool_call in tool_calls + ] + messages = list(row.messages) + [ Message( role="assistant", content=assistant_content, - tool_calls=tool_calls, + tool_calls=converted_tool_calls, ) ] diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py index d56f9427..4780388a 100644 --- a/tests/pytest/test_apps_coding.py +++ b/tests/pytest/test_apps_coding.py @@ -29,7 +29,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio @evaluation_test( input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"], dataset_adapter=apps_dataset_to_evaluation_row, - model=["accounts/fireworks/models/kimi-k2-instruct"], + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], threshold_of_success=0.33, rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py index 84e6b4a6..35d1a1b3 100644 --- a/tests/pytest/test_basic_coding.py +++ b/tests/pytest/test_basic_coding.py @@ -28,7 +28,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat @evaluation_test( input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"], dataset_adapter=coding_dataset_to_evaluation_row, - model=["accounts/fireworks/models/kimi-k2-instruct"], + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], threshold_of_success=0.8, rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py index 87348ead..b396e12c 100644 --- a/tests/pytest/test_hallucination.py +++ b/tests/pytest/test_hallucination.py @@ -9,12 +9,13 @@ import json from typing import Any, Dict, List -from fireworks import LLM +import litellm from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test -judge_llm = LLM(model="accounts/fireworks/models/kimi-k2-instruct", deployment_type="serverless") +# Configure the judge model for LiteLLM +JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct" def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]: @@ -31,7 +32,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation @evaluation_test( input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"], dataset_adapter=hallucination_dataset_adapter, - model=["accounts/fireworks/models/kimi-k2-instruct"], + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}], rollout_processor=default_single_turn_rollout_processor, threshold_of_success=0.33, @@ -77,7 +78,8 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow: """ try: - response = judge_llm.chat.completions.create( + response = litellm.completion( + model=JUDGE_MODEL, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}], temperature=0.1, max_tokens=500, diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py index 4dee7e89..cc2ae4f5 100644 --- a/tests/pytest/test_markdown_highlighting.py +++ b/tests/pytest/test_markdown_highlighting.py @@ -24,7 +24,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu @evaluation_test( input_dataset=["tests/pytest/data/markdown_dataset.jsonl"], dataset_adapter=markdown_dataset_to_evaluation_row, - model=["accounts/fireworks/models/kimi-k2-instruct"], + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], threshold_of_success=0.5, rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_pytest_function_calling.py b/tests/pytest/test_pytest_function_calling.py index ed536004..7239de58 100644 --- a/tests/pytest/test_pytest_function_calling.py +++ b/tests/pytest/test_pytest_function_calling.py @@ -19,7 +19,7 @@ def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evalu @evaluation_test( input_dataset=["tests/pytest/data/function_calling.jsonl"], - model=["accounts/fireworks/models/kimi-k2-instruct"], + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], mode="pointwise", dataset_adapter=function_calling_to_evaluation_row, rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py index c5a59fd8..c1b643d0 100644 --- a/tests/pytest/test_pytest_input_messages.py +++ b/tests/pytest/test_pytest_input_messages.py @@ -10,7 +10,7 @@ Message(role="user", content="What is the capital of France?"), ] ], - model=["accounts/fireworks/models/kimi-k2-instruct"], + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], rollout_processor=default_single_turn_rollout_processor, ) def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]: diff --git a/tests/pytest/test_pytest_json_schema.py b/tests/pytest/test_pytest_json_schema.py index 9dfc278a..8463f873 100644 --- a/tests/pytest/test_pytest_json_schema.py +++ b/tests/pytest/test_pytest_json_schema.py @@ -23,7 +23,7 @@ def json_schema_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evaluation @evaluation_test( input_dataset=["tests/pytest/data/json_schema.jsonl"], - model=["accounts/fireworks/models/kimi-k2-instruct"], + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], mode="pointwise", rollout_processor=default_single_turn_rollout_processor, dataset_adapter=json_schema_to_evaluation_row, diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py index 24139dcc..05b0022c 100644 --- a/tests/pytest/test_pytest_math_example.py +++ b/tests/pytest/test_pytest_math_example.py @@ -8,7 +8,7 @@ @evaluation_test( input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=gsm8k_to_evaluation_row, - model=["accounts/fireworks/models/kimi-k2-instruct"], + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], rollout_input_params=[{"temperature": 0.0}], max_dataset_rows=5, threshold_of_success=0.0, diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py index 2ffe899c..fbc59efc 100644 --- a/tests/pytest/test_pytest_math_format_length.py +++ b/tests/pytest/test_pytest_math_format_length.py @@ -11,7 +11,7 @@ @evaluation_test( input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=gsm8k_to_evaluation_row, - model=["accounts/fireworks/models/kimi-k2-instruct"], + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], rollout_input_params=[{"temperature": 0.0}], max_dataset_rows=5, threshold_of_success=0.0, diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py index daa256f7..51062a1f 100644 --- a/tests/pytest/test_pytest_word_count_example.py +++ b/tests/pytest/test_pytest_word_count_example.py @@ -8,7 +8,7 @@ @evaluation_test( input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=word_count_to_evaluation_row, - model=["accounts/fireworks/models/kimi-k2-instruct"], + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], rollout_input_params=[{"temperature": 0.0}], max_dataset_rows=5, threshold_of_success=0.3, # Reasonable threshold for word count evaluation