eval-protocol · xzrderek · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -1,21 +1,17 @@
 import asyncio
 from typing import List
 
-from openai import AsyncOpenAI
+from litellm import acompletion
+from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
 
-from eval_protocol.auth import get_fireworks_api_base, get_fireworks_api_key
 from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest.types import RolloutProcessorConfig
 
 
 async def default_single_turn_rollout_processor(
     rows: List[EvaluationRow], config: RolloutProcessorConfig
 ) -> List[EvaluationRow]:
-    """Generate a single response from a Fireworks model concurrently."""
-
-    api_key = get_fireworks_api_key()
-    api_base = get_fireworks_api_base()
-    client = AsyncOpenAI(api_key=api_key, base_url=f"{api_base}/inference/v1")
+    """Generate a single response from any supported model provider using LiteLLM."""
 
     async def process_row(row: EvaluationRow) -> EvaluationRow:
         """Process a single row asynchronously."""
@@ -24,17 +20,35 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
         messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
 
-        create_kwargs = dict(model=config.model, messages=messages_payload, **config.input_params)
+        request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
+
         if row.tools is not None:
-            create_kwargs["tools"] = row.tools
-        response = await client.chat.completions.create(**create_kwargs)
+            request_params["tools"] = row.tools
+
+        response = await acompletion(**request_params)
+
         assistant_content = response.choices[0].message.content or ""
         tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
+
+        converted_tool_calls = None
+        if tool_calls:
+            converted_tool_calls = [
+                ChatCompletionMessageToolCall(
+                    id=tool_call.id,
+                    type=tool_call.type,
+                    function={
+                        "name": tool_call.function.name,
+                        "arguments": tool_call.function.arguments,
+                    },
+                )
+                for tool_call in tool_calls
+            ]
+
         messages = list(row.messages) + [
             Message(
                 role="assistant",
                 content=assistant_content,
-                tool_calls=tool_calls,
+                tool_calls=converted_tool_calls,
             )
         ]
 

diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
@@ -29,7 +29,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
 @evaluation_test(
     input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"],
     dataset_adapter=apps_dataset_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     threshold_of_success=0.33,
     rollout_processor=default_single_turn_rollout_processor,

diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
@@ -28,7 +28,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
 @evaluation_test(
     input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"],
     dataset_adapter=coding_dataset_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     threshold_of_success=0.8,
     rollout_processor=default_single_turn_rollout_processor,

diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
@@ -9,12 +9,13 @@
 import json
 from typing import Any, Dict, List
 
-from fireworks import LLM
+import litellm
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
 
-judge_llm = LLM(model="accounts/fireworks/models/kimi-k2-instruct", deployment_type="serverless")
+# Configure the judge model for LiteLLM
+JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"
 
 
 def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -31,7 +32,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
 @evaluation_test(
     input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
     dataset_adapter=hallucination_dataset_adapter,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
     rollout_processor=default_single_turn_rollout_processor,
     threshold_of_success=0.33,
@@ -77,7 +78,8 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
     """
 
     try:
-        response = judge_llm.chat.completions.create(
+        response = litellm.completion(
+            model=JUDGE_MODEL,
             messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
             temperature=0.1,
             max_tokens=500,

diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -24,7 +24,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
 @evaluation_test(
     input_dataset=["tests/pytest/data/markdown_dataset.jsonl"],
     dataset_adapter=markdown_dataset_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     threshold_of_success=0.5,
     rollout_processor=default_single_turn_rollout_processor,

diff --git a/tests/pytest/test_pytest_function_calling.py b/tests/pytest/test_pytest_function_calling.py
@@ -19,7 +19,7 @@ def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evalu
 
 @evaluation_test(
     input_dataset=["tests/pytest/data/function_calling.jsonl"],
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     mode="pointwise",
     dataset_adapter=function_calling_to_evaluation_row,
     rollout_processor=default_single_turn_rollout_processor,

diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py
@@ -10,7 +10,7 @@
             Message(role="user", content="What is the capital of France?"),
         ]
     ],
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_processor=default_single_turn_rollout_processor,
 )
 def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:

diff --git a/tests/pytest/test_pytest_json_schema.py b/tests/pytest/test_pytest_json_schema.py
@@ -23,7 +23,7 @@ def json_schema_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evaluation
 
 @evaluation_test(
     input_dataset=["tests/pytest/data/json_schema.jsonl"],
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     mode="pointwise",
     rollout_processor=default_single_turn_rollout_processor,
     dataset_adapter=json_schema_to_evaluation_row,

diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
@@ -8,7 +8,7 @@
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
     dataset_adapter=gsm8k_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
     threshold_of_success=0.0,

diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
@@ -11,7 +11,7 @@
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
     dataset_adapter=gsm8k_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
     threshold_of_success=0.0,

diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
@@ -8,7 +8,7 @@
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
     dataset_adapter=word_count_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
     threshold_of_success=0.3,  # Reasonable threshold for word count evaluation