Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 25 additions & 11 deletions eval_protocol/pytest/default_single_turn_rollout_process.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
import asyncio
from typing import List

from openai import AsyncOpenAI
from litellm import acompletion
from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall

from eval_protocol.auth import get_fireworks_api_base, get_fireworks_api_key
from eval_protocol.models import EvaluationRow, Message
from eval_protocol.pytest.types import RolloutProcessorConfig


async def default_single_turn_rollout_processor(
rows: List[EvaluationRow], config: RolloutProcessorConfig
) -> List[EvaluationRow]:
"""Generate a single response from a Fireworks model concurrently."""

api_key = get_fireworks_api_key()
api_base = get_fireworks_api_base()
client = AsyncOpenAI(api_key=api_key, base_url=f"{api_base}/inference/v1")
"""Generate a single response from any supported model provider using LiteLLM."""

async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row asynchronously."""
Expand All @@ -24,17 +20,35 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:

messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]

create_kwargs = dict(model=config.model, messages=messages_payload, **config.input_params)
request_params = {"model": config.model, "messages": messages_payload, **config.input_params}

if row.tools is not None:
create_kwargs["tools"] = row.tools
response = await client.chat.completions.create(**create_kwargs)
request_params["tools"] = row.tools

response = await acompletion(**request_params)

assistant_content = response.choices[0].message.content or ""
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None

converted_tool_calls = None
if tool_calls:
converted_tool_calls = [
ChatCompletionMessageToolCall(
id=tool_call.id,
type=tool_call.type,
function={
"name": tool_call.function.name,
"arguments": tool_call.function.arguments,
},
)
for tool_call in tool_calls
]

messages = list(row.messages) + [
Message(
role="assistant",
content=assistant_content,
tool_calls=tool_calls,
tool_calls=converted_tool_calls,
)
]

Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_apps_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
@evaluation_test(
input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"],
dataset_adapter=apps_dataset_to_evaluation_row,
model=["accounts/fireworks/models/kimi-k2-instruct"],
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
threshold_of_success=0.33,
rollout_processor=default_single_turn_rollout_processor,
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_basic_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
@evaluation_test(
input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"],
dataset_adapter=coding_dataset_to_evaluation_row,
model=["accounts/fireworks/models/kimi-k2-instruct"],
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
threshold_of_success=0.8,
rollout_processor=default_single_turn_rollout_processor,
Expand Down
10 changes: 6 additions & 4 deletions tests/pytest/test_hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
import json
from typing import Any, Dict, List

from fireworks import LLM
import litellm

from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test

judge_llm = LLM(model="accounts/fireworks/models/kimi-k2-instruct", deployment_type="serverless")
# Configure the judge model for LiteLLM
JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"


def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
Expand All @@ -31,7 +32,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
@evaluation_test(
input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
dataset_adapter=hallucination_dataset_adapter,
model=["accounts/fireworks/models/kimi-k2-instruct"],
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
rollout_processor=default_single_turn_rollout_processor,
threshold_of_success=0.33,
Expand Down Expand Up @@ -77,7 +78,8 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
"""

try:
response = judge_llm.chat.completions.create(
response = litellm.completion(
model=JUDGE_MODEL,
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
temperature=0.1,
max_tokens=500,
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_markdown_highlighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
@evaluation_test(
input_dataset=["tests/pytest/data/markdown_dataset.jsonl"],
dataset_adapter=markdown_dataset_to_evaluation_row,
model=["accounts/fireworks/models/kimi-k2-instruct"],
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
threshold_of_success=0.5,
rollout_processor=default_single_turn_rollout_processor,
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_pytest_function_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evalu

@evaluation_test(
input_dataset=["tests/pytest/data/function_calling.jsonl"],
model=["accounts/fireworks/models/kimi-k2-instruct"],
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
mode="pointwise",
dataset_adapter=function_calling_to_evaluation_row,
rollout_processor=default_single_turn_rollout_processor,
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_pytest_input_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
Message(role="user", content="What is the capital of France?"),
]
],
model=["accounts/fireworks/models/kimi-k2-instruct"],
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
rollout_processor=default_single_turn_rollout_processor,
)
def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_pytest_json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def json_schema_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evaluation

@evaluation_test(
input_dataset=["tests/pytest/data/json_schema.jsonl"],
model=["accounts/fireworks/models/kimi-k2-instruct"],
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
mode="pointwise",
rollout_processor=default_single_turn_rollout_processor,
dataset_adapter=json_schema_to_evaluation_row,
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_pytest_math_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
@evaluation_test(
input_dataset=["development/gsm8k_sample.jsonl"],
dataset_adapter=gsm8k_to_evaluation_row,
model=["accounts/fireworks/models/kimi-k2-instruct"],
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
rollout_input_params=[{"temperature": 0.0}],
max_dataset_rows=5,
threshold_of_success=0.0,
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_pytest_math_format_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
@evaluation_test(
input_dataset=["development/gsm8k_sample.jsonl"],
dataset_adapter=gsm8k_to_evaluation_row,
model=["accounts/fireworks/models/kimi-k2-instruct"],
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
rollout_input_params=[{"temperature": 0.0}],
max_dataset_rows=5,
threshold_of_success=0.0,
Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_pytest_word_count_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
@evaluation_test(
input_dataset=["development/gsm8k_sample.jsonl"],
dataset_adapter=word_count_to_evaluation_row,
model=["accounts/fireworks/models/kimi-k2-instruct"],
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
rollout_input_params=[{"temperature": 0.0}],
max_dataset_rows=5,
threshold_of_success=0.3, # Reasonable threshold for word count evaluation
Expand Down
Loading