From 3427da128b675849049c880634b16d799902fb7b Mon Sep 17 00:00:00 2001
From: benjibc <youfychenbc5000@gmail.com>
Date: Fri, 1 Aug 2025 07:32:53 +0000
Subject: [PATCH 1/8] feat: Add pointwise evaluation mode with pytest
 integration

- Add mode='pointwise' parameter to @evaluation_test decorator
- Enable elegant row-by-row evaluation where core logic is separated from test configuration
- Add comprehensive word_count example using pointwise mode with haikus dependency
- Update README.md with clean architecture documentation and Mermaid diagram
- Show parameterized evaluation components in visual diagram
- Include both pointwise and batch mode examples
- Add dataset adapter helper for word_count evaluation
- Deprecate old @reward_function pattern in favor of pytest-based approach

This provides a much more elegant API where users define just the core evaluation logic
and everything else (models, datasets, thresholds, rollout strategies) is parameterized
in the decorator, with full pytest integration for testing and CI/CD.
---
 README.md                                     | 655 ++++--------------
 eval_protocol/pytest/pytest_utils.py          | 121 ++--
 pyproject.toml                                |   7 +
 .../helper/word_count_to_evaluation_row.py    |  14 +
 .../pytest/test_pytest_word_count_example.py  |  78 +++
 uv.lock                                       | 123 +++-
 6 files changed, 449 insertions(+), 549 deletions(-)
 create mode 100644 tests/pytest/helper/word_count_to_evaluation_row.py
 create mode 100644 tests/pytest/test_pytest_word_count_example.py

diff --git a/README.md b/README.md
index 66807d65..635a3f10 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,54 @@
 # Eval Protocol
 
-**Eval Protocol: Author, reproduce, and evaluate reward functions seamlessly on Fireworks, TRL, and your own infrastructure.**
+**A modern Python framework for authoring, testing, and deploying LLM evaluation functions with pytest integration.**
 
 ## Key Features
 
-*   **Easy-to-use Decorator**: Define reward functions with a simple `@reward_function` decorator.
-*   **Local Testing**: Quickly test your reward functions with sample data.
-*   **Flexible Evaluation**: Evaluate model outputs based on single or multiple custom metrics.
-*   **Seamless Deployment**: Deploy your reward functions to platforms like Fireworks AI.
-*   **Comprehensive CLI**: Manage reward functions, preview evaluations (`eval-protocol preview`), deploy (`eval-protocol deploy`), and run complex evaluation pipelines (`eval-protocol run`).
-*   **Simplified Dataset Integration**: Direct integration with HuggingFace datasets and on-the-fly format conversion.
-*   **Extensible**: Designed to be adaptable for various LLM evaluation scenarios.
+- **Pytest-Based Evaluation**: Write evaluation functions as pytest tests with full parameterization
+- **Pointwise Mode**: Define core evaluation logic once, parameterize everything else (models, datasets, thresholds)
+- **Local Testing**: Quickly test your evaluation functions with sample data
+- **Seamless Deployment**: Deploy your evaluation functions to platforms like Fireworks AI
+- **Dataset Integration**: Direct integration with HuggingFace datasets and custom formats
+- **Flexible Rollout Processing**: Support for single-turn, multi-turn, and custom rollout strategies
+
+## Architecture: Parameterized Evaluation Components
+
+The core innovation of Eval Protocol is separating evaluation logic from test configuration. Here's how the different elements are parameterized:
+
+```mermaid
+graph TD
+    A["@evaluation_test Decorator"] --> B["Model Configuration"]
+    A --> C["Dataset Configuration"]
+    A --> D["Rollout Processing"]
+    A --> E["Test Parameters"]
+    A --> F["Success Criteria"]
+    
+    B --> B1["model=['gpt-4o-mini']"]
+    B --> B2["input_params=[{temperature: 0.0}]"]
+    
+    C --> C1["input_dataset=['data.jsonl']"]
+    C --> C2["dataset_adapter=custom_adapter"]
+    C --> C3["max_dataset_rows=100"]
+    
+    D --> D1["rollout_processor=single_turn"]
+    D --> D2["rollout_processor=multi_turn"]
+    D --> D3["rollout_processor=custom"]
+    
+    E --> E1["mode='pointwise'"]
+    E --> E2["num_runs=3"]
+    E --> E3["aggregation_method='mean'"]
+    
+    F --> F1["threshold_of_success=0.8"]
+    
+    G["Core Evaluation Function"] --> H["def test_my_evaluate(messages, **kwargs)"]
+    H --> I["return EvaluateResult(score=..., metrics=...)"]
+    
+    A --> G
+    
+    style A fill:#e1f5fe
+    style G fill:#f3e5f5
+    style H fill:#fff3e0
+```
 
 ## Installation
 
@@ -18,548 +56,153 @@
 pip install eval-protocol
 ```
 
-### Optional TRL Extras
-
-Install the additional dependencies required for running the TRL-based training
-examples:
-
-```bash
-pip install "eval-protocol[trl]"
-```
-
-## Getting Started
-
-Eval Protocol simplifies the creation and deployment of reward functions for evaluating AI model outputs.
-
-### 1. Creating a Reward Function for Tool Calling
-
-Eval Protocol allows you to define custom logic to evaluate model responses. Here's an example of how you might use the built-in `exact_tool_match_reward` for evaluating tool/function calls. This reward function checks if the model's generated tool calls exactly match the expected ones.
-
-```python
-# This is a conceptual example of how exact_tool_match_reward is defined and used.
-# You would typically import it from eval_protocol.rewards.function_calling.
-# For actual usage, you configure it in your YAML files for `eval-protocol run`.
-
-from eval_protocol import reward_function
-from eval_protocol.models import EvaluateResult, Message, MetricResult
-from typing import List, Dict, Any, Optional, Union
-
-# Definition of exact_tool_match_reward (simplified for brevity, see source for full details)
-# from eval_protocol.rewards.function_calling import exact_tool_match_reward, eval_tool_call
-
-@reward_function
-def exact_tool_match_reward(
-    messages: Union[List[Message], List[Dict[str, Any]]],
-    ground_truth: Optional[Dict[str, Any]] = None,
-    **kwargs,
-) -> EvaluateResult:
-    if not messages:
-        return EvaluateResult(
-            score=0.0, reason="No messages provided for evaluation.", metrics={}
-        )
-
-    generation_message_obj = messages[-1]
-    generation_dict: Dict[str, Any]
-
-    if isinstance(generation_message_obj, Message):
-        generation_dict = {
-            "role": generation_message_obj.role,
-            "content": generation_message_obj.content,
-        }
-        if generation_message_obj.tool_calls:
-            generation_dict["tool_calls"] = [
-                tc.model_dump() if hasattr(tc, "model_dump") else tc
-                for tc in generation_message_obj.tool_calls
-            ]
-    elif isinstance(generation_message_obj, dict):
-        generation_dict = generation_message_obj
-    else:
-        # Handle error for unexpected type
-        return EvaluateResult(score=0.0, reason="Unexpected generation message type.", metrics={})
-
-    if ground_truth is None:
-        # Handle missing ground truth (e.g., score 0 if generation has tool calls, 1 if not)
-        # This logic is simplified here.
-        has_gen_tc = bool(generation_dict.get("tool_calls") or "<tool_call>" in generation_dict.get("content", ""))
-        score = 0.0 if has_gen_tc else 1.0
-        return EvaluateResult(score=score, reason="Ground truth not provided.", metrics={})
-
-    # Ensure ground_truth is a dict (it might be a JSON string from some datasets)
-    if isinstance(ground_truth, str):
-        try:
-            ground_truth = json.loads(ground_truth)
-        except json.JSONDecodeError:
-            return EvaluateResult(score=0.0, reason="Ground truth string failed to parse.", metrics={})
-
-    if not isinstance(ground_truth, dict):
-         return EvaluateResult(score=0.0, reason="Ground truth is not a dictionary.", metrics={})
-
-    # This simplified check compares generated tool calls with the expected ones.
-    expected_tcs = ground_truth.get("tool_calls", [])
-    generated_tcs = generation_dict.get("tool_calls", [])
-
-    # This is a highly simplified check. The actual function is much more robust.
-    is_match = (len(expected_tcs) == len(generated_tcs)) # Placeholder
-    score = 1.0 if is_match else 0.0
-
-    reason = f"Exact tool match evaluation score: {score}"
-    return EvaluateResult(score=score, reason=reason, metrics={
-        "tool_call_match": MetricResult(score=score, success=is_match, reason=reason)
-    })
-
-```
-This example illustrates the structure. The actual `exact_tool_match_reward` in `eval_protocol.rewards.function_calling` handles complex parsing and comparison of tool calls.
-
-### 2. Testing Your Reward Function with a Dataset
-
-Effective testing of a reward function involves evaluating it against a representative dataset. The key is the **dataset/reward function pair**: your dataset should provide the necessary `ground_truth` information that your reward function expects.
-
-**Crafting Your Dataset:**
-
-1.  **Define `ground_truth`**: For each sample in your dataset, the `ground_truth_for_eval` (or a similarly named field specified in your dataset configuration) must contain the information your reward function needs to make a judgment.
-    *   For `exact_tool_match_reward`, `ground_truth` should be a dictionary, often with a `tool_calls` key. This key would hold a list of expected tool calls, each specifying the `name` and `arguments` of the function call. Example:
-        ```json
-        {
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "name": "get_weather",
-              "arguments": {"location": "San Francisco, CA", "unit": "celsius"}
-            }
-          ]
-        }
-        ```
-2.  **Format**: Datasets are typically JSONL files, where each line is a JSON object representing a sample. Each sample should include:
-    *   `messages`: The input conversation history for the model.
-    *   `tools` (optional, for tool calling): A list of available tools the model can use.
-    *   `ground_truth_for_eval`: The expected output or data for the reward function (e.g., the structure shown above for tool calling).
-    *   An `id` for tracking.
-
-**Example Test Snippet (Conceptual):**
-
-While `eval-protocol run` is the primary way to evaluate with datasets, here's a conceptual local test:
-
-```python
-from eval_protocol.rewards.function_calling import exact_tool_match_reward # Import the actual function
-from eval_protocol.models import Message
-
-# Sample 1: Correct tool call
-test_messages_correct = [
-    Message(role="user", content="What's the weather in SF?"),
-    Message(role="assistant", tool_calls=[ # Model's generated tool call
-        {"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": '{"location": "San Francisco, CA", "unit": "celsius"}'}}
-    ])
-]
-ground_truth_correct = { # Expected tool call for the reward function
-    "tool_calls": [
-        {"name": "get_weather", "arguments": {"location": "San Francisco, CA", "unit": "celsius"}}
-    ]
-}
-
-# Sample 2: Incorrect tool call
-test_messages_incorrect = [
-    Message(role="user", content="What's the weather in SF?"),
-    Message(role="assistant", tool_calls=[
-        {"id": "call_456", "type": "function", "function": {"name": "get_current_time", "arguments": '{}'}}
-    ])
-]
-# Ground truth remains the same as we expect get_weather
-
-# Test with the actual reward function
-result_correct = exact_tool_match_reward(messages=test_messages_correct, ground_truth=ground_truth_correct)
-print(f"Correct Call - Score: {result_correct.score}, Reason: {result_correct.reason}")
-
-result_incorrect = exact_tool_match_reward(messages=test_messages_incorrect, ground_truth=ground_truth_correct)
-print(f"Incorrect Call - Score: {result_incorrect.score}, Reason: {result_incorrect.reason}")
-```
-This local test helps verify the reward function's logic with specific inputs. For comprehensive evaluation, use `eval-protocol run` with a full dataset (see next section).
-
-### 3. Running Local Evaluations with `eval-protocol run`
-
-For comprehensive local evaluations, especially when working with datasets and complex configurations, the `eval-protocol run` command is the recommended tool. It leverages Hydra for configuration management, allowing you to define your evaluation pipeline (dataset, model, reward function, etc.) in YAML files.
-
-**Example: Math Evaluation using `codeparrot/gsm8k`**
-
-The `examples/math_example` demonstrates evaluating models on math word problems.
-
-```bash
-# Ensure you are in the repository root
-# cd /path/to/eval-protocol
-
-# Run evaluation with the math configuration
-eval-protocol run --config-name run_math_eval.yaml --config-path examples/math_example/conf
-
-# Override parameters directly from the command line:
-eval-protocol run --config-name run_math_eval.yaml --config-path examples/math_example/conf \
-  generation.model_name="accounts/fireworks/models/llama-v3p1-405b-instruct" \
-  evaluation_params.limit_samples=10
-```
-
-**What this command does (typically):**
-*   Loads the specified dataset (e.g., GSM8K directly from HuggingFace).
-*   Applies any dataset-specific prompts or preprocessing defined in the configuration.
-*   Generates model responses (e.g., using the Fireworks API or other configured providers).
-*   Evaluates the generated responses using the specified reward function(s).
-*   Saves detailed evaluation results to `<config_output_name>.jsonl` (e.g., `math_example_results.jsonl`) in a timestamped output directory (e.g., under `outputs/`).
-*   Saves generated prompt/response pairs to `preview_input_output_pairs.jsonl` in the same output directory, suitable for inspection or re-evaluation with `eval-protocol preview`.
-
-**Example: APPS Coding Evaluation**
-
-The `examples/apps_coding_example` shows evaluation on code generation tasks using the `codeparrot/apps` dataset.
-
-```bash
-# Run evaluation with the APPS coding configuration
-eval-protocol run --config-path examples/apps_coding_example/conf --config-name run_eval
-
-# Example: Limit samples for a quick test
-eval-protocol run --config-path examples/apps_coding_example/conf --config-name run_eval evaluation_params.limit_samples=2
-
-# Example: Disable generation to test reward function on cached responses
-eval-protocol run --config-path examples/apps_coding_example/conf --config-name run_eval generation.enabled=false
-```
-
-These examples showcase how `eval-protocol run` can be adapted for different tasks and datasets through configuration files.
-
-For more details on this command, Hydra configuration, and advanced usage, see the [CLI Overview](docs/cli_reference/cli_overview.mdx) and [Hydra Configuration Guide](docs/developer_guide/hydra_configuration.mdx).
-
-### Fireworks Authentication Setup (Required for Preview/Deploy with Fireworks)
-
-To interact with the Fireworks AI platform for deploying and managing evaluations (including some preview scenarios that might use remote evaluators or if `eval-protocol run` uses a Fireworks-hosted model), Eval Protocol needs your Fireworks AI credentials. You can configure these in two ways:
-
-**A. Environment Variables (Highest Priority)**
-
-Set the following environment variables:
-
-*   `FIREWORKS_API_KEY`: Your Fireworks AI API key. This is required for all interactions with the Fireworks API.
-*   `FIREWORKS_ACCOUNT_ID`: Your Fireworks AI Account ID. This is often required for operations like creating or listing evaluators under your account.
-
-```bash
-export FIREWORKS_API_KEY="your_fireworks_api_key"
-export FIREWORKS_ACCOUNT_ID="your_fireworks_account_id"
-```
-
-**B. Configuration File (Lower Priority)**
-
-Alternatively, you can store your credentials in a configuration file located at `~/.fireworks/auth.ini`. If environment variables are not set, Eval Protocol will look for this file.
-
-Create the file with the following format:
-
-```ini
-[fireworks]
-api_key = YOUR_FIREWORKS_API_KEY
-account_id = YOUR_FIREWORKS_ACCOUNT_ID
-```
-
-Replace `YOUR_FIREWORKS_API_KEY` and `YOUR_FIREWORKS_ACCOUNT_ID` with your actual credentials.
-
-**Credential Sourcing Order:**
-
-Eval Protocol will prioritize credentials in the following order:
-1.  Environment Variables (`FIREWORKS_API_KEY`, `FIREWORKS_ACCOUNT_ID`)
-2.  `~/.fireworks/auth.ini` configuration file
-
-Ensure that the `auth.ini` file has appropriate permissions to protect your sensitive credentials.
-
-The `FIREWORKS_API_KEY` is essential for authenticating your requests to the Fireworks AI service. The `FIREWORKS_ACCOUNT_ID` is used to identify your specific account context for operations that are account-specific, such as managing your evaluators. While the API key authenticates *who* you are, the account ID often specifies *where* (under which account) an operation should take place. Some Fireworks API endpoints may require both.
-
-### 4. Evaluating with Sample Data (Preview)
-
-Create a JSONL file with sample conversations to evaluate:
-
-```json
-{"messages": [{"role": "user", "content": "Tell me about AI"}, {"role": "assistant", "content": "AI refers to systems designed to mimic human intelligence."}]}
-{"messages": [{"role": "user", "content": "What is machine learning?"}, {"role": "assistant", "content": "Machine learning is a subset of AI that focuses on building systems that can learn from data."}]}
-```
-
-Preview your evaluation using the CLI:
-
-```bash
-eval-protocol preview --metrics-folders "word_count=./path/to/metrics" --samples ./path/to/samples.jsonl
-```
-
-For example
-```
-eval-protocol preview --metrics-folders "word_count=examples/metrics/word_count" --samples development/CODING_DATASET.jsonl
-```
-
-### 5. Deploying Your Reward Function
-
-Deploy your reward function to use in training workflows:
-
-```bash
-eval-protocol deploy --id my-evaluator --metrics-folders "word_count=./path/to/metrics" --force
-```
-
-#### Local Development Server
+## Quick Start: Pointwise Evaluation
 
-For local development and testing, you can deploy a reward function as a local server with external tunnel access:
-
-```bash
-# Deploy as local server with automatic tunnel (ngrok/serveo)
-eval-protocol deploy --id test-local-serve-eval --target local-serve --function-ref examples.row_wise.dummy_example.dummy_rewards.simple_echo_reward --verbose --force
-```
-
-**What this does:**
-- Starts a local HTTP server on port 8001 serving your reward function
-- Creates an external tunnel (using ngrok or serveo.net) to make the server publicly accessible
-- Registers the tunnel URL with Fireworks AI for remote evaluation
-- Keeps the server running indefinitely in the background
-
-**Key points:**
-- The CLI returns to prompt after deployment, but the server continues running in background
-- Check running processes: `ps aux | grep -E "(generic_server|ngrok)"`
-- Test locally: `curl -X POST http://localhost:8001/evaluate -H "Content-Type: application/json" -d '{"messages": [{"role": "user", "content": "test"}]}'`
-- Monitor logs: `tail -f logs/eval-protocol-local/generic_server_*.log`
-- Stop server: Kill the background processes manually when done
-
-This is ideal for development, testing webhook integrations, or accessing your reward function from remote services without full cloud deployment.
-
-Or deploy programmatically:
+The modern way to write evaluations in Eval Protocol uses the `@evaluation_test` decorator with `mode="pointwise"`. This separates your core evaluation logic from test configuration:
 
 ```python
-from eval_protocol.evaluation import create_evaluation
-
-evaluator = create_evaluation(
-    evaluator_id="my-evaluator",
-    metric_folders=["word_count=./path/to/metrics"],
-    display_name="My Word Count Evaluator",
-    description="Evaluates responses based on word count",
-    force=True  # Update if already exists
+from typing import List
+from eval_protocol.pytest import evaluation_test, default_single_turn_rollout_processor
+from eval_protocol.models import EvaluateResult, MetricResult, Message
+
+@evaluation_test(
+    input_dataset=["data/sample.jsonl"],
+    model=["gpt-4o-mini"],
+    input_params=[{"temperature": 0.0}],
+    max_dataset_rows=100,
+    threshold_of_success=0.8,
+    rollout_processor=default_single_turn_rollout_processor,
+    mode="pointwise",  # This is the key - enables row-by-row evaluation
 )
-```
-
-## Advanced Usage
-
-### Multiple Metrics
-
-Combine multiple metrics in a single reward function:
+def test_word_count_evaluate(messages: List[Message], **kwargs) -> EvaluateResult:
+    """
+    Core evaluation logic - everything else is parameterized in the decorator.
+    """
+    if not messages:
+        return EvaluateResult(score=0.0, reason="No messages found")
 
-```python
-from eval_protocol import reward_function
-from eval_protocol.models import EvaluateResult, MetricResult, Message # Assuming models are here
-from typing import List, Dict, Any, Optional
-
-@reward_function
-def combined_reward(
-    messages: List[Dict[str, Any]], # Or List[Message]
-    original_messages: Optional[List[Dict[str, Any]]] = None, # Or List[Message]
-    **kwargs: Any
-) -> EvaluateResult:
-    """Evaluate with multiple metrics."""
-    response = messages[-1].get("content", "")
-
-    # Word count metric
+    response = messages[-1].content or ""
     word_count = len(response.split())
-    word_score = min(word_count / 100.0, 1.0)
-    word_metric_success = word_count > 10
-
-    # Specificity metric
-    specificity_markers = ["specifically", "for example", "such as"]
-    marker_count = sum(1 for marker in specificity_markers if marker.lower() in response.lower())
-    specificity_score = min(marker_count / 2.0, 1.0)
-    specificity_metric_success = marker_count > 0
-
-    # Combined score with weighted components
-    final_score = word_score * 0.3 + specificity_score * 0.7
+    score = min(word_count / 100, 1.0)  # Normalize to 0-1
 
     return EvaluateResult(
-        score=final_score,
-        reason=f"Combined score based on word count ({word_count}) and specificity markers ({marker_count})",
+        score=score,
+        reason=f"Word count: {word_count}",
         metrics={
             "word_count": MetricResult(
-                score=word_score,
-                success=word_metric_success,
-                reason=f"Word count: {word_count}"
-            ),
-            "specificity": MetricResult(
-                score=specificity_score,
-                success=specificity_metric_success,
-                reason=f"Found {marker_count} specificity markers"
+                score=score,
+                is_score_valid=word_count > 0,
+                reason=f"Response has {word_count} words"
             )
         }
     )
 ```
 
-### Custom Model Providers
-
-Deploy your reward function with a specific model provider:
-
-```python
-# Deploy with a custom provider
-my_function.deploy(
-    name="my-evaluator-anthropic",
-    description="My evaluator using Claude model",
-    providers=[
-        {
-            "providerType": "anthropic",
-            "modelId": "claude-3-sonnet-20240229"
-        }
-    ],
-    force=True
-)
-```
-
-## Dataset Integration
+## Key Benefits
 
-Eval Protocol provides seamless integration with popular datasets through a simplified configuration system:
+1. **Clean Separation**: Your evaluation function only contains the core logic
+2. **Full Parameterization**: Models, datasets, thresholds, rollout strategies all configured in the decorator
+3. **Pytest Integration**: Run as regular pytest tests with full reporting and CI/CD integration
+4. **Reusability**: Same evaluation logic can be tested with different configurations
 
-### Direct HuggingFace Integration
-
-Load datasets directly from HuggingFace Hub without manual preprocessing:
-
-```bash
-# Evaluate using GSM8K dataset with math-specific prompts
-eval-protocol run --config-name run_math_eval.yaml --config-path examples/math_example/conf
-```
+## Batch Mode Example
 
-### Derived Datasets
-
-Create specialized dataset configurations that reference base datasets and apply transformations:
-
-```yaml
-# conf/dataset/gsm8k_math_prompts.yaml
-defaults:
-  - base_derived_dataset
-  - _self_
-
-base_dataset: "gsm8k"
-system_prompt: "Solve the following math problem. Show your work clearly. Put the final numerical answer between <answer> and </answer> tags."
-output_format: "evaluation_format"
-derived_max_samples: 5
-```
-
-### Key Benefits
-
-- **No Manual Conversion**: Datasets are converted to evaluation format on-the-fly
-- **System Prompt Integration**: Prompts are part of dataset configuration, not evaluation logic
-- **Flexible Column Mapping**: Automatically adapts different dataset formats
-- **Reusable Configurations**: Base datasets can be extended for different use cases
-
-See the [math example](examples/math_example/) for a complete demonstration of the dataset system.
-
-## Detailed Documentation
-
-For more comprehensive information, including API references, tutorials, and advanced guides, please see our [full documentation](docs/documentation_home.mdx).
-
-## Examples
-
-Check the `examples` directory for complete examples:
-
-- `evaluation_preview_example.py`: How to preview an evaluator.
-- `deploy_example.py`: How to deploy a reward function to Fireworks.
-- `math_example/`: Demonstrates CLI-based evaluation (`eval-protocol run`) and TRL GRPO training for math problems (GSM8K dataset).
-- `apps_coding_example/`: Shows CLI-based evaluation (`eval-protocol run`) for code generation tasks (APPS dataset).
- - `apps_coding_example/`: Shows CLI-based evaluation (`eval-protocol run`) for code generation tasks (APPS dataset).
-
-The OpenEvals project provides a suite of evaluators that can be used directly within Eval Protocol. The helper `eval_protocol.integrations.openeval.adapt` converts any OpenEvals evaluator into a reward function returning an `EvaluateResult`.
+For cases where you need to process the full dataset at once, use the default batch mode:
 
 ```python
-from openevals import exact_match
-from eval_protocol.integrations.openeval import adapt
-
-exact_match_reward = adapt(exact_match)
-result = exact_match_reward(
-    messages=[{"role": "assistant", "content": "hello"}],
-    ground_truth="hello",
+from eval_protocol.pytest import default_single_turn_rollout_processor, evaluate, evaluation_test
+from examples.math_with_format_and_length.main import evaluate as math_fl_evaluate
+from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row
+
+@evaluation_test(
+    input_dataset=["development/gsm8k_sample.jsonl"],
+    dataset_adapter=gsm8k_to_evaluation_row,
+    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    input_params=[{"temperature": 0.0}],
+    max_dataset_rows=5,
+    threshold_of_success=0.0,
+    rollout_processor=default_single_turn_rollout_processor,
+    # No mode specified = defaults to "batch"
 )
-print(result.score)
+def test_math_format_length_dataset(input_dataset, input_params, model):
+    """Run math with format and length evaluation on sample dataset."""
+    return evaluate(input_dataset, math_fl_evaluate)
 ```
 
-The [deepeval](https://github.com/confident-ai/deepeval) project also offers a
-variety of metrics. The helper `eval_protocol.integrations.deepeval.adapt_metric`
-converts a deepeval metric instance into a reward function returning an
-`EvaluateResult`.
+In batch mode, your test function receives the full dataset and must return a list of `EvaluationRow` instances.
 
-```python
-from deepeval.metrics import FaithfulnessMetric
-from eval_protocol.integrations.deepeval import adapt_metric
+## Running Tests
 
-faithfulness_reward = adapt_metric(FaithfulnessMetric())
-result = faithfulness_reward(
-    messages=[{"role": "assistant", "content": "hello"}],
-    ground_truth="hello",
-)
-print(result.score)
-```
+Run your evaluation as a regular pytest test:
 
-The GEval metric family uses an LLM-as-a-judge to score outputs based on
-custom criteria. You can construct a `GEval` metric and adapt it in the same
-way:
+```bash
+# Run specific evaluation
+pytest tests/test_word_count_evaluate.py -v
 
-```python
-from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
-from eval_protocol.integrations.deepeval import adapt_metric
-
-correctness_metric = GEval(
-    name="Correctness",
-    criteria="Determine whether the answer is factually correct",
-    evaluation_params=[
-        LLMTestCaseParams.INPUT,
-        LLMTestCaseParams.ACTUAL_OUTPUT,
-        LLMTestCaseParams.EXPECTED_OUTPUT,
-    ],
-)
+# Run all evaluations
+pytest tests/ -k "evaluate" -v
 
-correctness_reward = adapt_metric(correctness_metric)
-result = correctness_reward(
-    messages=[{"role": "user", "content": "Who wrote 1984?"}, {"role": "assistant", "content": "George Orwell"}],
-    ground_truth="George Orwell",
-)
-print(result.score)
+# Run with specific parameters
+pytest tests/test_word_count_evaluate.py::test_word_count_evaluate -v
 ```
 
-## Command Line Interface
-
-Eval Protocol includes a CLI for common operations:
-
-```bash
-# Show help
-eval-protocol --help
+## Dataset Integration
 
-# Preview an evaluator
-eval-protocol preview --metrics-folders "metric=./path" --samples ./samples.jsonl
+Create dataset adapters to convert your data format to evaluation rows:
 
-# Deploy an evaluator
-eval-protocol deploy --id my-evaluator --metrics-folders "metric=./path" --force
+```python
+from typing import Any, Dict, List
+from eval_protocol.models import EvaluationRow, Message
+
+def my_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """Convert your dataset format to EvaluationRow format."""
+    return [
+        EvaluationRow(
+            messages=[Message(role="user", content=row["question"])], 
+            ground_truth=row["answer"]
+        )
+        for row in data
+    ]
 ```
 
-## Community and Support
+## Configuration Options
 
-*   **GitHub Issues**: For bug reports and feature requests, please use [GitHub Issues](https://github.com/eval-protocol/python-sdk/issues).
-*   **GitHub Discussions**: (If enabled) For general questions, ideas, and discussions.
-*   Please also review our [Contributing Guidelines](development/CONTRIBUTING.md) and [Code of Conduct](CODE_OF_CONDUCT.md).
+The `@evaluation_test` decorator supports extensive configuration:
 
-## Development
+### Core Parameters
+- **`mode`**: `"pointwise"` for row-by-row evaluation, `"batch"` for full dataset processing
+- **`model`**: List of model identifiers to test against
+- **`input_dataset`**: Paths to JSONL dataset files
+- **`dataset_adapter`**: Function to convert dataset format to EvaluationRow format
 
-### Type Checking
+### Evaluation Control
+- **`threshold_of_success`**: Minimum aggregated score for test to pass
+- **`max_dataset_rows`**: Limit dataset size for faster testing
+- **`num_runs`**: Number of evaluation runs for stability testing
+- **`aggregation_method`**: How to combine scores (`"mean"`, `"max"`, `"min"`)
 
-The codebase uses mypy for static type checking. To run type checking:
+### Model Configuration
+- **`input_params`**: Generation parameters like temperature, top_p
+- **`rollout_processor`**: Strategy for model interaction (single-turn, multi-turn, custom)
 
-```bash
-# Install development dependencies
-pip install -e ".[dev]"
-
-# Run mypy
-mypy eval_protocol
-```
-
-Our CI pipeline enforces type checking, so please ensure your code passes mypy checks before submitting PRs.
+## Examples
 
-### Running Tests
+- **Word Count Evaluation (Pointwise)**: `tests/pytest/test_pytest_word_count_example.py`
+- **Math Evaluation (Batch)**: `tests/pytest/test_pytest_math_example.py`
+- **Math Format & Length (Batch)**: `tests/pytest/test_pytest_math_format_length.py`
+- **Migration Guide**: `tests/pytest/README_word_count_migration.md`
 
-```bash
-# Install test dependencies
-pip install -e ".[dev]"
+## Migration from Old Pattern
 
-# Run tests
-pytest
-```
+The old `@reward_function` decorator is deprecated. The new pytest-based approach provides:
 
-## Code of Conduct
+- Better separation of concerns
+- Full pytest integration
+- Easier testing and CI/CD integration
+- More flexible parameterization
 
-We are dedicated to providing a welcoming and inclusive experience for everyone. Please review and adhere to our [Code of Conduct](CODE_OF_CONDUCT.md).
+See the migration guide for detailed examples.
 
 ## License
 
-Eval Protocol is released under the Apache License 2.0.
+Eval Protocol is released under the Apache License 2.0.
\ No newline at end of file
diff --git a/eval_protocol/pytest/pytest_utils.py b/eval_protocol/pytest/pytest_utils.py
index afb0b8f4..ad04eb1c 100644
--- a/eval_protocol/pytest/pytest_utils.py
+++ b/eval_protocol/pytest/pytest_utils.py
@@ -57,6 +57,7 @@ def evaluation_test(
     num_runs: int = 1,
     max_dataset_rows: Optional[int] = None,
     mcp_config_path: Optional[str] = None,
+    mode: str = "batch",  # "batch" (default) or "pointwise"/"rowwise"
 ) -> Callable[
     [TestFunction],
     TestFunction,
@@ -80,6 +81,8 @@ def evaluation_test(
             below this threshold.
         num_runs: Number of times to repeat the evaluation.
         max_dataset_rows: Limit dataset to the first N rows.
+        mode: Evaluation mode. "batch" (default) expects test function to handle
+            full dataset. "pointwise"/"rowwise" applies test function to each row.
 
     Usage:
     With an input dataset and input params, the test function will be called with the following arguments:
@@ -125,11 +128,18 @@ def decorator(
         is_async = inspect.iscoroutinefunction(test_func)
 
         sig = inspect.signature(test_func)
-        if "input_dataset" not in sig.parameters:
-            raise ValueError("test_func must have a parameter named 'input_dataset'")
-
-        if "model" not in sig.parameters:
-            raise ValueError("test_func must have a parameter named 'model'")
+        
+        # For pointwise/rowwise mode, we expect a different signature
+        if mode in ["pointwise", "rowwise"]:
+            # Pointwise mode: function should accept messages and other row-level params
+            if "messages" not in sig.parameters:
+                raise ValueError(f"In {mode} mode, test_func must have a parameter named 'messages'")
+        else:
+            # Batch mode: function should accept input_dataset and model
+            if "input_dataset" not in sig.parameters:
+                raise ValueError("test_func must have a parameter named 'input_dataset'")
+            if "model" not in sig.parameters:
+                raise ValueError("test_func must have a parameter named 'model'")
 
         def execute_with_params(
             test_func: TestFunction,
@@ -205,13 +215,22 @@ def generate_combinations(model: List[ModelParam]):
             param_tuples.append(tuple(param_tuple))
 
         # Determine the parameter names for the test function
-        test_param_names = ["model"]
-        if input_dataset is not None:
-            test_param_names.append("dataset_path")
-        if input_params is not None:
-            test_param_names.append("input_params")
-        if input_messages is not None:
-            test_param_names.append("input_messages")
+        if mode in ["pointwise", "rowwise"]:
+            # For pointwise mode, we generate simpler parameter names
+            test_param_names = ["model"]
+            if input_dataset is not None:
+                test_param_names.append("dataset_path")
+            if input_params is not None:
+                test_param_names.append("input_params")
+        else:
+            # For batch mode, use the original parameter names
+            test_param_names = ["model"]
+            if input_dataset is not None:
+                test_param_names.append("dataset_path")
+            if input_params is not None:
+                test_param_names.append("input_params")
+            if input_messages is not None:
+                test_param_names.append("input_messages")
 
         # Create wrapper function with exact signature that pytest expects
         def create_wrapper_with_signature():
@@ -257,29 +276,33 @@ def wrapper_body(**kwargs):
 
                 all_results: List[EvaluationRow] = []
                 for _ in range(num_runs):
-                    # Each run reuses the same processed rows
-                    results = execute_with_params(
-                        test_func,
-                        model=model_name,
-                        input_dataset=input_dataset,
-                        input_params=kwargs.get("input_params") if "input_params" in kwargs else None,
-                    )
-                    if results is None:
-                        raise ValueError(
-                            f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
-                        )
-                    if not isinstance(results, list):
-                        raise ValueError(
-                            f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
-                        )
-                    if not results:
-                        raise ValueError(
-                            f"Test function {test_func.__name__} returned an empty list. You must return a non-empty list of EvaluationRow instances from your test function decorated with @evaluation_test."
-                        )
-                    if not all(isinstance(r, EvaluationRow) for r in results):
-                        raise ValueError(
-                            f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
+                    if mode in ["pointwise", "rowwise"]:
+                        # Pointwise mode: apply the evaluator function to each row
+                        results = evaluate(input_dataset, test_func)
+                    else:
+                        # Batch mode: call the test function with the full dataset
+                        results = execute_with_params(
+                            test_func,
+                            model=model_name,
+                            input_dataset=input_dataset,
+                            input_params=kwargs.get("input_params") if "input_params" in kwargs else None,
                         )
+                        if results is None:
+                            raise ValueError(
+                                f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
+                            )
+                        if not isinstance(results, list):
+                            raise ValueError(
+                                f"Test function {test_func.__name__} did not return a list of EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
+                            )
+                        if not results:
+                            raise ValueError(
+                                f"Test function {test_func.__name__} returned an empty list. You must return a non-empty list of EvaluationRow instances from your test function decorated with @evaluation_test."
+                            )
+                        if not all(isinstance(r, EvaluationRow) for r in results):
+                            raise ValueError(
+                                f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
+                            )
                     all_results.extend(results)
 
                 scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
@@ -292,14 +315,28 @@ def wrapper_body(**kwargs):
             # Create a function with the exact signature pytest expects without using exec
             from functools import wraps
 
-            @wraps(test_func)
-            def wrapper(**kwargs):
-                return wrapper_body(**kwargs)
-
-            parameters = [
-                inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names
-            ]
-            wrapper.__signature__ = inspect.Signature(parameters)
+            if mode in ["pointwise", "rowwise"]:
+                # For pointwise mode, create a wrapper that handles everything internally
+                @wraps(test_func)
+                def wrapper(**kwargs):
+                    return wrapper_body(**kwargs)
+                
+                # Give it the pytest-compatible signature but the test_func name
+                parameters = [
+                    inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names
+                ]
+                wrapper.__signature__ = inspect.Signature(parameters)
+                wrapper.__name__ = test_func.__name__.replace('_evaluate', '_dataset') if '_evaluate' in test_func.__name__ else f"test_{test_func.__name__}"
+            else:
+                # For batch mode, use the original wrapper
+                @wraps(test_func)
+                def wrapper(**kwargs):
+                    return wrapper_body(**kwargs)
+
+                parameters = [
+                    inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names
+                ]
+                wrapper.__signature__ = inspect.Signature(parameters)
 
             return wrapper
 
diff --git a/pyproject.toml b/pyproject.toml
index f7095386..dcc10ba4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,6 +80,7 @@ dev = [
     "ipykernel>=6.30.0",
     "jupyter>=1.1.1",
     "pip>=25.1.1",
+    "haikus==0.3.8",
 ]
 trl = [
     "torch>=1.9",
@@ -119,3 +120,9 @@ line_length = 119
 
 [tool.uv.sources]
 tau2 = { git = "https://github.com/sierra-research/tau2-bench.git" }
+
+[dependency-groups]
+dev = [
+    "haikus==0.3.8",
+    "pytest>=8.4.1",
+]
diff --git a/tests/pytest/helper/word_count_to_evaluation_row.py b/tests/pytest/helper/word_count_to_evaluation_row.py
new file mode 100644
index 00000000..f0517dd0
--- /dev/null
+++ b/tests/pytest/helper/word_count_to_evaluation_row.py
@@ -0,0 +1,14 @@
+from typing import Any, Dict, List
+
+from eval_protocol.models import EvaluationRow, Message
+
+
+def word_count_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """Convert gsm8k dataset format to EvaluationRow for word_count evaluation."""
+    return [
+        EvaluationRow(
+            messages=[Message(role="user", content=row["user_query"])], 
+            ground_truth=row["ground_truth_for_eval"]
+        )
+        for row in data
+    ] 
\ No newline at end of file
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
new file mode 100644
index 00000000..c608d6fb
--- /dev/null
+++ b/tests/pytest/test_pytest_word_count_example.py
@@ -0,0 +1,78 @@
+from typing import List
+from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
+from eval_protocol.models import EvaluateResult, MetricResult, Message
+from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row
+from haikus import haikus
+
+
+@evaluation_test(
+    input_dataset=["development/gsm8k_sample.jsonl"],
+    dataset_adapter=word_count_to_evaluation_row,
+    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    input_params=[{"temperature": 0.0}],
+    max_dataset_rows=5,
+    threshold_of_success=0.3,  # Reasonable threshold for word count evaluation
+    rollout_processor=default_single_turn_rollout_processor,
+    mode="pointwise",  # Use pointwise mode for elegant row-by-row evaluation
+)
+def test_word_count_evaluate(messages: List[Message], **kwargs) -> EvaluateResult:
+    """
+    Pointwise word count evaluator - just the core evaluation logic.
+    Everything else (models, datasets, thresholds) is parameterized in the decorator.
+    """
+    if not messages:
+        return EvaluateResult(score=0.0, reason="No messages found", is_score_valid=False)
+
+    last_message = messages[-1]
+    content = last_message.content if last_message and last_message.content else ""
+
+    # Word count logic
+    word_count = len(content.split())
+    word_count_score = min(word_count / 100, 1.0)
+
+    # Haiku analysis logic
+    haiku_lines = content.splitlines()
+    haiku_analysis_data = {}
+    haiku_metric_score = 0.0
+    haiku_metric_reason = "Content not suitable for haiku analysis."
+    haiku_metric_valid = False
+
+    if len(haiku_lines) in [3, 5]:
+        try:
+            analysis = haikus(haiku_lines)
+            haiku_analysis_data = analysis
+            kigo = analysis.get("kigo", [])
+            haiku_type = analysis.get("type", "unknown")
+
+            if kigo:
+                haiku_metric_score = 1.0
+            elif haiku_type not in ["unknown", "error"]:
+                haiku_metric_score = 0.5
+
+            haiku_metric_reason = f"Haiku analysis - Type: {haiku_type}, Kigo: {', '.join(kigo) if kigo else 'None'}"
+            haiku_metric_valid = True
+        except Exception as e:
+            haiku_metric_reason = f"Haiku analysis failed: {str(e)}"
+            haiku_metric_valid = False
+
+    # Combine metrics
+    metrics = {
+        "word_count": MetricResult(
+            score=word_count_score,
+            is_score_valid=word_count > 0,
+            reason=f"Word count: {word_count}",
+        ),
+        "haiku_analysis": MetricResult(
+            score=haiku_metric_score,
+            is_score_valid=haiku_metric_valid,
+            reason=haiku_metric_reason,
+            data=haiku_analysis_data,
+        ),
+    }
+
+    return EvaluateResult(
+        score=word_count_score,
+        reason=f"Word count: {word_count}. {haiku_metric_reason}",
+        is_score_valid=True,
+        metrics=metrics,
+    )
\ No newline at end of file
diff --git a/uv.lock b/uv.lock
index 431722cf..4a5fdba8 100644
--- a/uv.lock
+++ b/uv.lock
@@ -953,6 +953,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" },
 ]
 
+[[package]]
+name = "deprecated"
+version = "1.2.18"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/98/97/06afe62762c9a8a86af0cfb7bfdab22a43ad17138b07af5b1a58442690a2/deprecated-1.2.18.tar.gz", hash = "sha256:422b6f6d859da6f2ef57857761bfb392480502a64c3028ca9bbe86085d72115d", size = 2928744, upload-time = "2025-01-27T10:46:25.7Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998, upload-time = "2025-01-27T10:46:09.186Z" },
+]
+
 [[package]]
 name = "dill"
 version = "0.3.8"
@@ -1073,6 +1085,7 @@ dev = [
     { name = "docker" },
     { name = "e2b" },
     { name = "flake8" },
+    { name = "haikus" },
     { name = "ipykernel" },
     { name = "isort" },
     { name = "jupyter" },
@@ -1108,6 +1121,12 @@ trl = [
     { name = "trl" },
 ]
 
+[package.dev-dependencies]
+dev = [
+    { name = "haikus" },
+    { name = "pytest" },
+]
+
 [package.metadata]
 requires-dist = [
     { name = "accelerate", marker = "extra == 'trl'", specifier = ">=0.28.0" },
@@ -1119,8 +1138,8 @@ requires-dist = [
     { name = "black", marker = "extra == 'dev'", specifier = ">=21.5b2" },
     { name = "build", marker = "extra == 'dev'" },
     { name = "dataclasses-json", specifier = ">=0.5.7" },
-    { name = "deepdiff", specifier = ">=6.0.0" },
     { name = "datasets" },
+    { name = "deepdiff", specifier = ">=6.0.0" },
     { name = "docker", marker = "extra == 'dev'", specifier = "==7.1.0" },
     { name = "docstring-parser", specifier = ">=0.15" },
     { name = "e2b", marker = "extra == 'dev'" },
@@ -1129,6 +1148,7 @@ requires-dist = [
     { name = "flake8", marker = "extra == 'dev'", specifier = ">=3.9.2" },
     { name = "fsspec" },
     { name = "gymnasium", specifier = ">=0.29.0" },
+    { name = "haikus", marker = "extra == 'dev'", specifier = "==0.3.8" },
     { name = "httpx", specifier = ">=0.24.0" },
     { name = "hydra-core", specifier = ">=1.3.2" },
     { name = "ipykernel", specifier = ">=6.30.0" },
@@ -1175,6 +1195,12 @@ requires-dist = [
 ]
 provides-extras = ["dev", "trl", "openevals", "fireworks"]
 
+[package.metadata.requires-dev]
+dev = [
+    { name = "haikus", specifier = "==0.3.8" },
+    { name = "pytest", specifier = ">=8.4.1" },
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.3.0"
@@ -1554,6 +1580,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d0/9e/984486f2d0a0bd2b024bf4bc1c62688fcafa9e61991f041fb0e2def4a982/h2-4.2.0-py3-none-any.whl", hash = "sha256:479a53ad425bb29af087f3458a61d30780bc818e4ebcf01f0b536ba916462ed0", size = 60957, upload-time = "2025-02-01T11:02:26.481Z" },
 ]
 
+[[package]]
+name = "haikus"
+version = "0.3.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pykakasi" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7a/c2/2309ca1210318e3fc66007d0c8b1c3f959d9b3432d17a8b17a55fc6e145a/haikus-0.3.8.tar.gz", hash = "sha256:0e59cf8bfae8faa51965a9b39d60aa511e68f053a53a1fd956e391e26dbb796e", size = 96508, upload-time = "2024-01-17T04:42:32.572Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/eb/21/89a60826a8973ba43bb24ed3747f26fa93467586bb79703670140901120e/haikus-0.3.8-py3-none-any.whl", hash = "sha256:fc5566b062db047a8128db38a32be4195fcc84482f0f279dad34d825c5ee1799", size = 98532, upload-time = "2024-01-17T04:42:28.572Z" },
+]
+
 [[package]]
 name = "hf-xet"
 version = "1.1.5"
@@ -1850,6 +1888,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c1/11/114d0a5f4dabbdcedc1125dee0888514c3c3b16d3e9facad87ed96fad97c/isort-6.0.1-py3-none-any.whl", hash = "sha256:2dc5d7f65c9678d94c88dfc29161a320eec67328bc97aad576874cb4be1e9615", size = 94186, upload-time = "2025-02-26T21:13:14.911Z" },
 ]
 
+[[package]]
+name = "jaconv"
+version = "0.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d2/e1/670cefc7f00b0e1890e114a37a98ea425f7e06131342aeb9636856ac663c/jaconv-0.4.0.tar.gz", hash = "sha256:32da74b247f276e09a52d6b35c153df2387965cb85a6f034cc8af21d446f8161", size = 17402, upload-time = "2024-07-25T16:35:24.75Z" }
+
 [[package]]
 name = "jaraco-classes"
 version = "3.4.0"
@@ -4072,6 +4116,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
 ]
 
+[[package]]
+name = "pykakasi"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecated" },
+    { name = "jaconv" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ad/32/2a8e213fd744459a03864af7cf4c6142ee061fc915757c8152d147b16015/pykakasi-2.3.0.tar.gz", hash = "sha256:fa052a8e63f59fb8d6569abbe719a8c9f9daf15ed27a67a56ab1705f0f67b0a1", size = 21752447, upload-time = "2024-06-24T04:57:31.233Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0f/e8/11644fe823e05c583b330e9fb81e3e8fc5d079036512a8300fc157be349d/pykakasi-2.3.0-py3-none-any.whl", hash = "sha256:26d21b090048ff45c6a4d8e962426b7951767216008ec30358e8a9d74af77f29", size = 2395003, upload-time = "2024-06-24T04:57:18.101Z" },
+]
+
 [[package]]
 name = "pyproject-hooks"
 version = "1.2.0"
@@ -5406,6 +5463,70 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
 ]
 
+[[package]]
+name = "wrapt"
+version = "1.17.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/fc/e91cc220803d7bc4db93fb02facd8461c37364151b8494762cc88b0fbcef/wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3", size = 55531, upload-time = "2025-01-14T10:35:45.465Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/d1/1daec934997e8b160040c78d7b31789f19b122110a75eca3d4e8da0049e1/wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984", size = 53307, upload-time = "2025-01-14T10:33:13.616Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/7b/13369d42651b809389c1a7153baa01d9700430576c81a2f5c5e460df0ed9/wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22", size = 38486, upload-time = "2025-01-14T10:33:15.947Z" },
+    { url = "https://files.pythonhosted.org/packages/62/bf/e0105016f907c30b4bd9e377867c48c34dc9c6c0c104556c9c9126bd89ed/wrapt-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80dd7db6a7cb57ffbc279c4394246414ec99537ae81ffd702443335a61dbf3a7", size = 38777, upload-time = "2025-01-14T10:33:17.462Z" },
+    { url = "https://files.pythonhosted.org/packages/27/70/0f6e0679845cbf8b165e027d43402a55494779295c4b08414097b258ac87/wrapt-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a6e821770cf99cc586d33833b2ff32faebdbe886bd6322395606cf55153246c", size = 83314, upload-time = "2025-01-14T10:33:21.282Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/77/0576d841bf84af8579124a93d216f55d6f74374e4445264cb378a6ed33eb/wrapt-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b60fb58b90c6d63779cb0c0c54eeb38941bae3ecf7a73c764c52c88c2dcb9d72", size = 74947, upload-time = "2025-01-14T10:33:24.414Z" },
+    { url = "https://files.pythonhosted.org/packages/90/ec/00759565518f268ed707dcc40f7eeec38637d46b098a1f5143bff488fe97/wrapt-1.17.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b870b5df5b71d8c3359d21be8f0d6c485fa0ebdb6477dda51a1ea54a9b558061", size = 82778, upload-time = "2025-01-14T10:33:26.152Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/5a/7cffd26b1c607b0b0c8a9ca9d75757ad7620c9c0a9b4a25d3f8a1480fafc/wrapt-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4011d137b9955791f9084749cba9a367c68d50ab8d11d64c50ba1688c9b457f2", size = 81716, upload-time = "2025-01-14T10:33:27.372Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/09/dccf68fa98e862df7e6a60a61d43d644b7d095a5fc36dbb591bbd4a1c7b2/wrapt-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1473400e5b2733e58b396a04eb7f35f541e1fb976d0c0724d0223dd607e0f74c", size = 74548, upload-time = "2025-01-14T10:33:28.52Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/8e/067021fa3c8814952c5e228d916963c1115b983e21393289de15128e867e/wrapt-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3cedbfa9c940fdad3e6e941db7138e26ce8aad38ab5fe9dcfadfed9db7a54e62", size = 81334, upload-time = "2025-01-14T10:33:29.643Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/0d/9d4b5219ae4393f718699ca1c05f5ebc0c40d076f7e65fd48f5f693294fb/wrapt-1.17.2-cp310-cp310-win32.whl", hash = "sha256:582530701bff1dec6779efa00c516496968edd851fba224fbd86e46cc6b73563", size = 36427, upload-time = "2025-01-14T10:33:30.832Z" },
+    { url = "https://files.pythonhosted.org/packages/72/6a/c5a83e8f61aec1e1aeef939807602fb880e5872371e95df2137142f5c58e/wrapt-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:58705da316756681ad3c9c73fd15499aa4d8c69f9fd38dc8a35e06c12468582f", size = 38774, upload-time = "2025-01-14T10:33:32.897Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/f7/a2aab2cbc7a665efab072344a8949a71081eed1d2f451f7f7d2b966594a2/wrapt-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ff04ef6eec3eee8a5efef2401495967a916feaa353643defcc03fc74fe213b58", size = 53308, upload-time = "2025-01-14T10:33:33.992Z" },
+    { url = "https://files.pythonhosted.org/packages/50/ff/149aba8365fdacef52b31a258c4dc1c57c79759c335eff0b3316a2664a64/wrapt-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db983e7bca53819efdbd64590ee96c9213894272c776966ca6306b73e4affda", size = 38488, upload-time = "2025-01-14T10:33:35.264Z" },
+    { url = "https://files.pythonhosted.org/packages/65/46/5a917ce85b5c3b490d35c02bf71aedaa9f2f63f2d15d9949cc4ba56e8ba9/wrapt-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9abc77a4ce4c6f2a3168ff34b1da9b0f311a8f1cfd694ec96b0603dff1c79438", size = 38776, upload-time = "2025-01-14T10:33:38.28Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/74/336c918d2915a4943501c77566db41d1bd6e9f4dbc317f356b9a244dfe83/wrapt-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b929ac182f5ace000d459c59c2c9c33047e20e935f8e39371fa6e3b85d56f4a", size = 83776, upload-time = "2025-01-14T10:33:40.678Z" },
+    { url = "https://files.pythonhosted.org/packages/09/99/c0c844a5ccde0fe5761d4305485297f91d67cf2a1a824c5f282e661ec7ff/wrapt-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f09b286faeff3c750a879d336fb6d8713206fc97af3adc14def0cdd349df6000", size = 75420, upload-time = "2025-01-14T10:33:41.868Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/b0/9fc566b0fe08b282c850063591a756057c3247b2362b9286429ec5bf1721/wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7ed2d9d039bd41e889f6fb9364554052ca21ce823580f6a07c4ec245c1f5d6", size = 83199, upload-time = "2025-01-14T10:33:43.598Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/4b/71996e62d543b0a0bd95dda485219856def3347e3e9380cc0d6cf10cfb2f/wrapt-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:129a150f5c445165ff941fc02ee27df65940fcb8a22a61828b1853c98763a64b", size = 82307, upload-time = "2025-01-14T10:33:48.499Z" },
+    { url = "https://files.pythonhosted.org/packages/39/35/0282c0d8789c0dc9bcc738911776c762a701f95cfe113fb8f0b40e45c2b9/wrapt-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1fb5699e4464afe5c7e65fa51d4f99e0b2eadcc176e4aa33600a3df7801d6662", size = 75025, upload-time = "2025-01-14T10:33:51.191Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/6d/90c9fd2c3c6fee181feecb620d95105370198b6b98a0770cba090441a828/wrapt-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9a2bce789a5ea90e51a02dfcc39e31b7f1e662bc3317979aa7e5538e3a034f72", size = 81879, upload-time = "2025-01-14T10:33:52.328Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/fa/9fb6e594f2ce03ef03eddbdb5f4f90acb1452221a5351116c7c4708ac865/wrapt-1.17.2-cp311-cp311-win32.whl", hash = "sha256:4afd5814270fdf6380616b321fd31435a462019d834f83c8611a0ce7484c7317", size = 36419, upload-time = "2025-01-14T10:33:53.551Z" },
+    { url = "https://files.pythonhosted.org/packages/47/f8/fb1773491a253cbc123c5d5dc15c86041f746ed30416535f2a8df1f4a392/wrapt-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:acc130bc0375999da18e3d19e5a86403667ac0c4042a094fefb7eec8ebac7cf3", size = 38773, upload-time = "2025-01-14T10:33:56.323Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/bd/ab55f849fd1f9a58ed7ea47f5559ff09741b25f00c191231f9f059c83949/wrapt-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d5e2439eecc762cd85e7bd37161d4714aa03a33c5ba884e26c81559817ca0925", size = 53799, upload-time = "2025-01-14T10:33:57.4Z" },
+    { url = "https://files.pythonhosted.org/packages/53/18/75ddc64c3f63988f5a1d7e10fb204ffe5762bc663f8023f18ecaf31a332e/wrapt-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fc7cb4c1c744f8c05cd5f9438a3caa6ab94ce8344e952d7c45a8ed59dd88392", size = 38821, upload-time = "2025-01-14T10:33:59.334Z" },
+    { url = "https://files.pythonhosted.org/packages/48/2a/97928387d6ed1c1ebbfd4efc4133a0633546bec8481a2dd5ec961313a1c7/wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fdbdb757d5390f7c675e558fd3186d590973244fab0c5fe63d373ade3e99d40", size = 38919, upload-time = "2025-01-14T10:34:04.093Z" },
+    { url = "https://files.pythonhosted.org/packages/73/54/3bfe5a1febbbccb7a2f77de47b989c0b85ed3a6a41614b104204a788c20e/wrapt-1.17.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb1d0dbf99411f3d871deb6faa9aabb9d4e744d67dcaaa05399af89d847a91d", size = 88721, upload-time = "2025-01-14T10:34:07.163Z" },
+    { url = "https://files.pythonhosted.org/packages/25/cb/7262bc1b0300b4b64af50c2720ef958c2c1917525238d661c3e9a2b71b7b/wrapt-1.17.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d18a4865f46b8579d44e4fe1e2bcbc6472ad83d98e22a26c963d46e4c125ef0b", size = 80899, upload-time = "2025-01-14T10:34:09.82Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/5a/04cde32b07a7431d4ed0553a76fdb7a61270e78c5fd5a603e190ac389f14/wrapt-1.17.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc570b5f14a79734437cb7b0500376b6b791153314986074486e0b0fa8d71d98", size = 89222, upload-time = "2025-01-14T10:34:11.258Z" },
+    { url = "https://files.pythonhosted.org/packages/09/28/2e45a4f4771fcfb109e244d5dbe54259e970362a311b67a965555ba65026/wrapt-1.17.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6d9187b01bebc3875bac9b087948a2bccefe464a7d8f627cf6e48b1bbae30f82", size = 86707, upload-time = "2025-01-14T10:34:12.49Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/d2/dcb56bf5f32fcd4bd9aacc77b50a539abdd5b6536872413fd3f428b21bed/wrapt-1.17.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9e8659775f1adf02eb1e6f109751268e493c73716ca5761f8acb695e52a756ae", size = 79685, upload-time = "2025-01-14T10:34:15.043Z" },
+    { url = "https://files.pythonhosted.org/packages/80/4e/eb8b353e36711347893f502ce91c770b0b0929f8f0bed2670a6856e667a9/wrapt-1.17.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8b2816ebef96d83657b56306152a93909a83f23994f4b30ad4573b00bd11bb9", size = 87567, upload-time = "2025-01-14T10:34:16.563Z" },
+    { url = "https://files.pythonhosted.org/packages/17/27/4fe749a54e7fae6e7146f1c7d914d28ef599dacd4416566c055564080fe2/wrapt-1.17.2-cp312-cp312-win32.whl", hash = "sha256:468090021f391fe0056ad3e807e3d9034e0fd01adcd3bdfba977b6fdf4213ea9", size = 36672, upload-time = "2025-01-14T10:34:17.727Z" },
+    { url = "https://files.pythonhosted.org/packages/15/06/1dbf478ea45c03e78a6a8c4be4fdc3c3bddea5c8de8a93bc971415e47f0f/wrapt-1.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:ec89ed91f2fa8e3f52ae53cd3cf640d6feff92ba90d62236a81e4e563ac0e991", size = 38865, upload-time = "2025-01-14T10:34:19.577Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/b9/0ffd557a92f3b11d4c5d5e0c5e4ad057bd9eb8586615cdaf901409920b14/wrapt-1.17.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6ed6ffac43aecfe6d86ec5b74b06a5be33d5bb9243d055141e8cabb12aa08125", size = 53800, upload-time = "2025-01-14T10:34:21.571Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/ef/8be90a0b7e73c32e550c73cfb2fa09db62234227ece47b0e80a05073b375/wrapt-1.17.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35621ae4c00e056adb0009f8e86e28eb4a41a4bfa8f9bfa9fca7d343fe94f998", size = 38824, upload-time = "2025-01-14T10:34:22.999Z" },
+    { url = "https://files.pythonhosted.org/packages/36/89/0aae34c10fe524cce30fe5fc433210376bce94cf74d05b0d68344c8ba46e/wrapt-1.17.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a604bf7a053f8362d27eb9fefd2097f82600b856d5abe996d623babd067b1ab5", size = 38920, upload-time = "2025-01-14T10:34:25.386Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/24/11c4510de906d77e0cfb5197f1b1445d4fec42c9a39ea853d482698ac681/wrapt-1.17.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbabee4f083b6b4cd282f5b817a867cf0b1028c54d445b7ec7cfe6505057cf8", size = 88690, upload-time = "2025-01-14T10:34:28.058Z" },
+    { url = "https://files.pythonhosted.org/packages/71/d7/cfcf842291267bf455b3e266c0c29dcb675b5540ee8b50ba1699abf3af45/wrapt-1.17.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49703ce2ddc220df165bd2962f8e03b84c89fee2d65e1c24a7defff6f988f4d6", size = 80861, upload-time = "2025-01-14T10:34:29.167Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/66/5d973e9f3e7370fd686fb47a9af3319418ed925c27d72ce16b791231576d/wrapt-1.17.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8112e52c5822fc4253f3901b676c55ddf288614dc7011634e2719718eaa187dc", size = 89174, upload-time = "2025-01-14T10:34:31.702Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/d3/8e17bb70f6ae25dabc1aaf990f86824e4fd98ee9cadf197054e068500d27/wrapt-1.17.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fee687dce376205d9a494e9c121e27183b2a3df18037f89d69bd7b35bcf59e2", size = 86721, upload-time = "2025-01-14T10:34:32.91Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/54/f170dfb278fe1c30d0ff864513cff526d624ab8de3254b20abb9cffedc24/wrapt-1.17.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:18983c537e04d11cf027fbb60a1e8dfd5190e2b60cc27bc0808e653e7b218d1b", size = 79763, upload-time = "2025-01-14T10:34:34.903Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/98/de07243751f1c4a9b15c76019250210dd3486ce098c3d80d5f729cba029c/wrapt-1.17.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:703919b1633412ab54bcf920ab388735832fdcb9f9a00ae49387f0fe67dad504", size = 87585, upload-time = "2025-01-14T10:34:36.13Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/f0/13925f4bd6548013038cdeb11ee2cbd4e37c30f8bfd5db9e5a2a370d6e20/wrapt-1.17.2-cp313-cp313-win32.whl", hash = "sha256:abbb9e76177c35d4e8568e58650aa6926040d6a9f6f03435b7a522bf1c487f9a", size = 36676, upload-time = "2025-01-14T10:34:37.962Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/ae/743f16ef8c2e3628df3ddfd652b7d4c555d12c84b53f3d8218498f4ade9b/wrapt-1.17.2-cp313-cp313-win_amd64.whl", hash = "sha256:69606d7bb691b50a4240ce6b22ebb319c1cfb164e5f6569835058196e0f3a845", size = 38871, upload-time = "2025-01-14T10:34:39.13Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/bc/30f903f891a82d402ffb5fda27ec1d621cc97cb74c16fea0b6141f1d4e87/wrapt-1.17.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:4a721d3c943dae44f8e243b380cb645a709ba5bd35d3ad27bc2ed947e9c68192", size = 56312, upload-time = "2025-01-14T10:34:40.604Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/04/c97273eb491b5f1c918857cd26f314b74fc9b29224521f5b83f872253725/wrapt-1.17.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:766d8bbefcb9e00c3ac3b000d9acc51f1b399513f44d77dfe0eb026ad7c9a19b", size = 40062, upload-time = "2025-01-14T10:34:45.011Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/ca/3b7afa1eae3a9e7fefe499db9b96813f41828b9fdb016ee836c4c379dadb/wrapt-1.17.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e496a8ce2c256da1eb98bd15803a79bee00fc351f5dfb9ea82594a3f058309e0", size = 40155, upload-time = "2025-01-14T10:34:47.25Z" },
+    { url = "https://files.pythonhosted.org/packages/89/be/7c1baed43290775cb9030c774bc53c860db140397047cc49aedaf0a15477/wrapt-1.17.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d615e4fe22f4ad3528448c193b218e077656ca9ccb22ce2cb20db730f8d306", size = 113471, upload-time = "2025-01-14T10:34:50.934Z" },
+    { url = "https://files.pythonhosted.org/packages/32/98/4ed894cf012b6d6aae5f5cc974006bdeb92f0241775addad3f8cd6ab71c8/wrapt-1.17.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a5aaeff38654462bc4b09023918b7f21790efb807f54c000a39d41d69cf552cb", size = 101208, upload-time = "2025-01-14T10:34:52.297Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/fd/0c30f2301ca94e655e5e057012e83284ce8c545df7661a78d8bfca2fac7a/wrapt-1.17.2-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a7d15bbd2bc99e92e39f49a04653062ee6085c0e18b3b7512a4f2fe91f2d681", size = 109339, upload-time = "2025-01-14T10:34:53.489Z" },
+    { url = "https://files.pythonhosted.org/packages/75/56/05d000de894c4cfcb84bcd6b1df6214297b8089a7bd324c21a4765e49b14/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e3890b508a23299083e065f435a492b5435eba6e304a7114d2f919d400888cc6", size = 110232, upload-time = "2025-01-14T10:34:55.327Z" },
+    { url = "https://files.pythonhosted.org/packages/53/f8/c3f6b2cf9b9277fb0813418e1503e68414cd036b3b099c823379c9575e6d/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8c8b293cd65ad716d13d8dd3624e42e5a19cc2a2f1acc74b30c2c13f15cb61a6", size = 100476, upload-time = "2025-01-14T10:34:58.055Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/b1/0bb11e29aa5139d90b770ebbfa167267b1fc548d2302c30c8f7572851738/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f", size = 106377, upload-time = "2025-01-14T10:34:59.3Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/e1/0122853035b40b3f333bbb25f1939fc1045e21dd518f7f0922b60c156f7c/wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555", size = 37986, upload-time = "2025-01-14T10:35:00.498Z" },
+    { url = "https://files.pythonhosted.org/packages/09/5e/1655cf481e079c1f22d0cabdd4e51733679932718dc23bf2db175f329b76/wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c", size = 40750, upload-time = "2025-01-14T10:35:03.378Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/82/f56956041adef78f849db6b289b282e72b55ab8045a75abad81898c28d19/wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8", size = 23594, upload-time = "2025-01-14T10:35:44.018Z" },
+]
+
 [[package]]
 name = "wsproto"
 version = "1.2.0"

From 82f5d86944e33aab893a965323ddb15eef7da869 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Sun, 3 Aug 2025 12:28:10 -0700
Subject: [PATCH 2/8] save

---
 eval_protocol/pytest/pytest_utils.py          | 141 ++++++++++--------
 eval_protocol/pytest/types.py                 |  13 +-
 tests/pytest/test_markdown_highlighting.py    |  42 +++---
 tests/pytest/test_pytest_async.py             |   2 +-
 ..._pytest_default_agent_rollout_processor.py |   2 +-
 tests/pytest/test_pytest_input_messages.py    |   2 +-
 tests/pytest/test_pytest_math_example.py      |   4 +-
 .../pytest/test_pytest_math_format_length.py  |   4 +-
 .../pytest/test_pytest_word_count_example.py  |  11 +-
 9 files changed, 123 insertions(+), 98 deletions(-)

diff --git a/eval_protocol/pytest/pytest_utils.py b/eval_protocol/pytest/pytest_utils.py
index ad04eb1c..50f8349e 100644
--- a/eval_protocol/pytest/pytest_utils.py
+++ b/eval_protocol/pytest/pytest_utils.py
@@ -8,6 +8,7 @@
 from eval_protocol.pytest.types import (
     Dataset,
     DatasetPathParam,
+    EvaluationTestMode,
     InputMessagesParam,
     InputParam,
     ModelParam,
@@ -44,6 +45,39 @@ def _aggregate(scores: List[float], method: str) -> float:
     raise ValueError(f"Unknown aggregation method: {method}")
 
 
+def _create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names):
+    """
+    Creates a wrapper function with dynamic parameters for pytest parameterization.
+
+    This function takes a test function and creates a wrapper that:
+    1. Preserves the original function's metadata using functools.wraps
+    2. Creates a new function signature with the specified parameter names that maps to pytest.mark.parametrize decorator
+    3. Returns a callable that can be used with pytest.mark.parametrize
+
+    The function signature is dynamically created to match the parameter names expected by
+    pytest.mark.parametrize, ensuring that pytest can properly map the test parameters
+    to the function arguments.
+
+    Args:
+        test_func: The original test function to wrap
+        wrapper_body: The function body that contains the actual test logic
+        test_param_names: List of parameter names for the dynamic signature
+
+    Returns:
+        A wrapper function with the specified parameter signature that calls wrapper_body
+    """
+    from functools import wraps
+
+    @wraps(test_func)
+    def wrapper(**kwargs):
+        return wrapper_body(**kwargs)
+
+    parameters = [inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names]
+    wrapper.__signature__ = inspect.Signature(parameters)
+
+    return wrapper
+
+
 def evaluation_test(
     *,
     model: List[ModelParam],
@@ -57,7 +91,7 @@ def evaluation_test(
     num_runs: int = 1,
     max_dataset_rows: Optional[int] = None,
     mcp_config_path: Optional[str] = None,
-    mode: str = "batch",  # "batch" (default) or "pointwise"/"rowwise"
+    mode: EvaluationTestMode = "batch",
 ) -> Callable[
     [TestFunction],
     TestFunction,
@@ -82,7 +116,8 @@ def evaluation_test(
         num_runs: Number of times to repeat the evaluation.
         max_dataset_rows: Limit dataset to the first N rows.
         mode: Evaluation mode. "batch" (default) expects test function to handle
-            full dataset. "pointwise"/"rowwise" applies test function to each row.
+            full dataset. "pointwise" applies test function to each row. If your evaluation requires
+            the full rollout of all rows to compute the score, use
 
     Usage:
     With an input dataset and input params, the test function will be called with the following arguments:
@@ -128,22 +163,31 @@ def decorator(
         is_async = inspect.iscoroutinefunction(test_func)
 
         sig = inspect.signature(test_func)
-        
+
         # For pointwise/rowwise mode, we expect a different signature
-        if mode in ["pointwise", "rowwise"]:
+        if mode == "pointwise":
             # Pointwise mode: function should accept messages and other row-level params
-            if "messages" not in sig.parameters:
-                raise ValueError(f"In {mode} mode, test_func must have a parameter named 'messages'")
+            if "row" not in sig.parameters:
+                raise ValueError(f"In pointwise mode, your eval function must have a parameter named 'row'")
+
+            # validate that the function has a return type of EvaluationRow
+            if sig.return_annotation is not EvaluationRow:
+                raise ValueError("In pointwise mode, your eval function must return an EvaluationRow instance")
         else:
             # Batch mode: function should accept input_dataset and model
             if "input_dataset" not in sig.parameters:
-                raise ValueError("test_func must have a parameter named 'input_dataset'")
+                raise ValueError("In batch mode, your eval function must have a parameter named 'input_dataset'")
             if "model" not in sig.parameters:
-                raise ValueError("test_func must have a parameter named 'model'")
+                raise ValueError("In batch mode, your eval function must have a parameter named 'model'")
+
+            # validate that the function has a return type of List[EvaluationRow]
+            if sig.return_annotation is not List[EvaluationRow]:
+                raise ValueError("In batch mode, your eval function must return a list of EvaluationRow instances")
 
         def execute_with_params(
             test_func: TestFunction,
             model: str,
+            row: EvaluationRow | None = None,
             input_dataset: List[EvaluationRow] | None = None,
             input_params: InputParam | None = None,
         ):
@@ -154,6 +198,8 @@ def execute_with_params(
                 kwargs["input_params"] = input_params
             if model is not None:
                 kwargs["model"] = model
+            if row is not None:
+                kwargs["row"] = row
             if is_async:
                 # Handle async functions with proper event loop management
                 try:
@@ -173,19 +219,16 @@ def execute_with_params(
             return results
 
         # Calculate all possible combinations of parameters
-        def generate_combinations(model: List[ModelParam]):
+        def generate_combinations():
             combinations = []
 
-            # Always include models
-            model_list = model
-
             # Handle optional parameters with defaults
             datasets: List[Optional[DatasetPathParam]] = input_dataset if input_dataset is not None else [None]  # type: ignore
             params: List[Optional[InputParam]] = input_params if input_params is not None else [None]  # type: ignore
             messages: List[Optional[InputMessagesParam]] = input_messages if input_messages is not None else [None]  # type: ignore
 
             # Generate all combinations
-            for m in model_list:
+            for m in model:
                 for ds in datasets:
                     for ip in params:
                         for im in messages:
@@ -199,7 +242,7 @@ def generate_combinations(model: List[ModelParam]):
 
             return combinations
 
-        combinations = generate_combinations(model)
+        combinations = generate_combinations()
 
         # Create parameter tuples for pytest.mark.parametrize
         param_tuples = []
@@ -214,23 +257,14 @@ def generate_combinations(model: List[ModelParam]):
                 param_tuple.append(messages)
             param_tuples.append(tuple(param_tuple))
 
-        # Determine the parameter names for the test function
-        if mode in ["pointwise", "rowwise"]:
-            # For pointwise mode, we generate simpler parameter names
-            test_param_names = ["model"]
-            if input_dataset is not None:
-                test_param_names.append("dataset_path")
-            if input_params is not None:
-                test_param_names.append("input_params")
-        else:
-            # For batch mode, use the original parameter names
-            test_param_names = ["model"]
-            if input_dataset is not None:
-                test_param_names.append("dataset_path")
-            if input_params is not None:
-                test_param_names.append("input_params")
-            if input_messages is not None:
-                test_param_names.append("input_messages")
+        # For batch mode, use the original parameter names
+        test_param_names = ["model"]
+        if input_dataset is not None:
+            test_param_names.append("dataset_path")
+        if input_params is not None:
+            test_param_names.append("input_params")
+        if input_messages is not None:
+            test_param_names.append("input_messages")
 
         # Create wrapper function with exact signature that pytest expects
         def create_wrapper_with_signature():
@@ -276,9 +310,20 @@ def wrapper_body(**kwargs):
 
                 all_results: List[EvaluationRow] = []
                 for _ in range(num_runs):
-                    if mode in ["pointwise", "rowwise"]:
+                    if mode == "pointwise":
                         # Pointwise mode: apply the evaluator function to each row
-                        results = evaluate(input_dataset, test_func)
+                        for row in input_dataset:
+                            result = execute_with_params(
+                                test_func,
+                                model=model_name,
+                                row=row,
+                                input_params=kwargs.get("input_params") if "input_params" in kwargs else None,
+                            )
+                            if result is None or not isinstance(result, EvaluationRow):
+                                raise ValueError(
+                                    f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
+                                )
+                            all_results.append(result)
                     else:
                         # Batch mode: call the test function with the full dataset
                         results = execute_with_params(
@@ -303,7 +348,7 @@ def wrapper_body(**kwargs):
                             raise ValueError(
                                 f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
                             )
-                    all_results.extend(results)
+                        all_results.extend(results)
 
                 scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
                 agg_score = _aggregate(scores, aggregation_method)
@@ -312,33 +357,7 @@ def wrapper_body(**kwargs):
                         agg_score >= threshold_of_success
                     ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}"
 
-            # Create a function with the exact signature pytest expects without using exec
-            from functools import wraps
-
-            if mode in ["pointwise", "rowwise"]:
-                # For pointwise mode, create a wrapper that handles everything internally
-                @wraps(test_func)
-                def wrapper(**kwargs):
-                    return wrapper_body(**kwargs)
-                
-                # Give it the pytest-compatible signature but the test_func name
-                parameters = [
-                    inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names
-                ]
-                wrapper.__signature__ = inspect.Signature(parameters)
-                wrapper.__name__ = test_func.__name__.replace('_evaluate', '_dataset') if '_evaluate' in test_func.__name__ else f"test_{test_func.__name__}"
-            else:
-                # For batch mode, use the original wrapper
-                @wraps(test_func)
-                def wrapper(**kwargs):
-                    return wrapper_body(**kwargs)
-
-                parameters = [
-                    inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names
-                ]
-                wrapper.__signature__ = inspect.Signature(parameters)
-
-            return wrapper
+            return _create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names)
 
         wrapper = create_wrapper_with_signature()
         wrapper = pytest.mark.parametrize(test_param_names, param_tuples)(wrapper)
diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py
index 8850a5ea..a1e124c8 100644
--- a/eval_protocol/pytest/types.py
+++ b/eval_protocol/pytest/types.py
@@ -3,7 +3,7 @@
 """
 
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List
+from typing import Any, Callable, Dict, List, Literal
 
 from ..models import EvaluationRow, Message
 
@@ -13,6 +13,17 @@
 InputMessagesParam = List[Message]
 
 Dataset = List[EvaluationRow]
+
+EvaluationTestMode = Literal["batch", "pointwise"]
+"""
+"batch": (default) expects test function to handle full dataset.
+"pointwise": applies test function to each row.
+
+How to choose between "batch" and "pointwise":
+If your evaluation requires the rollout of all rows to be passed into your eval compute the score, use "batch".
+If your evaluation can be computed pointwise, use "pointwise" as EP can pipeline the rollouts and evals to be faster.
+"""
+
 """
 Test function types
 """
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
index d0029c17..5b4504a0 100644
--- a/tests/pytest/test_markdown_highlighting.py
+++ b/tests/pytest/test_markdown_highlighting.py
@@ -15,56 +15,48 @@
 def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
     """
     Convert entries from markdown dataset to EvaluationRow objects.
-    """    
+    """
     return [
-        EvaluationRow(
-            messages=[Message(role="user", content=row["prompt"])], 
-            ground_truth=str(row["num_highlights"])
-        )
+        EvaluationRow(messages=[Message(role="user", content=row["prompt"])], ground_truth=str(row["num_highlights"]))
         for row in data
     ]
 
 
-def markdown_format_evaluate(messages: List[Message], ground_truth: Optional[str]=None, **kwargs) -> EvaluateResult:
+def markdown_format_evaluate(messages: List[Message], ground_truth: Optional[str] = None, **kwargs) -> EvaluateResult:
     """
     Evaluation function that checks if the model's response contains the required number of formatted sections.
     """
-    
+
     assistant_response = messages[-1].content
-    
+
     if not assistant_response:
-        return EvaluateResult(
-            score=0.0,
-            reason="❌ No assistant response found"
-        )
-    
+        return EvaluateResult(score=0.0, reason="❌ No assistant response found")
+
     required_highlights = int(ground_truth)
 
     # Check if the response contains the required number of formatted sections
     # e.g. **bold** or *italic*
-    
+
     actual_count = 0
     highlights = re.findall(r"\*[^\n\*]*\*", assistant_response)
     double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", assistant_response)
-    
+
     for highlight in highlights:
         if highlight.strip("*").strip():
             actual_count += 1
     for highlight in double_highlights:
         if highlight.removeprefix("**").removesuffix("**").strip():
             actual_count += 1
-    
+
     meets_requirement = actual_count >= required_highlights
-    
+
     if meets_requirement:
         return EvaluateResult(
-            score=1.0,
-            reason=f"✅ Found {actual_count} highlighted sections (required: {required_highlights})"
+            score=1.0, reason=f"✅ Found {actual_count} highlighted sections (required: {required_highlights})"
         )
     else:
         return EvaluateResult(
-            score=0.0,
-            reason=f"❌ Only found {actual_count} highlighted sections (required: {required_highlights})"
+            score=0.0, reason=f"❌ Only found {actual_count} highlighted sections (required: {required_highlights})"
         )
 
 
@@ -72,13 +64,13 @@ def markdown_format_evaluate(messages: List[Message], ground_truth: Optional[str
     input_dataset=["tests/pytest/data/markdown_dataset.jsonl"],
     dataset_adapter=markdown_dataset_to_evaluation_row,
     model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
-    input_params=[{"temperature": 0.0, "max_tokens": 4096}],  
+    input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     threshold_of_success=1.0,
     rollout_processor=default_single_turn_rollout_processor,
-    num_runs=1
+    num_runs=1,
 )
-def test_markdown_highlighting_evaluation(input_dataset, input_params, model):
+def test_markdown_highlighting_evaluation(input_dataset, input_params, model) -> List[EvaluationRow]:
     """
     Test markdown highlighting validation using batch mode with evaluate().
     """
-    return evaluate(input_dataset, markdown_format_evaluate) 
\ No newline at end of file
+    return evaluate(input_dataset, markdown_format_evaluate)
diff --git a/tests/pytest/test_pytest_async.py b/tests/pytest/test_pytest_async.py
index e37d43a2..d1d01e86 100644
--- a/tests/pytest/test_pytest_async.py
+++ b/tests/pytest/test_pytest_async.py
@@ -16,6 +16,6 @@
     ],
     model=["accounts/fireworks/models/kimi-k2-instruct"],
 )
-async def test_pytest_async(input_dataset: List[EvaluationRow], model):
+async def test_pytest_async(input_dataset: List[EvaluationRow], model) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
     return input_dataset
diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py
index e18dcf32..9d2d910c 100644
--- a/tests/pytest/test_pytest_default_agent_rollout_processor.py
+++ b/tests/pytest/test_pytest_default_agent_rollout_processor.py
@@ -19,6 +19,6 @@
     rollout_processor=default_agent_rollout_processor,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
 )
-def test_pytest_default_agent_rollout_processor(input_dataset: List[EvaluationRow], model):
+def test_pytest_default_agent_rollout_processor(input_dataset: List[EvaluationRow], model) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
     return input_dataset
diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py
index c6f8d52a..176047ac 100644
--- a/tests/pytest/test_pytest_input_messages.py
+++ b/tests/pytest/test_pytest_input_messages.py
@@ -13,6 +13,6 @@
     model=["accounts/fireworks/models/kimi-k2-instruct"],
     rollout_processor=default_single_turn_rollout_processor,
 )
-def test_input_messages_in_decorator(input_dataset: List[EvaluationRow], model):
+def test_input_messages_in_decorator(input_dataset: List[EvaluationRow], model) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
     return input_dataset
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
index 267c4705..d9cd5a50 100644
--- a/tests/pytest/test_pytest_math_example.py
+++ b/tests/pytest/test_pytest_math_example.py
@@ -1,3 +1,5 @@
+from typing import List
+from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluate, evaluation_test
 from examples.math_example.main import evaluate as math_evaluate
 from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row
@@ -12,6 +14,6 @@
     threshold_of_success=0.0,
     rollout_processor=default_single_turn_rollout_processor,
 )
-def test_math_dataset(input_dataset, input_params, model):
+def test_math_dataset(input_dataset, input_params, model) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
     return evaluate(input_dataset, math_evaluate)
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
index 589685ab..9182aa45 100644
--- a/tests/pytest/test_pytest_math_format_length.py
+++ b/tests/pytest/test_pytest_math_format_length.py
@@ -1,3 +1,5 @@
+from typing import List
+from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluate, evaluation_test
 from examples.math_with_format_and_length.main import evaluate as math_fl_evaluate
 from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row
@@ -12,6 +14,6 @@
     threshold_of_success=0.0,
     rollout_processor=default_single_turn_rollout_processor,
 )
-def test_math_format_length_dataset(input_dataset, input_params, model):
+def test_math_format_length_dataset(input_dataset, input_params, model) -> List[EvaluationRow]:
     """Run math with format and length evaluation on sample dataset."""
     return evaluate(input_dataset, math_fl_evaluate)
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
index c608d6fb..db2403fa 100644
--- a/tests/pytest/test_pytest_word_count_example.py
+++ b/tests/pytest/test_pytest_word_count_example.py
@@ -1,6 +1,6 @@
 from typing import List
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
-from eval_protocol.models import EvaluateResult, MetricResult, Message
+from eval_protocol.models import EvaluateResult, MetricResult, EvaluationRow
 from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row
 from haikus import haikus
 
@@ -15,15 +15,15 @@
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",  # Use pointwise mode for elegant row-by-row evaluation
 )
-def test_word_count_evaluate(messages: List[Message], **kwargs) -> EvaluateResult:
+def test_word_count_evaluate(row: EvaluationRow) -> EvaluationRow:
     """
     Pointwise word count evaluator - just the core evaluation logic.
     Everything else (models, datasets, thresholds) is parameterized in the decorator.
     """
-    if not messages:
+    if not row.messages:
         return EvaluateResult(score=0.0, reason="No messages found", is_score_valid=False)
 
-    last_message = messages[-1]
+    last_message = row.messages[-1]
     content = last_message.content if last_message and last_message.content else ""
 
     # Word count logic
@@ -73,6 +73,5 @@ def test_word_count_evaluate(messages: List[Message], **kwargs) -> EvaluateResul
     return EvaluateResult(
         score=word_count_score,
         reason=f"Word count: {word_count}. {haiku_metric_reason}",
-        is_score_valid=True,
         metrics=metrics,
-    )
\ No newline at end of file
+    )

From 2616f6151fa21b32a52d7e6fcb3487db357aac78 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Sun, 3 Aug 2025 12:30:35 -0700
Subject: [PATCH 3/8] Refactor async handling in evaluation functions by
 introducing a dedicated _execute_function to streamline execution of both
 async and non-async functions.

---
 eval_protocol/pytest/pytest_utils.py | 68 +++++++++++++++-------------
 1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/eval_protocol/pytest/pytest_utils.py b/eval_protocol/pytest/pytest_utils.py
index 50f8349e..2d923d81 100644
--- a/eval_protocol/pytest/pytest_utils.py
+++ b/eval_protocol/pytest/pytest_utils.py
@@ -21,6 +21,40 @@
 from ..models import EvaluateResult, EvaluationRow
 
 
+def _execute_function(func: Callable, **kwargs) -> Any:
+    """
+    Execute a function with proper async handling.
+
+    This is a pure function that handles both async and non-async function execution
+    with proper event loop management for async functions.
+
+    Args:
+        func: The function to execute
+        **kwargs: Arguments to pass to the function
+
+    Returns:
+        The result of the function execution
+    """
+    is_async = asyncio.iscoroutinefunction(func)
+    if is_async:
+        # Handle async functions with proper event loop management
+        try:
+            loop = asyncio.get_event_loop()
+            if not loop.is_closed():
+                # Use existing loop
+                task = loop.create_task(func(**kwargs))
+                results = loop.run_until_complete(task)
+            else:
+                # Loop is closed, create a new one
+                results = asyncio.run(func(**kwargs))
+        except RuntimeError:
+            # No event loop or other issues, create a new one
+            results = asyncio.run(func(**kwargs))
+    else:
+        results = func(**kwargs)
+    return results
+
+
 def evaluate(
     rows: List[EvaluationRow], reward_fn: Callable[..., EvaluateResult], **kwargs: Any
 ) -> List[EvaluationRow]:
@@ -200,23 +234,7 @@ def execute_with_params(
                 kwargs["model"] = model
             if row is not None:
                 kwargs["row"] = row
-            if is_async:
-                # Handle async functions with proper event loop management
-                try:
-                    loop = asyncio.get_event_loop()
-                    if not loop.is_closed():
-                        # Use existing loop
-                        task = loop.create_task(test_func(**kwargs))
-                        results = loop.run_until_complete(task)
-                    else:
-                        # Loop is closed, create a new one
-                        results = asyncio.run(test_func(**kwargs))
-                except RuntimeError:
-                    # No event loop or other issues, create a new one
-                    results = asyncio.run(test_func(**kwargs))
-            else:
-                results = test_func(**kwargs)
-            return results
+            return _execute_function(test_func, **kwargs)
 
         # Calculate all possible combinations of parameters
         def generate_combinations():
@@ -291,21 +309,7 @@ def wrapper_body(**kwargs):
                     initial_messages=kwargs.get("input_messages") if "input_messages" in kwargs else [],
                 )
                 for row in data:
-                    is_async = inspect.iscoroutinefunction(rollout_processor)
-                    if is_async:
-                        try:
-                            loop = asyncio.get_event_loop()
-                            if not loop.is_closed():
-                                # Use existing loop
-                                task = loop.create_task(rollout_processor(row, config=config))
-                                processed: List[EvaluationRow] = loop.run_until_complete(task)
-                            else:
-                                processed: List[EvaluationRow] = asyncio.run(rollout_processor(row, config=config))
-                        except RuntimeError:
-                            # No event loop or other issues, create a new one
-                            processed: List[EvaluationRow] = asyncio.run(rollout_processor(row, config=config))
-                    else:
-                        processed: List[EvaluationRow] = rollout_processor(row, config=config)
+                    processed: List[EvaluationRow] = _execute_function(rollout_processor, row=row, config=config)
                     input_dataset.extend(processed)
 
                 all_results: List[EvaluationRow] = []

From 9dc3a2ab419c58eba221106def93f3bb5b24c2b1 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Sun, 3 Aug 2025 12:32:31 -0700
Subject: [PATCH 4/8] Add note to test_word_count_evaluate function explaining
 its purpose and limitations

---
 tests/pytest/test_pytest_word_count_example.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
index db2403fa..5a13f6f5 100644
--- a/tests/pytest/test_pytest_word_count_example.py
+++ b/tests/pytest/test_pytest_word_count_example.py
@@ -19,6 +19,11 @@ def test_word_count_evaluate(row: EvaluationRow) -> EvaluationRow:
     """
     Pointwise word count evaluator - just the core evaluation logic.
     Everything else (models, datasets, thresholds) is parameterized in the decorator.
+
+    NOTE: This function does not make any sense since it just counts the number
+    of words in the last message and computes some haiku analysis but only uses
+    the word count to compute the score. But tests/shows how to write a
+    pointwise evaluation function.
     """
     if not row.messages:
         return EvaluateResult(score=0.0, reason="No messages found", is_score_valid=False)

From 1722e0f0bd63ccdb7b0108277d66b61a7c6e2fbf Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Sun, 3 Aug 2025 12:39:03 -0700
Subject: [PATCH 5/8] Refactor evaluation_test function to enforce parameter
 type validation for pointwise and batch modes, updating tests to use 'rows'
 instead of 'input_dataset' for consistency.

---
 eval_protocol/pytest/pytest_utils.py             | 16 +++++++++++-----
 tests/pytest/test_markdown_highlighting.py       |  5 ++---
 tests/pytest/test_pytest_async.py                | 10 +++++-----
 ...est_pytest_default_agent_rollout_processor.py | 14 +++++++-------
 tests/pytest/test_pytest_input_messages.py       |  8 ++++----
 tests/pytest/test_pytest_math_example.py         |  4 ++--
 tests/pytest/test_pytest_math_format_length.py   |  4 ++--
 tests/pytest/test_pytest_word_count_example.py   |  1 -
 8 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/eval_protocol/pytest/pytest_utils.py b/eval_protocol/pytest/pytest_utils.py
index 2d923d81..082a67ac 100644
--- a/eval_protocol/pytest/pytest_utils.py
+++ b/eval_protocol/pytest/pytest_utils.py
@@ -204,15 +204,21 @@ def decorator(
             if "row" not in sig.parameters:
                 raise ValueError(f"In pointwise mode, your eval function must have a parameter named 'row'")
 
+            # validate that "Row" is of type EvaluationRow
+            if sig.parameters["row"].annotation is not EvaluationRow:
+                raise ValueError(f"In pointwise mode, the 'row' parameter must be of type EvaluationRow")
+
             # validate that the function has a return type of EvaluationRow
             if sig.return_annotation is not EvaluationRow:
                 raise ValueError("In pointwise mode, your eval function must return an EvaluationRow instance")
         else:
             # Batch mode: function should accept input_dataset and model
-            if "input_dataset" not in sig.parameters:
-                raise ValueError("In batch mode, your eval function must have a parameter named 'input_dataset'")
-            if "model" not in sig.parameters:
-                raise ValueError("In batch mode, your eval function must have a parameter named 'model'")
+            if "rows" not in sig.parameters:
+                raise ValueError("In batch mode, your eval function must have a parameter named 'rows'")
+
+            # validate that "Rows" is of type List[EvaluationRow]
+            if sig.parameters["rows"].annotation is not List[EvaluationRow]:
+                raise ValueError(f"In batch mode, the 'rows' parameter must be of type List[EvaluationRow]")
 
             # validate that the function has a return type of List[EvaluationRow]
             if sig.return_annotation is not List[EvaluationRow]:
@@ -227,7 +233,7 @@ def execute_with_params(
         ):
             kwargs = {}
             if input_dataset is not None:
-                kwargs["input_dataset"] = list(input_dataset)
+                kwargs["rows"] = input_dataset
             if input_params is not None:
                 kwargs["input_params"] = input_params
             if model is not None:
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
index 5b4504a0..e85f0742 100644
--- a/tests/pytest/test_markdown_highlighting.py
+++ b/tests/pytest/test_markdown_highlighting.py
@@ -4,7 +4,6 @@
 This test demonstrates how to check if model responses contain the required number of highlighted sections.
 """
 
-import json
 import re
 from typing import Any, Dict, List, Optional
 
@@ -69,8 +68,8 @@ def markdown_format_evaluate(messages: List[Message], ground_truth: Optional[str
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
 )
-def test_markdown_highlighting_evaluation(input_dataset, input_params, model) -> List[EvaluationRow]:
+def test_markdown_highlighting_evaluation(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """
     Test markdown highlighting validation using batch mode with evaluate().
     """
-    return evaluate(input_dataset, markdown_format_evaluate)
+    return evaluate(rows, markdown_format_evaluate)
diff --git a/tests/pytest/test_pytest_async.py b/tests/pytest/test_pytest_async.py
index d1d01e86..620683e1 100644
--- a/tests/pytest/test_pytest_async.py
+++ b/tests/pytest/test_pytest_async.py
@@ -1,6 +1,6 @@
 from typing import List
 
-from eval_protocol.models import EvaluationRow
+from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest import evaluation_test
 from examples.math_example.main import evaluate as math_evaluate
 
@@ -8,14 +8,14 @@
 @evaluation_test(
     input_messages=[
         [
-            {"role": "user", "content": "What is the capital of France?"},
+            Message(role="user", content="What is the capital of France?"),
         ],
         [
-            {"role": "user", "content": "What is the capital of the moon?"},
+            Message(role="user", content="What is the capital of the moon?"),
         ],
     ],
     model=["accounts/fireworks/models/kimi-k2-instruct"],
 )
-async def test_pytest_async(input_dataset: List[EvaluationRow], model) -> List[EvaluationRow]:
+async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
-    return input_dataset
+    return rows
diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py
index 9d2d910c..06762046 100644
--- a/tests/pytest/test_pytest_default_agent_rollout_processor.py
+++ b/tests/pytest/test_pytest_default_agent_rollout_processor.py
@@ -1,24 +1,24 @@
 from datetime import datetime
 from typing import List
 
-from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.models import Message, EvaluationRow
 from eval_protocol.pytest import default_agent_rollout_processor, evaluation_test
 
 
 @evaluation_test(
     input_messages=[
         [
-            {
-                "role": "user",
-                "content": "Can you give a summary of the past week in the 'general, model-requests, bug-reports, questions, and feature-requests' channels. For EVERY message or thread has not been resolved, please list them at the end of your response in a table. Be sure to include the exact message, severity, and current status so far. Current Date & Time: {current_date_time}".format(
+            Message(
+                role="user",
+                content="Can you give a summary of the past week in the 'general, model-requests, bug-reports, questions, and feature-requests' channels. For EVERY message or thread has not been resolved, please list them at the end of your response in a table. Be sure to include the exact message, severity, and current status so far. Current Date & Time: {current_date_time}".format(
                     current_date_time=datetime.now().strftime("%B %d, %Y at %I:%M %p")
                 ),
-            }
+            )
         ]
     ],
     rollout_processor=default_agent_rollout_processor,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
 )
-def test_pytest_default_agent_rollout_processor(input_dataset: List[EvaluationRow], model) -> List[EvaluationRow]:
+def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
-    return input_dataset
+    return rows
diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py
index 176047ac..c5a59fd8 100644
--- a/tests/pytest/test_pytest_input_messages.py
+++ b/tests/pytest/test_pytest_input_messages.py
@@ -1,18 +1,18 @@
 from typing import List
 
-from eval_protocol.models import EvaluationRow
+from eval_protocol.models import Message, EvaluationRow
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
 
 
 @evaluation_test(
     input_messages=[
         [
-            {"role": "user", "content": "What is the capital of France?"},
+            Message(role="user", content="What is the capital of France?"),
         ]
     ],
     model=["accounts/fireworks/models/kimi-k2-instruct"],
     rollout_processor=default_single_turn_rollout_processor,
 )
-def test_input_messages_in_decorator(input_dataset: List[EvaluationRow], model) -> List[EvaluationRow]:
+def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
-    return input_dataset
+    return rows
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
index d9cd5a50..367794a0 100644
--- a/tests/pytest/test_pytest_math_example.py
+++ b/tests/pytest/test_pytest_math_example.py
@@ -14,6 +14,6 @@
     threshold_of_success=0.0,
     rollout_processor=default_single_turn_rollout_processor,
 )
-def test_math_dataset(input_dataset, input_params, model) -> List[EvaluationRow]:
+def test_math_dataset(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
-    return evaluate(input_dataset, math_evaluate)
+    return evaluate(rows, math_evaluate)
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
index 9182aa45..ba5dd60b 100644
--- a/tests/pytest/test_pytest_math_format_length.py
+++ b/tests/pytest/test_pytest_math_format_length.py
@@ -14,6 +14,6 @@
     threshold_of_success=0.0,
     rollout_processor=default_single_turn_rollout_processor,
 )
-def test_math_format_length_dataset(input_dataset, input_params, model) -> List[EvaluationRow]:
+def test_math_format_length_dataset(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math with format and length evaluation on sample dataset."""
-    return evaluate(input_dataset, math_fl_evaluate)
+    return evaluate(rows, math_fl_evaluate)
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
index 5a13f6f5..baed6050 100644
--- a/tests/pytest/test_pytest_word_count_example.py
+++ b/tests/pytest/test_pytest_word_count_example.py
@@ -1,4 +1,3 @@
-from typing import List
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
 from eval_protocol.models import EvaluateResult, MetricResult, EvaluationRow
 from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row

From 54d9b11aa3cf8d232205ff7d160a049e8c948349 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Sun, 3 Aug 2025 12:42:20 -0700
Subject: [PATCH 6/8] move decorator into its own file

---
 .../{pytest_utils.py => evaluation_test.py}   | 106 +-----------------
 eval_protocol/pytest/utils.py                 |  96 ++++++++++++++++
 2 files changed, 102 insertions(+), 100 deletions(-)
 rename eval_protocol/pytest/{pytest_utils.py => evaluation_test.py} (77%)
 create mode 100644 eval_protocol/pytest/utils.py

diff --git a/eval_protocol/pytest/pytest_utils.py b/eval_protocol/pytest/evaluation_test.py
similarity index 77%
rename from eval_protocol/pytest/pytest_utils.py
rename to eval_protocol/pytest/evaluation_test.py
index 082a67ac..bddbad6c 100644
--- a/eval_protocol/pytest/pytest_utils.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -1,9 +1,9 @@
-import asyncio
 import inspect
 from typing import Any, Callable, Dict, List, Optional
 
 import pytest
 
+from eval_protocol.models import EvaluationRow
 from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
 from eval_protocol.pytest.types import (
     Dataset,
@@ -16,100 +16,9 @@
     RolloutProcessorConfig,
     TestFunction,
 )
+from eval_protocol.pytest.utils import aggregate, create_dynamically_parameterized_wrapper, execute_function
 
 from ..common_utils import load_jsonl
-from ..models import EvaluateResult, EvaluationRow
-
-
-def _execute_function(func: Callable, **kwargs) -> Any:
-    """
-    Execute a function with proper async handling.
-
-    This is a pure function that handles both async and non-async function execution
-    with proper event loop management for async functions.
-
-    Args:
-        func: The function to execute
-        **kwargs: Arguments to pass to the function
-
-    Returns:
-        The result of the function execution
-    """
-    is_async = asyncio.iscoroutinefunction(func)
-    if is_async:
-        # Handle async functions with proper event loop management
-        try:
-            loop = asyncio.get_event_loop()
-            if not loop.is_closed():
-                # Use existing loop
-                task = loop.create_task(func(**kwargs))
-                results = loop.run_until_complete(task)
-            else:
-                # Loop is closed, create a new one
-                results = asyncio.run(func(**kwargs))
-        except RuntimeError:
-            # No event loop or other issues, create a new one
-            results = asyncio.run(func(**kwargs))
-    else:
-        results = func(**kwargs)
-    return results
-
-
-def evaluate(
-    rows: List[EvaluationRow], reward_fn: Callable[..., EvaluateResult], **kwargs: Any
-) -> List[EvaluationRow]:
-    """Apply a reward function to each row and attach the result."""
-    evaluated: List[EvaluationRow] = []
-    for row in rows:
-        result = reward_fn(messages=row.messages, ground_truth=row.ground_truth, **kwargs)
-        row.evaluation_result = result
-        evaluated.append(row)
-    return evaluated
-
-
-def _aggregate(scores: List[float], method: str) -> float:
-    if not scores:
-        return 0.0
-    if method == "mean":
-        return sum(scores) / len(scores)
-    if method == "max":
-        return max(scores)
-    if method == "min":
-        return min(scores)
-    raise ValueError(f"Unknown aggregation method: {method}")
-
-
-def _create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names):
-    """
-    Creates a wrapper function with dynamic parameters for pytest parameterization.
-
-    This function takes a test function and creates a wrapper that:
-    1. Preserves the original function's metadata using functools.wraps
-    2. Creates a new function signature with the specified parameter names that maps to pytest.mark.parametrize decorator
-    3. Returns a callable that can be used with pytest.mark.parametrize
-
-    The function signature is dynamically created to match the parameter names expected by
-    pytest.mark.parametrize, ensuring that pytest can properly map the test parameters
-    to the function arguments.
-
-    Args:
-        test_func: The original test function to wrap
-        wrapper_body: The function body that contains the actual test logic
-        test_param_names: List of parameter names for the dynamic signature
-
-    Returns:
-        A wrapper function with the specified parameter signature that calls wrapper_body
-    """
-    from functools import wraps
-
-    @wraps(test_func)
-    def wrapper(**kwargs):
-        return wrapper_body(**kwargs)
-
-    parameters = [inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names]
-    wrapper.__signature__ = inspect.Signature(parameters)
-
-    return wrapper
 
 
 def evaluation_test(
@@ -193,9 +102,6 @@ def test_func(model_name: str, input_messages: List[List[Message]]):
     def decorator(
         test_func: TestFunction,
     ):
-        # Check if the function is async
-        is_async = inspect.iscoroutinefunction(test_func)
-
         sig = inspect.signature(test_func)
 
         # For pointwise/rowwise mode, we expect a different signature
@@ -240,7 +146,7 @@ def execute_with_params(
                 kwargs["model"] = model
             if row is not None:
                 kwargs["row"] = row
-            return _execute_function(test_func, **kwargs)
+            return execute_function(test_func, **kwargs)
 
         # Calculate all possible combinations of parameters
         def generate_combinations():
@@ -315,7 +221,7 @@ def wrapper_body(**kwargs):
                     initial_messages=kwargs.get("input_messages") if "input_messages" in kwargs else [],
                 )
                 for row in data:
-                    processed: List[EvaluationRow] = _execute_function(rollout_processor, row=row, config=config)
+                    processed: List[EvaluationRow] = execute_function(rollout_processor, row=row, config=config)
                     input_dataset.extend(processed)
 
                 all_results: List[EvaluationRow] = []
@@ -361,13 +267,13 @@ def wrapper_body(**kwargs):
                         all_results.extend(results)
 
                 scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
-                agg_score = _aggregate(scores, aggregation_method)
+                agg_score = aggregate(scores, aggregation_method)
                 if threshold_of_success is not None:
                     assert (
                         agg_score >= threshold_of_success
                     ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}"
 
-            return _create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names)
+            return create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names)
 
         wrapper = create_wrapper_with_signature()
         wrapper = pytest.mark.parametrize(test_param_names, param_tuples)(wrapper)
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
new file mode 100644
index 00000000..cfe2f05f
--- /dev/null
+++ b/eval_protocol/pytest/utils.py
@@ -0,0 +1,96 @@
+import asyncio
+import inspect
+from typing import Any, Callable, List
+
+from ..models import EvaluateResult, EvaluationRow
+
+
+def execute_function(func: Callable, **kwargs) -> Any:
+    """
+    Execute a function with proper async handling.
+
+    This is a pure function that handles both async and non-async function execution
+    with proper event loop management for async functions.
+
+    Args:
+        func: The function to execute
+        **kwargs: Arguments to pass to the function
+
+    Returns:
+        The result of the function execution
+    """
+    is_async = asyncio.iscoroutinefunction(func)
+    if is_async:
+        # Handle async functions with proper event loop management
+        try:
+            loop = asyncio.get_event_loop()
+            if not loop.is_closed():
+                # Use existing loop
+                task = loop.create_task(func(**kwargs))
+                results = loop.run_until_complete(task)
+            else:
+                # Loop is closed, create a new one
+                results = asyncio.run(func(**kwargs))
+        except RuntimeError:
+            # No event loop or other issues, create a new one
+            results = asyncio.run(func(**kwargs))
+    else:
+        results = func(**kwargs)
+    return results
+
+
+def evaluate(
+    rows: List[EvaluationRow], reward_fn: Callable[..., EvaluateResult], **kwargs: Any
+) -> List[EvaluationRow]:
+    """Apply a reward function to each row and attach the result."""
+    evaluated: List[EvaluationRow] = []
+    for row in rows:
+        result = reward_fn(messages=row.messages, ground_truth=row.ground_truth, **kwargs)
+        row.evaluation_result = result
+        evaluated.append(row)
+    return evaluated
+
+
+def aggregate(scores: List[float], method: str) -> float:
+    if not scores:
+        return 0.0
+    if method == "mean":
+        return sum(scores) / len(scores)
+    if method == "max":
+        return max(scores)
+    if method == "min":
+        return min(scores)
+    raise ValueError(f"Unknown aggregation method: {method}")
+
+
+def create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param_names):
+    """
+    Creates a wrapper function with dynamic parameters for pytest parameterization.
+
+    This function takes a test function and creates a wrapper that:
+    1. Preserves the original function's metadata using functools.wraps
+    2. Creates a new function signature with the specified parameter names that maps to pytest.mark.parametrize decorator
+    3. Returns a callable that can be used with pytest.mark.parametrize
+
+    The function signature is dynamically created to match the parameter names expected by
+    pytest.mark.parametrize, ensuring that pytest can properly map the test parameters
+    to the function arguments.
+
+    Args:
+        test_func: The original test function to wrap
+        wrapper_body: The function body that contains the actual test logic
+        test_param_names: List of parameter names for the dynamic signature
+
+    Returns:
+        A wrapper function with the specified parameter signature that calls wrapper_body
+    """
+    from functools import wraps
+
+    @wraps(test_func)
+    def wrapper(**kwargs):
+        return wrapper_body(**kwargs)
+
+    parameters = [inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names]
+    wrapper.__signature__ = inspect.Signature(parameters)
+
+    return wrapper

From b95909b7422bfbd0866a742cd1e3b2a0b0e0fcd4 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Sun, 3 Aug 2025 12:44:37 -0700
Subject: [PATCH 7/8] fix __init__.py

---
 eval_protocol/pytest/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py
index ca0f7feb..8a31438a 100644
--- a/eval_protocol/pytest/__init__.py
+++ b/eval_protocol/pytest/__init__.py
@@ -1,8 +1,9 @@
 from .default_agent_rollout_processor import default_agent_rollout_processor
 from .default_no_op_rollout_process import default_no_op_rollout_processor
 from .default_single_turn_rollout_process import default_single_turn_rollout_processor
-from .pytest_utils import evaluate, evaluation_test
+from .evaluation_test import evaluation_test
 from .types import RolloutProcessor, RolloutProcessorConfig
+from .utils import evaluate
 
 __all__ = [
     "default_agent_rollout_processor",

From 56bf3abdf410a2744901c5909334d549037661f9 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Sun, 3 Aug 2025 13:11:36 -0700
Subject: [PATCH 8/8] fix

---
 eval_protocol/pytest/evaluation_test.py       | 56 +++----------------
 eval_protocol/pytest/utils.py                 |  7 ++-
 .../pytest/test_pytest_word_count_example.py  |  8 ++-
 3 files changed, 18 insertions(+), 53 deletions(-)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index bddbad6c..8cfd2e1a 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -16,7 +16,12 @@
     RolloutProcessorConfig,
     TestFunction,
 )
-from eval_protocol.pytest.utils import aggregate, create_dynamically_parameterized_wrapper, execute_function
+from eval_protocol.pytest.utils import (
+    AggregationMethod,
+    aggregate,
+    create_dynamically_parameterized_wrapper,
+    execute_function,
+)
 
 from ..common_utils import load_jsonl
 
@@ -29,7 +34,7 @@ def evaluation_test(
     dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = lambda x: x,
     input_params: Optional[List[InputParam]] = None,
     rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
-    aggregation_method: str = "mean",
+    aggregation_method: AggregationMethod = "mean",
     threshold_of_success: Optional[float] = None,
     num_runs: int = 1,
     max_dataset_rows: Optional[int] = None,
@@ -58,45 +63,10 @@ def evaluation_test(
             below this threshold.
         num_runs: Number of times to repeat the evaluation.
         max_dataset_rows: Limit dataset to the first N rows.
+        mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
         mode: Evaluation mode. "batch" (default) expects test function to handle
             full dataset. "pointwise" applies test function to each row. If your evaluation requires
             the full rollout of all rows to compute the score, use
-
-    Usage:
-    With an input dataset and input params, the test function will be called with the following arguments:
-
-    ```python
-    @evaluation_test(
-        model=["gpt-4o", "gpt-4o-mini"],
-        input_dataset=["data/test.jsonl"],
-        input_params=[{"temperature": 0.5}],
-        rollout_processor=default_rollout_processor,
-        aggregation_method="mean",
-    )
-    def test_func(dataset_path: str, model_name: str, input_params: Dict[str, Any]):
-        pass
-    ```
-
-    Without an input dataset and input params, the test function will be called with the following arguments:
-
-    ```python
-    @evaluation_test(
-        model=["gpt-4o", "gpt-4o-mini"],
-    )
-    def test_func(model_name: str):
-        pass
-    ```
-
-    With model and input_messages, the test function will be called with the following arguments:
-
-    ```python
-    @evaluation_test(
-        model=["gpt-4o", "gpt-4o-mini"],
-        input_messages=[{"role": "user", "content": "Hello, how are you?"}],
-    )
-    def test_func(model_name: str, input_messages: List[List[Message]]):
-        pass
-    ```
     """
 
     def decorator(
@@ -132,18 +102,12 @@ def decorator(
 
         def execute_with_params(
             test_func: TestFunction,
-            model: str,
             row: EvaluationRow | None = None,
             input_dataset: List[EvaluationRow] | None = None,
-            input_params: InputParam | None = None,
         ):
             kwargs = {}
             if input_dataset is not None:
                 kwargs["rows"] = input_dataset
-            if input_params is not None:
-                kwargs["input_params"] = input_params
-            if model is not None:
-                kwargs["model"] = model
             if row is not None:
                 kwargs["row"] = row
             return execute_function(test_func, **kwargs)
@@ -231,9 +195,7 @@ def wrapper_body(**kwargs):
                         for row in input_dataset:
                             result = execute_with_params(
                                 test_func,
-                                model=model_name,
                                 row=row,
-                                input_params=kwargs.get("input_params") if "input_params" in kwargs else None,
                             )
                             if result is None or not isinstance(result, EvaluationRow):
                                 raise ValueError(
@@ -244,9 +206,7 @@ def wrapper_body(**kwargs):
                         # Batch mode: call the test function with the full dataset
                         results = execute_with_params(
                             test_func,
-                            model=model_name,
                             input_dataset=input_dataset,
-                            input_params=kwargs.get("input_params") if "input_params" in kwargs else None,
                         )
                         if results is None:
                             raise ValueError(
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
index cfe2f05f..fccf5f81 100644
--- a/eval_protocol/pytest/utils.py
+++ b/eval_protocol/pytest/utils.py
@@ -1,6 +1,6 @@
 import asyncio
 import inspect
-from typing import Any, Callable, List
+from typing import Any, Callable, List, Literal
 
 from ..models import EvaluateResult, EvaluationRow
 
@@ -51,7 +51,10 @@ def evaluate(
     return evaluated
 
 
-def aggregate(scores: List[float], method: str) -> float:
+AggregationMethod = Literal["mean", "max", "min"]
+
+
+def aggregate(scores: List[float], method: AggregationMethod) -> float:
     if not scores:
         return 0.0
     if method == "mean":
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
index baed6050..a0cb908c 100644
--- a/tests/pytest/test_pytest_word_count_example.py
+++ b/tests/pytest/test_pytest_word_count_example.py
@@ -1,7 +1,8 @@
+from haikus import haikus
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
-from eval_protocol.models import EvaluateResult, MetricResult, EvaluationRow
 from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row
-from haikus import haikus
 
 
 @evaluation_test(
@@ -74,8 +75,9 @@ def test_word_count_evaluate(row: EvaluationRow) -> EvaluationRow:
         ),
     }
 
-    return EvaluateResult(
+    row.evaluation_result = EvaluateResult(
         score=word_count_score,
         reason=f"Word count: {word_count}. {haiku_metric_reason}",
         metrics=metrics,
     )
+    return row