Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
2cf2406
don't just log and continue anymore—user should know
Aug 7, 2025
a6c9de2
Add PEP 440 versioning support
Aug 7, 2025
c22dcc9
Enhance default_single_turn_rollout_processor to log messages
Aug 7, 2025
5e2a497
Add pytest as a dependency in pyproject.toml
Aug 7, 2025
6e22d35
Remove unused imports in utils.py to clean up the codebase.
Aug 7, 2025
eeda2a6
Add directory utility functions for finding and creating evaluation p…
Aug 7, 2025
429bd8b
Add PID field to EvaluationRow model
Aug 7, 2025
33d054a
Add 'stopped' status to evaluation protocol and update StatusIndicato…
Aug 7, 2025
94b0e46
Update uv.lock to modify pytest dependency and revision number
Aug 7, 2025
1fe3338
Ensure evaluation watcher is running at the start of evaluation tests
Aug 7, 2025
ee9957f
Add optional PID field to EvaluationRowSchema
Aug 7, 2025
1c12956
Enhance evaluation logging and error handling
Aug 7, 2025
a8a193d
Refactor eval_watcher.py to use structured logging
Aug 7, 2025
33d2ce5
Add logging utilities for eval_protocol package
Aug 7, 2025
5ca5d65
Enhance LocalFSDatasetLoggerAdapter to prevent duplicate row IDs
Aug 7, 2025
755c017
Add singleton lock functionality for process management
Aug 7, 2025
d5090e6
Merge branch 'main' into fix-onboarding-flow
Aug 7, 2025
ca126dd
works
Aug 7, 2025
b980964
Enhance JSON line error handling in load_jsonl function
Aug 7, 2025
598c12a
works!
Aug 7, 2025
2f07296
Refactor evaluation_test.py to ensure singleton watcher is initialized
Aug 7, 2025
dab44da
Update CI workflow to ignore test_eval_watcher.py in coverage reports
Aug 7, 2025
bf3786e
Add signal handler to manage zombie processes in eval_watcher.py
Aug 7, 2025
4ff7912
move
Aug 7, 2025
cc0abb2
Enhance singleton lock functionality and file locking in LocalFSDatas…
Aug 7, 2025
0fdd594
build
Aug 7, 2025
e37b078
Add script alias for eval_protocol CLI in pyproject.toml
Aug 7, 2025
698b04d
Fix import path for braintrust adapters in eval_protocol module
Aug 7, 2025
a7b76d4
Update broadcast_file_update method to restrict broadcasting to .json…
Aug 7, 2025
a0a487f
Fix broadcast_file_update logic to ensure only .jsonl files are broad…
Aug 7, 2025
f84360f
remove a bunch of stuff
Aug 7, 2025
7cd374a
remove ignore test that doesn't exist
Aug 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions eval_protocol/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,16 @@

import warnings

from .adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
from eval_protocol.adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn

from .auth import get_fireworks_account_id, get_fireworks_api_key
from .common_utils import load_jsonl
from .config import RewardKitConfig, get_config, load_config
from .mcp_env import (
AnthropicPolicy,
OpenAIPolicy,
LiteLLMPolicy,
FireworksPolicy,
LiteLLMPolicy,
OpenAIPolicy,
make,
rollout,
test_mcp,
Expand Down
34 changes: 14 additions & 20 deletions eval_protocol/common_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import json
import logging
import re
from typing import Any, Dict, List

logger = logging.getLogger(__name__)


def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
"""
Expand All @@ -14,23 +12,19 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:

Returns:
A list of dictionaries, where each dictionary is a parsed JSON object from a line.
Returns an empty list if the file is not found or if errors occur during parsing,
with errors logged.
Returns an empty list if the file is not found or if errors occur during parsing.
"""
data: List[Dict[str, Any]] = []
try:
with open(file_path, "r", encoding="utf-8") as f:
for i, line in enumerate(f):
try:
data.append(json.loads(line.strip()))
except json.JSONDecodeError as e:
logger.error(f"Error decoding JSON on line {i+1} in {file_path}: {e} - Line: '{line.strip()}'")
# Optionally, re-raise, or return partial data, or handle as per desired strictness
# For now, we'll log and continue, returning successfully parsed lines.
except FileNotFoundError:
logger.error(f"File not found: {file_path}")
return []
except Exception as e:
logger.error(f"An unexpected error occurred while reading {file_path}: {e}")
return []
with open(file_path, "r", encoding="utf-8") as f:
for line_number, line in enumerate(f):
try:
data.append(json.loads(line.strip()))
except json.JSONDecodeError as e:
print(f"Error parsing JSON line for file {file_path} at line {line_number}")
# attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
row_id_index = line.find("row_id")
if row_id_index != -1:
row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
raise e
return data
55 changes: 55 additions & 0 deletions eval_protocol/dataset_logger/directory_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os
from typing import Optional

# Shared constants for directory discovery
EVAL_PROTOCOL_DIR = ".eval_protocol"
PYTHON_FILES = ["pyproject.toml", "requirements.txt"]
DATASETS_DIR = "datasets"


def find_eval_protocol_dir() -> str:
"""
Find the .eval_protocol directory by looking up the directory tree.

Returns:
Path to the .eval_protocol directory
"""
# recursively look up for a .eval_protocol directory
current_dir = os.path.dirname(os.path.abspath(__file__))
while current_dir != "/":
if os.path.exists(os.path.join(current_dir, EVAL_PROTOCOL_DIR)):
log_dir = os.path.join(current_dir, EVAL_PROTOCOL_DIR)
break
current_dir = os.path.dirname(current_dir)
else:
# if not found, recursively look up until a pyproject.toml or requirements.txt is found
current_dir = os.path.dirname(os.path.abspath(__file__))
while current_dir != "/":
if any(os.path.exists(os.path.join(current_dir, f)) for f in PYTHON_FILES):
log_dir = os.path.join(current_dir, EVAL_PROTOCOL_DIR)
break
current_dir = os.path.dirname(current_dir)
else:
# get the PWD that this python process is running in
log_dir = os.path.join(os.getcwd(), EVAL_PROTOCOL_DIR)

# create the .eval_protocol directory if it doesn't exist
os.makedirs(log_dir, exist_ok=True)

return log_dir


def find_eval_protocol_datasets_dir() -> str:
"""
Find the .eval_protocol/datasets directory by looking up the directory tree.

Returns:
Path to the .eval_protocol/datasets directory
"""
log_dir = find_eval_protocol_dir()

# create the datasets subdirectory
datasets_dir = os.path.join(log_dir, DATASETS_DIR)
os.makedirs(datasets_dir, exist_ok=True)

return datasets_dir
100 changes: 42 additions & 58 deletions eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,26 @@
from datetime import datetime, timezone
import json
import os
import tempfile
import shutil
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional

from eval_protocol.common_utils import load_jsonl
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_datasets_dir

if TYPE_CHECKING:
from eval_protocol.models import EvaluationRow


class LocalFSDatasetLoggerAdapter(DatasetLogger):
"""
Logger that stores logs in the local filesystem.
Logger that stores logs in the local filesystem with file locking to prevent race conditions.
"""

EVAL_PROTOCOL_DIR = ".eval_protocol"
PYTHON_FILES = ["pyproject.toml", "requirements.txt"]
DATASETS_DIR = "datasets"

def __init__(self):
# recursively look up for a .eval_protocol directory
current_dir = os.path.dirname(os.path.abspath(__file__))
while current_dir != "/":
if os.path.exists(os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)):
self.log_dir = os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)
break
current_dir = os.path.dirname(current_dir)

# if not found, recursively look up until a pyproject.toml or requirements.txt is found
current_dir = os.path.dirname(os.path.abspath(__file__))
while current_dir != "/":
if any(os.path.exists(os.path.join(current_dir, f)) for f in self.PYTHON_FILES):
self.log_dir = os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)
break
current_dir = os.path.dirname(current_dir)

# get the PWD that this python process is running in
self.log_dir = os.path.join(os.getcwd(), self.EVAL_PROTOCOL_DIR)

# create the .eval_protocol directory if it doesn't exist
os.makedirs(self.log_dir, exist_ok=True)

# create the datasets subdirectory
self.datasets_dir = os.path.join(self.log_dir, self.DATASETS_DIR)
os.makedirs(self.datasets_dir, exist_ok=True)
self.log_dir = os.path.dirname(find_eval_protocol_datasets_dir())
self.datasets_dir = find_eval_protocol_datasets_dir()

# ensure that log file exists
if not os.path.exists(self.current_jsonl_path):
Expand All @@ -68,44 +43,53 @@ def log(self, row: "EvaluationRow") -> None:
"""Log a row, updating existing row with same ID or appending new row."""
row_id = row.input_metadata.row_id

# Check if row with this ID already exists
if os.path.exists(self.current_jsonl_path):
with open(self.current_jsonl_path, "r") as f:
lines = f.readlines()

# Find the line with matching ID
for i, line in enumerate(lines):
try:
line_data = json.loads(line.strip())
if line_data["input_metadata"]["row_id"] == row_id:
# Update existing row
lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
with open(self.current_jsonl_path, "w") as f:
f.writelines(lines)
return
except json.JSONDecodeError:
continue

# If no existing row found, append new row
# Check if row with this ID already exists in any JSONL file
if os.path.exists(self.datasets_dir):
for filename in os.listdir(self.datasets_dir):
if filename.endswith(".jsonl"):
file_path = os.path.join(self.datasets_dir, filename)
if os.path.exists(file_path):
with open(file_path, "r") as f:
lines = f.readlines()

# Find the line with matching ID
for i, line in enumerate(lines):
try:
line_data = json.loads(line.strip())
if line_data["input_metadata"]["row_id"] == row_id:
# Update existing row
lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
with open(file_path, "w") as f:
f.writelines(lines)
return
except json.JSONDecodeError:
continue

# If no existing row found, append new row to current file
with open(self.current_jsonl_path, "a") as f:
f.write(row.model_dump_json(exclude_none=True) + os.linesep)

def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
"""Read rows from all JSONL files in the datasets directory."""
"""Read rows from all JSONL files in the datasets directory. Also
ensures that there are no duplicate row IDs."""
from eval_protocol.models import EvaluationRow

if not os.path.exists(self.datasets_dir):
return []

all_rows = []
existing_row_ids = set()
for filename in os.listdir(self.datasets_dir):
if filename.endswith(".jsonl"):
file_path = os.path.join(self.datasets_dir, filename)
try:
data = load_jsonl(file_path)
all_rows.extend([EvaluationRow(**r) for r in data])
except Exception:
continue # skip files that can't be read/parsed
data = load_jsonl(file_path)
for r in data:
row = EvaluationRow(**r)
if row.input_metadata.row_id not in existing_row_ids:
existing_row_ids.add(row.input_metadata.row_id)
else:
raise ValueError(f"Duplicate Row ID {row.input_metadata.row_id} already exists")
all_rows.append(row)

if row_id:
# Filter by row_id if specified
Expand Down
133 changes: 133 additions & 0 deletions eval_protocol/get_pep440_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Cache for PEP 440 version string
import subprocess

_version_cache = {"version": None, "base_version": None}


def get_pep440_version(base_version=None):
"""
Generate a PEP 440 compliant version string based on git information.

This function is inspired by versioneer but doesn't require the full versioneer
setup, making it easier for downstream users to adopt without additional dependencies.

The result is cached statically to avoid repeated git calls.

Args:
base_version: The base version string (e.g., "1.0.0"). If None, will try to
find the most recent version tag in git.

Returns:
A PEP 440 compliant version string that includes:
- Development release number (devN) based on commit count since base_version
- Local version identifier with git commit hash
- Dirty indicator if there are uncommitted changes

Examples:
>>> get_pep440_version("1.0.0")
"1.0.0.dev42+g1234567" # 42 commits since 1.0.0, commit hash 1234567
>>> get_pep440_version("1.0.0") # with uncommitted changes
"1.0.0.dev42+g1234567.dirty" # indicates dirty working directory
>>> get_pep440_version("1.0.0") # no git available
"1.0.0+unknown" # indicates git info not available
"""
# Check if we have a cached version for this base_version
if _version_cache["version"] is not None and _version_cache["base_version"] == base_version:
return _version_cache["version"]
try:
# Check if we're in a git repository
subprocess.run(
["git", "rev-parse", "--git-dir"],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)

# If base_version is None, try to find the most recent version tag
if base_version is None:
try:
base_version = subprocess.check_output(
["git", "describe", "--tags", "--abbrev=0"], universal_newlines=True, stderr=subprocess.DEVNULL
).strip()
except subprocess.CalledProcessError:
# No tags found, we'll handle this case specially
base_version = None

# Get commit count since base_version
if base_version is None:
# No base version (no tags), just count all commits
count = subprocess.check_output(
["git", "rev-list", "--count", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
).strip()
base_version = "0.0.0" # Use this for the final version string
else:
try:
count = subprocess.check_output(
["git", "rev-list", "--count", f"{base_version}..HEAD"],
universal_newlines=True,
stderr=subprocess.DEVNULL,
).strip()
# If no commits found, try counting from the beginning
if count == "0" or not count:
count = subprocess.check_output(
["git", "rev-list", "--count", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
).strip()
except subprocess.CalledProcessError:
# If base_version tag doesn't exist, count all commits
count = subprocess.check_output(
["git", "rev-list", "--count", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
).strip()

# Get short commit hash
commit_hash = subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
).strip()

# Check for uncommitted changes (dirty working directory)
try:
subprocess.run(
["git", "diff-index", "--quiet", "HEAD", "--"],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
dirty_suffix = ""
except subprocess.CalledProcessError:
dirty_suffix = ".dirty"

# Ensure count is a valid integer
try:
dev_count = int(count)
except (ValueError, TypeError):
dev_count = 0

# Build PEP 440 compliant version string
# Format: <base_version>.dev<count>+g<hash>[.dirty]
version_parts = [base_version]

if dev_count > 0:
version_parts.append(f".dev{dev_count}")

version_parts.append(f"+g{commit_hash}")

if dirty_suffix:
version_parts.append(dirty_suffix)

result = "".join(version_parts)

# Cache the result
_version_cache["version"] = result
_version_cache["base_version"] = base_version

return result

except (subprocess.CalledProcessError, FileNotFoundError, OSError):
# Git is not available or not a git repository
result = f"{base_version}+unknown"

# Cache the result
_version_cache["version"] = result
_version_cache["base_version"] = base_version

return result
Loading
Loading