eval-protocol · dphuang2 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py
@@ -10,15 +10,16 @@
 
 import warnings
 
-from .adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
+from eval_protocol.adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
+
 from .auth import get_fireworks_account_id, get_fireworks_api_key
 from .common_utils import load_jsonl
 from .config import RewardKitConfig, get_config, load_config
 from .mcp_env import (
     AnthropicPolicy,
-    OpenAIPolicy,
-    LiteLLMPolicy,
     FireworksPolicy,
+    LiteLLMPolicy,
+    OpenAIPolicy,
     make,
     rollout,
     test_mcp,

diff --git a/eval_protocol/common_utils.py b/eval_protocol/common_utils.py
@@ -1,9 +1,7 @@
 import json
-import logging
+import re
 from typing import Any, Dict, List
 
-logger = logging.getLogger(__name__)
-
 
 def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
     """
@@ -14,23 +12,19 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
 
     Returns:
         A list of dictionaries, where each dictionary is a parsed JSON object from a line.
-        Returns an empty list if the file is not found or if errors occur during parsing,
-        with errors logged.
+        Returns an empty list if the file is not found or if errors occur during parsing.
     """
     data: List[Dict[str, Any]] = []
-    try:
-        with open(file_path, "r", encoding="utf-8") as f:
-            for i, line in enumerate(f):
-                try:
-                    data.append(json.loads(line.strip()))
-                except json.JSONDecodeError as e:
-                    logger.error(f"Error decoding JSON on line {i+1} in {file_path}: {e} - Line: '{line.strip()}'")
-                    # Optionally, re-raise, or return partial data, or handle as per desired strictness
-                    # For now, we'll log and continue, returning successfully parsed lines.
-    except FileNotFoundError:
-        logger.error(f"File not found: {file_path}")
-        return []
-    except Exception as e:
-        logger.error(f"An unexpected error occurred while reading {file_path}: {e}")
-        return []
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line_number, line in enumerate(f):
+            try:
+                data.append(json.loads(line.strip()))
+            except json.JSONDecodeError as e:
+                print(f"Error parsing JSON line for file {file_path} at line {line_number}")
+                # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
+                row_id_index = line.find("row_id")
+                if row_id_index != -1:
+                    row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
+                    raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                raise e
     return data
diff --git a/eval_protocol/dataset_logger/directory_utils.py b/eval_protocol/dataset_logger/directory_utils.py
@@ -0,0 +1,55 @@
+import os
+from typing import Optional
+
+# Shared constants for directory discovery
+EVAL_PROTOCOL_DIR = ".eval_protocol"
+PYTHON_FILES = ["pyproject.toml", "requirements.txt"]
+DATASETS_DIR = "datasets"
+
+
+def find_eval_protocol_dir() -> str:
+    """
+    Find the .eval_protocol directory by looking up the directory tree.
+
+    Returns:
+        Path to the .eval_protocol directory
+    """
+    # recursively look up for a .eval_protocol directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    while current_dir != "/":
+        if os.path.exists(os.path.join(current_dir, EVAL_PROTOCOL_DIR)):
+            log_dir = os.path.join(current_dir, EVAL_PROTOCOL_DIR)
+            break
+        current_dir = os.path.dirname(current_dir)
+    else:
+        # if not found, recursively look up until a pyproject.toml or requirements.txt is found
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        while current_dir != "/":
+            if any(os.path.exists(os.path.join(current_dir, f)) for f in PYTHON_FILES):
+                log_dir = os.path.join(current_dir, EVAL_PROTOCOL_DIR)
+                break
+            current_dir = os.path.dirname(current_dir)
+        else:
+            # get the PWD that this python process is running in
+            log_dir = os.path.join(os.getcwd(), EVAL_PROTOCOL_DIR)
+
+    # create the .eval_protocol directory if it doesn't exist
+    os.makedirs(log_dir, exist_ok=True)
+
+    return log_dir
+
+
+def find_eval_protocol_datasets_dir() -> str:
+    """
+    Find the .eval_protocol/datasets directory by looking up the directory tree.
+
+    Returns:
+        Path to the .eval_protocol/datasets directory
+    """
+    log_dir = find_eval_protocol_dir()
+
+    # create the datasets subdirectory
+    datasets_dir = os.path.join(log_dir, DATASETS_DIR)
+    os.makedirs(datasets_dir, exist_ok=True)
+
+    return datasets_dir
diff --git a/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py b/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py
@@ -1,51 +1,26 @@
-from datetime import datetime, timezone
 import json
 import os
-import tempfile
-import shutil
+import time
+from datetime import datetime, timezone
+from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional
+
 from eval_protocol.common_utils import load_jsonl
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
+from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_datasets_dir
 
 if TYPE_CHECKING:
     from eval_protocol.models import EvaluationRow
 
 
 class LocalFSDatasetLoggerAdapter(DatasetLogger):
     """
-    Logger that stores logs in the local filesystem.
+    Logger that stores logs in the local filesystem with file locking to prevent race conditions.
     """
 
-    EVAL_PROTOCOL_DIR = ".eval_protocol"
-    PYTHON_FILES = ["pyproject.toml", "requirements.txt"]
-    DATASETS_DIR = "datasets"
-
     def __init__(self):
-        # recursively look up for a .eval_protocol directory
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        while current_dir != "/":
-            if os.path.exists(os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)):
-                self.log_dir = os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)
-                break
-            current_dir = os.path.dirname(current_dir)
-
-        # if not found, recursively look up until a pyproject.toml or requirements.txt is found
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        while current_dir != "/":
-            if any(os.path.exists(os.path.join(current_dir, f)) for f in self.PYTHON_FILES):
-                self.log_dir = os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)
-                break
-            current_dir = os.path.dirname(current_dir)
-
-        # get the PWD that this python process is running in
-        self.log_dir = os.path.join(os.getcwd(), self.EVAL_PROTOCOL_DIR)
-
-        # create the .eval_protocol directory if it doesn't exist
-        os.makedirs(self.log_dir, exist_ok=True)
-
-        # create the datasets subdirectory
-        self.datasets_dir = os.path.join(self.log_dir, self.DATASETS_DIR)
-        os.makedirs(self.datasets_dir, exist_ok=True)
+        self.log_dir = os.path.dirname(find_eval_protocol_datasets_dir())
+        self.datasets_dir = find_eval_protocol_datasets_dir()
 
         # ensure that log file exists
         if not os.path.exists(self.current_jsonl_path):
@@ -68,44 +43,53 @@ def log(self, row: "EvaluationRow") -> None:
         """Log a row, updating existing row with same ID or appending new row."""
         row_id = row.input_metadata.row_id
 
-        # Check if row with this ID already exists
-        if os.path.exists(self.current_jsonl_path):
-            with open(self.current_jsonl_path, "r") as f:
-                lines = f.readlines()
-
-            # Find the line with matching ID
-            for i, line in enumerate(lines):
-                try:
-                    line_data = json.loads(line.strip())
-                    if line_data["input_metadata"]["row_id"] == row_id:
-                        # Update existing row
-                        lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
-                        with open(self.current_jsonl_path, "w") as f:
-                            f.writelines(lines)
-                        return
-                except json.JSONDecodeError:
-                    continue
-
-        # If no existing row found, append new row
+        # Check if row with this ID already exists in any JSONL file
+        if os.path.exists(self.datasets_dir):
+            for filename in os.listdir(self.datasets_dir):
+                if filename.endswith(".jsonl"):
+                    file_path = os.path.join(self.datasets_dir, filename)
+                    if os.path.exists(file_path):
+                        with open(file_path, "r") as f:
+                            lines = f.readlines()
+
+                        # Find the line with matching ID
+                        for i, line in enumerate(lines):
+                            try:
+                                line_data = json.loads(line.strip())
+                                if line_data["input_metadata"]["row_id"] == row_id:
+                                    # Update existing row
+                                    lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
+                                    with open(file_path, "w") as f:
+                                        f.writelines(lines)
+                                    return
+                            except json.JSONDecodeError:
+                                continue
+
+        # If no existing row found, append new row to current file
         with open(self.current_jsonl_path, "a") as f:
             f.write(row.model_dump_json(exclude_none=True) + os.linesep)
 
     def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
-        """Read rows from all JSONL files in the datasets directory."""
+        """Read rows from all JSONL files in the datasets directory. Also
+        ensures that there are no duplicate row IDs."""
         from eval_protocol.models import EvaluationRow
 
         if not os.path.exists(self.datasets_dir):
             return []
 
         all_rows = []
+        existing_row_ids = set()
         for filename in os.listdir(self.datasets_dir):
             if filename.endswith(".jsonl"):
                 file_path = os.path.join(self.datasets_dir, filename)
-                try:
-                    data = load_jsonl(file_path)
-                    all_rows.extend([EvaluationRow(**r) for r in data])
-                except Exception:
-                    continue  # skip files that can't be read/parsed
+                data = load_jsonl(file_path)
+                for r in data:
+                    row = EvaluationRow(**r)
+                    if row.input_metadata.row_id not in existing_row_ids:
+                        existing_row_ids.add(row.input_metadata.row_id)
+                    else:
+                        raise ValueError(f"Duplicate Row ID {row.input_metadata.row_id} already exists")
+                    all_rows.append(row)
 
         if row_id:
             # Filter by row_id if specified

diff --git a/eval_protocol/get_pep440_version.py b/eval_protocol/get_pep440_version.py
@@ -0,0 +1,133 @@
+# Cache for PEP 440 version string
+import subprocess
+
+_version_cache = {"version": None, "base_version": None}
+
+
+def get_pep440_version(base_version=None):
+    """
+    Generate a PEP 440 compliant version string based on git information.
+
+    This function is inspired by versioneer but doesn't require the full versioneer
+    setup, making it easier for downstream users to adopt without additional dependencies.
+
+    The result is cached statically to avoid repeated git calls.
+
+    Args:
+        base_version: The base version string (e.g., "1.0.0"). If None, will try to
+                     find the most recent version tag in git.
+
+    Returns:
+        A PEP 440 compliant version string that includes:
+        - Development release number (devN) based on commit count since base_version
+        - Local version identifier with git commit hash
+        - Dirty indicator if there are uncommitted changes
+
+    Examples:
+        >>> get_pep440_version("1.0.0")
+        "1.0.0.dev42+g1234567"  # 42 commits since 1.0.0, commit hash 1234567
+        >>> get_pep440_version("1.0.0")  # with uncommitted changes
+        "1.0.0.dev42+g1234567.dirty"  # indicates dirty working directory
+        >>> get_pep440_version("1.0.0")  # no git available
+        "1.0.0+unknown"  # indicates git info not available
+    """
+    # Check if we have a cached version for this base_version
+    if _version_cache["version"] is not None and _version_cache["base_version"] == base_version:
+        return _version_cache["version"]
+    try:
+        # Check if we're in a git repository
+        subprocess.run(
+            ["git", "rev-parse", "--git-dir"],
+            check=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            universal_newlines=True,
+        )
+
+        # If base_version is None, try to find the most recent version tag
+        if base_version is None:
+            try:
+                base_version = subprocess.check_output(
+                    ["git", "describe", "--tags", "--abbrev=0"], universal_newlines=True, stderr=subprocess.DEVNULL
+                ).strip()
+            except subprocess.CalledProcessError:
+                # No tags found, we'll handle this case specially
+                base_version = None
+
+        # Get commit count since base_version
+        if base_version is None:
+            # No base version (no tags), just count all commits
+            count = subprocess.check_output(
+                ["git", "rev-list", "--count", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
+            ).strip()
+            base_version = "0.0.0"  # Use this for the final version string
+        else:
+            try:
+                count = subprocess.check_output(
+                    ["git", "rev-list", "--count", f"{base_version}..HEAD"],
+                    universal_newlines=True,
+                    stderr=subprocess.DEVNULL,
+                ).strip()
+                # If no commits found, try counting from the beginning
+                if count == "0" or not count:
+                    count = subprocess.check_output(
+                        ["git", "rev-list", "--count", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
+                    ).strip()
+            except subprocess.CalledProcessError:
+                # If base_version tag doesn't exist, count all commits
+                count = subprocess.check_output(
+                    ["git", "rev-list", "--count", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
+                ).strip()
+
+        # Get short commit hash
+        commit_hash = subprocess.check_output(
+            ["git", "rev-parse", "--short", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
+        ).strip()
+
+        # Check for uncommitted changes (dirty working directory)
+        try:
+            subprocess.run(
+                ["git", "diff-index", "--quiet", "HEAD", "--"],
+                check=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            dirty_suffix = ""
+        except subprocess.CalledProcessError:
+            dirty_suffix = ".dirty"
+
+        # Ensure count is a valid integer
+        try:
+            dev_count = int(count)
+        except (ValueError, TypeError):
+            dev_count = 0
+
+        # Build PEP 440 compliant version string
+        # Format: <base_version>.dev<count>+g<hash>[.dirty]
+        version_parts = [base_version]
+
+        if dev_count > 0:
+            version_parts.append(f".dev{dev_count}")
+
+        version_parts.append(f"+g{commit_hash}")
+
+        if dirty_suffix:
+            version_parts.append(dirty_suffix)
+
+        result = "".join(version_parts)
+
+        # Cache the result
+        _version_cache["version"] = result
+        _version_cache["base_version"] = base_version
+
+        return result
+
+    except (subprocess.CalledProcessError, FileNotFoundError, OSError):
+        # Git is not available or not a git repository
+        result = f"{base_version}+unknown"
+
+        # Cache the result
+        _version_cache["version"] = result
+        _version_cache["base_version"] = base_version
+
+        return result