eval-protocol · dphuang2 · Aug 6, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -160,6 +160,7 @@ cython_debug/
 # Specific to this project (can be kept or reviewed)
 firectl
 references
+.eval_protocol
 
 
 samples.json # If this is a specific file to ignore

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -34,6 +34,18 @@
       "env": {
         "PYTHONPATH": "${workspaceFolder}"
       }
+    },
+    {
+      "name": "Python: Debug Logs Server (Uvicorn)",
+      "type": "python",
+      "request": "launch",
+      "module": "uvicorn",
+      "args": ["eval_protocol.utils.logs_server:app", "--reload"],
+      "console": "integratedTerminal",
+      "justMyCode": false,
+      "env": {
+        "PYTHONPATH": "${workspaceFolder}"
+      }
     }
   ]
 }
diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
@@ -14,6 +14,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 from eval_protocol.evaluation import create_evaluation, preview_evaluation
 
 from .cli_commands.agent_eval_cmd import agent_eval_command
@@ -26,6 +27,7 @@
 from .cli_commands.deploy_mcp import deploy_mcp_command
 from .cli_commands.preview import preview_command
 from .cli_commands.run_eval_cmd import hydra_cli_entry_point
+from .cli_commands.logs import logs_command
 
 
 def parse_args(args=None):
@@ -285,6 +287,39 @@ def parse_args(args=None):
         help="Override the number of parallel rollouts to execute for each task.",
     )
 
+    # Logs command
+    logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
+    logs_parser.add_argument(
+        "--build-dir",
+        default="dist",
+        help="Path to the Vite build output directory (default: dist)",
+    )
+    logs_parser.add_argument(
+        "--host",
+        default="localhost",
+        help="Host to bind the server to (default: localhost)",
+    )
+    logs_parser.add_argument(
+        "--port",
+        type=int,
+        default=4789,
+        help="Port to bind the server to (default: 4789)",
+    )
+    logs_parser.add_argument(
+        "--index-file",
+        default="index.html",
+        help="Name of the main index file (default: index.html)",
+    )
+    logs_parser.add_argument(
+        "--watch-paths",
+        help="Comma-separated list of paths to watch for file changes (default: current directory)",
+    )
+    logs_parser.add_argument(
+        "--reload",
+        action="store_true",
+        help="Enable auto-reload (default: False)",
+    )
+
     # Run command (for Hydra-based evaluations)
     # This subparser intentionally defines no arguments itself.
     # All arguments after 'run' will be passed to Hydra by parse_known_args.
@@ -338,6 +373,8 @@ def main():
         return deploy_mcp_command(args)
     elif args.command == "agent-eval":
         return agent_eval_command(args)
+    elif args.command == "logs":
+        return logs_command(args)
     elif args.command == "run":
         # For the 'run' command, Hydra takes over argument parsing.
 

diff --git a/eval_protocol/cli_commands/logs.py b/eval_protocol/cli_commands/logs.py
@@ -0,0 +1,40 @@
+"""
+CLI command for serving logs with file watching and real-time updates.
+"""
+
+import sys
+from pathlib import Path
+
+from ..utils.logs_server import serve_logs
+
+
+def logs_command(args):
+    """Serve logs with file watching and real-time updates"""
+
+    # Parse watch paths
+    watch_paths = None
+    if args.watch_paths:
+        watch_paths = args.watch_paths.split(",")
+        watch_paths = [path.strip() for path in watch_paths if path.strip()]
+
+    print(f"🚀 Starting Eval Protocol Logs Server")
+    print(f"🌐 URL: http://{args.host}:{args.port}")
+    print(f"🔌 WebSocket: ws://{args.host}:{args.port}/ws")
+    print(f"👀 Watching paths: {watch_paths or ['current directory']}")
+    print("Press Ctrl+C to stop the server")
+    print("-" * 50)
+
+    try:
+        serve_logs(
+            host=args.host,
+            port=args.port,
+            watch_paths=watch_paths,
+            reload=args.reload,
+        )
+        return 0
+    except KeyboardInterrupt:
+        print("\n🛑 Server stopped by user")
+        return 0
+    except Exception as e:
+        print(f"❌ Error starting server: {e}")
+        return 1
diff --git a/eval_protocol/dataset_logger/__init__.py b/eval_protocol/dataset_logger/__init__.py
@@ -0,0 +1,3 @@
+from eval_protocol.dataset_logger.local_fs_dataset_logger_adapter import LocalFSDatasetLoggerAdapter
+
+default_logger = LocalFSDatasetLoggerAdapter()
diff --git a/eval_protocol/dataset_logger/dataset_logger.py b/eval_protocol/dataset_logger/dataset_logger.py
@@ -0,0 +1,35 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Optional
+
+if TYPE_CHECKING:
+    from eval_protocol.models import EvaluationRow
+
+
+class DatasetLogger(ABC):
+    """
+    Abstract base class for logging EvaluationRow objects.
+    Implementations should provide methods for storing and retrieving logs.
+    """
+
+    @abstractmethod
+    def log(self, row: "EvaluationRow") -> None:
+        """
+        Store a single EvaluationRow log.
+
+        Args:
+            row (EvaluationRow): The evaluation row to log.
+        """
+        pass
+
+    @abstractmethod
+    def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
+        """
+        Retrieve EvaluationRow logs.
+
+        Args:
+            row_id (Optional[str]): If provided, filter logs by this row_id.
+
+        Returns:
+            List[EvaluationRow]: List of retrieved evaluation rows.
+        """
+        pass
diff --git a/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py b/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py
@@ -0,0 +1,114 @@
+from datetime import datetime, timezone
+import json
+import os
+import tempfile
+import shutil
+from typing import TYPE_CHECKING, List, Optional
+from eval_protocol.common_utils import load_jsonl
+from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
+
+if TYPE_CHECKING:
+    from eval_protocol.models import EvaluationRow
+
+
+class LocalFSDatasetLoggerAdapter(DatasetLogger):
+    """
+    Logger that stores logs in the local filesystem.
+    """
+
+    EVAL_PROTOCOL_DIR = ".eval_protocol"
+    PYTHON_FILES = ["pyproject.toml", "requirements.txt"]
+    DATASETS_DIR = "datasets"
+
+    def __init__(self):
+        # recursively look up for a .eval_protocol directory
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        while current_dir != "/":
+            if os.path.exists(os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)):
+                self.log_dir = os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)
+                break
+            current_dir = os.path.dirname(current_dir)
+
+        # if not found, recursively look up until a pyproject.toml or requirements.txt is found
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        while current_dir != "/":
+            if any(os.path.exists(os.path.join(current_dir, f)) for f in self.PYTHON_FILES):
+                self.log_dir = os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)
+                break
+            current_dir = os.path.dirname(current_dir)
+
+        # get the PWD that this python process is running in
+        self.log_dir = os.path.join(os.getcwd(), self.EVAL_PROTOCOL_DIR)
+
+        # create the .eval_protocol directory if it doesn't exist
+        os.makedirs(self.log_dir, exist_ok=True)
+
+        # create the datasets subdirectory
+        self.datasets_dir = os.path.join(self.log_dir, self.DATASETS_DIR)
+        os.makedirs(self.datasets_dir, exist_ok=True)
+
+        # ensure that log file exists
+        if not os.path.exists(self.current_jsonl_path):
+            with open(self.current_jsonl_path, "w") as f:
+                f.write("")
+
+    @property
+    def current_date(self) -> str:
+        # Use UTC timezone to be consistent across local device/locations/CI
+        return datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    @property
+    def current_jsonl_path(self) -> str:
+        """
+        The current JSONL file path. Based on the current date.
+        """
+        return os.path.join(self.datasets_dir, f"{self.current_date}.jsonl")
+
+    def log(self, row: "EvaluationRow") -> None:
+        """Log a row, updating existing row with same ID or appending new row."""
+        row_id = row.input_metadata.row_id
+
+        # Check if row with this ID already exists
+        if os.path.exists(self.current_jsonl_path):
+            with open(self.current_jsonl_path, "r") as f:
+                lines = f.readlines()
+
+            # Find the line with matching ID
+            for i, line in enumerate(lines):
+                try:
+                    line_data = json.loads(line.strip())
+                    if line_data["input_metadata"]["row_id"] == row_id:
+                        # Update existing row
+                        lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
+                        with open(self.current_jsonl_path, "w") as f:
+                            f.writelines(lines)
+                        return
+                except json.JSONDecodeError:
+                    continue
+
+        # If no existing row found, append new row
+        with open(self.current_jsonl_path, "a") as f:
+            f.write(row.model_dump_json(exclude_none=True) + os.linesep)
+
+    def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
+        """Read rows from all JSONL files in the datasets directory."""
+        from eval_protocol.models import EvaluationRow
+
+        if not os.path.exists(self.datasets_dir):
+            return []
+
+        all_rows = []
+        for filename in os.listdir(self.datasets_dir):
+            if filename.endswith(".jsonl"):
+                file_path = os.path.join(self.datasets_dir, filename)
+                try:
+                    data = load_jsonl(file_path)
+                    all_rows.extend([EvaluationRow(**r) for r in data])
+                except Exception:
+                    continue  # skip files that can't be read/parsed
+
+        if row_id:
+            # Filter by row_id if specified
+            return [row for row in all_rows if getattr(row.input_metadata, "row_id", None) == row_id]
+        else:
+            return all_rows
diff --git a/eval_protocol/human_id/__init__.py b/eval_protocol/human_id/__init__.py
@@ -0,0 +1,34 @@
+import random
+import itertools
+from typing import Hashable
+from . import dictionary
+
+__all__ = ["generate_id"]
+
+system_random = random.SystemRandom()
+
+
+def generate_id(separator="-", seed: int | float | str | bytes | bytearray | None = None, word_count=4) -> str:
+    """
+    Generate a human readable ID
+
+    :param separator: The string to use to separate words
+    :param seed: The seed to use. The same seed will produce the same ID
+    :param word_count: The number of words to use. Minimum of 3.
+    :return: A human readable ID
+    """
+    if word_count < 3:
+        raise ValueError("word_count cannot be lower than 3")
+
+    random_obj = system_random
+    if seed:
+        random_obj = random.Random(seed)
+
+    parts = {dictionary.verbs: 1, dictionary.adjectives: 1, dictionary.nouns: 1}
+
+    for _ in range(3, word_count):
+        parts[random_obj.choice(list(parts.keys()))] += 1
+
+    parts = itertools.chain.from_iterable(random_obj.sample(part, count) for part, count in parts.items())
+
+    return separator.join(parts)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from eval_protocol.dataset_logger.local_fs_dataset_logger_adapter import LocalFSDatasetLoggerAdapter

		default_logger = LocalFSDatasetLoggerAdapter()