Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
0ff42c5
initial server file watching works
Aug 5, 2025
01cc196
save
Aug 5, 2025
c7ee212
fix error
Aug 5, 2025
33e2663
include dist/ as part of production packaging
Aug 5, 2025
0f8dc1f
add favicon
Aug 5, 2025
9f14be6
implement logger and use in agent rollout processor
Aug 5, 2025
4eeb2c1
log rows after evals too
Aug 5, 2025
290714f
add data
Aug 5, 2025
041e04f
Merge branch 'main' into in-progress-eval-viewer
Aug 5, 2025
9b1e528
feedback -> reason
Aug 5, 2025
c0c437b
add zod types for eval protocol
Aug 5, 2025
742f522
save. fix reloading
Aug 5, 2025
265016a
Update FastAPI dependency to version 0.116.1 and refactor LogsServer …
Aug 5, 2025
bc23c88
Refactor InputMetadata to make row_id optional and update type annota…
Aug 5, 2025
02d44af
save
Aug 5, 2025
960d9a3
Add Python debug configuration for Uvicorn logs server and refactor L…
Aug 5, 2025
5155849
fix
Aug 5, 2025
975bc76
fix
Aug 5, 2025
ee56fc7
save
Aug 5, 2025
011834a
remove unnecessary test
Aug 5, 2025
cc28ff4
remove classnames
Aug 5, 2025
3dc82dc
make threshold of succes lower
Aug 5, 2025
32d70b1
remove old links
Aug 5, 2025
779c2ed
revert
Aug 5, 2025
e831eac
add mobx
Aug 5, 2025
eb01dc5
save
Aug 5, 2025
58d8150
set threshold lower
Aug 5, 2025
7caa4d7
data is being set properly
Aug 5, 2025
ac83ce6
refactor App component layout and styling for improved UI
Aug 6, 2025
98f7474
looks good
Aug 6, 2025
97f7cd7
save
Aug 6, 2025
aee6e46
refactor state management by moving GlobalState to a separate file
Aug 6, 2025
50e9967
more updates
Aug 6, 2025
81fc0fc
use table
Aug 6, 2025
d87da9d
refactor Dashboard component to simplify dataset summary display by r…
Aug 6, 2025
242a39e
log for debugging
Aug 6, 2025
ae5e4ae
refactor Row component to replace MessageBubble with ChatInterface an…
Aug 6, 2025
6bcd294
Add logo image and update App component header
Aug 6, 2025
6a04326
Update chat width calculation to set maximum width as 80% of containe…
Aug 6, 2025
1159eae
Refactor Row component to use MetadataSection for displaying Evaluati…
Aug 6, 2025
6dca6e3
first pass eval metadata
Aug 6, 2025
332f25b
add more fields
Aug 6, 2025
75934d2
save
Aug 6, 2025
9be63e2
Refactor ChatInterface component to improve resize handle functionali…
Aug 6, 2025
8e154eb
Add 'passed' field to EvalMetadata and update evaluation logic to det…
Aug 6, 2025
ce3eead
Update Row component to change created date text color from gray-900 …
Aug 6, 2025
0409ef0
Refactor StatusIndicator component to accept status prop, enhancing f…
Aug 6, 2025
c4b9b3c
Refactor Agent class to accept EvaluationRow directly in the construc…
Aug 6, 2025
ffe942e
fix rollout
Aug 6, 2025
a0cb830
Refactor evaluation_test to improve error handling and metadata initi…
Aug 6, 2025
afcc280
Add .eval_protocol to .gitignore and remove outdated JSONL dataset fi…
Aug 6, 2025
ee20bfe
Update test_pytest_mcp_config.py docstring to clarify test purpose fo…
Aug 6, 2025
7ea7007
Remove redundant Metadata header from Row component for cleaner UI.
Aug 6, 2025
3cd2729
Update ChatInterface component to adjust maximum width calculation fo…
Aug 6, 2025
5626b88
make messages more dense
Aug 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ cython_debug/
# Specific to this project (can be kept or reviewed)
firectl
references
.eval_protocol


samples.json # If this is a specific file to ignore
Expand Down
12 changes: 12 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@
"env": {
"PYTHONPATH": "${workspaceFolder}"
}
},
{
"name": "Python: Debug Logs Server (Uvicorn)",
"type": "python",
"request": "launch",
"module": "uvicorn",
"args": ["eval_protocol.utils.logs_server:app", "--reload"],
"console": "integratedTerminal",
"justMyCode": false,
"env": {
"PYTHONPATH": "${workspaceFolder}"
}
}
]
}
37 changes: 37 additions & 0 deletions eval_protocol/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

logger = logging.getLogger(__name__)


from eval_protocol.evaluation import create_evaluation, preview_evaluation

from .cli_commands.agent_eval_cmd import agent_eval_command
Expand All @@ -26,6 +27,7 @@
from .cli_commands.deploy_mcp import deploy_mcp_command
from .cli_commands.preview import preview_command
from .cli_commands.run_eval_cmd import hydra_cli_entry_point
from .cli_commands.logs import logs_command


def parse_args(args=None):
Expand Down Expand Up @@ -285,6 +287,39 @@ def parse_args(args=None):
help="Override the number of parallel rollouts to execute for each task.",
)

# Logs command
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
logs_parser.add_argument(
"--build-dir",
default="dist",
help="Path to the Vite build output directory (default: dist)",
)
logs_parser.add_argument(
"--host",
default="localhost",
help="Host to bind the server to (default: localhost)",
)
logs_parser.add_argument(
"--port",
type=int,
default=4789,
help="Port to bind the server to (default: 4789)",
)
logs_parser.add_argument(
"--index-file",
default="index.html",
help="Name of the main index file (default: index.html)",
)
logs_parser.add_argument(
"--watch-paths",
help="Comma-separated list of paths to watch for file changes (default: current directory)",
)
logs_parser.add_argument(
"--reload",
action="store_true",
help="Enable auto-reload (default: False)",
)

# Run command (for Hydra-based evaluations)
# This subparser intentionally defines no arguments itself.
# All arguments after 'run' will be passed to Hydra by parse_known_args.
Expand Down Expand Up @@ -338,6 +373,8 @@ def main():
return deploy_mcp_command(args)
elif args.command == "agent-eval":
return agent_eval_command(args)
elif args.command == "logs":
return logs_command(args)
elif args.command == "run":
# For the 'run' command, Hydra takes over argument parsing.

Expand Down
40 changes: 40 additions & 0 deletions eval_protocol/cli_commands/logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
CLI command for serving logs with file watching and real-time updates.
"""

import sys
from pathlib import Path

from ..utils.logs_server import serve_logs


def logs_command(args):
"""Serve logs with file watching and real-time updates"""

# Parse watch paths
watch_paths = None
if args.watch_paths:
watch_paths = args.watch_paths.split(",")
watch_paths = [path.strip() for path in watch_paths if path.strip()]

print(f"🚀 Starting Eval Protocol Logs Server")
print(f"🌐 URL: http://{args.host}:{args.port}")
print(f"🔌 WebSocket: ws://{args.host}:{args.port}/ws")
print(f"👀 Watching paths: {watch_paths or ['current directory']}")
print("Press Ctrl+C to stop the server")
print("-" * 50)

try:
serve_logs(
host=args.host,
port=args.port,
watch_paths=watch_paths,
reload=args.reload,
)
return 0
except KeyboardInterrupt:
print("\n🛑 Server stopped by user")
return 0
except Exception as e:
print(f"❌ Error starting server: {e}")
return 1
3 changes: 3 additions & 0 deletions eval_protocol/dataset_logger/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from eval_protocol.dataset_logger.local_fs_dataset_logger_adapter import LocalFSDatasetLoggerAdapter

default_logger = LocalFSDatasetLoggerAdapter()
35 changes: 35 additions & 0 deletions eval_protocol/dataset_logger/dataset_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List, Optional

if TYPE_CHECKING:
from eval_protocol.models import EvaluationRow


class DatasetLogger(ABC):
"""
Abstract base class for logging EvaluationRow objects.
Implementations should provide methods for storing and retrieving logs.
"""

@abstractmethod
def log(self, row: "EvaluationRow") -> None:
"""
Store a single EvaluationRow log.

Args:
row (EvaluationRow): The evaluation row to log.
"""
pass

@abstractmethod
def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
"""
Retrieve EvaluationRow logs.

Args:
row_id (Optional[str]): If provided, filter logs by this row_id.

Returns:
List[EvaluationRow]: List of retrieved evaluation rows.
"""
pass
114 changes: 114 additions & 0 deletions eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from datetime import datetime, timezone
import json
import os
import tempfile
import shutil
from typing import TYPE_CHECKING, List, Optional
from eval_protocol.common_utils import load_jsonl
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger

if TYPE_CHECKING:
from eval_protocol.models import EvaluationRow


class LocalFSDatasetLoggerAdapter(DatasetLogger):
"""
Logger that stores logs in the local filesystem.
"""

EVAL_PROTOCOL_DIR = ".eval_protocol"
PYTHON_FILES = ["pyproject.toml", "requirements.txt"]
DATASETS_DIR = "datasets"

def __init__(self):
# recursively look up for a .eval_protocol directory
current_dir = os.path.dirname(os.path.abspath(__file__))
while current_dir != "/":
if os.path.exists(os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)):
self.log_dir = os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)
break
current_dir = os.path.dirname(current_dir)

# if not found, recursively look up until a pyproject.toml or requirements.txt is found
current_dir = os.path.dirname(os.path.abspath(__file__))
while current_dir != "/":
if any(os.path.exists(os.path.join(current_dir, f)) for f in self.PYTHON_FILES):
self.log_dir = os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)
break
current_dir = os.path.dirname(current_dir)

# get the PWD that this python process is running in
self.log_dir = os.path.join(os.getcwd(), self.EVAL_PROTOCOL_DIR)

# create the .eval_protocol directory if it doesn't exist
os.makedirs(self.log_dir, exist_ok=True)

# create the datasets subdirectory
self.datasets_dir = os.path.join(self.log_dir, self.DATASETS_DIR)
os.makedirs(self.datasets_dir, exist_ok=True)

# ensure that log file exists
if not os.path.exists(self.current_jsonl_path):
with open(self.current_jsonl_path, "w") as f:
f.write("")

@property
def current_date(self) -> str:
# Use UTC timezone to be consistent across local device/locations/CI
return datetime.now(timezone.utc).strftime("%Y-%m-%d")

@property
def current_jsonl_path(self) -> str:
"""
The current JSONL file path. Based on the current date.
"""
return os.path.join(self.datasets_dir, f"{self.current_date}.jsonl")

def log(self, row: "EvaluationRow") -> None:
"""Log a row, updating existing row with same ID or appending new row."""
row_id = row.input_metadata.row_id

# Check if row with this ID already exists
if os.path.exists(self.current_jsonl_path):
with open(self.current_jsonl_path, "r") as f:
lines = f.readlines()

# Find the line with matching ID
for i, line in enumerate(lines):
try:
line_data = json.loads(line.strip())
if line_data["input_metadata"]["row_id"] == row_id:
# Update existing row
lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
with open(self.current_jsonl_path, "w") as f:
f.writelines(lines)
return
except json.JSONDecodeError:
continue

# If no existing row found, append new row
with open(self.current_jsonl_path, "a") as f:
f.write(row.model_dump_json(exclude_none=True) + os.linesep)

def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
"""Read rows from all JSONL files in the datasets directory."""
from eval_protocol.models import EvaluationRow

if not os.path.exists(self.datasets_dir):
return []

all_rows = []
for filename in os.listdir(self.datasets_dir):
if filename.endswith(".jsonl"):
file_path = os.path.join(self.datasets_dir, filename)
try:
data = load_jsonl(file_path)
all_rows.extend([EvaluationRow(**r) for r in data])
except Exception:
continue # skip files that can't be read/parsed

if row_id:
# Filter by row_id if specified
return [row for row in all_rows if getattr(row.input_metadata, "row_id", None) == row_id]
else:
return all_rows
34 changes: 34 additions & 0 deletions eval_protocol/human_id/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import random
import itertools
from typing import Hashable
from . import dictionary

__all__ = ["generate_id"]

system_random = random.SystemRandom()


def generate_id(separator="-", seed: int | float | str | bytes | bytearray | None = None, word_count=4) -> str:
"""
Generate a human readable ID

:param separator: The string to use to separate words
:param seed: The seed to use. The same seed will produce the same ID
:param word_count: The number of words to use. Minimum of 3.
:return: A human readable ID
"""
if word_count < 3:
raise ValueError("word_count cannot be lower than 3")

random_obj = system_random
if seed:
random_obj = random.Random(seed)

parts = {dictionary.verbs: 1, dictionary.adjectives: 1, dictionary.nouns: 1}

for _ in range(3, word_count):
parts[random_obj.choice(list(parts.keys()))] += 1

parts = itertools.chain.from_iterable(random_obj.sample(part, count) for part, count in parts.items())

return separator.join(parts)
Loading
Loading