Skip to content

Commit ac8b739

Browse files
authored
Merge branch 'main' into support-mcp-server-session-data-reset
2 parents a3f9ca1 + 0fb7071 commit ac8b739

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+5913
-104
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ cython_debug/
160160
# Specific to this project (can be kept or reviewed)
161161
firectl
162162
references
163+
.eval_protocol
163164

164165

165166
samples.json # If this is a specific file to ignore

.vscode/launch.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,18 @@
3434
"env": {
3535
"PYTHONPATH": "${workspaceFolder}"
3636
}
37+
},
38+
{
39+
"name": "Python: Debug Logs Server (Uvicorn)",
40+
"type": "python",
41+
"request": "launch",
42+
"module": "uvicorn",
43+
"args": ["eval_protocol.utils.logs_server:app", "--reload"],
44+
"console": "integratedTerminal",
45+
"justMyCode": false,
46+
"env": {
47+
"PYTHONPATH": "${workspaceFolder}"
48+
}
3749
}
3850
]
3951
}

eval_protocol/cli.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
logger = logging.getLogger(__name__)
1616

17+
1718
from eval_protocol.evaluation import create_evaluation, preview_evaluation
1819

1920
from .cli_commands.agent_eval_cmd import agent_eval_command
@@ -26,6 +27,7 @@
2627
from .cli_commands.deploy_mcp import deploy_mcp_command
2728
from .cli_commands.preview import preview_command
2829
from .cli_commands.run_eval_cmd import hydra_cli_entry_point
30+
from .cli_commands.logs import logs_command
2931

3032

3133
def parse_args(args=None):
@@ -285,6 +287,39 @@ def parse_args(args=None):
285287
help="Override the number of parallel rollouts to execute for each task.",
286288
)
287289

290+
# Logs command
291+
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
292+
logs_parser.add_argument(
293+
"--build-dir",
294+
default="dist",
295+
help="Path to the Vite build output directory (default: dist)",
296+
)
297+
logs_parser.add_argument(
298+
"--host",
299+
default="localhost",
300+
help="Host to bind the server to (default: localhost)",
301+
)
302+
logs_parser.add_argument(
303+
"--port",
304+
type=int,
305+
default=4789,
306+
help="Port to bind the server to (default: 4789)",
307+
)
308+
logs_parser.add_argument(
309+
"--index-file",
310+
default="index.html",
311+
help="Name of the main index file (default: index.html)",
312+
)
313+
logs_parser.add_argument(
314+
"--watch-paths",
315+
help="Comma-separated list of paths to watch for file changes (default: current directory)",
316+
)
317+
logs_parser.add_argument(
318+
"--reload",
319+
action="store_true",
320+
help="Enable auto-reload (default: False)",
321+
)
322+
288323
# Run command (for Hydra-based evaluations)
289324
# This subparser intentionally defines no arguments itself.
290325
# All arguments after 'run' will be passed to Hydra by parse_known_args.
@@ -338,6 +373,8 @@ def main():
338373
return deploy_mcp_command(args)
339374
elif args.command == "agent-eval":
340375
return agent_eval_command(args)
376+
elif args.command == "logs":
377+
return logs_command(args)
341378
elif args.command == "run":
342379
# For the 'run' command, Hydra takes over argument parsing.
343380

eval_protocol/cli_commands/logs.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""
2+
CLI command for serving logs with file watching and real-time updates.
3+
"""
4+
5+
import sys
6+
from pathlib import Path
7+
8+
from ..utils.logs_server import serve_logs
9+
10+
11+
def logs_command(args):
12+
"""Serve logs with file watching and real-time updates"""
13+
14+
# Parse watch paths
15+
watch_paths = None
16+
if args.watch_paths:
17+
watch_paths = args.watch_paths.split(",")
18+
watch_paths = [path.strip() for path in watch_paths if path.strip()]
19+
20+
print(f"🚀 Starting Eval Protocol Logs Server")
21+
print(f"🌐 URL: http://{args.host}:{args.port}")
22+
print(f"🔌 WebSocket: ws://{args.host}:{args.port}/ws")
23+
print(f"👀 Watching paths: {watch_paths or ['current directory']}")
24+
print("Press Ctrl+C to stop the server")
25+
print("-" * 50)
26+
27+
try:
28+
serve_logs(
29+
host=args.host,
30+
port=args.port,
31+
watch_paths=watch_paths,
32+
reload=args.reload,
33+
)
34+
return 0
35+
except KeyboardInterrupt:
36+
print("\n🛑 Server stopped by user")
37+
return 0
38+
except Exception as e:
39+
print(f"❌ Error starting server: {e}")
40+
return 1
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from eval_protocol.dataset_logger.local_fs_dataset_logger_adapter import LocalFSDatasetLoggerAdapter
2+
3+
default_logger = LocalFSDatasetLoggerAdapter()
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from abc import ABC, abstractmethod
2+
from typing import TYPE_CHECKING, List, Optional
3+
4+
if TYPE_CHECKING:
5+
from eval_protocol.models import EvaluationRow
6+
7+
8+
class DatasetLogger(ABC):
9+
"""
10+
Abstract base class for logging EvaluationRow objects.
11+
Implementations should provide methods for storing and retrieving logs.
12+
"""
13+
14+
@abstractmethod
15+
def log(self, row: "EvaluationRow") -> None:
16+
"""
17+
Store a single EvaluationRow log.
18+
19+
Args:
20+
row (EvaluationRow): The evaluation row to log.
21+
"""
22+
pass
23+
24+
@abstractmethod
25+
def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
26+
"""
27+
Retrieve EvaluationRow logs.
28+
29+
Args:
30+
row_id (Optional[str]): If provided, filter logs by this row_id.
31+
32+
Returns:
33+
List[EvaluationRow]: List of retrieved evaluation rows.
34+
"""
35+
pass
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from datetime import datetime, timezone
2+
import json
3+
import os
4+
import tempfile
5+
import shutil
6+
from typing import TYPE_CHECKING, List, Optional
7+
from eval_protocol.common_utils import load_jsonl
8+
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
9+
10+
if TYPE_CHECKING:
11+
from eval_protocol.models import EvaluationRow
12+
13+
14+
class LocalFSDatasetLoggerAdapter(DatasetLogger):
15+
"""
16+
Logger that stores logs in the local filesystem.
17+
"""
18+
19+
EVAL_PROTOCOL_DIR = ".eval_protocol"
20+
PYTHON_FILES = ["pyproject.toml", "requirements.txt"]
21+
DATASETS_DIR = "datasets"
22+
23+
def __init__(self):
24+
# recursively look up for a .eval_protocol directory
25+
current_dir = os.path.dirname(os.path.abspath(__file__))
26+
while current_dir != "/":
27+
if os.path.exists(os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)):
28+
self.log_dir = os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)
29+
break
30+
current_dir = os.path.dirname(current_dir)
31+
32+
# if not found, recursively look up until a pyproject.toml or requirements.txt is found
33+
current_dir = os.path.dirname(os.path.abspath(__file__))
34+
while current_dir != "/":
35+
if any(os.path.exists(os.path.join(current_dir, f)) for f in self.PYTHON_FILES):
36+
self.log_dir = os.path.join(current_dir, self.EVAL_PROTOCOL_DIR)
37+
break
38+
current_dir = os.path.dirname(current_dir)
39+
40+
# get the PWD that this python process is running in
41+
self.log_dir = os.path.join(os.getcwd(), self.EVAL_PROTOCOL_DIR)
42+
43+
# create the .eval_protocol directory if it doesn't exist
44+
os.makedirs(self.log_dir, exist_ok=True)
45+
46+
# create the datasets subdirectory
47+
self.datasets_dir = os.path.join(self.log_dir, self.DATASETS_DIR)
48+
os.makedirs(self.datasets_dir, exist_ok=True)
49+
50+
# ensure that log file exists
51+
if not os.path.exists(self.current_jsonl_path):
52+
with open(self.current_jsonl_path, "w") as f:
53+
f.write("")
54+
55+
@property
56+
def current_date(self) -> str:
57+
# Use UTC timezone to be consistent across local device/locations/CI
58+
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
59+
60+
@property
61+
def current_jsonl_path(self) -> str:
62+
"""
63+
The current JSONL file path. Based on the current date.
64+
"""
65+
return os.path.join(self.datasets_dir, f"{self.current_date}.jsonl")
66+
67+
def log(self, row: "EvaluationRow") -> None:
68+
"""Log a row, updating existing row with same ID or appending new row."""
69+
row_id = row.input_metadata.row_id
70+
71+
# Check if row with this ID already exists
72+
if os.path.exists(self.current_jsonl_path):
73+
with open(self.current_jsonl_path, "r") as f:
74+
lines = f.readlines()
75+
76+
# Find the line with matching ID
77+
for i, line in enumerate(lines):
78+
try:
79+
line_data = json.loads(line.strip())
80+
if line_data["input_metadata"]["row_id"] == row_id:
81+
# Update existing row
82+
lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
83+
with open(self.current_jsonl_path, "w") as f:
84+
f.writelines(lines)
85+
return
86+
except json.JSONDecodeError:
87+
continue
88+
89+
# If no existing row found, append new row
90+
with open(self.current_jsonl_path, "a") as f:
91+
f.write(row.model_dump_json(exclude_none=True) + os.linesep)
92+
93+
def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
94+
"""Read rows from all JSONL files in the datasets directory."""
95+
from eval_protocol.models import EvaluationRow
96+
97+
if not os.path.exists(self.datasets_dir):
98+
return []
99+
100+
all_rows = []
101+
for filename in os.listdir(self.datasets_dir):
102+
if filename.endswith(".jsonl"):
103+
file_path = os.path.join(self.datasets_dir, filename)
104+
try:
105+
data = load_jsonl(file_path)
106+
all_rows.extend([EvaluationRow(**r) for r in data])
107+
except Exception:
108+
continue # skip files that can't be read/parsed
109+
110+
if row_id:
111+
# Filter by row_id if specified
112+
return [row for row in all_rows if getattr(row.input_metadata, "row_id", None) == row_id]
113+
else:
114+
return all_rows

eval_protocol/human_id/__init__.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import random
2+
import itertools
3+
from typing import Hashable
4+
from . import dictionary
5+
6+
__all__ = ["generate_id"]
7+
8+
system_random = random.SystemRandom()
9+
10+
11+
def generate_id(separator="-", seed: int | float | str | bytes | bytearray | None = None, word_count=4) -> str:
12+
"""
13+
Generate a human readable ID
14+
15+
:param separator: The string to use to separate words
16+
:param seed: The seed to use. The same seed will produce the same ID
17+
:param word_count: The number of words to use. Minimum of 3.
18+
:return: A human readable ID
19+
"""
20+
if word_count < 3:
21+
raise ValueError("word_count cannot be lower than 3")
22+
23+
random_obj = system_random
24+
if seed:
25+
random_obj = random.Random(seed)
26+
27+
parts = {dictionary.verbs: 1, dictionary.adjectives: 1, dictionary.nouns: 1}
28+
29+
for _ in range(3, word_count):
30+
parts[random_obj.choice(list(parts.keys()))] += 1
31+
32+
parts = itertools.chain.from_iterable(random_obj.sample(part, count) for part, count in parts.items())
33+
34+
return separator.join(parts)

0 commit comments

Comments
 (0)