Skip to content

Commit c0137e2

Browse files
author
Dylan Huang
authored
Aggregated metrics part 7 (#58)
* use gpt-oss-120b for less rate limits and faster tests * fix typeerror * Refactor LogsServer event handling and improve integration tests - Moved event_bus.start_listening() to the correct location in LogsServer to ensure it starts listening during the broadcast loop. - Updated integration tests to use multiprocessing for server startup and improved health check validation. - Enhanced test_create_app_factory to be asynchronous and added necessary imports for better clarity. * Enhance test_create_app_factory to verify LogsServer start_loops call - Updated the test_create_app_factory to mock and assert that the start_loops method of LogsServer is called during app creation. - Ensured the test remains asynchronous and maintains clarity in its assertions. * fix
1 parent d4167ce commit c0137e2

File tree

7 files changed

+45
-68
lines changed

7 files changed

+45
-68
lines changed

eval_protocol/pytest/evaluation_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ def _log_eval_error(
500500
if r.eval_metadata is not None:
501501
r.eval_metadata.status = "finished"
502502
r.eval_metadata.passed = passed
503-
default_logger.log(r)
503+
default_logger.log(r)
504504

505505
# Optional: print and/or persist a summary artifact for CI
506506
try:

eval_protocol/utils/logs_server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,6 @@ def __init__(
248248

249249
# Subscribe to events and start listening for cross-process events
250250
event_bus.subscribe(self._handle_event)
251-
event_bus.start_listening()
252251

253252
logger.info(f"LogsServer initialized on {host}:{port}")
254253

@@ -292,6 +291,7 @@ def start_loops(self):
292291
"""Start the broadcast loop and evaluation watcher."""
293292
self.websocket_manager.start_broadcast_loop()
294293
self.evaluation_watcher.start()
294+
event_bus.start_listening()
295295

296296
async def run_async(self):
297297
"""

examples/aime2025_chat_completion/tests/test_aime2025.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
6060
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
6161
rollout_processor=default_single_turn_rollout_processor,
6262
aggregation_method="mean",
63-
threshold_of_success=None,
63+
passed_threshold=None,
6464
num_runs=2,
6565
max_dataset_rows=2,
6666
max_concurrent_rollouts=4,

examples/gpqa/tests/test_gpqa.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def _load_gpqa_messages_from_csv() -> List[List[Message]]:
6969
], # default to low effort; override via CLI plugin
7070
rollout_processor=default_single_turn_rollout_processor,
7171
aggregation_method="mean",
72-
threshold_of_success=None,
72+
passed_threshold=None,
7373
num_runs=8,
7474
mode="pointwise",
7575
)

examples/healthbench/tests/test_evaluation.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
1-
from typing import Dict, List
2-
31
import json
2+
from typing import Dict, List
43

54
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
6-
from eval_protocol.pytest.evaluation_test import evaluation_test
75
from eval_protocol.pytest.default_single_turn_rollout_process import (
86
default_single_turn_rollout_processor,
97
)
8+
from eval_protocol.pytest.evaluation_test import evaluation_test
109

11-
12-
SYSTEM_PROMPT = (
13-
"You are a clinician assistant. Provide safe, accurate guidance."
14-
)
10+
SYSTEM_PROMPT = "You are a clinician assistant. Provide safe, accurate guidance."
1511

1612

1713
# Inline two small HealthBench-like samples and attach tiny rubrics in-memory
@@ -56,7 +52,7 @@
5652
rollout_input_params=[{"temperature": 0.2, "max_tokens": 512}],
5753
rollout_processor=default_single_turn_rollout_processor,
5854
aggregation_method="mean",
59-
threshold_of_success=None,
55+
passed_threshold=None,
6056
num_runs=1,
6157
max_dataset_rows=2,
6258
mode="pointwise",
@@ -91,5 +87,3 @@ def test_healthbench_pointwise(row: EvaluationRow) -> EvaluationRow:
9187
},
9288
)
9389
return row
94-
95-

tests/pytest/test_markdown_highlighting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
2828
@evaluation_test(
2929
input_dataset=["tests/pytest/data/markdown_dataset.jsonl"],
3030
dataset_adapter=markdown_dataset_to_evaluation_row,
31-
model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
31+
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
3232
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
3333
passed_threshold=0.5,
3434
rollout_processor=default_single_turn_rollout_processor,

tests/test_logs_server.py

Lines changed: 36 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pathlib import Path
77
from unittest.mock import AsyncMock, MagicMock, Mock, patch
88

9+
import httpx
910
import psutil
1011
import pytest
1112
from fastapi import FastAPI
@@ -332,10 +333,14 @@ async def test_handle_event(self, temp_build_dir):
332333
# The event should be queued for broadcasting
333334
assert not server.websocket_manager._broadcast_queue.empty()
334335

335-
def test_create_app_factory(self, temp_build_dir):
336+
@pytest.mark.asyncio
337+
async def test_create_app_factory(self, temp_build_dir):
336338
"""Test the create_app factory function."""
337-
app = create_app(build_dir=str(temp_build_dir))
338-
assert isinstance(app, FastAPI)
339+
with patch("eval_protocol.utils.logs_server.LogsServer.start_loops") as mock_start_loops:
340+
app = create_app(build_dir=str(temp_build_dir))
341+
assert isinstance(app, FastAPI)
342+
# Verify that start_loops was called
343+
mock_start_loops.assert_called_once()
339344

340345
def test_serve_logs_convenience_function(self, temp_build_dir):
341346
"""Test the serve_logs convenience function."""
@@ -475,13 +480,11 @@ def test_health_endpoint(self, temp_build_dir_with_files):
475480
assert data["status"] == "ok"
476481

477482
@pytest.mark.asyncio
478-
async def test_server_runs_on_specific_port(self, temp_build_dir_with_files):
479-
"""Integration test: verify that LogsServer actually runs on the specified port (async requests)."""
483+
async def test_server_runs_on_specific_port(self):
484+
"""Integration test: verify that LogsServer runs on specified port and handles port parameters correctly."""
485+
import multiprocessing
480486
import socket
481487

482-
import httpx
483-
484-
# Find an available port for testing
485488
def find_free_port():
486489
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
487490
s.bind(("", 0))
@@ -491,54 +494,34 @@ def find_free_port():
491494

492495
test_port = find_free_port()
493496

494-
# Create and start server in background
495-
server = LogsServer(build_dir=str(temp_build_dir_with_files), port=test_port)
496-
497-
# Start server in background task
498-
server_task = asyncio.create_task(server.run_async())
499-
500-
try:
501-
# Wait longer for server to start and be ready
502-
await asyncio.sleep(3)
503-
504-
async with httpx.AsyncClient() as client:
505-
# Test that we can actually connect to the server on the specified port
506-
response = await client.get(f"http://localhost:{test_port}/", timeout=10)
507-
assert response.status_code == 200
508-
assert "Test" in response.text
497+
# Start server with dynamic port and build_dir
498+
server_process = multiprocessing.Process(target=serve_logs, kwargs={"port": test_port}, daemon=True)
499+
server_process.start()
509500

510-
# Test the health endpoint
511-
response = await client.get(f"http://localhost:{test_port}/health", timeout=10)
512-
assert response.status_code == 200
513-
data = response.json()
514-
assert data["status"] == "ok"
515-
516-
finally:
517-
# Clean up
518-
server_task.cancel()
501+
# Wait for server to be ready
502+
for _ in range(20):
519503
try:
520-
await server_task
521-
except asyncio.CancelledError:
504+
response = httpx.get(f"http://localhost:{test_port}/health", timeout=1)
505+
if response.status_code == 200:
506+
break
507+
except httpx.RequestError:
522508
pass
523-
524-
def test_serve_logs_port_parameter_integration(self, temp_build_dir_with_files):
525-
"""Integration test: verify that serve_logs function actually works with port parameter."""
526-
# This test verifies that serve_logs creates LogsServer with the correct port
527-
# without actually starting the server
528-
test_port = 9999
529-
530-
# Use a different approach - mock the LogsServer class and verify the port parameter
531-
with patch("eval_protocol.utils.logs_server.LogsServer") as mock_logs_server_class:
532-
mock_server_instance = Mock()
533-
mock_logs_server_class.return_value = mock_server_instance
534-
535-
# Call serve_logs with specific port
536-
serve_logs(port=test_port)
537-
538-
# Verify that LogsServer was created with the correct port
539-
mock_logs_server_class.assert_called_once_with(port=test_port)
540-
# Verify that the run method was called on the instance
541-
mock_server_instance.run.assert_called_once()
509+
await asyncio.sleep(1)
510+
511+
async with httpx.AsyncClient() as client:
512+
# Test health endpoint
513+
response = await client.get(f"http://localhost:{test_port}/health", timeout=10)
514+
assert response.status_code == 200
515+
data = response.json()
516+
assert data["status"] == "ok"
517+
518+
# Clean up server
519+
if server_process.is_alive():
520+
server_process.terminate()
521+
server_process.join(timeout=2)
522+
if server_process.is_alive():
523+
server_process.kill()
524+
server_process.join(timeout=1)
542525

543526

544527
@pytest.mark.asyncio

0 commit comments

Comments
 (0)