Aggregated metrics part 7 (#58)

Dylan Huang · web-flow · commit c0137e202c02 · 2025-08-11T15:23:09.000-07:00
* use gpt-oss-120b for less rate limits and faster tests

* fix typeerror

* Refactor LogsServer event handling and improve integration tests

- Moved event_bus.start_listening() to the correct location in LogsServer to ensure it starts listening during the broadcast loop.
- Updated integration tests to use multiprocessing for server startup and improved health check validation.
- Enhanced test_create_app_factory to be asynchronous and added necessary imports for better clarity.

* Enhance test_create_app_factory to verify LogsServer start_loops call

- Updated the test_create_app_factory to mock and assert that the start_loops method of LogsServer is called during app creation.
- Ensured the test remains asynchronous and maintains clarity in its assertions.

* fix
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -500,7 +500,7 @@ def _log_eval_error(
                             if r.eval_metadata is not None:
                                 r.eval_metadata.status = "finished"
                                 r.eval_metadata.passed = passed
-                        default_logger.log(r)
+                            default_logger.log(r)
 
                     # Optional: print and/or persist a summary artifact for CI
                     try:
diff --git a/eval_protocol/utils/logs_server.py b/eval_protocol/utils/logs_server.py
@@ -248,7 +248,6 @@ def __init__(
 
         # Subscribe to events and start listening for cross-process events
         event_bus.subscribe(self._handle_event)
-        event_bus.start_listening()
 
         logger.info(f"LogsServer initialized on {host}:{port}")
 
@@ -292,6 +291,7 @@ def start_loops(self):
         """Start the broadcast loop and evaluation watcher."""
         self.websocket_manager.start_broadcast_loop()
         self.evaluation_watcher.start()
+        event_bus.start_listening()
 
     async def run_async(self):
         """
diff --git a/examples/aime2025_chat_completion/tests/test_aime2025.py b/examples/aime2025_chat_completion/tests/test_aime2025.py
@@ -60,7 +60,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
     rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
-    threshold_of_success=None,
+    passed_threshold=None,
     num_runs=2,
     max_dataset_rows=2,
     max_concurrent_rollouts=4,
diff --git a/examples/gpqa/tests/test_gpqa.py b/examples/gpqa/tests/test_gpqa.py
@@ -69,7 +69,7 @@ def _load_gpqa_messages_from_csv() -> List[List[Message]]:
     ],  # default to low effort; override via CLI plugin
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
-    threshold_of_success=None,
+    passed_threshold=None,
     num_runs=8,
     mode="pointwise",
 )
diff --git a/examples/healthbench/tests/test_evaluation.py b/examples/healthbench/tests/test_evaluation.py
@@ -1,17 +1,13 @@
-from typing import Dict, List
-
 import json
+from typing import Dict, List
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
-from eval_protocol.pytest.evaluation_test import evaluation_test
 from eval_protocol.pytest.default_single_turn_rollout_process import (
     default_single_turn_rollout_processor,
 )
+from eval_protocol.pytest.evaluation_test import evaluation_test
 
-
-SYSTEM_PROMPT = (
-    "You are a clinician assistant. Provide safe, accurate guidance."
-)
+SYSTEM_PROMPT = "You are a clinician assistant. Provide safe, accurate guidance."
 
 
 # Inline two small HealthBench-like samples and attach tiny rubrics in-memory
@@ -56,7 +52,7 @@
     rollout_input_params=[{"temperature": 0.2, "max_tokens": 512}],
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
-    threshold_of_success=None,
+    passed_threshold=None,
     num_runs=1,
     max_dataset_rows=2,
     mode="pointwise",
@@ -91,5 +87,3 @@ def test_healthbench_pointwise(row: EvaluationRow) -> EvaluationRow:
         },
     )
     return row
-
-
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -28,7 +28,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
 @evaluation_test(
     input_dataset=["tests/pytest/data/markdown_dataset.jsonl"],
     dataset_adapter=markdown_dataset_to_evaluation_row,
-    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     passed_threshold=0.5,
     rollout_processor=default_single_turn_rollout_processor,
diff --git a/tests/test_logs_server.py b/tests/test_logs_server.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock, Mock, patch
 
+import httpx
 import psutil
 import pytest
 from fastapi import FastAPI
@@ -332,10 +333,14 @@ async def test_handle_event(self, temp_build_dir):
         # The event should be queued for broadcasting
         assert not server.websocket_manager._broadcast_queue.empty()
 
-    def test_create_app_factory(self, temp_build_dir):
+    @pytest.mark.asyncio
+    async def test_create_app_factory(self, temp_build_dir):
         """Test the create_app factory function."""
-        app = create_app(build_dir=str(temp_build_dir))
-        assert isinstance(app, FastAPI)
+        with patch("eval_protocol.utils.logs_server.LogsServer.start_loops") as mock_start_loops:
+            app = create_app(build_dir=str(temp_build_dir))
+            assert isinstance(app, FastAPI)
+            # Verify that start_loops was called
+            mock_start_loops.assert_called_once()
 
     def test_serve_logs_convenience_function(self, temp_build_dir):
         """Test the serve_logs convenience function."""
@@ -475,13 +480,11 @@ def test_health_endpoint(self, temp_build_dir_with_files):
         assert data["status"] == "ok"
 
     @pytest.mark.asyncio
-    async def test_server_runs_on_specific_port(self, temp_build_dir_with_files):
-        """Integration test: verify that LogsServer actually runs on the specified port (async requests)."""
+    async def test_server_runs_on_specific_port(self):
+        """Integration test: verify that LogsServer runs on specified port and handles port parameters correctly."""
+        import multiprocessing
         import socket
 
-        import httpx
-
-        # Find an available port for testing
         def find_free_port():
             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                 s.bind(("", 0))
@@ -491,54 +494,34 @@ def find_free_port():
 
         test_port = find_free_port()
 
-        # Create and start server in background
-        server = LogsServer(build_dir=str(temp_build_dir_with_files), port=test_port)
-
-        # Start server in background task
-        server_task = asyncio.create_task(server.run_async())
-
-        try:
-            # Wait longer for server to start and be ready
-            await asyncio.sleep(3)
-
-            async with httpx.AsyncClient() as client:
-                # Test that we can actually connect to the server on the specified port
-                response = await client.get(f"http://localhost:{test_port}/", timeout=10)
-                assert response.status_code == 200
-                assert "Test" in response.text
+        # Start server with dynamic port and build_dir
+        server_process = multiprocessing.Process(target=serve_logs, kwargs={"port": test_port}, daemon=True)
+        server_process.start()
 
-                # Test the health endpoint
-                response = await client.get(f"http://localhost:{test_port}/health", timeout=10)
-                assert response.status_code == 200
-                data = response.json()
-                assert data["status"] == "ok"
-
-        finally:
-            # Clean up
-            server_task.cancel()
+        # Wait for server to be ready
+        for _ in range(20):
             try:
-                await server_task
-            except asyncio.CancelledError:
+                response = httpx.get(f"http://localhost:{test_port}/health", timeout=1)
+                if response.status_code == 200:
+                    break
+            except httpx.RequestError:
                 pass
-
-    def test_serve_logs_port_parameter_integration(self, temp_build_dir_with_files):
-        """Integration test: verify that serve_logs function actually works with port parameter."""
-        # This test verifies that serve_logs creates LogsServer with the correct port
-        # without actually starting the server
-        test_port = 9999
-
-        # Use a different approach - mock the LogsServer class and verify the port parameter
-        with patch("eval_protocol.utils.logs_server.LogsServer") as mock_logs_server_class:
-            mock_server_instance = Mock()
-            mock_logs_server_class.return_value = mock_server_instance
-
-            # Call serve_logs with specific port
-            serve_logs(port=test_port)
-
-            # Verify that LogsServer was created with the correct port
-            mock_logs_server_class.assert_called_once_with(port=test_port)
-            # Verify that the run method was called on the instance
-            mock_server_instance.run.assert_called_once()
+            await asyncio.sleep(1)
+
+        async with httpx.AsyncClient() as client:
+            # Test health endpoint
+            response = await client.get(f"http://localhost:{test_port}/health", timeout=10)
+            assert response.status_code == 200
+            data = response.json()
+            assert data["status"] == "ok"
+
+        # Clean up server
+        if server_process.is_alive():
+            server_process.terminate()
+            server_process.join(timeout=2)
+            if server_process.is_alive():
+                server_process.kill()
+                server_process.join(timeout=1)
 
 
 @pytest.mark.asyncio

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ def _load_gpqa_messages_from_csv() -> List[List[Message]]:`
`69`	`69`	`], # default to low effort; override via CLI plugin`
`70`	`70`	`rollout_processor=default_single_turn_rollout_processor,`
`71`	`71`	`aggregation_method="mean",`
`72`		`- threshold_of_success=None,`
	`72`	`+ passed_threshold=None,`
`73`	`73`	`num_runs=8,`
`74`	`74`	`mode="pointwise",`
`75`	`75`	`)`