aggregated metrics part 8 (move ids under execution_metadata) (#62)

Dylan Huang · web-flow · commit 984e2869bc65 · 2025-08-11T16:10:23.000-07:00
* add --port arg to ep logs * Fix WebSocketManager to reset broadcast task after cancellation * simple tests work * TODO: TestLogsServer * TODO: TestLogsServerIntegration * TODO: test HTML injection - also test TestAsyncWebSocketOperations * add logs server tests * add port parameter testes * use gpt-oss-120b to avoid rate limits * point to port 8000 for dev * woops * fix "uvicorn eval_protocol.utils.logs_server:create_app --factory --reload" * use gpt-oss-120b since less rate limiting (#57) * Aggregated metrics part 7 (#58) * use gpt-oss-120b for less rate limits and faster tests * fix typeerror * Refactor LogsServer event handling and improve integration tests - Moved event_bus.start_listening() to the correct location in LogsServer to ensure it starts listening during the broadcast loop. - Updated integration tests to use multiprocessing for server startup and improved health check validation. - Enhanced test_create_app_factory to be asynchronous and added necessary imports for better clarity. * Enhance test_create_app_factory to verify LogsServer start_loops call - Updated the test_create_app_factory to mock and assert that the start_loops method of LogsServer is called during app creation. - Ensured the test remains asynchronous and maintains clarity in its assertions. * fix * use active logger * cohort -> experiment * vite build * Update model path in pytest configuration to use gpt-oss-120b for improved performance * move ids under execution_metadata * Update model path in pytest configuration to use gpt-oss-20b for testing adjustments (#63)
diff --git a/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py b/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py
@@ -37,9 +37,9 @@ def db_path(self) -> str:
         return self._db_path
 
     def upsert_row(self, data: dict) -> None:
-        rollout_id = data["rollout_id"]
-        if "rollout_id" not in data:
-            raise ValueError("rollout_id is required to upsert a row")
+        rollout_id = data["execution_metadata"]["rollout_id"]
+        if rollout_id is None:
+            raise ValueError("execution_metadata.rollout_id is required to upsert a row")
         if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
             self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
         else:
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -237,6 +237,30 @@ class EvalMetadata(BaseModel):
     passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
 
 
+class ExecutionMetadata(BaseModel):
+    """Metadata about the execution of the evaluation."""
+
+    invocation_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the invocation that this row belongs to.",
+    )
+
+    experiment_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the experiment that this row belongs to.",
+    )
+
+    rollout_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the rollout that this row belongs to.",
+    )
+
+    run_id: Optional[str] = Field(
+        None,
+        description=("The ID of the run that this row belongs to."),
+    )
+
+
 class RolloutStatus(BaseModel):
     """Status of the rollout."""
 
@@ -281,26 +305,6 @@ class EvaluationRow(BaseModel):
         description="The status of the rollout.",
     )
 
-    invocation_id: Optional[str] = Field(
-        default_factory=generate_id,
-        description="The ID of the invocation that this row belongs to.",
-    )
-
-    experiment_id: Optional[str] = Field(
-        default_factory=generate_id,
-        description="The ID of the experiment that this row belongs to.",
-    )
-
-    rollout_id: Optional[str] = Field(
-        default_factory=generate_id,
-        description="The ID of the rollout that this row belongs to.",
-    )
-
-    run_id: Optional[str] = Field(
-        None,
-        description=("The ID of the run that this row belongs to."),
-    )
-
     # Ground truth reference (moved from EvaluateResult to top level)
     ground_truth: Optional[str] = Field(
         default=None, description="Optional ground truth reference for this evaluation."
@@ -311,6 +315,11 @@ class EvaluationRow(BaseModel):
         default=None, description="The evaluation result for this row/trajectory."
     )
 
+    execution_metadata: ExecutionMetadata = Field(
+        default_factory=ExecutionMetadata,
+        description="Metadata about the execution of the evaluation.",
+    )
+
     # LLM usage statistics
     usage: Optional[CompletionUsage] = Field(
         default=None, description="Token usage statistics from LLM calls during execution."
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -383,8 +383,8 @@ def _log_eval_error(
                         row.input_metadata.session_data["mode"] = mode
                         # Initialize eval_metadata for each row
                         row.eval_metadata = eval_metadata
-                        row.experiment_id = experiment_id
-                        row.invocation_id = invocation_id
+                        row.execution_metadata.experiment_id = experiment_id
+                        row.execution_metadata.invocation_id = invocation_id
 
                         # has to be done in the pytest main process since it's
                         # used to determine whether this eval has stopped
@@ -409,11 +409,11 @@ def _log_eval_error(
 
                         # apply new run_id to fresh_dataset
                         for row in fresh_dataset:
-                            row.run_id = run_id
+                            row.execution_metadata.run_id = run_id
 
                         # generate new rollout_id for each row
                         for row in fresh_dataset:
-                            row.rollout_id = generate_id()
+                            row.execution_metadata.rollout_id = generate_id()
 
                         # log the fresh_dataset
                         for row in fresh_dataset:
diff --git a/tests/dataset_logger/test_sqlite_dataset_logger_adapter.py b/tests/dataset_logger/test_sqlite_dataset_logger_adapter.py
@@ -24,7 +24,7 @@ def test_update_log_and_read():
 
     logger = SqliteDatasetLoggerAdapter(store=store)
     logger.log(row)
-    saved = logger.read(row.rollout_id)[0]
+    saved = logger.read(row.execution_metadata.rollout_id)[0]
     assert row.messages == saved.messages
     assert row.input_metadata == saved.input_metadata
 
@@ -42,7 +42,7 @@ def test_create_log_and_read():
     row = EvaluationRow(input_metadata=input_metadata, messages=messages)
 
     logger.log(row)
-    saved = logger.read(rollout_id=row.rollout_id)[0]
+    saved = logger.read(rollout_id=row.execution_metadata.rollout_id)[0]
     assert row.messages == saved.messages
     assert row.input_metadata == saved.input_metadata
 
diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py
@@ -12,8 +12,8 @@ def __init__(self):
         self._rows: dict[str, EvaluationRow] = {}
 
     def log(self, row: EvaluationRow):
-        print(row.run_id, row.rollout_id)
-        self._rows[row.rollout_id] = row
+        print(row.execution_metadata.run_id, row.execution_metadata.rollout_id)
+        self._rows[row.execution_metadata.rollout_id] = row
 
     def read(self):
         return list(self._rows.values())
@@ -76,10 +76,10 @@ def test_evaluation_test_decorator_ids_single(monkeypatch):
         logger=InMemoryLogger(),
     )
     def eval_fn(row: EvaluationRow) -> EvaluationRow:
-        unique_run_ids.add(row.run_id)
-        unique_experiment_ids.add(row.experiment_id)
-        unique_rollout_ids.add(row.rollout_id)
-        unique_invocation_ids.add(row.invocation_id)
+        unique_run_ids.add(row.execution_metadata.run_id)
+        unique_experiment_ids.add(row.execution_metadata.experiment_id)
+        unique_rollout_ids.add(row.execution_metadata.rollout_id)
+        unique_invocation_ids.add(row.execution_metadata.invocation_id)
         unique_row_ids.add(row.input_metadata.row_id)
         return row
 
diff --git a/tests/pytest/test_pytest_mcp_config.py b/tests/pytest/test_pytest_mcp_config.py
@@ -20,7 +20,7 @@
         ]
     ],
     rollout_processor=default_agent_rollout_processor,
-    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-20b"],
     mode="pointwise",
     mcp_config_path="tests/pytest/mcp_configurations/mock_discord_mcp_config.json",
 )
diff --git a/vite-app/src/GlobalState.tsx b/vite-app/src/GlobalState.tsx
@@ -14,10 +14,10 @@ export class GlobalState {
 
   upsertRows(dataset: EvaluationRow[]) {
     dataset.forEach((row) => {
-      if (!row.rollout_id) {
+      if (!row.execution_metadata?.rollout_id) {
         return;
       }
-      this.dataset[row.rollout_id] = row;
+      this.dataset[row.execution_metadata.rollout_id] = row;
     });
   }
 
diff --git a/vite-app/src/components/EvaluationRow.tsx b/vite-app/src/components/EvaluationRow.tsx
@@ -133,10 +133,10 @@ const IdSection = observer(({ data }: { data: EvaluationRowType }) => (
   <MetadataSection
     title="IDs"
     data={{
-      rollout_id: data.rollout_id,
-      experiment_id: data.experiment_id,
-      invocation_id: data.invocation_id,
-      run_id: data.run_id,
+      rollout_id: data.execution_metadata?.rollout_id,
+      experiment_id: data.execution_metadata?.experiment_id,
+      invocation_id: data.execution_metadata?.invocation_id,
+      run_id: data.execution_metadata?.run_id,
     }}
   />
 ));
@@ -197,7 +197,7 @@ const ExpandedContent = observer(
 
 export const EvaluationRow = observer(
   ({ row }: { row: EvaluationRowType; index: number }) => {
-    const rolloutId = row.rollout_id;
+    const rolloutId = row.execution_metadata?.rollout_id;
     const isExpanded = state.isRowExpanded(rolloutId);
 
     const toggleExpanded = () => state.toggleRowExpansion(rolloutId);
@@ -226,7 +226,7 @@ export const EvaluationRow = observer(
 
           {/* Rollout ID */}
           <TableCell className="py-3 text-xs">
-            <RolloutId rolloutId={row.rollout_id} />
+            <RolloutId rolloutId={row.execution_metadata?.rollout_id} />
           </TableCell>
 
           {/* Model */}
diff --git a/vite-app/src/components/EvaluationTable.tsx b/vite-app/src/components/EvaluationTable.tsx
@@ -20,7 +20,7 @@ const TableBody = observer(
       <TableBodyBase>
         {paginatedData.map((row, index) => (
           <EvaluationRow
-            key={row.rollout_id}
+            key={row.execution_metadata?.rollout_id}
             row={row}
             index={startIndex + index}
           />
diff --git a/vite-app/src/types/eval-protocol.ts b/vite-app/src/types/eval-protocol.ts
@@ -94,15 +94,19 @@ export const RolloutStatusSchema = z.object({
   error_message: z.string().optional().describe('Error message if the rollout failed.')
 });
 
+export const ExecutionMetadataSchema = z.object({
+  invocation_id: z.string().optional().describe('The ID of the invocation that this row belongs to.'),
+  experiment_id: z.string().optional().describe('The ID of the experiment that this row belongs to.'),
+  rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
+  run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
+});
+
 export const EvaluationRowSchema = z.object({
   messages: z.array(MessageSchema).describe('List of messages in the conversation/trajectory.'),
   tools: z.array(z.record(z.string(), z.any())).optional().describe('Available tools/functions that were provided to the agent.'),
   input_metadata: InputMetadataSchema.describe('Metadata related to the input (dataset info, model config, session data, etc.).'),
   rollout_status: RolloutStatusSchema.default({ status: 'finished' }).describe('The status of the rollout.'),
-  invocation_id: z.string().optional().describe('The ID of the invocation that this row belongs to.'),
-  experiment_id: z.string().optional().describe('The ID of the experiment that this row belongs to.'),
-  rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
-  run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
+  execution_metadata: ExecutionMetadataSchema.optional().describe('Metadata about the execution of the evaluation.'),
   ground_truth: z.string().optional().describe('Optional ground truth reference for this evaluation.'),
   evaluation_result: EvaluateResultSchema.optional().describe('The evaluation result for this row/trajectory.'),
   usage: CompletionUsageSchema.optional().describe('Token usage statistics from LLM calls during execution.'),
diff --git a/vite-app/src/util/pivot.test.ts b/vite-app/src/util/pivot.test.ts
@@ -189,7 +189,7 @@ describe('computePivot', () => {
 
     const res = computePivot({
       data: rows,
-      rowFields: ['$.eval_metadata.name', '$.experiment_id'],
+      rowFields: ['$.eval_metadata.name', '$.execution_metadata.experiment_id'],
       columnFields: ['$.input_metadata.completion_params.model'],
       valueField: '$.evaluation_result.score',
       aggregator: 'avg',

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@`
`20`	`20`	`]`
`21`	`21`	`],`
`22`	`22`	`rollout_processor=default_agent_rollout_processor,`
`23`		`- model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],`
	`23`	`+ model=["fireworks_ai/accounts/fireworks/models/gpt-oss-20b"],`
`24`	`24`	`mode="pointwise",`
`25`	`25`	`mcp_config_path="tests/pytest/mcp_configurations/mock_discord_mcp_config.json",`
`26`	`26`	`)`
Original file line number	Diff line number	Diff line change
`@@ -14,10 +14,10 @@ export class GlobalState {`
`14`	`14`
`15`	`15`	`upsertRows(dataset: EvaluationRow[]) {`
`16`	`16`	`dataset.forEach((row) => {`
`17`		`- if (!row.rollout_id) {`
	`17`	`+ if (!row.execution_metadata?.rollout_id) {`
`18`	`18`	`return;`
`19`	`19`	`}`
`20`		`- this.dataset[row.rollout_id] = row;`
	`20`	`+ this.dataset[row.execution_metadata.rollout_id] = row;`
`21`	`21`	`});`
`22`	`22`	`}`
`23`	`23`