Skip to content

Commit 984e286

Browse files
author
Dylan Huang
authored
aggregated metrics part 8 (move ids under execution_metadata) (#62)
* add --port arg to ep logs * Fix WebSocketManager to reset broadcast task after cancellation * simple tests work * TODO: TestLogsServer * TODO: TestLogsServerIntegration * TODO: test HTML injection - also test TestAsyncWebSocketOperations * add logs server tests * add port parameter testes * use gpt-oss-120b to avoid rate limits * point to port 8000 for dev * woops * fix "uvicorn eval_protocol.utils.logs_server:create_app --factory --reload" * use gpt-oss-120b since less rate limiting (#57) * Aggregated metrics part 7 (#58) * use gpt-oss-120b for less rate limits and faster tests * fix typeerror * Refactor LogsServer event handling and improve integration tests - Moved event_bus.start_listening() to the correct location in LogsServer to ensure it starts listening during the broadcast loop. - Updated integration tests to use multiprocessing for server startup and improved health check validation. - Enhanced test_create_app_factory to be asynchronous and added necessary imports for better clarity. * Enhance test_create_app_factory to verify LogsServer start_loops call - Updated the test_create_app_factory to mock and assert that the start_loops method of LogsServer is called during app creation. - Ensured the test remains asynchronous and maintains clarity in its assertions. * fix * use active logger * cohort -> experiment * vite build * Update model path in pytest configuration to use gpt-oss-120b for improved performance * move ids under execution_metadata * Update model path in pytest configuration to use gpt-oss-20b for testing adjustments (#63)
1 parent 70916e6 commit 984e286

File tree

11 files changed

+63
-50
lines changed

11 files changed

+63
-50
lines changed

eval_protocol/dataset_logger/sqlite_evaluation_row_store.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ def db_path(self) -> str:
3737
return self._db_path
3838

3939
def upsert_row(self, data: dict) -> None:
40-
rollout_id = data["rollout_id"]
41-
if "rollout_id" not in data:
42-
raise ValueError("rollout_id is required to upsert a row")
40+
rollout_id = data["execution_metadata"]["rollout_id"]
41+
if rollout_id is None:
42+
raise ValueError("execution_metadata.rollout_id is required to upsert a row")
4343
if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
4444
self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
4545
else:

eval_protocol/models.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,30 @@ class EvalMetadata(BaseModel):
237237
passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
238238

239239

240+
class ExecutionMetadata(BaseModel):
241+
"""Metadata about the execution of the evaluation."""
242+
243+
invocation_id: Optional[str] = Field(
244+
default_factory=generate_id,
245+
description="The ID of the invocation that this row belongs to.",
246+
)
247+
248+
experiment_id: Optional[str] = Field(
249+
default_factory=generate_id,
250+
description="The ID of the experiment that this row belongs to.",
251+
)
252+
253+
rollout_id: Optional[str] = Field(
254+
default_factory=generate_id,
255+
description="The ID of the rollout that this row belongs to.",
256+
)
257+
258+
run_id: Optional[str] = Field(
259+
None,
260+
description=("The ID of the run that this row belongs to."),
261+
)
262+
263+
240264
class RolloutStatus(BaseModel):
241265
"""Status of the rollout."""
242266

@@ -281,26 +305,6 @@ class EvaluationRow(BaseModel):
281305
description="The status of the rollout.",
282306
)
283307

284-
invocation_id: Optional[str] = Field(
285-
default_factory=generate_id,
286-
description="The ID of the invocation that this row belongs to.",
287-
)
288-
289-
experiment_id: Optional[str] = Field(
290-
default_factory=generate_id,
291-
description="The ID of the experiment that this row belongs to.",
292-
)
293-
294-
rollout_id: Optional[str] = Field(
295-
default_factory=generate_id,
296-
description="The ID of the rollout that this row belongs to.",
297-
)
298-
299-
run_id: Optional[str] = Field(
300-
None,
301-
description=("The ID of the run that this row belongs to."),
302-
)
303-
304308
# Ground truth reference (moved from EvaluateResult to top level)
305309
ground_truth: Optional[str] = Field(
306310
default=None, description="Optional ground truth reference for this evaluation."
@@ -311,6 +315,11 @@ class EvaluationRow(BaseModel):
311315
default=None, description="The evaluation result for this row/trajectory."
312316
)
313317

318+
execution_metadata: ExecutionMetadata = Field(
319+
default_factory=ExecutionMetadata,
320+
description="Metadata about the execution of the evaluation.",
321+
)
322+
314323
# LLM usage statistics
315324
usage: Optional[CompletionUsage] = Field(
316325
default=None, description="Token usage statistics from LLM calls during execution."

eval_protocol/pytest/evaluation_test.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,8 @@ def _log_eval_error(
383383
row.input_metadata.session_data["mode"] = mode
384384
# Initialize eval_metadata for each row
385385
row.eval_metadata = eval_metadata
386-
row.experiment_id = experiment_id
387-
row.invocation_id = invocation_id
386+
row.execution_metadata.experiment_id = experiment_id
387+
row.execution_metadata.invocation_id = invocation_id
388388

389389
# has to be done in the pytest main process since it's
390390
# used to determine whether this eval has stopped
@@ -409,11 +409,11 @@ def _log_eval_error(
409409

410410
# apply new run_id to fresh_dataset
411411
for row in fresh_dataset:
412-
row.run_id = run_id
412+
row.execution_metadata.run_id = run_id
413413

414414
# generate new rollout_id for each row
415415
for row in fresh_dataset:
416-
row.rollout_id = generate_id()
416+
row.execution_metadata.rollout_id = generate_id()
417417

418418
# log the fresh_dataset
419419
for row in fresh_dataset:

tests/dataset_logger/test_sqlite_dataset_logger_adapter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def test_update_log_and_read():
2424

2525
logger = SqliteDatasetLoggerAdapter(store=store)
2626
logger.log(row)
27-
saved = logger.read(row.rollout_id)[0]
27+
saved = logger.read(row.execution_metadata.rollout_id)[0]
2828
assert row.messages == saved.messages
2929
assert row.input_metadata == saved.input_metadata
3030

@@ -42,7 +42,7 @@ def test_create_log_and_read():
4242
row = EvaluationRow(input_metadata=input_metadata, messages=messages)
4343

4444
logger.log(row)
45-
saved = logger.read(rollout_id=row.rollout_id)[0]
45+
saved = logger.read(rollout_id=row.execution_metadata.rollout_id)[0]
4646
assert row.messages == saved.messages
4747
assert row.input_metadata == saved.input_metadata
4848

tests/pytest/test_pytest_ids.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ def __init__(self):
1212
self._rows: dict[str, EvaluationRow] = {}
1313

1414
def log(self, row: EvaluationRow):
15-
print(row.run_id, row.rollout_id)
16-
self._rows[row.rollout_id] = row
15+
print(row.execution_metadata.run_id, row.execution_metadata.rollout_id)
16+
self._rows[row.execution_metadata.rollout_id] = row
1717

1818
def read(self):
1919
return list(self._rows.values())
@@ -76,10 +76,10 @@ def test_evaluation_test_decorator_ids_single(monkeypatch):
7676
logger=InMemoryLogger(),
7777
)
7878
def eval_fn(row: EvaluationRow) -> EvaluationRow:
79-
unique_run_ids.add(row.run_id)
80-
unique_experiment_ids.add(row.experiment_id)
81-
unique_rollout_ids.add(row.rollout_id)
82-
unique_invocation_ids.add(row.invocation_id)
79+
unique_run_ids.add(row.execution_metadata.run_id)
80+
unique_experiment_ids.add(row.execution_metadata.experiment_id)
81+
unique_rollout_ids.add(row.execution_metadata.rollout_id)
82+
unique_invocation_ids.add(row.execution_metadata.invocation_id)
8383
unique_row_ids.add(row.input_metadata.row_id)
8484
return row
8585

tests/pytest/test_pytest_mcp_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
]
2121
],
2222
rollout_processor=default_agent_rollout_processor,
23-
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
23+
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-20b"],
2424
mode="pointwise",
2525
mcp_config_path="tests/pytest/mcp_configurations/mock_discord_mcp_config.json",
2626
)

vite-app/src/GlobalState.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ export class GlobalState {
1414

1515
upsertRows(dataset: EvaluationRow[]) {
1616
dataset.forEach((row) => {
17-
if (!row.rollout_id) {
17+
if (!row.execution_metadata?.rollout_id) {
1818
return;
1919
}
20-
this.dataset[row.rollout_id] = row;
20+
this.dataset[row.execution_metadata.rollout_id] = row;
2121
});
2222
}
2323

vite-app/src/components/EvaluationRow.tsx

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,10 @@ const IdSection = observer(({ data }: { data: EvaluationRowType }) => (
133133
<MetadataSection
134134
title="IDs"
135135
data={{
136-
rollout_id: data.rollout_id,
137-
experiment_id: data.experiment_id,
138-
invocation_id: data.invocation_id,
139-
run_id: data.run_id,
136+
rollout_id: data.execution_metadata?.rollout_id,
137+
experiment_id: data.execution_metadata?.experiment_id,
138+
invocation_id: data.execution_metadata?.invocation_id,
139+
run_id: data.execution_metadata?.run_id,
140140
}}
141141
/>
142142
));
@@ -197,7 +197,7 @@ const ExpandedContent = observer(
197197

198198
export const EvaluationRow = observer(
199199
({ row }: { row: EvaluationRowType; index: number }) => {
200-
const rolloutId = row.rollout_id;
200+
const rolloutId = row.execution_metadata?.rollout_id;
201201
const isExpanded = state.isRowExpanded(rolloutId);
202202

203203
const toggleExpanded = () => state.toggleRowExpansion(rolloutId);
@@ -226,7 +226,7 @@ export const EvaluationRow = observer(
226226

227227
{/* Rollout ID */}
228228
<TableCell className="py-3 text-xs">
229-
<RolloutId rolloutId={row.rollout_id} />
229+
<RolloutId rolloutId={row.execution_metadata?.rollout_id} />
230230
</TableCell>
231231

232232
{/* Model */}

vite-app/src/components/EvaluationTable.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ const TableBody = observer(
2020
<TableBodyBase>
2121
{paginatedData.map((row, index) => (
2222
<EvaluationRow
23-
key={row.rollout_id}
23+
key={row.execution_metadata?.rollout_id}
2424
row={row}
2525
index={startIndex + index}
2626
/>

vite-app/src/types/eval-protocol.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,15 +94,19 @@ export const RolloutStatusSchema = z.object({
9494
error_message: z.string().optional().describe('Error message if the rollout failed.')
9595
});
9696

97+
export const ExecutionMetadataSchema = z.object({
98+
invocation_id: z.string().optional().describe('The ID of the invocation that this row belongs to.'),
99+
experiment_id: z.string().optional().describe('The ID of the experiment that this row belongs to.'),
100+
rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
101+
run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
102+
});
103+
97104
export const EvaluationRowSchema = z.object({
98105
messages: z.array(MessageSchema).describe('List of messages in the conversation/trajectory.'),
99106
tools: z.array(z.record(z.string(), z.any())).optional().describe('Available tools/functions that were provided to the agent.'),
100107
input_metadata: InputMetadataSchema.describe('Metadata related to the input (dataset info, model config, session data, etc.).'),
101108
rollout_status: RolloutStatusSchema.default({ status: 'finished' }).describe('The status of the rollout.'),
102-
invocation_id: z.string().optional().describe('The ID of the invocation that this row belongs to.'),
103-
experiment_id: z.string().optional().describe('The ID of the experiment that this row belongs to.'),
104-
rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
105-
run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
109+
execution_metadata: ExecutionMetadataSchema.optional().describe('Metadata about the execution of the evaluation.'),
106110
ground_truth: z.string().optional().describe('Optional ground truth reference for this evaluation.'),
107111
evaluation_result: EvaluateResultSchema.optional().describe('The evaluation result for this row/trajectory.'),
108112
usage: CompletionUsageSchema.optional().describe('Token usage statistics from LLM calls during execution.'),

0 commit comments

Comments
 (0)