Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
79ce036
add --port arg to ep logs
Aug 11, 2025
f74699a
Merge branch 'main' into aggregated-metrics-part-5
Aug 11, 2025
3c36fab
Fix WebSocketManager to reset broadcast task after cancellation
Aug 11, 2025
2e0be24
simple tests work
Aug 11, 2025
6cdadf3
TODO: TestLogsServer
Aug 11, 2025
88f0f3a
TODO: TestLogsServerIntegration
Aug 11, 2025
1b42179
TODO: test HTML injection
Aug 11, 2025
b10d403
add logs server tests
Aug 11, 2025
c248b68
add port parameter testes
Aug 11, 2025
050a0a1
use gpt-oss-120b to avoid rate limits
Aug 11, 2025
9f67172
Merge branch 'main' into aggregated-metrics-part-5
Aug 11, 2025
6af5d77
point to port 8000 for dev
Aug 11, 2025
685c86a
woops
Aug 11, 2025
dbd1759
Merge branch 'main' into aggregated-metrics-part-6
Aug 11, 2025
12ec78a
fix "uvicorn eval_protocol.utils.logs_server:create_app --factory --r…
Aug 11, 2025
d4167ce
use gpt-oss-120b since less rate limiting (#57)
Aug 11, 2025
c0137e2
Aggregated metrics part 7 (#58)
Aug 11, 2025
47ba989
use active logger
Aug 11, 2025
38390bf
Merge branch 'main' into aggregated-metrics-part-6
Aug 11, 2025
3e63a43
cohort -> experiment
Aug 11, 2025
287897d
vite build
Aug 11, 2025
adae8f6
Update model path in pytest configuration to use gpt-oss-120b for imp…
Aug 11, 2025
95fcf5f
Merge branch 'main' into aggregated-metrics-part-7
Aug 11, 2025
6609394
Merge branch 'aggregated-metrics-part-7' into use-gpt-oss-for-mcp-con…
Aug 11, 2025
f15542c
move ids under execution_metadata
Aug 11, 2025
5ca5746
Merge branch 'main' into use-gpt-oss-for-mcp-config-test
Aug 11, 2025
a2b3760
Merge branch 'use-gpt-oss-for-mcp-config-test' into aggregated-metric…
Aug 11, 2025
af5e798
Merge branch 'main' into aggregated-metrics-part-8
Aug 11, 2025
19f9b05
Update model path in pytest configuration to use gpt-oss-20b for test…
Aug 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions eval_protocol/dataset_logger/sqlite_evaluation_row_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def db_path(self) -> str:
return self._db_path

def upsert_row(self, data: dict) -> None:
rollout_id = data["rollout_id"]
if "rollout_id" not in data:
raise ValueError("rollout_id is required to upsert a row")
rollout_id = data["execution_metadata"]["rollout_id"]
if rollout_id is None:
raise ValueError("execution_metadata.rollout_id is required to upsert a row")
if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
else:
Expand Down
49 changes: 29 additions & 20 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,30 @@ class EvalMetadata(BaseModel):
passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")


class ExecutionMetadata(BaseModel):
"""Metadata about the execution of the evaluation."""

invocation_id: Optional[str] = Field(
default_factory=generate_id,
description="The ID of the invocation that this row belongs to.",
)

experiment_id: Optional[str] = Field(
default_factory=generate_id,
description="The ID of the experiment that this row belongs to.",
)

rollout_id: Optional[str] = Field(
default_factory=generate_id,
description="The ID of the rollout that this row belongs to.",
)

run_id: Optional[str] = Field(
None,
description=("The ID of the run that this row belongs to."),
)


class RolloutStatus(BaseModel):
"""Status of the rollout."""

Expand Down Expand Up @@ -281,26 +305,6 @@ class EvaluationRow(BaseModel):
description="The status of the rollout.",
)

invocation_id: Optional[str] = Field(
default_factory=generate_id,
description="The ID of the invocation that this row belongs to.",
)

experiment_id: Optional[str] = Field(
default_factory=generate_id,
description="The ID of the experiment that this row belongs to.",
)

rollout_id: Optional[str] = Field(
default_factory=generate_id,
description="The ID of the rollout that this row belongs to.",
)

run_id: Optional[str] = Field(
None,
description=("The ID of the run that this row belongs to."),
)

# Ground truth reference (moved from EvaluateResult to top level)
ground_truth: Optional[str] = Field(
default=None, description="Optional ground truth reference for this evaluation."
Expand All @@ -311,6 +315,11 @@ class EvaluationRow(BaseModel):
default=None, description="The evaluation result for this row/trajectory."
)

execution_metadata: ExecutionMetadata = Field(
default_factory=ExecutionMetadata,
description="Metadata about the execution of the evaluation.",
)

# LLM usage statistics
usage: Optional[CompletionUsage] = Field(
default=None, description="Token usage statistics from LLM calls during execution."
Expand Down
8 changes: 4 additions & 4 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,8 +383,8 @@ def _log_eval_error(
row.input_metadata.session_data["mode"] = mode
# Initialize eval_metadata for each row
row.eval_metadata = eval_metadata
row.experiment_id = experiment_id
row.invocation_id = invocation_id
row.execution_metadata.experiment_id = experiment_id
row.execution_metadata.invocation_id = invocation_id

# has to be done in the pytest main process since it's
# used to determine whether this eval has stopped
Expand All @@ -409,11 +409,11 @@ def _log_eval_error(

# apply new run_id to fresh_dataset
for row in fresh_dataset:
row.run_id = run_id
row.execution_metadata.run_id = run_id

# generate new rollout_id for each row
for row in fresh_dataset:
row.rollout_id = generate_id()
row.execution_metadata.rollout_id = generate_id()

# log the fresh_dataset
for row in fresh_dataset:
Expand Down
4 changes: 2 additions & 2 deletions tests/dataset_logger/test_sqlite_dataset_logger_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_update_log_and_read():

logger = SqliteDatasetLoggerAdapter(store=store)
logger.log(row)
saved = logger.read(row.rollout_id)[0]
saved = logger.read(row.execution_metadata.rollout_id)[0]
assert row.messages == saved.messages
assert row.input_metadata == saved.input_metadata

Expand All @@ -42,7 +42,7 @@ def test_create_log_and_read():
row = EvaluationRow(input_metadata=input_metadata, messages=messages)

logger.log(row)
saved = logger.read(rollout_id=row.rollout_id)[0]
saved = logger.read(rollout_id=row.execution_metadata.rollout_id)[0]
assert row.messages == saved.messages
assert row.input_metadata == saved.input_metadata

Expand Down
12 changes: 6 additions & 6 deletions tests/pytest/test_pytest_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def __init__(self):
self._rows: dict[str, EvaluationRow] = {}

def log(self, row: EvaluationRow):
print(row.run_id, row.rollout_id)
self._rows[row.rollout_id] = row
print(row.execution_metadata.run_id, row.execution_metadata.rollout_id)
self._rows[row.execution_metadata.rollout_id] = row

def read(self):
return list(self._rows.values())
Expand Down Expand Up @@ -76,10 +76,10 @@ def test_evaluation_test_decorator_ids_single(monkeypatch):
logger=InMemoryLogger(),
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
unique_run_ids.add(row.run_id)
unique_experiment_ids.add(row.experiment_id)
unique_rollout_ids.add(row.rollout_id)
unique_invocation_ids.add(row.invocation_id)
unique_run_ids.add(row.execution_metadata.run_id)
unique_experiment_ids.add(row.execution_metadata.experiment_id)
unique_rollout_ids.add(row.execution_metadata.rollout_id)
unique_invocation_ids.add(row.execution_metadata.invocation_id)
unique_row_ids.add(row.input_metadata.row_id)
return row

Expand Down
2 changes: 1 addition & 1 deletion tests/pytest/test_pytest_mcp_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
]
],
rollout_processor=default_agent_rollout_processor,
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-20b"],
mode="pointwise",
mcp_config_path="tests/pytest/mcp_configurations/mock_discord_mcp_config.json",
)
Expand Down
4 changes: 2 additions & 2 deletions vite-app/src/GlobalState.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ export class GlobalState {

upsertRows(dataset: EvaluationRow[]) {
dataset.forEach((row) => {
if (!row.rollout_id) {
if (!row.execution_metadata?.rollout_id) {
return;
}
this.dataset[row.rollout_id] = row;
this.dataset[row.execution_metadata.rollout_id] = row;
});
}

Expand Down
12 changes: 6 additions & 6 deletions vite-app/src/components/EvaluationRow.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,10 @@ const IdSection = observer(({ data }: { data: EvaluationRowType }) => (
<MetadataSection
title="IDs"
data={{
rollout_id: data.rollout_id,
experiment_id: data.experiment_id,
invocation_id: data.invocation_id,
run_id: data.run_id,
rollout_id: data.execution_metadata?.rollout_id,
experiment_id: data.execution_metadata?.experiment_id,
invocation_id: data.execution_metadata?.invocation_id,
run_id: data.execution_metadata?.run_id,
}}
/>
));
Expand Down Expand Up @@ -197,7 +197,7 @@ const ExpandedContent = observer(

export const EvaluationRow = observer(
({ row }: { row: EvaluationRowType; index: number }) => {
const rolloutId = row.rollout_id;
const rolloutId = row.execution_metadata?.rollout_id;
const isExpanded = state.isRowExpanded(rolloutId);

const toggleExpanded = () => state.toggleRowExpansion(rolloutId);
Expand Down Expand Up @@ -226,7 +226,7 @@ export const EvaluationRow = observer(

{/* Rollout ID */}
<TableCell className="py-3 text-xs">
<RolloutId rolloutId={row.rollout_id} />
<RolloutId rolloutId={row.execution_metadata?.rollout_id} />
</TableCell>

{/* Model */}
Expand Down
2 changes: 1 addition & 1 deletion vite-app/src/components/EvaluationTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const TableBody = observer(
<TableBodyBase>
{paginatedData.map((row, index) => (
<EvaluationRow
key={row.rollout_id}
key={row.execution_metadata?.rollout_id}
row={row}
index={startIndex + index}
/>
Expand Down
12 changes: 8 additions & 4 deletions vite-app/src/types/eval-protocol.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,19 @@ export const RolloutStatusSchema = z.object({
error_message: z.string().optional().describe('Error message if the rollout failed.')
});

export const ExecutionMetadataSchema = z.object({
invocation_id: z.string().optional().describe('The ID of the invocation that this row belongs to.'),
experiment_id: z.string().optional().describe('The ID of the experiment that this row belongs to.'),
rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
});

export const EvaluationRowSchema = z.object({
messages: z.array(MessageSchema).describe('List of messages in the conversation/trajectory.'),
tools: z.array(z.record(z.string(), z.any())).optional().describe('Available tools/functions that were provided to the agent.'),
input_metadata: InputMetadataSchema.describe('Metadata related to the input (dataset info, model config, session data, etc.).'),
rollout_status: RolloutStatusSchema.default({ status: 'finished' }).describe('The status of the rollout.'),
invocation_id: z.string().optional().describe('The ID of the invocation that this row belongs to.'),
experiment_id: z.string().optional().describe('The ID of the experiment that this row belongs to.'),
rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
execution_metadata: ExecutionMetadataSchema.optional().describe('Metadata about the execution of the evaluation.'),
ground_truth: z.string().optional().describe('Optional ground truth reference for this evaluation.'),
evaluation_result: EvaluateResultSchema.optional().describe('The evaluation result for this row/trajectory.'),
usage: CompletionUsageSchema.optional().describe('Token usage statistics from LLM calls during execution.'),
Expand Down
2 changes: 1 addition & 1 deletion vite-app/src/util/pivot.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ describe('computePivot', () => {

const res = computePivot({
data: rows,
rowFields: ['$.eval_metadata.name', '$.experiment_id'],
rowFields: ['$.eval_metadata.name', '$.execution_metadata.experiment_id'],
columnFields: ['$.input_metadata.completion_params.model'],
valueField: '$.evaluation_result.score',
aggregator: 'avg',
Expand Down
Loading