Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,9 +286,9 @@ class EvaluationRow(BaseModel):
description="The ID of the invocation that this row belongs to.",
)

cohort_id: Optional[str] = Field(
experiment_id: Optional[str] = Field(
default_factory=generate_id,
description="The ID of the cohort that this row belongs to.",
description="The ID of the experiment that this row belongs to.",
)

rollout_id: Optional[str] = Field(
Expand Down
16 changes: 8 additions & 8 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,14 @@ def evaluation_test( # noqa: C901
Here are some key concepts to understand the terminology in EP:

- "invocation" is a single execution of a test function. An invocation can
generate 1 or more cohorts. Grouping by invocation might be useful to
generate 1 or more experiments. Grouping by invocation might be useful to
aggregate eval scores across multiple invocations when you want to aggregate
scores across multiple datasets.
- "cohort" is a group of runs with for a combination of parameters. A single
cohort will have multiple runs if num_runs > 1.
- "experiment" is a group of runs with for a combination of parameters. A single
experiment will have multiple runs if num_runs > 1.
1. If your evaluation_test has combinations of parameters, it will generate
multiple cohorts per combination of parameters.
2. A new execution of a test function will generate a new cohort.
multiple experiments per combination of parameters.
2. A new execution of a test function will generate a new experiment.
- "run" is a group of rollouts. For multiple num_runs > 1, there will be
multiple "run_id"s.
- "rollout" is the execution/process that produces a "trajectory". You
Expand All @@ -98,7 +98,7 @@ def evaluation_test( # noqa: C901
decorated test. It simply produces a score from 0 to 1 and attached it
to the row as the "evaluation_result" field.

"invocation", "cohort", "run", "rollout", and "row" each have a unique ID
"invocation", "experiment", "run", "rollout", and "row" each have a unique ID
which can be used to easily group and identify your dataset by.

Args:
Expand Down Expand Up @@ -302,7 +302,7 @@ def wrapper_body(**kwargs):
eval_metadata = None
all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]

cohort_id = generate_id()
experiment_id = generate_id()

def _log_eval_error(
status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
Expand Down Expand Up @@ -383,7 +383,7 @@ def _log_eval_error(
row.input_metadata.session_data["mode"] = mode
# Initialize eval_metadata for each row
row.eval_metadata = eval_metadata
row.cohort_id = cohort_id
row.experiment_id = experiment_id
row.invocation_id = invocation_id

# has to be done in the pytest main process since it's
Expand Down
6 changes: 3 additions & 3 deletions tests/pytest/test_pytest_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
def test_evaluation_test_decorator_ids_single(monkeypatch):
in_memory_logger = InMemoryLogger()
unique_run_ids = set()
unique_cohort_ids = set()
unique_experiment_ids = set()
unique_rollout_ids = set()
unique_invocation_ids = set()
unique_row_ids = set()
Expand All @@ -77,7 +77,7 @@ def test_evaluation_test_decorator_ids_single(monkeypatch):
)
def eval_fn(row: EvaluationRow) -> EvaluationRow:
unique_run_ids.add(row.run_id)
unique_cohort_ids.add(row.cohort_id)
unique_experiment_ids.add(row.experiment_id)
unique_rollout_ids.add(row.rollout_id)
unique_invocation_ids.add(row.invocation_id)
unique_row_ids.add(row.input_metadata.row_id)
Expand All @@ -97,6 +97,6 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
# Assertions on IDs generated by the decorator logic
assert len(unique_invocation_ids) == 1
assert len(unique_run_ids) == 20 # 4 combinations * 5 runs each
assert len(unique_cohort_ids) == 2 * 2 # 2 datasets * 2 param sets
assert len(unique_experiment_ids) == 2 * 2 # 2 datasets * 2 param sets
assert len(unique_row_ids) == 19 # from the markdown dataset
assert len(unique_rollout_ids) == 19 * 5 * 2 * 2 # rows * runs * datasets * params

Large diffs are not rendered by default.

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions vite-app/dist/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>EP | Log Viewer</title>
<link rel="icon" href="/assets/favicon-BkAAWQga.png" />
<script type="module" crossorigin src="/assets/index-t_hsfGP1.js"></script>
<script type="module" crossorigin src="/assets/index-Cvu-Dnw_.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-CGYj40Gx.css">
</head>
<body>
<div id="root"></div>
</body>
</html>
</html>
2 changes: 1 addition & 1 deletion vite-app/src/components/EvaluationRow.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ const IdSection = observer(({ data }: { data: EvaluationRowType }) => (
title="IDs"
data={{
rollout_id: data.rollout_id,
cohort_id: data.cohort_id,
experiment_id: data.experiment_id,
invocation_id: data.invocation_id,
run_id: data.run_id,
}}
Expand Down
2 changes: 1 addition & 1 deletion vite-app/src/types/eval-protocol.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ export const EvaluationRowSchema = z.object({
input_metadata: InputMetadataSchema.describe('Metadata related to the input (dataset info, model config, session data, etc.).'),
rollout_status: RolloutStatusSchema.default({ status: 'finished' }).describe('The status of the rollout.'),
invocation_id: z.string().optional().describe('The ID of the invocation that this row belongs to.'),
cohort_id: z.string().optional().describe('The ID of the cohort that this row belongs to.'),
experiment_id: z.string().optional().describe('The ID of the experiment that this row belongs to.'),
rollout_id: z.string().optional().describe('The ID of the rollout that this row belongs to.'),
run_id: z.string().optional().describe('The ID of the run that this row belongs to.'),
ground_truth: z.string().optional().describe('Optional ground truth reference for this evaluation.'),
Expand Down
2 changes: 1 addition & 1 deletion vite-app/src/util/pivot.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ describe('computePivot', () => {

const res = computePivot({
data: rows,
rowFields: ['$.eval_metadata.name', '$.cohort_id'],
rowFields: ['$.eval_metadata.name', '$.experiment_id'],
columnFields: ['$.input_metadata.completion_params.model'],
valueField: '$.evaluation_result.score',
aggregator: 'avg',
Expand Down
Loading