Skip to content

Commit c6e3192

Browse files
committed
bug fixes
1 parent bf8f228 commit c6e3192

File tree

3 files changed

+71
-2
lines changed

3 files changed

+71
-2
lines changed

eval_protocol/cli_commands/create_rft.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,30 @@ def _validate_dataset(dataset_jsonl: Optional[str]) -> bool:
271271
return _validate_dataset_jsonl(dataset_jsonl)
272272

273273

274+
def _warn_if_large_dataset(dataset_jsonl: Optional[str], row_threshold: int = 200) -> None:
275+
"""Best-effort warning when local validation will run over a large dataset.
276+
277+
This is primarily to help users of `ep create rft` understand why local validation
278+
might be slow and to point them at --skip-validation when appropriate.
279+
"""
280+
if not dataset_jsonl:
281+
return
282+
try:
283+
# Count non-empty lines in the JSONL; simple full pass for clarity.
284+
with open(dataset_jsonl, "r", encoding="utf-8") as f:
285+
count = sum(1 for line in f if line.strip())
286+
if count > row_threshold:
287+
print(
288+
f"Warning: Local evaluator validation will run over more than {row_threshold} rows "
289+
f"from dataset JSONL at {dataset_jsonl}.\n"
290+
" This may take a while. You can pass --skip-validation to `ep create rft` "
291+
"to skip local pytest-based validation if you are confident in your evaluator."
292+
)
293+
except Exception:
294+
# Best-effort hint only; do not block RFT creation if counting fails.
295+
return
296+
297+
274298
def _validate_evaluator_locally(
275299
project_root: str,
276300
selected_test_file: Optional[str],
@@ -791,6 +815,16 @@ def create_rft_command(args) -> int:
791815

792816
# 3) Optional local validation
793817
if not skip_validation:
818+
# Best-effort hint if the JSONL dataset is large; helps users decide to use --skip-validation.
819+
if dataset_jsonl:
820+
# Resolve dataset_jsonl path relative to CWD if needed (mirror upload logic).
821+
jsonl_path_for_warning = (
822+
dataset_jsonl
823+
if os.path.isabs(dataset_jsonl)
824+
else os.path.abspath(os.path.join(project_root, dataset_jsonl))
825+
)
826+
_warn_if_large_dataset(jsonl_path_for_warning)
827+
794828
# Dataset validation (JSONL must be EvaluationRow-compatible when present)
795829
if not _validate_dataset(dataset_jsonl):
796830
return 1

eval_protocol/cli_commands/local_test.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,17 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List
3636
def _run_pytest_host(pytest_target: str) -> int:
3737
"""Run pytest against a target on the host and return its exit code."""
3838
# Always enforce a small success threshold for evaluation_test-based suites so that runs with all-zero scores fail.
39-
cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
39+
cmd = [
40+
sys.executable,
41+
"-m",
42+
"pytest",
43+
"--ep-success-threshold",
44+
"0.001",
45+
"--ep-num-runs",
46+
"1",
47+
pytest_target,
48+
"-vs",
49+
]
4050
# Print the exact command being executed for easier debugging.
4151
print("Running locally:", " ".join(cmd))
4252
proc = subprocess.run(cmd)
@@ -98,7 +108,7 @@ def _run_pytest_in_docker(
98108

99109
# Build pytest command, always enforcing the same small success threshold as
100110
# the host runner so that all-zero score runs fail consistently.
101-
pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
111+
pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", "--ep-num-runs", "1", pytest_target, "-vs"]
102112

103113
cmd += [image_tag] + pytest_cmd
104114
print("Running in Docker:", " ".join(cmd))

tests/test_cli_create_rft.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,31 @@ def rft_test_harness(tmp_path, monkeypatch):
4545
return project
4646

4747

48+
def test_warn_if_large_dataset_silent_for_small(tmp_path, capsys):
49+
# Dataset with fewer rows than the threshold should not emit a warning.
50+
ds_path = tmp_path / "small.jsonl"
51+
ds_path.write_text('{"row":1}\n{"row":2}\n', encoding="utf-8")
52+
53+
cr._warn_if_large_dataset(str(ds_path), row_threshold=5)
54+
55+
out, err = capsys.readouterr()
56+
assert "Warning: Local evaluator validation will run over more than" not in out
57+
assert "Warning: Local evaluator validation will run over more than" not in err
58+
59+
60+
def test_warn_if_large_dataset_emits_warning_for_large(tmp_path, capsys):
61+
# Dataset with more rows than the threshold should emit a warning.
62+
ds_path = tmp_path / "large.jsonl"
63+
# 3 non-empty lines, threshold=2 -> should warn
64+
ds_path.write_text('{"row":1}\n{"row":2}\n{"row":3}\n', encoding="utf-8")
65+
66+
cr._warn_if_large_dataset(str(ds_path), row_threshold=2)
67+
68+
out, err = capsys.readouterr()
69+
combined = out + err
70+
assert "Warning: Local evaluator validation will run over more than 2 rows" in combined
71+
72+
4873
def test_create_rft_passes_all_flags_into_request_body(rft_test_harness, monkeypatch):
4974
project = rft_test_harness
5075

0 commit comments

Comments
 (0)