@@ -271,6 +271,30 @@ def _validate_dataset(dataset_jsonl: Optional[str]) -> bool:
271271 return _validate_dataset_jsonl (dataset_jsonl )
272272
273273
274+ def _warn_if_large_dataset (dataset_jsonl : Optional [str ], row_threshold : int = 200 ) -> None :
275+ """Best-effort warning when local validation will run over a large dataset.
276+
277+ This is primarily to help users of `ep create rft` understand why local validation
278+ might be slow and to point them at --skip-validation when appropriate.
279+ """
280+ if not dataset_jsonl :
281+ return
282+ try :
283+ # Count non-empty lines in the JSONL; simple full pass for clarity.
284+ with open (dataset_jsonl , "r" , encoding = "utf-8" ) as f :
285+ count = sum (1 for line in f if line .strip ())
286+ if count > row_threshold :
287+ print (
288+ f"Warning: Local evaluator validation will run over more than { row_threshold } rows "
289+ f"from dataset JSONL at { dataset_jsonl } .\n "
290+ " This may take a while. You can pass --skip-validation to `ep create rft` "
291+ "to skip local pytest-based validation if you are confident in your evaluator."
292+ )
293+ except Exception :
294+ # Best-effort hint only; do not block RFT creation if counting fails.
295+ return
296+
297+
274298def _validate_evaluator_locally (
275299 project_root : str ,
276300 selected_test_file : Optional [str ],
@@ -791,6 +815,16 @@ def create_rft_command(args) -> int:
791815
792816 # 3) Optional local validation
793817 if not skip_validation :
818+ # Best-effort hint if the JSONL dataset is large; helps users decide to use --skip-validation.
819+ if dataset_jsonl :
820+ # Resolve dataset_jsonl path relative to CWD if needed (mirror upload logic).
821+ jsonl_path_for_warning = (
822+ dataset_jsonl
823+ if os .path .isabs (dataset_jsonl )
824+ else os .path .abspath (os .path .join (project_root , dataset_jsonl ))
825+ )
826+ _warn_if_large_dataset (jsonl_path_for_warning )
827+
794828 # Dataset validation (JSONL must be EvaluationRow-compatible when present)
795829 if not _validate_dataset (dataset_jsonl ):
796830 return 1
0 commit comments