@@ -304,13 +304,6 @@ def wrapper_body(**kwargs):
304304
305305 cohort_id = generate_id ()
306306
307- def _log_eval_error (
308- status : Literal ["finished" , "error" ], rows : Optional [List [EvaluationRow ]] | None , passed : bool
309- ) -> None :
310- log_eval_status_and_rows (eval_metadata , rows , status , passed , default_logger )
311-
312- cohort_id = generate_id ()
313-
314307 def _log_eval_error (
315308 status : Literal ["finished" , "error" ], rows : Optional [List [EvaluationRow ]] | None , passed : bool
316309 ) -> None :
@@ -470,25 +463,9 @@ def _log_eval_error(
470463 sum ([r .evaluation_result .score for r in result if r .evaluation_result ]) / len (result )
471464 for result in all_results
472465 ]
473- print (f"SCORES: { scores } " )
474466 agg_score = aggregate (scores , aggregation_method )
475467 score_std = statistics .stdev (scores ) if len (scores ) > 1 else 0.0
476468
477- # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
478- ci_low : float | None = None
479- ci_high : float | None = None
480- if aggregation_method == "mean" :
481- try :
482- result_ci = compute_fixed_set_mu_ci ([item for sublist in all_results for item in sublist ])
483- mu_ci_low , mu_ci_high = result_ci [1 ], result_ci [2 ]
484- if mu_ci_low is not None and mu_ci_high is not None :
485- ci_low = float (mu_ci_low )
486- ci_high = float (mu_ci_high )
487- # Keep agg_score as-is (mean over scores). For equal repeats per question these match.
488- except Exception :
489- ci_low = None
490- ci_high = None
491-
492469 # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
493470 ci_low : float | None = None
494471 ci_high : float | None = None
0 commit comments