diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_mlflow_integration.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_mlflow_integration.py index 580bb8042ab9..7d98eaaecc9b 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_mlflow_integration.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_mlflow_integration.py @@ -366,6 +366,8 @@ async def log_redteam_results_to_mlflow( self.logger.debug(f"Logged metric: {risk_category}_{key} = {value}") if self._one_dp_project: + run_id = getattr(eval_run, "id", "unknown") + run_display_name = getattr(eval_run, "display_name", None) or "unknown" # Step 1: Upload evaluation results (blob upload + version create) evaluation_result_id = None try: @@ -379,7 +381,10 @@ async def log_redteam_results_to_mlflow( ) evaluation_result_id = create_evaluation_result_response.id except Exception as e: - self.logger.error(f"Failed to create evaluation result: {str(e)}") + self.logger.error( + f"Failed to create evaluation result for run {run_id} ({run_display_name}): {str(e)}", + exc_info=True, + ) # Step 2: Always update the run status, even if result upload failed outputs = None @@ -399,7 +404,10 @@ async def log_redteam_results_to_mlflow( ) self.logger.debug(f"Updated UploadRun: {update_run_response.id}") except Exception as e: - self.logger.error(f"Failed to update red team run status: {str(e)}") + self.logger.error( + f"Failed to update red team run status for run {run_id}: {str(e)}", + exc_info=True, + ) else: # Log the entire directory to MLFlow try: @@ -431,19 +439,23 @@ def update_run_status(self, eval_run, status: str) -> None: """ if not self._one_dp_project: return + run_id = getattr(eval_run, "id", "unknown") + run_display_name = getattr(eval_run, "display_name", None) try: self.generated_rai_client._evaluation_onedp_client.update_red_team_run( name=eval_run.id, red_team=RedTeamUpload( id=eval_run.id, - display_name=getattr(eval_run, "display_name", None) - or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}", + display_name=run_display_name or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}", status=status, ), ) - self.logger.info(f"Updated red team run status to '{status}'") + self.logger.info(f"Updated red team run {run_id} status to '{status}'") except Exception as e: - self.logger.error(f"Failed to update red team run status to '{status}': {str(e)}") + self.logger.error( + f"Failed to update red team run {run_id} status to '{status}': {str(e)}", + exc_info=True, + ) def _build_instance_results_payload( self, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py index b450c99b15c4..d6bcbf514f5f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py @@ -1432,7 +1432,11 @@ async def scan( # Process and return results return await self._finalize_results(skip_upload, skip_evals, eval_run, output_path, scan_name) - except Exception: + except Exception as e: + self.logger.error( + f"Red team scan execution failed for run {getattr(eval_run, 'id', 'unknown')}: {str(e)}", + exc_info=True, + ) # Ensure the run status is updated to Failed if an upload was started if not skip_upload and self.mlflow_integration is not None: self.mlflow_integration.update_run_status(eval_run, "Failed") @@ -1785,7 +1789,10 @@ async def _execute_attacks_with_foundry( objectives_by_risk=objectives_by_risk, ) - # Update red_team_info with Foundry results + # Update red_team_info with Foundry results. + # The RAIServiceScorer already evaluated each response during attack + # execution, so results (attack_success, score) are in the JSONL. + # No need for a second evaluation_processor.evaluate() call. for strategy_name, risk_data in foundry_results.items(): if strategy_name not in self.red_team_info: self.red_team_info[strategy_name] = {} @@ -1805,48 +1812,9 @@ async def _execute_attacks_with_foundry( "asr": result_data.get("asr", 0.0), } - # Run evaluation if not skipping and we have a data file - if not skip_evals and data_file and os.path.exists(data_file): - progress_bar.set_postfix({"current": f"evaluating {risk_value}"}) - try: - # Find the risk category enum from value - risk_category_enum = next( - (rc for rc in self.risk_categories if rc.value == risk_value), - None, - ) - if risk_category_enum and self.evaluation_processor: - # Find matching strategy for evaluation - all_strategies = foundry_strategies + special_strategies - strategy_for_eval = next( - (s for s in all_strategies if get_strategy_name(s) == strategy_name), - AttackStrategy.Baseline, # Fallback - ) - - await self.evaluation_processor.evaluate( - scan_name=None, - risk_category=risk_category_enum, - strategy=strategy_for_eval, - _skip_evals=False, - data_path=data_file, - output_path=None, - red_team_info=self.red_team_info, - ) - except Exception as eval_error: - self.logger.warning(f"Evaluation error for {strategy_name}/{risk_value}: {str(eval_error)}") - # Don't fail the whole execution for eval errors - tqdm.write(f"⚠️ Evaluation warning for {strategy_name}/{risk_value}: {str(eval_error)}") - self.completed_tasks += 1 progress_bar.update(1) - # Handle Baseline strategy separately if present - if AttackStrategy.Baseline in special_strategies: - await self._handle_baseline_with_foundry_results( - objectives_by_risk=objectives_by_risk, - progress_bar=progress_bar, - skip_evals=skip_evals, - ) - self.logger.info("Foundry-based attack execution completed") except Exception as e: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py index 8a5ca5afb317..4d246fbaafa2 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py @@ -191,7 +191,10 @@ def to_red_team_result( rows = [] eval_row_lookup = {} else: - self.logger.debug(f"No evaluation results available for {strategy_name}/{risk_category}") + self.logger.debug( + f"Using scorer results from data file for {strategy_name}/{risk_category} " + f"(no separate evaluation pass)" + ) # Process data file to extract conversations if data_file and os.path.exists(data_file): @@ -280,6 +283,19 @@ def to_red_team_result( else None ), } + elif "attack_success" in conv_data: + # Foundry path: RAIServiceScorer already evaluated during + # attack execution. Use scorer results from the JSONL. + attack_success = conv_data["attack_success"] + score_data = conv_data.get("score", {}) + if score_data and isinstance(score_data, dict): + score_metadata = score_data.get("metadata", {}) + raw_score = score_metadata.get("raw_score") + if raw_score is not None: + risk_assessment[risk_category] = { + "severity_label": get_harm_severity_level(raw_score), + "reason": score_data.get("rationale", ""), + } # Add to tracking arrays for statistical analysis converters.append(strategy_name) @@ -579,11 +595,20 @@ def _build_sample_payload( sample_payload["usage"] = usage_dict break - # Exclude risk_sub_type and _eval_run_output_item from metadata + # Exclude internal/scorer fields from metadata metadata = { key: value for key, value in raw_conversation.items() - if key not in {"conversation", "risk_sub_type", "_eval_run_output_item"} and not self._is_missing(value) + if key + not in { + "conversation", + "risk_sub_type", + "_eval_run_output_item", + "attack_success", + "attack_strategy", + "score", + } + and not self._is_missing(value) } if metadata: sample_payload["metadata"] = metadata @@ -1395,7 +1420,7 @@ def _compute_per_model_usage(output_items: List[Dict[str, Any]]) -> List[Dict[st @staticmethod def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Build aggregated pass/fail counts per testing criteria (risk category and attack strategy). + """Build aggregated pass/fail counts per testing criteria (risk category only). Uses ASR semantics: - passed: attack was unsuccessful (system defended) @@ -1404,8 +1429,6 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di # Track by risk category (testing_criteria) criteria: Dict[str, Dict[str, int]] = {} - # Track by attack strategy - strategy_criteria: Dict[str, Dict[str, int]] = {} for item in output_items: for result in item.get("results", []): @@ -1427,20 +1450,7 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di else: bucket["failed"] += 1 - # Track by attack strategy from properties - properties = result.get("properties", {}) - if isinstance(properties, dict): - attack_technique = properties.get("attack_technique") - if attack_technique: - strategy_bucket = strategy_criteria.setdefault( - str(attack_technique), {"passed": 0, "failed": 0} - ) - if passed_value: - strategy_bucket["passed"] += 1 - else: - strategy_bucket["failed"] += 1 - - # Build results list with risk categories + # Build results list with risk categories only (not attack strategies) results = [ { "testing_criteria": criteria_name, @@ -1450,17 +1460,6 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di for criteria_name, counts in sorted(criteria.items()) ] - # Add attack strategy summaries - for strategy_name, counts in sorted(strategy_criteria.items()): - results.append( - { - "testing_criteria": strategy_name, - "attack_strategy": strategy_name, - "passed": counts["passed"], - "failed": counts["failed"], - } - ) - return results @staticmethod