Azure · slister1001 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
@@ -366,6 +366,8 @@ async def log_redteam_results_to_mlflow(
                                 self.logger.debug(f"Logged metric: {risk_category}_{key} = {value}")
 
             if self._one_dp_project:
+                run_id = getattr(eval_run, "id", "unknown")
+                run_display_name = getattr(eval_run, "display_name", None) or "unknown"
                 # Step 1: Upload evaluation results (blob upload + version create)
                 evaluation_result_id = None
                 try:
@@ -379,7 +381,10 @@ async def log_redteam_results_to_mlflow(
                     )
                     evaluation_result_id = create_evaluation_result_response.id
                 except Exception as e:
-                    self.logger.error(f"Failed to create evaluation result: {str(e)}")
+                    self.logger.error(
+                        f"Failed to create evaluation result for run {run_id} ({run_display_name}): {str(e)}",
+                        exc_info=True,
+                    )
 
                 # Step 2: Always update the run status, even if result upload failed
                 outputs = None
@@ -399,7 +404,10 @@ async def log_redteam_results_to_mlflow(
                     )
                     self.logger.debug(f"Updated UploadRun: {update_run_response.id}")
                 except Exception as e:
-                    self.logger.error(f"Failed to update red team run status: {str(e)}")
+                    self.logger.error(
+                        f"Failed to update red team run status for run {run_id}: {str(e)}",
+                        exc_info=True,
+                    )
             else:
                 # Log the entire directory to MLFlow
                 try:
@@ -431,19 +439,23 @@ def update_run_status(self, eval_run, status: str) -> None:
         """
         if not self._one_dp_project:
             return
+        run_id = getattr(eval_run, "id", "unknown")
+        run_display_name = getattr(eval_run, "display_name", None)
         try:
             self.generated_rai_client._evaluation_onedp_client.update_red_team_run(
                 name=eval_run.id,
                 red_team=RedTeamUpload(
                     id=eval_run.id,
-                    display_name=getattr(eval_run, "display_name", None)
-                    or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
+                    display_name=run_display_name or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
                     status=status,
                 ),
             )
-            self.logger.info(f"Updated red team run status to '{status}'")
+            self.logger.info(f"Updated red team run {run_id} status to '{status}'")
         except Exception as e:
-            self.logger.error(f"Failed to update red team run status to '{status}': {str(e)}")
+            self.logger.error(
+                f"Failed to update red team run {run_id} status to '{status}': {str(e)}",
+                exc_info=True,
+            )
 
     def _build_instance_results_payload(
         self,

@@ -1432,7 +1432,11 @@ async def scan(
 
                 # Process and return results
                 return await self._finalize_results(skip_upload, skip_evals, eval_run, output_path, scan_name)
-            except Exception:
+            except Exception as e:
+                self.logger.error(
+                    f"Red team scan execution failed for run {getattr(eval_run, 'id', 'unknown')}: {str(e)}",
+                    exc_info=True,
+                )
                 # Ensure the run status is updated to Failed if an upload was started
                 if not skip_upload and self.mlflow_integration is not None:
                     self.mlflow_integration.update_run_status(eval_run, "Failed")
@@ -1785,7 +1789,10 @@ async def _execute_attacks_with_foundry(
                 objectives_by_risk=objectives_by_risk,
             )
 
-            # Update red_team_info with Foundry results
+            # Update red_team_info with Foundry results.
+            # The RAIServiceScorer already evaluated each response during attack
+            # execution, so results (attack_success, score) are in the JSONL.
+            # No need for a second evaluation_processor.evaluate() call.
             for strategy_name, risk_data in foundry_results.items():
                 if strategy_name not in self.red_team_info:
                     self.red_team_info[strategy_name] = {}
@@ -1805,48 +1812,9 @@ async def _execute_attacks_with_foundry(
                         "asr": result_data.get("asr", 0.0),
                     }
 
-                    # Run evaluation if not skipping and we have a data file
-                    if not skip_evals and data_file and os.path.exists(data_file):
-                        progress_bar.set_postfix({"current": f"evaluating {risk_value}"})
-                        try:
-                            # Find the risk category enum from value
-                            risk_category_enum = next(
-                                (rc for rc in self.risk_categories if rc.value == risk_value),
-                                None,
-                            )
-                            if risk_category_enum and self.evaluation_processor:
-                                # Find matching strategy for evaluation
-                                all_strategies = foundry_strategies + special_strategies
-                                strategy_for_eval = next(
-                                    (s for s in all_strategies if get_strategy_name(s) == strategy_name),
-                                    AttackStrategy.Baseline,  # Fallback
-                                )
-
-                                await self.evaluation_processor.evaluate(
-                                    scan_name=None,
-                                    risk_category=risk_category_enum,
-                                    strategy=strategy_for_eval,
-                                    _skip_evals=False,
-                                    data_path=data_file,
-                                    output_path=None,
-                                    red_team_info=self.red_team_info,
-                                )
-                        except Exception as eval_error:
-                            self.logger.warning(f"Evaluation error for {strategy_name}/{risk_value}: {str(eval_error)}")
-                            # Don't fail the whole execution for eval errors
-                            tqdm.write(f"⚠️ Evaluation warning for {strategy_name}/{risk_value}: {str(eval_error)}")
-
                     self.completed_tasks += 1
                     progress_bar.update(1)
 
-            # Handle Baseline strategy separately if present
-            if AttackStrategy.Baseline in special_strategies:
-                await self._handle_baseline_with_foundry_results(
-                    objectives_by_risk=objectives_by_risk,
-                    progress_bar=progress_bar,
-                    skip_evals=skip_evals,
-                )
-
             self.logger.info("Foundry-based attack execution completed")
 
         except Exception as e:

@@ -191,7 +191,10 @@ def to_red_team_result(
                         rows = []
                         eval_row_lookup = {}
                 else:
-                    self.logger.debug(f"No evaluation results available for {strategy_name}/{risk_category}")
+                    self.logger.debug(
+                        f"Using scorer results from data file for {strategy_name}/{risk_category} "
+                        f"(no separate evaluation pass)"
+                    )
 
                 # Process data file to extract conversations
                 if data_file and os.path.exists(data_file):
@@ -280,6 +283,19 @@ def to_red_team_result(
                                                                 else None
                                                             ),
                                                         }
+                                        elif "attack_success" in conv_data:
+                                            # Foundry path: RAIServiceScorer already evaluated during
+                                            # attack execution. Use scorer results from the JSONL.
+                                            attack_success = conv_data["attack_success"]
+                                            score_data = conv_data.get("score", {})
+                                            if score_data and isinstance(score_data, dict):
+                                                score_metadata = score_data.get("metadata", {})
+                                                raw_score = score_metadata.get("raw_score")
+                                                if raw_score is not None:
+                                                    risk_assessment[risk_category] = {
+                                                        "severity_label": get_harm_severity_level(raw_score),
+                                                        "reason": score_data.get("rationale", ""),
+                                                    }
 
                                         # Add to tracking arrays for statistical analysis
                                         converters.append(strategy_name)
@@ -579,11 +595,20 @@ def _build_sample_payload(
                             sample_payload["usage"] = usage_dict
                             break
 
-        # Exclude risk_sub_type and _eval_run_output_item from metadata
+        # Exclude internal/scorer fields from metadata
         metadata = {
             key: value
             for key, value in raw_conversation.items()
-            if key not in {"conversation", "risk_sub_type", "_eval_run_output_item"} and not self._is_missing(value)
+            if key
+            not in {
+                "conversation",
+                "risk_sub_type",
+                "_eval_run_output_item",
+                "attack_success",
+                "attack_strategy",
+                "score",
+            }
+            and not self._is_missing(value)
         }
         if metadata:
             sample_payload["metadata"] = metadata
@@ -1395,7 +1420,7 @@ def _compute_per_model_usage(output_items: List[Dict[str, Any]]) -> List[Dict[st
 
     @staticmethod
     def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Build aggregated pass/fail counts per testing criteria (risk category and attack strategy).
+        """Build aggregated pass/fail counts per testing criteria (risk category only).
 
         Uses ASR semantics:
         - passed: attack was unsuccessful (system defended)
@@ -1404,8 +1429,6 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
 
         # Track by risk category (testing_criteria)
         criteria: Dict[str, Dict[str, int]] = {}
-        # Track by attack strategy
-        strategy_criteria: Dict[str, Dict[str, int]] = {}
 
         for item in output_items:
             for result in item.get("results", []):
@@ -1427,20 +1450,7 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
                 else:
                     bucket["failed"] += 1
 
-                # Track by attack strategy from properties
-                properties = result.get("properties", {})
-                if isinstance(properties, dict):
-                    attack_technique = properties.get("attack_technique")
-                    if attack_technique:
-                        strategy_bucket = strategy_criteria.setdefault(
-                            str(attack_technique), {"passed": 0, "failed": 0}
-                        )
-                        if passed_value:
-                            strategy_bucket["passed"] += 1
-                        else:
-                            strategy_bucket["failed"] += 1
-
-        # Build results list with risk categories
+        # Build results list with risk categories only (not attack strategies)
         results = [
             {
                 "testing_criteria": criteria_name,
@@ -1450,17 +1460,6 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
             for criteria_name, counts in sorted(criteria.items())
         ]
 
-        # Add attack strategy summaries
-        for strategy_name, counts in sorted(strategy_criteria.items()):
-            results.append(
-                {
-                    "testing_criteria": strategy_name,
-                    "attack_strategy": strategy_name,
-                    "passed": counts["passed"],
-                    "failed": counts["failed"],
-                }
-            )
-
         return results
 
     @staticmethod