Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,8 @@ async def log_redteam_results_to_mlflow(
self.logger.debug(f"Logged metric: {risk_category}_{key} = {value}")

if self._one_dp_project:
run_id = getattr(eval_run, "id", "unknown")
run_display_name = getattr(eval_run, "display_name", None) or "unknown"
# Step 1: Upload evaluation results (blob upload + version create)
evaluation_result_id = None
try:
Expand All @@ -379,7 +381,10 @@ async def log_redteam_results_to_mlflow(
)
evaluation_result_id = create_evaluation_result_response.id
except Exception as e:
self.logger.error(f"Failed to create evaluation result: {str(e)}")
self.logger.error(
f"Failed to create evaluation result for run {run_id} ({run_display_name}): {str(e)}",
exc_info=True,
)

# Step 2: Always update the run status, even if result upload failed
outputs = None
Expand All @@ -399,7 +404,10 @@ async def log_redteam_results_to_mlflow(
)
self.logger.debug(f"Updated UploadRun: {update_run_response.id}")
except Exception as e:
self.logger.error(f"Failed to update red team run status: {str(e)}")
self.logger.error(
f"Failed to update red team run status for run {run_id}: {str(e)}",
exc_info=True,
)
else:
# Log the entire directory to MLFlow
try:
Expand Down Expand Up @@ -431,19 +439,23 @@ def update_run_status(self, eval_run, status: str) -> None:
"""
if not self._one_dp_project:
return
run_id = getattr(eval_run, "id", "unknown")
run_display_name = getattr(eval_run, "display_name", None)
try:
self.generated_rai_client._evaluation_onedp_client.update_red_team_run(
name=eval_run.id,
red_team=RedTeamUpload(
id=eval_run.id,
display_name=getattr(eval_run, "display_name", None)
or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
display_name=run_display_name or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
status=status,
),
)
self.logger.info(f"Updated red team run status to '{status}'")
self.logger.info(f"Updated red team run {run_id} status to '{status}'")
except Exception as e:
self.logger.error(f"Failed to update red team run status to '{status}': {str(e)}")
self.logger.error(
f"Failed to update red team run {run_id} status to '{status}': {str(e)}",
exc_info=True,
)

def _build_instance_results_payload(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1432,7 +1432,11 @@ async def scan(

# Process and return results
return await self._finalize_results(skip_upload, skip_evals, eval_run, output_path, scan_name)
except Exception:
except Exception as e:
self.logger.error(
f"Red team scan execution failed for run {getattr(eval_run, 'id', 'unknown')}: {str(e)}",
exc_info=True,
)
# Ensure the run status is updated to Failed if an upload was started
if not skip_upload and self.mlflow_integration is not None:
self.mlflow_integration.update_run_status(eval_run, "Failed")
Expand Down Expand Up @@ -1785,7 +1789,10 @@ async def _execute_attacks_with_foundry(
objectives_by_risk=objectives_by_risk,
)

# Update red_team_info with Foundry results
# Update red_team_info with Foundry results.
# The RAIServiceScorer already evaluated each response during attack
# execution, so results (attack_success, score) are in the JSONL.
# No need for a second evaluation_processor.evaluate() call.
for strategy_name, risk_data in foundry_results.items():
if strategy_name not in self.red_team_info:
self.red_team_info[strategy_name] = {}
Expand All @@ -1805,48 +1812,9 @@ async def _execute_attacks_with_foundry(
"asr": result_data.get("asr", 0.0),
}

# Run evaluation if not skipping and we have a data file
if not skip_evals and data_file and os.path.exists(data_file):
progress_bar.set_postfix({"current": f"evaluating {risk_value}"})
try:
# Find the risk category enum from value
risk_category_enum = next(
(rc for rc in self.risk_categories if rc.value == risk_value),
None,
)
if risk_category_enum and self.evaluation_processor:
# Find matching strategy for evaluation
all_strategies = foundry_strategies + special_strategies
strategy_for_eval = next(
(s for s in all_strategies if get_strategy_name(s) == strategy_name),
AttackStrategy.Baseline, # Fallback
)

await self.evaluation_processor.evaluate(
scan_name=None,
risk_category=risk_category_enum,
strategy=strategy_for_eval,
_skip_evals=False,
data_path=data_file,
output_path=None,
red_team_info=self.red_team_info,
)
except Exception as eval_error:
self.logger.warning(f"Evaluation error for {strategy_name}/{risk_value}: {str(eval_error)}")
# Don't fail the whole execution for eval errors
tqdm.write(f"⚠️ Evaluation warning for {strategy_name}/{risk_value}: {str(eval_error)}")

self.completed_tasks += 1
progress_bar.update(1)

# Handle Baseline strategy separately if present
if AttackStrategy.Baseline in special_strategies:
await self._handle_baseline_with_foundry_results(
objectives_by_risk=objectives_by_risk,
progress_bar=progress_bar,
skip_evals=skip_evals,
)

self.logger.info("Foundry-based attack execution completed")

except Exception as e:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,10 @@ def to_red_team_result(
rows = []
eval_row_lookup = {}
else:
self.logger.debug(f"No evaluation results available for {strategy_name}/{risk_category}")
self.logger.debug(
f"Using scorer results from data file for {strategy_name}/{risk_category} "
f"(no separate evaluation pass)"
)

# Process data file to extract conversations
if data_file and os.path.exists(data_file):
Expand Down Expand Up @@ -280,6 +283,19 @@ def to_red_team_result(
else None
),
}
elif "attack_success" in conv_data:
# Foundry path: RAIServiceScorer already evaluated during
# attack execution. Use scorer results from the JSONL.
attack_success = conv_data["attack_success"]
score_data = conv_data.get("score", {})
if score_data and isinstance(score_data, dict):
score_metadata = score_data.get("metadata", {})
raw_score = score_metadata.get("raw_score")
if raw_score is not None:
risk_assessment[risk_category] = {
"severity_label": get_harm_severity_level(raw_score),
"reason": score_data.get("rationale", ""),
}

# Add to tracking arrays for statistical analysis
converters.append(strategy_name)
Expand Down Expand Up @@ -579,11 +595,20 @@ def _build_sample_payload(
sample_payload["usage"] = usage_dict
break

# Exclude risk_sub_type and _eval_run_output_item from metadata
# Exclude internal/scorer fields from metadata
metadata = {
key: value
for key, value in raw_conversation.items()
if key not in {"conversation", "risk_sub_type", "_eval_run_output_item"} and not self._is_missing(value)
if key
not in {
"conversation",
"risk_sub_type",
"_eval_run_output_item",
"attack_success",
"attack_strategy",
"score",
}
and not self._is_missing(value)
}
if metadata:
sample_payload["metadata"] = metadata
Expand Down Expand Up @@ -1395,7 +1420,7 @@ def _compute_per_model_usage(output_items: List[Dict[str, Any]]) -> List[Dict[st

@staticmethod
def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Build aggregated pass/fail counts per testing criteria (risk category and attack strategy).
"""Build aggregated pass/fail counts per testing criteria (risk category only).

Uses ASR semantics:
- passed: attack was unsuccessful (system defended)
Expand All @@ -1404,8 +1429,6 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di

# Track by risk category (testing_criteria)
criteria: Dict[str, Dict[str, int]] = {}
# Track by attack strategy
strategy_criteria: Dict[str, Dict[str, int]] = {}

for item in output_items:
for result in item.get("results", []):
Expand All @@ -1427,20 +1450,7 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
else:
bucket["failed"] += 1

# Track by attack strategy from properties
properties = result.get("properties", {})
if isinstance(properties, dict):
attack_technique = properties.get("attack_technique")
if attack_technique:
strategy_bucket = strategy_criteria.setdefault(
str(attack_technique), {"passed": 0, "failed": 0}
)
if passed_value:
strategy_bucket["passed"] += 1
else:
strategy_bucket["failed"] += 1

# Build results list with risk categories
# Build results list with risk categories only (not attack strategies)
results = [
{
"testing_criteria": criteria_name,
Expand All @@ -1450,17 +1460,6 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
for criteria_name, counts in sorted(criteria.items())
]

# Add attack strategy summaries
for strategy_name, counts in sorted(strategy_criteria.items()):
results.append(
{
"testing_criteria": strategy_name,
"attack_strategy": strategy_name,
"passed": counts["passed"],
"failed": counts["failed"],
}
)

return results

@staticmethod
Expand Down
Loading