Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions crates/admin-cli/src/machine/health_report/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,11 @@ pub fn get_health_report(template: HealthReportTemplates, message: Option<String
// (admin machine force-delete is unchanged). Merge source `request-online-repair` is separate
// from `tenant-reported-issue`.
HealthReportTemplates::RequestOnlineRepair => {
report.source = "request-online-repair".to_string();
report.source = health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string();
report.alerts[0].id = HealthProbeId::from_str("RequestOnlineRepair")
.expect("RequestOnlineRepair is a valid non-empty HealthProbeId");
report.alerts[0].target = Some("request-online-repair".to_string());
report.alerts[0].target =
Some(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string());
report.alerts[0].classifications = vec![
HealthAlertClassification::prevent_allocations(),
HealthAlertClassification::suppress_external_alerting(),
Expand All @@ -150,7 +151,7 @@ pub fn get_health_report(template: HealthReportTemplates, message: Option<String
// Template to indicate that the instance is identified as unhealthy and
// is ready to be picked by Repair System for diagnosis and fix.
HealthReportTemplates::RequestRepair => {
report.source = "repair-request".to_string();
report.source = health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string();
report.alerts[0].id = HealthProbeId::from_str("RequestRepair")
.expect("RequestRepair is a valid non-empty HealthProbeId");
report.alerts[0].target = Some("repair-requested".to_string());
Expand Down Expand Up @@ -261,7 +262,7 @@ mod tests {
Some("Hardware diagnostics indicate memory failure".to_string()),
);

assert_eq!(report.source, "repair-request");
assert_eq!(report.source, health_report::REPAIR_REQUEST_MERGE_SOURCE);
assert_eq!(report.alerts.len(), 1);

let alert = &report.alerts[0];
Expand Down Expand Up @@ -299,7 +300,7 @@ mod tests {
fn test_request_repair_template_with_empty_message() {
let report = get_health_report(HealthReportTemplates::RequestRepair, None);

assert_eq!(report.source, "repair-request");
assert_eq!(report.source, health_report::REPAIR_REQUEST_MERGE_SOURCE);
assert_eq!(report.alerts[0].message, "");
}

Expand Down Expand Up @@ -350,15 +351,21 @@ mod tests {
Some("Online repair handoff for stuck repair workflow".to_string()),
);

assert_eq!(report.source, "request-online-repair");
assert_eq!(
report.source,
health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE
);
assert_eq!(report.alerts.len(), 1);

let alert = &report.alerts[0];
assert_eq!(
alert.id,
HealthProbeId::from_str("RequestOnlineRepair").unwrap()
);
assert_eq!(alert.target, Some("request-online-repair".to_string()));
assert_eq!(
alert.target,
Some(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string())
);
assert_eq!(
alert.message,
"Online repair handoff for stuck repair workflow"
Expand Down Expand Up @@ -387,7 +394,10 @@ mod tests {
fn test_request_online_repair_template_with_empty_message() {
let report = get_health_report(HealthReportTemplates::RequestOnlineRepair, None);

assert_eq!(report.source, "request-online-repair");
assert_eq!(
report.source,
health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE
);
assert_eq!(report.alerts[0].message, "");
}

Expand Down
10 changes: 10 additions & 0 deletions crates/api-model/src/health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ pub struct HealthReportSources {
}

impl HealthReportSources {
/// True when a repair-related health merge override is active (`repair-request` or
/// `request-online-repair`).
pub fn repair_merge_active(&self) -> bool {
self.merges
.contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE)
|| self
.merges
.contains_key(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE)
}

#[allow(clippy::should_implement_trait)]
pub fn iter(&self) -> impl Iterator<Item = (&HealthReport, HealthReportApplyMode)> {
self.merges
Expand Down
4 changes: 4 additions & 0 deletions crates/api-model/src/instance/status/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ pub enum TenantState {
Failed,
/// Not sure what happened. Check log for more info
Invalid,
/// Instance is undergoing online repair while otherwise tenant-ready. Set by
/// `instance_status_tenant_state` in the RPC model layer when a repair health merge
/// is active and the instance would otherwise be [`Ready`].
Repairing,
}

#[cfg(test)]
Expand Down
11 changes: 7 additions & 4 deletions crates/api/src/handlers/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ fn create_tenant_reported_issue_override(
/// Creates a RequestRepair health override template
fn create_request_repair_override(issue: &rpc::Issue) -> HealthReport {
HealthReport {
source: "repair-request".to_string(),
source: health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(),
observed_at: Some(chrono::Utc::now()),
alerts: vec![HealthProbeAlert {
id: HealthProbeId::from_str("RequestRepair")
Expand Down Expand Up @@ -443,7 +443,10 @@ async fn handle_instance_release_from_repair_tenant(
machine: &model::machine::Machine,
tenant_organization_id: &str,
) -> Result<(), CarbideError> {
let has_request_repair = machine.health_reports.merges.contains_key("repair-request");
let has_request_repair = machine
.health_reports
.merges
.contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE);

if !has_request_repair {
// No existing RequestRepair override
Expand Down Expand Up @@ -489,7 +492,7 @@ async fn handle_instance_release_from_repair_tenant(
remove_health_override(
txn,
machine_id,
"repair-request",
health_report::REPAIR_REQUEST_MERGE_SOURCE,
"RequestRepair removed - repair completed successfully",
)
.await?;
Expand Down Expand Up @@ -530,7 +533,7 @@ async fn handle_instance_release_from_repair_tenant(
remove_health_override(
txn,
machine_id,
"repair-request",
health_report::REPAIR_REQUEST_MERGE_SOURCE,
"RequestRepair removed for incomplete repair",
)
.await?;
Expand Down
3 changes: 2 additions & 1 deletion crates/api/src/tests/dpu_reprovisioning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,8 @@ async fn assert_reprov_tenant_state(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down
39 changes: 26 additions & 13 deletions crates/api/src/tests/host_bmc_firmware_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1253,7 +1253,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1292,7 +1293,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1341,7 +1343,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1387,7 +1390,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1423,7 +1427,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1481,7 +1486,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1517,7 +1523,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1565,7 +1572,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1633,7 +1641,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1697,7 +1706,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1733,7 +1743,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1767,7 +1778,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down Expand Up @@ -1796,7 +1808,8 @@ async fn test_instance_upgrading_actual_part_2(
host.state.clone().value,
None,
None,
None
None,
&host.health_reports,
)
.unwrap()
.tenant
Expand Down
16 changes: 10 additions & 6 deletions crates/api/src/tests/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5315,7 +5315,7 @@ async fn test_instance_release_backward_compatibility(_: PgPoolOptions, options:
!host_machine
.health_reports
.merges
.contains_key("repair-request"),
.contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE),
"Backward compatibility: RequestRepair override should NOT be applied without issue field"
);

Expand Down Expand Up @@ -5424,7 +5424,7 @@ async fn test_instance_release_repair_tenant(_: PgPoolOptions, options: PgConnec
let has_repair_request_override = host_machine
.health_reports
.merges
.contains_key("repair-request");
.contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE);

assert!(
!has_tenant_reported_override,
Expand Down Expand Up @@ -5522,7 +5522,7 @@ async fn test_instance_release_combined_enhancements(_: PgPoolOptions, options:
let has_repair_request_override = host_machine
.health_reports
.merges
.contains_key("repair-request");
.contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE);

assert!(
!has_repair_request_override,
Expand Down Expand Up @@ -5716,14 +5716,18 @@ async fn test_instance_release_auto_repair_enabled(_: PgPoolOptions, options: Pg
host_machine
.health_reports
.merges
.contains_key("repair-request"),
.contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE),
"Should have RequestRepair override when auto-repair is enabled"
);

// 4. Verify the RequestRepair override content
let repair_override = &host_machine.health_reports.merges["repair-request"];
let repair_override =
&host_machine.health_reports.merges[health_report::REPAIR_REQUEST_MERGE_SOURCE];
let repair_report: health_report::HealthReport = repair_override.clone();
assert_eq!(repair_report.source, "repair-request");
assert_eq!(
repair_report.source,
health_report::REPAIR_REQUEST_MERGE_SOURCE
);
assert_eq!(repair_report.alerts.len(), 1);
assert_eq!(repair_report.alerts[0].id.to_string(), "RequestRepair");
assert!(
Expand Down
11 changes: 7 additions & 4 deletions crates/api/src/tests/machine_health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1136,7 +1136,7 @@ async fn test_request_repair_health_override_template(

// Create a RequestRepair health override using the API
let repair_request_override = health_report::HealthReport {
source: "repair-request".to_string(),
source: health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(),
triggered_by: None,
observed_at: Some(chrono::Utc::now()),
successes: vec![],
Expand Down Expand Up @@ -1174,7 +1174,10 @@ async fn test_request_repair_health_override_template(
machine.health_sources[1].mode,
HealthReportApplyMode::Merge as i32
);
assert_eq!(machine.health_sources[1].source, "repair-request");
assert_eq!(
machine.health_sources[1].source,
health_report::REPAIR_REQUEST_MERGE_SOURCE
);

// Verify aggregate health includes the override
let aggregate_health = aggregate(machine).unwrap();
Expand Down Expand Up @@ -1232,7 +1235,7 @@ async fn test_tenant_reported_issue_and_request_repair_combined(
};

let repair_request_override = health_report::HealthReport {
source: "repair-request".to_string(),
source: health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(),
triggered_by: None,
observed_at: Some(chrono::Utc::now()),
successes: vec![],
Expand Down Expand Up @@ -1278,7 +1281,7 @@ async fn test_tenant_reported_issue_and_request_repair_combined(
.map(|o| o.source.clone())
.collect();
assert!(sources.contains(&"tenant-reported-issue".to_string()));
assert!(sources.contains(&"repair-request".to_string()));
assert!(sources.contains(&health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string()));

// All should be merge mode
for override_entry in &machine.health_sources {
Expand Down
9 changes: 7 additions & 2 deletions crates/health-report/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ use std::str::FromStr;

use serde::{Deserialize, Serialize};

/// `HealthReportSources::merges` key for the auto-repair (`RequestRepair`) override.
pub const REPAIR_REQUEST_MERGE_SOURCE: &str = "repair-request";
/// `HealthReportSources::merges` key for online repair gating (`RequestOnlineRepair` override).
pub const REQUEST_ONLINE_REPAIR_MERGE_SOURCE: &str = "request-online-repair";

/// Reports the aggregate health of a system or subsystem
#[derive(PartialEq, Eq, Debug, Clone, Serialize, Deserialize)]
pub struct HealthReport {
Expand Down Expand Up @@ -763,13 +768,13 @@ mod tests {
// Shape matches admin-cli `HealthReportTemplates::RequestOnlineRepair` (merge source
// `request-online-repair`, probe id `RequestOnlineRepair`).
let report = HealthReport {
source: "request-online-repair".to_string(),
source: REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string(),
triggered_by: None,
observed_at: Some(chrono::Utc::now()),
successes: vec![],
alerts: vec![HealthProbeAlert {
id: HealthProbeId::from_str("RequestOnlineRepair").unwrap(),
target: Some("request-online-repair".to_string()),
target: Some(REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string()),
in_alert_since: None,
message: "test".to_string(),
tenant_message: None,
Expand Down
Loading
Loading