Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion crates/api/src/handlers/managed_host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,12 @@ pub(crate) async fn set_primary_dpu(
}

/// Maintenance mode: Put a machine into maintenance mode or take it out.
/// Switching a host into maintenance mode prevents an instance being assigned to it.
///
/// Switching a host into maintenance mode prevents an instance being assigned
/// to it and suppresses external alerting on the host. It also excludes the
/// host from state-machine SLA tracking so that machines being worked on by an
/// operator do not page on-call for time-in-state breaches (e.g. stuck-instance
/// alerts) regardless of which state or substate they happen to be in.
pub(crate) async fn set_maintenance(
api: &Api,
request: Request<rpc::MaintenanceRequest>,
Expand Down Expand Up @@ -297,6 +302,7 @@ pub(crate) async fn set_maintenance(
classifications: vec![
health_report::HealthAlertClassification::prevent_allocations(),
health_report::HealthAlertClassification::suppress_external_alerting(),
health_report::HealthAlertClassification::exclude_from_state_machine_sla(),
Copy link
Copy Markdown
Contributor

@krish-nvidia krish-nvidia May 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just semantics, but this will also make time_in_state_above_sla false for non-assigned machines. There’s a Grafana dashboard panel that uses forge_machines_per_state_above_sla{fresh="true", state!="assigned"}, so putting one of those machines into maintenance mode will now remove it from that view too.

I think that’s expected behavior, but maybe update the PR description/comments to call out that this applies to all machine state SLA tracking, not just assigned machines. The unit test also doesn’t create an instance, so it’s already validating this broader behavior 😄

],
}],
}
Expand Down
94 changes: 93 additions & 1 deletion crates/api/src/tests/maintenance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use common::api_fixtures::create_test_env;
use common::api_fixtures::instance::{
default_os_config, default_tenant_config, single_interface_network_config,
};
use model::machine::{FailureCause, FailureDetails, FailureSource, ManagedHostState};
use rpc::forge as rpcf;
use rpc::forge::forge_server::Forge;

Expand Down Expand Up @@ -68,7 +69,8 @@ async fn test_maintenance(db_pool: sqlx::PgPool) -> Result<(), eyre::Report> {
tenant_message: None,
classifications: vec![
"PreventAllocations".to_string(),
"SuppressExternalAlerting".to_string()
"SuppressExternalAlerting".to_string(),
"ExcludeFromStateMachineSla".to_string(),
]
}
);
Expand Down Expand Up @@ -285,3 +287,93 @@ async fn test_maintenance_multi_dpu(db_pool: sqlx::PgPool) -> Result<(), eyre::R

Ok(())
}

/// test: putting a machine into maintenance mode must suppress any stuck instance alerts.
///
/// We check a machine in maintenance mode for its contribution to the state-machine
/// SLA-breach signal that drives the `stuckInstanceCritical` Prometheus alert.
///
/// This makes use of the tactic from `test_state_sla` (force the machine into
/// `ManagedHostState::Failed`, which has a zero-second SLA, so the machine is
/// instantly "above SLA" without us having to wait out a real SLA window)
#[crate::sqlx_test]
async fn test_maintenance_suppresses_state_machine_sla_alert(
db_pool: sqlx::PgPool,
) -> Result<(), eyre::Report> {
let env = create_test_env(db_pool.clone()).await;
let (host_id, _dpu_id) = create_managed_host(&env).await.into();
let rpc_host_id: MachineId = host_id;

// force the host into Failed state (0-second SLA).
// this is what would otherwise drive `carbide_machines_per_state_above_sla > 0`
// and page on-call via `stuckInstanceCritical`.
let mut txn = env.db_txn().await;
db::machine::update_state(
&mut txn,
&host_id,
&ManagedHostState::Failed {
details: FailureDetails {
cause: FailureCause::NoError,
failed_at: chrono::Utc::now(),
source: FailureSource::NoError,
},
machine_id: host_id,
retry_count: 1,
},
)
.await
.unwrap();
txn.commit().await.unwrap();

// with no maintenance override, the machine reports as
// above-SLA and would be counted by the stuck-instance alert metric.
let machine = env.find_machine(rpc_host_id).await.remove(0);
let sla = machine.state_sla.as_ref().unwrap();
assert!(
sla.time_in_state_above_sla,
"expected the Failed-state host to be above SLA before maintenance is enabled",
);

// enable maintenance mode
env.api
.set_maintenance(tonic::Request::new(rpcf::MaintenanceRequest {
operation: rpcf::MaintenanceOperation::Enable.into(),
host_id: Some(rpc_host_id),
reference: Some("https://jira.example.com/ABC-123".to_string()),
}))
.await
.unwrap();

// SetMaintenance now adds the ExcludeFromStateMachineSla
// classification, so state_sla() short-circuits to no_sla() and the
// host stops contributing to the stuck-instance Prometheus metric.
let machine = env.find_machine(rpc_host_id).await.remove(0);
let sla = machine.state_sla.as_ref().unwrap();
assert!(
!sla.time_in_state_above_sla,
"maintenance mode must suppress state-machine SLA breach",
);
assert!(
sla.sla.is_none(),
"maintenance mode must produce a no-SLA result regardless of current state",
);

// disabling maintenance should re-expose the breach.
env.api
.set_maintenance(tonic::Request::new(rpcf::MaintenanceRequest {
operation: rpcf::MaintenanceOperation::Disable.into(),
host_id: Some(rpc_host_id),
reference: None,
}))
.await
.unwrap();

let machine = env.find_machine(rpc_host_id).await.remove(0);
let sla = machine.state_sla.as_ref().unwrap();
assert!(
sla.time_in_state_above_sla,
"disabling maintenance should re-expose the above-SLA condition",
);

Ok(())
}
2 changes: 2 additions & 0 deletions docs/architecture/health/health_alert_classifications.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ site-wide fleet-health. This is achieved by metrics/alerting queries ignoring th
Hosts with this classification will not be counted towards state machine transition time SLA.
This classification is mostly used to prevent the state machine from continuously alerting when some manual operations are being performed on the machine.

It is applied automatically (together with `PreventAllocations` and `SuppressExternalAlerting`) when a host is placed into maintenance mode via the `SetMaintenance` RPC, so that stuck-instance / state-machine SLA alerts do not page on-call for hosts an operator is actively working on — regardless of which state or substate the host is in at the time.

### `StopRebootForAutomaticRecoveryFromStateMachine`

For hosts with this classification, the NICo state machine will not automatically
Expand Down
Loading