From f3ec59618afed39edbd7658f31b8b66fbca20bfd Mon Sep 17 00:00:00 2001 From: Narasimhan Venkadeswaran Date: Sun, 17 May 2026 16:41:59 -0700 Subject: [PATCH 1/7] fix(configurenmxc): make scale-up fabric state deterministic --- Cargo.lock | 5 +- Cargo.toml | 2 +- crates/api-model/src/rack.rs | 17 +++-- crates/api-test-helper/src/mock_rms.rs | 69 +++++++++++++++++++ crates/api/src/rack/rms_client.rs | 54 +++++++++++++++ .../src/state_controller/rack/maintenance.rs | 4 ++ 6 files changed, 144 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a8a50c1f0d..8fd99f36de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6190,7 +6190,7 @@ dependencies = [ [[package]] name = "librms" version = "0.0.12" -source = "git+https://github.com/NVIDIA/nv-rms-client.git?tag=v0.0.12-rc1#94a7d944e80075cd57d04bdf0ebf2db497e205db" +source = "git+https://github.com/NVIDIA/nv-rms-client.git?tag=v0.0.12-rc2#87cd51aae55b7c23489144500059c8b7c4d1a43c" dependencies = [ "async-trait", "chrono", @@ -6200,6 +6200,7 @@ dependencies = [ "hyper-timeout", "hyper-util", "prost", + "prost-types", "rustls", "rustls-pemfile", "serde", @@ -11206,7 +11207,7 @@ dependencies = [ [[package]] name = "tonic-client-wrapper" version = "1.0.0" -source = "git+https://github.com/NVIDIA/nv-rms-client.git?tag=v0.0.12-rc1#94a7d944e80075cd57d04bdf0ebf2db497e205db" +source = "git+https://github.com/NVIDIA/nv-rms-client.git?tag=v0.0.12-rc2#87cd51aae55b7c23489144500059c8b7c4d1a43c" dependencies = [ "async-trait", "heck", diff --git a/Cargo.toml b/Cargo.toml index ab65a3e15a..364399b717 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,7 @@ authors = ["NVIDIA Carbide Engineering "] [workspace.dependencies] clap = { version = "4", features = ["derive", "env"] } libredfish = { git = "https://github.com/NVIDIA/libredfish.git", tag = "v0.44.2" } -librms = { git = "https://github.com/NVIDIA/nv-rms-client.git", tag = "v0.0.12-rc1" } +librms = { git = "https://github.com/NVIDIA/nv-rms-client.git", tag = "v0.0.12-rc2" } ansi-to-html = "0.2.2" tokio = { version = "1", features = ["full", "tracing"] } diff --git a/crates/api-model/src/rack.rs b/crates/api-model/src/rack.rs index c7a3cf3d92..064d48becf 100644 --- a/crates/api-model/src/rack.rs +++ b/crates/api-model/src/rack.rs @@ -456,13 +456,16 @@ impl Display for RackMaintenanceState { /// Sub-states of `RackMaintenanceState::ConfigureNmxCluster`. /// -/// `Start` selects a primary switch and asks RMS to configure the -/// NMX cluster. `WaitForFabricStatus` polls -/// `GetScaleUpFabricServicesStatus` and persists the per-switch -/// `fabric_manager_status` before advancing. +/// `Start` selects and persists the primary switch. `DisableScaleUpFabricState` +/// disables ScaleUpFabric state on all scoped switches before +/// `ConfigureScaleUpFabricManager` configures only the primary switch. +/// `WaitForFabricStatus` polls `GetScaleUpFabricServicesStatus` and persists +/// the per-switch `fabric_manager_status` before advancing. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum ConfigureNmxClusterState { Start, + DisableScaleUpFabricState, + ConfigureScaleUpFabricManager, WaitForFabricStatus, } @@ -470,6 +473,12 @@ impl Display for ConfigureNmxClusterState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { ConfigureNmxClusterState::Start => write!(f, "Start"), + ConfigureNmxClusterState::DisableScaleUpFabricState => { + write!(f, "DisableScaleUpFabricState") + } + ConfigureNmxClusterState::ConfigureScaleUpFabricManager => { + write!(f, "ConfigureScaleUpFabricManager") + } ConfigureNmxClusterState::WaitForFabricStatus => write!(f, "WaitForFabricStatus"), } } diff --git a/crates/api-test-helper/src/mock_rms.rs b/crates/api-test-helper/src/mock_rms.rs index 65f0d71fee..0d6aaed209 100644 --- a/crates/api-test-helper/src/mock_rms.rs +++ b/crates/api-test-helper/src/mock_rms.rs @@ -170,6 +170,10 @@ pub struct MockRmsApi { Mutex>>, configure_scale_up_fabric_manager_calls: Mutex>, + set_scale_up_fabric_state_responses: + Mutex>>, + set_scale_up_fabric_state_calls: Mutex>, + enable_scale_up_fabric_telemetry_interface_responses: Mutex< VecDeque>, >, @@ -253,6 +257,8 @@ impl MockRmsApi { poll_job_status_calls: Default::default(), configure_scale_up_fabric_manager_responses: Default::default(), configure_scale_up_fabric_manager_calls: Default::default(), + set_scale_up_fabric_state_responses: Default::default(), + set_scale_up_fabric_state_calls: Default::default(), enable_scale_up_fabric_telemetry_interface_responses: Default::default(), enable_scale_up_fabric_telemetry_interface_calls: Default::default(), version_responses: Default::default(), @@ -504,6 +510,14 @@ impl MockRmsApi { rms::ConfigureScaleUpFabricManagerRequest, rms::ConfigureScaleUpFabricManagerResponse ); + impl_enqueue_inspect!( + enqueue_set_scale_up_fabric_state, + set_scale_up_fabric_state_calls, + set_scale_up_fabric_state_responses, + set_scale_up_fabric_state_calls, + rms::SetScaleUpFabricStateRequest, + rms::SetScaleUpFabricStateResponse + ); impl_enqueue_inspect!( enqueue_enable_scale_up_fabric_telemetry_interface, enable_scale_up_fabric_telemetry_interface_calls, @@ -828,6 +842,54 @@ impl RmsApi for MockRmsApi { self.get_firmware_job_status_calls.lock().await.push(cmd); pop_or_err(&mut self.get_firmware_job_status_responses.lock().await) } + async fn add_firmware_object( + &self, + _cmd: rms::AddFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn get_firmware_object( + &self, + _cmd: rms::GetFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn list_firmware_objects( + &self, + _cmd: rms::ListFirmwareObjectsRequest, + ) -> Result { + Ok(rms::ListFirmwareObjectsResponse::default()) + } + async fn delete_firmware_object( + &self, + _cmd: rms::DeleteFirmwareObjectRequest, + ) -> Result { + Ok(rms::OperationResponse::default()) + } + async fn set_default_firmware_object( + &self, + _cmd: rms::SetDefaultFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn apply_firmware_object( + &self, + _cmd: rms::ApplyFirmwareObjectRequest, + ) -> Result { + Ok(rms::ApplyFirmwareObjectResponse::default()) + } + async fn apply_switch_system_image( + &self, + _cmd: rms::ApplySwitchSystemImageRequest, + ) -> Result { + Ok(rms::ApplySwitchSystemImageResponse::default()) + } + async fn get_firmware_object_history( + &self, + _cmd: rms::GetFirmwareObjectHistoryRequest, + ) -> Result { + Ok(rms::GetFirmwareObjectHistoryResponse::default()) + } async fn list_firmware_on_switch( &self, cmd: rms::ListFirmwareOnSwitchCommand, @@ -895,6 +957,13 @@ impl RmsApi for MockRmsApi { .await, ) } + async fn set_scale_up_fabric_state( + &self, + cmd: rms::SetScaleUpFabricStateRequest, + ) -> Result { + self.set_scale_up_fabric_state_calls.lock().await.push(cmd); + pop_or_err(&mut self.set_scale_up_fabric_state_responses.lock().await) + } async fn enable_scale_up_fabric_telemetry_interface( &self, cmd: rms::EnableScaleUpFabricTelemetryInterfaceRequest, diff --git a/crates/api/src/rack/rms_client.rs b/crates/api/src/rack/rms_client.rs index dbc82c81d7..704178e673 100644 --- a/crates/api/src/rack/rms_client.rs +++ b/crates/api/src/rack/rms_client.rs @@ -425,6 +425,54 @@ pub mod test_support { ) -> Result { Ok(rms::GetRackFirmwareInventoryResponse::default()) } + async fn add_firmware_object( + &self, + _cmd: rms::AddFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn get_firmware_object( + &self, + _cmd: rms::GetFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn list_firmware_objects( + &self, + _cmd: rms::ListFirmwareObjectsRequest, + ) -> Result { + Ok(rms::ListFirmwareObjectsResponse::default()) + } + async fn delete_firmware_object( + &self, + _cmd: rms::DeleteFirmwareObjectRequest, + ) -> Result { + Ok(rms::OperationResponse::default()) + } + async fn set_default_firmware_object( + &self, + _cmd: rms::SetDefaultFirmwareObjectRequest, + ) -> Result { + Ok(rms::FirmwareObject::default()) + } + async fn apply_firmware_object( + &self, + _cmd: rms::ApplyFirmwareObjectRequest, + ) -> Result { + Ok(rms::ApplyFirmwareObjectResponse::default()) + } + async fn apply_switch_system_image( + &self, + _cmd: rms::ApplySwitchSystemImageRequest, + ) -> Result { + Ok(rms::ApplySwitchSystemImageResponse::default()) + } + async fn get_firmware_object_history( + &self, + _cmd: rms::GetFirmwareObjectHistoryRequest, + ) -> Result { + Ok(rms::GetFirmwareObjectHistoryResponse::default()) + } async fn list_firmware_on_switch( &self, _cmd: rms::ListFirmwareOnSwitchCommand, @@ -449,6 +497,12 @@ pub mod test_support { ) -> Result { Ok(rms::ConfigureScaleUpFabricManagerResponse::default()) } + async fn set_scale_up_fabric_state( + &self, + _cmd: rms::SetScaleUpFabricStateRequest, + ) -> Result { + Ok(rms::SetScaleUpFabricStateResponse::default()) + } async fn fetch_switch_system_image( &self, _cmd: rms::FetchSwitchSystemImageRequest, diff --git a/crates/api/src/state_controller/rack/maintenance.rs b/crates/api/src/state_controller/rack/maintenance.rs index 15c35ff275..d195209385 100644 --- a/crates/api/src/state_controller/rack/maintenance.rs +++ b/crates/api/src/state_controller/rack/maintenance.rs @@ -1889,6 +1889,10 @@ pub async fn handle_maintenance( }, })) } + ConfigureNmxClusterState::DisableScaleUpFabricState + | ConfigureNmxClusterState::ConfigureScaleUpFabricManager => Ok( + StateHandlerOutcome::wait("ConfigureNmxCluster sub-state is not wired yet".into()), + ), ConfigureNmxClusterState::WaitForFabricStatus => { let switch_inventory = load_rack_switch_firmware_inventory( &ctx.services.db_pool, From c0fa2d7b7fb329ead705548a42fd25b58f274729 Mon Sep 17 00:00:00 2001 From: Narasimhan Venkadeswaran Date: Mon, 18 May 2026 08:53:46 -0700 Subject: [PATCH 2/7] test(rack): record nmx rms requests --- crates/api/src/rack/rms_client.rs | 149 ++++++++++++++++++++++++++++-- 1 file changed, 143 insertions(+), 6 deletions(-) diff --git a/crates/api/src/rack/rms_client.rs b/crates/api/src/rack/rms_client.rs index 704178e673..88f94af1b2 100644 --- a/crates/api/src/rack/rms_client.rs +++ b/crates/api/src/rack/rms_client.rs @@ -74,6 +74,19 @@ pub mod test_support { switch_system_image_job_statuses: Arc>>, switch_system_image_job_errors: Arc>>, + submitted_get_device_info_by_device_list_requests: + Arc>>, + queued_get_device_info_by_device_list_responses: + Arc>>>, + submitted_configure_scale_up_fabric_manager_requests: + Arc>>, + queued_configure_scale_up_fabric_manager_responses: Arc< + Mutex>>, + >, + submitted_set_scale_up_fabric_state_requests: + Arc>>, + queued_set_scale_up_fabric_state_responses: + Arc>>>, submitted_set_power_state_by_device_list_requests: Arc>>, queued_set_power_state_by_device_list_responses: @@ -94,6 +107,18 @@ pub mod test_support { queued_switch_system_image_responses: Arc::new(Mutex::new(VecDeque::new())), switch_system_image_job_statuses: Arc::new(Mutex::new(HashMap::new())), switch_system_image_job_errors: Arc::new(Mutex::new(HashMap::new())), + submitted_get_device_info_by_device_list_requests: Arc::new(Mutex::new(Vec::new())), + queued_get_device_info_by_device_list_responses: Arc::new(Mutex::new( + VecDeque::new(), + )), + submitted_configure_scale_up_fabric_manager_requests: Arc::new(Mutex::new( + Vec::new(), + )), + queued_configure_scale_up_fabric_manager_responses: Arc::new(Mutex::new( + VecDeque::new(), + )), + submitted_set_scale_up_fabric_state_requests: Arc::new(Mutex::new(Vec::new())), + queued_set_scale_up_fabric_state_responses: Arc::new(Mutex::new(VecDeque::new())), submitted_set_power_state_by_device_list_requests: Arc::new(Mutex::new(Vec::new())), queued_set_power_state_by_device_list_responses: Arc::new(Mutex::new( VecDeque::new(), @@ -131,6 +156,24 @@ pub mod test_support { .clone(), switch_system_image_job_statuses: self.switch_system_image_job_statuses.clone(), switch_system_image_job_errors: self.switch_system_image_job_errors.clone(), + submitted_get_device_info_by_device_list_requests: self + .submitted_get_device_info_by_device_list_requests + .clone(), + queued_get_device_info_by_device_list_responses: self + .queued_get_device_info_by_device_list_responses + .clone(), + submitted_configure_scale_up_fabric_manager_requests: self + .submitted_configure_scale_up_fabric_manager_requests + .clone(), + queued_configure_scale_up_fabric_manager_responses: self + .queued_configure_scale_up_fabric_manager_responses + .clone(), + submitted_set_scale_up_fabric_state_requests: self + .submitted_set_scale_up_fabric_state_requests + .clone(), + queued_set_scale_up_fabric_state_responses: self + .queued_set_scale_up_fabric_state_responses + .clone(), submitted_set_power_state_by_device_list_requests: self .submitted_set_power_state_by_device_list_requests .clone(), @@ -228,6 +271,63 @@ pub mod test_support { .clone() } + pub async fn queue_get_device_info_by_device_list_response( + &self, + response: Result, + ) { + self.queued_get_device_info_by_device_list_responses + .lock() + .await + .push_back(response); + } + + pub async fn submitted_get_device_info_by_device_list_requests( + &self, + ) -> Vec { + self.submitted_get_device_info_by_device_list_requests + .lock() + .await + .clone() + } + + pub async fn queue_configure_scale_up_fabric_manager_response( + &self, + response: Result, + ) { + self.queued_configure_scale_up_fabric_manager_responses + .lock() + .await + .push_back(response); + } + + pub async fn submitted_configure_scale_up_fabric_manager_requests( + &self, + ) -> Vec { + self.submitted_configure_scale_up_fabric_manager_requests + .lock() + .await + .clone() + } + + pub async fn queue_set_scale_up_fabric_state_response( + &self, + response: Result, + ) { + self.queued_set_scale_up_fabric_state_responses + .lock() + .await + .push_back(response); + } + + pub async fn submitted_set_scale_up_fabric_state_requests( + &self, + ) -> Vec { + self.submitted_set_scale_up_fabric_state_requests + .lock() + .await + .clone() + } + /// Queue a `Result` to be returned on the next call to /// `set_power_state_by_device_list`. Used by power-shelf maintenance /// tests to drive both the success and failure paths of the @@ -270,6 +370,19 @@ pub mod test_support { switch_system_image_job_statuses: Arc>>, switch_system_image_job_errors: Arc>>, + submitted_get_device_info_by_device_list_requests: + Arc>>, + queued_get_device_info_by_device_list_responses: + Arc>>>, + submitted_configure_scale_up_fabric_manager_requests: + Arc>>, + queued_configure_scale_up_fabric_manager_responses: Arc< + Mutex>>, + >, + submitted_set_scale_up_fabric_state_requests: + Arc>>, + queued_set_scale_up_fabric_state_responses: + Arc>>>, submitted_set_power_state_by_device_list_requests: Arc>>, queued_set_power_state_by_device_list_responses: @@ -280,9 +393,17 @@ pub mod test_support { impl RmsApi for MockRmsClient { async fn get_device_info_by_device_list( &self, - _cmd: rms::GetDeviceInfoByDeviceListRequest, + cmd: rms::GetDeviceInfoByDeviceListRequest, ) -> Result { - Ok(rms::GetDeviceInfoByDeviceListResponse::default()) + self.submitted_get_device_info_by_device_list_requests + .lock() + .await + .push(cmd); + self.queued_get_device_info_by_device_list_responses + .lock() + .await + .pop_front() + .unwrap_or(Ok(rms::GetDeviceInfoByDeviceListResponse::default())) } async fn get_node_device_info( &self, @@ -493,15 +614,31 @@ pub mod test_support { } async fn configure_scale_up_fabric_manager( &self, - _cmd: rms::ConfigureScaleUpFabricManagerRequest, + cmd: rms::ConfigureScaleUpFabricManagerRequest, ) -> Result { - Ok(rms::ConfigureScaleUpFabricManagerResponse::default()) + self.submitted_configure_scale_up_fabric_manager_requests + .lock() + .await + .push(cmd); + self.queued_configure_scale_up_fabric_manager_responses + .lock() + .await + .pop_front() + .unwrap_or(Ok(rms::ConfigureScaleUpFabricManagerResponse::default())) } async fn set_scale_up_fabric_state( &self, - _cmd: rms::SetScaleUpFabricStateRequest, + cmd: rms::SetScaleUpFabricStateRequest, ) -> Result { - Ok(rms::SetScaleUpFabricStateResponse::default()) + self.submitted_set_scale_up_fabric_state_requests + .lock() + .await + .push(cmd); + self.queued_set_scale_up_fabric_state_responses + .lock() + .await + .pop_front() + .unwrap_or(Ok(rms::SetScaleUpFabricStateResponse::default())) } async fn fetch_switch_system_image( &self, From 94fd402e6bac3c0d862f79e44d042a83c14c9f5f Mon Sep 17 00:00:00 2001 From: Narasimhan Venkadeswaran Date: Mon, 18 May 2026 09:23:48 -0700 Subject: [PATCH 3/7] feat(rack): disable scale-up fabric before nmx primary selection --- crates/api-model/src/rack.rs | 9 +- .../src/state_controller/rack/maintenance.rs | 135 ++++++- .../tests/rack_state_controller/handler.rs | 332 ++++++++++++++---- 3 files changed, 408 insertions(+), 68 deletions(-) diff --git a/crates/api-model/src/rack.rs b/crates/api-model/src/rack.rs index 064d48becf..1b038b318c 100644 --- a/crates/api-model/src/rack.rs +++ b/crates/api-model/src/rack.rs @@ -456,11 +456,12 @@ impl Display for RackMaintenanceState { /// Sub-states of `RackMaintenanceState::ConfigureNmxCluster`. /// -/// `Start` selects and persists the primary switch. `DisableScaleUpFabricState` +/// `Start` advances into the NMX cluster sequence. `DisableScaleUpFabricState` /// disables ScaleUpFabric state on all scoped switches before -/// `ConfigureScaleUpFabricManager` configures only the primary switch. -/// `WaitForFabricStatus` polls `GetScaleUpFabricServicesStatus` and persists -/// the per-switch `fabric_manager_status` before advancing. +/// `ConfigureScaleUpFabricManager` selects, persists, and configures only the +/// primary switch. `WaitForFabricStatus` polls +/// `GetScaleUpFabricServicesStatus` and persists the per-switch +/// `fabric_manager_status` before advancing. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum ConfigureNmxClusterState { Start, diff --git a/crates/api/src/state_controller/rack/maintenance.rs b/crates/api/src/state_controller/rack/maintenance.rs index d195209385..c0100986d4 100644 --- a/crates/api/src/state_controller/rack/maintenance.rs +++ b/crates/api/src/state_controller/rack/maintenance.rs @@ -1718,6 +1718,17 @@ pub async fn handle_maintenance( configure_nmx_cluster, } => match configure_nmx_cluster { ConfigureNmxClusterState::Start => { + tracing::info!( + rack_id = %id, + "Starting ConfigureNmxCluster; advancing to DisableScaleUpFabricState" + ); + Ok(StateHandlerOutcome::transition(RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::DisableScaleUpFabricState, + }, + })) + } + ConfigureNmxClusterState::DisableScaleUpFabricState => { let Some(rms_client) = ctx.services.rms_client.as_ref() else { return transition_to_rack_error(id, state, "RMS client not configured", ctx) .await; @@ -1730,7 +1741,125 @@ pub async fn handle_maintenance( .await .map_err(|error| { StateHandlerError::GenericError(eyre::eyre!( - "failed to load rack switch firmware inventory for ConfigureNmxCluster: {}", + "failed to load rack switch firmware inventory for DisableScaleUpFabricState: {}", + error + )) + })?; + let switch_inventory = filter_switch_inventory_by_scope(switch_inventory, scope); + + if switch_inventory.switches.is_empty() { + return Ok(skip_configure_nmx_cluster_outcome( + id, + "rack has no switches in inventory", + scope, + )); + } + + if let Err(cause) = + validate_switch_inventory_for_nmx_cluster(&switch_inventory.switches) + { + return transition_to_rack_error(id, state, cause, ctx).await; + } + + tracing::info!( + rack_id = %id, + switch_count = switch_inventory.switches.len(), + "Disabling ScaleUpFabric state before selecting ConfigureNmxCluster primary switch" + ); + let response = match rms_client + .set_scale_up_fabric_state(rms::SetScaleUpFabricStateRequest { + nodes: Some(rms::NodeSet { + devices: switch_inventory + .switches + .iter() + .map(|switch| { + build_new_node_info(id, switch, rms::NodeType::Switch) + }) + .collect(), + }), + enabled: Some(false), + verify_ssl: false, + ..Default::default() + }) + .await + { + Ok(response) => response, + Err(error) => { + let error = rack_manager_error("set_scale_up_fabric_state", error); + return transition_to_rack_error(id, state, error.to_string(), ctx).await; + } + }; + + let batch = response.response.unwrap_or_default(); + if batch.status != rms::ReturnCode::Success as i32 || batch.failed_nodes > 0 { + let node_error = batch + .node_results + .iter() + .find(|result| { + result.status != rms::ReturnCode::Success as i32 + || !result.error_message.is_empty() + }) + .map(|result| { + if result.error_message.is_empty() { + format!("status={}", result.status) + } else { + result.error_message.clone() + } + }); + let summary = if !batch.message.trim().is_empty() { + batch.message + } else if let Some(error) = node_error { + error + } else { + format!( + "batch status {}, failed_nodes {}", + batch.status, batch.failed_nodes, + ) + }; + tracing::error!( + rack_id = %id, + batch_status = batch.status, + successful_nodes = batch.successful_nodes, + failed_nodes = batch.failed_nodes, + summary = %summary, + "RMS SetScaleUpFabricState failed", + ); + return transition_to_rack_error( + id, + state, + format!("RMS SetScaleUpFabricState failed: {}", summary), + ctx, + ) + .await; + } + + tracing::info!( + rack_id = %id, + successful_nodes = batch.successful_nodes, + switch_count = switch_inventory.switches.len(), + "ScaleUpFabric state disabled; advancing to ConfigureScaleUpFabricManager" + ); + Ok(StateHandlerOutcome::transition(RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: + ConfigureNmxClusterState::ConfigureScaleUpFabricManager, + }, + })) + } + ConfigureNmxClusterState::ConfigureScaleUpFabricManager => { + let Some(rms_client) = ctx.services.rms_client.as_ref() else { + return transition_to_rack_error(id, state, "RMS client not configured", ctx) + .await; + }; + let switch_inventory = load_rack_switch_firmware_inventory( + &ctx.services.db_pool, + ctx.services.credential_manager.as_ref(), + id, + ) + .await + .map_err(|error| { + StateHandlerError::GenericError(eyre::eyre!( + "failed to load rack switch firmware inventory for ConfigureScaleUpFabricManager: {}", error )) })?; @@ -1889,10 +2018,6 @@ pub async fn handle_maintenance( }, })) } - ConfigureNmxClusterState::DisableScaleUpFabricState - | ConfigureNmxClusterState::ConfigureScaleUpFabricManager => Ok( - StateHandlerOutcome::wait("ConfigureNmxCluster sub-state is not wired yet".into()), - ), ConfigureNmxClusterState::WaitForFabricStatus => { let switch_inventory = load_rack_switch_firmware_inventory( &ctx.services.db_pool, diff --git a/crates/api/src/tests/rack_state_controller/handler.rs b/crates/api/src/tests/rack_state_controller/handler.rs index 03df32a78e..0fce360d32 100644 --- a/crates/api/src/tests/rack_state_controller/handler.rs +++ b/crates/api/src/tests/rack_state_controller/handler.rs @@ -27,9 +27,10 @@ use librms::protos::rack_manager as rms; use model::expected_machine::ExpectedMachineData; use model::expected_rack::ExpectedRack; use model::rack::{ - FirmwareUpgradeDeviceStatus, FirmwareUpgradeJob, FirmwareUpgradeState, MaintenanceActivity, - MaintenanceScope, NvosUpdateState, NvosUpdateSwitchStatus, Rack, RackConfig, - RackFirmwareUpgradeState, RackMaintenanceState, RackPowerState, RackState, RackValidationState, + ConfigureNmxClusterState, FirmwareUpgradeDeviceStatus, FirmwareUpgradeJob, + FirmwareUpgradeState, MaintenanceActivity, MaintenanceScope, NvosUpdateState, + NvosUpdateSwitchStatus, Rack, RackConfig, RackFirmwareUpgradeState, RackMaintenanceState, + RackPowerState, RackState, RackValidationState, }; use model::rack_type::{ RackCapabilitiesSet, RackCapabilityCompute, RackCapabilityPowerShelf, RackCapabilitySwitch, @@ -282,71 +283,91 @@ async fn create_two_compute_rack( Ok((rack_id, host_a, host_b)) } -async fn attach_switch_with_nvos_credentials( +async fn attach_switches_with_nvos_credentials( env: &TestEnv, rack_id: &RackId, -) -> Result> { + count: usize, +) -> Result, Box> { let mut txn = env.pool.begin().await?; - let expected_switch = create_expected_switches(txn.as_mut()) - .await + let expected_switches = create_expected_switches(txn.as_mut()).await; + let selected_switches = expected_switches .into_iter() - .next() - .ok_or("expected at least one switch fixture")?; - - let switch_id = model::switch::switch_id::from_hardware_info( - &expected_switch.serial_number, - "NVIDIA", - "Switch", - carbide_uuid::switch::SwitchIdSource::ProductBoardChassisSerial, - carbide_uuid::switch::SwitchType::NvLink, - )?; - - let new_switch = NewSwitch { - id: switch_id, - config: SwitchConfig { - name: expected_switch.metadata.name.clone(), - enable_nmxc: false, - fabric_manager_config: None, - }, - bmc_mac_address: Some(expected_switch.bmc_mac_address), - metadata: None, - rack_id: Some(rack_id.clone()), - slot_number: Some(0), - tray_index: Some(0), - }; - db_switch::create(txn.as_mut(), &new_switch).await?; + .take(count) + .collect::>(); + if selected_switches.len() != count { + return Err(eyre::eyre!("expected at least {} switch fixtures", count).into()); + } + + let mut switch_ids = Vec::with_capacity(selected_switches.len()); + for (index, expected_switch) in selected_switches.iter().enumerate() { + let switch_id = model::switch::switch_id::from_hardware_info( + &expected_switch.serial_number, + "NVIDIA", + "Switch", + carbide_uuid::switch::SwitchIdSource::ProductBoardChassisSerial, + carbide_uuid::switch::SwitchType::NvLink, + )?; + + let new_switch = NewSwitch { + id: switch_id.clone(), + config: SwitchConfig { + name: expected_switch.metadata.name.clone(), + enable_nmxc: false, + fabric_manager_config: None, + }, + bmc_mac_address: Some(expected_switch.bmc_mac_address.clone()), + metadata: None, + rack_id: Some(rack_id.clone()), + slot_number: Some(index as i32), + tray_index: Some(0), + }; + db_switch::create(txn.as_mut(), &new_switch).await?; + switch_ids.push(switch_id); + } txn.commit().await?; - env.api - .credential_manager - .set_credentials( - &CredentialKey::BmcCredentials { - credential_type: BmcCredentialType::BmcRoot { + for expected_switch in selected_switches { + env.api + .credential_manager + .set_credentials( + &CredentialKey::BmcCredentials { + credential_type: BmcCredentialType::BmcRoot { + bmc_mac_address: expected_switch.bmc_mac_address.clone(), + }, + }, + &Credentials::UsernamePassword { + username: "root".to_string(), + password: "notforprod".to_string(), + }, + ) + .await + .map_err(|error| eyre::eyre!("failed to set switch BMC credentials: {}", error))?; + env.api + .credential_manager + .set_credentials( + &CredentialKey::SwitchNvosAdmin { bmc_mac_address: expected_switch.bmc_mac_address, }, - }, - &Credentials::UsernamePassword { - username: "root".to_string(), - password: "notforprod".to_string(), - }, - ) - .await - .map_err(|error| eyre::eyre!("failed to set switch BMC credentials: {}", error))?; - env.api - .credential_manager - .set_credentials( - &CredentialKey::SwitchNvosAdmin { - bmc_mac_address: expected_switch.bmc_mac_address, - }, - &Credentials::UsernamePassword { - username: "nvos-admin".to_string(), - password: "nvos-pass".to_string(), - }, - ) - .await - .map_err(|error| eyre::eyre!("failed to set switch NVOS credentials: {}", error))?; + &Credentials::UsernamePassword { + username: "nvos-admin".to_string(), + password: "nvos-pass".to_string(), + }, + ) + .await + .map_err(|error| eyre::eyre!("failed to set switch NVOS credentials: {}", error))?; + } - Ok(switch_id) + Ok(switch_ids) +} + +async fn attach_switch_with_nvos_credentials( + env: &TestEnv, + rack_id: &RackId, +) -> Result> { + let mut switch_ids = attach_switches_with_nvos_credentials(env, rack_id, 1).await?; + switch_ids + .pop() + .ok_or_else(|| eyre::eyre!("expected one switch fixture").into()) } pub(crate) fn new_rack_id() -> RackId { @@ -2206,6 +2227,199 @@ async fn test_nvos_update_start_transitions_to_wait_for_complete( Ok(()) } +#[crate::sqlx_test] +async fn test_configure_nmx_cluster_start_advances_to_disable_scale_up_fabric_state( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env_with_overrides(pool.clone(), TestEnvOverrides::default()).await; + + let rack_id = new_rack_id(); + let mut txn = pool.acquire().await?; + db_rack::create( + &mut txn, + &rack_id, + Some(&RackProfileId::new("Empty")), + &RackConfig::default(), + None, + ) + .await?; + + let mut rack = get_db_rack(env.db_reader().as_mut(), &rack_id).await; + + let handler_instance = RackStateHandler::default(); + let mut services = env.state_handler_services(); + let mut metrics = RackMetrics::default(); + let mut db_writes = DbWriteBatch::default(); + let mut ctx = StateHandlerContext:: { + services: &mut services, + metrics: &mut metrics, + pending_db_writes: &mut db_writes, + }; + + let nmx_state = RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::Start, + }, + }; + let outcome = handler_instance + .handle_object_state(&rack_id, &mut rack, &nmx_state, &mut ctx) + .await?; + + match outcome { + StateHandlerOutcome::Transition { next_state, .. } => { + assert!( + matches!( + next_state, + RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: + ConfigureNmxClusterState::DisableScaleUpFabricState, + }, + } + ), + "ConfigureNmxCluster(Start) should transition to DisableScaleUpFabricState, got {:?}", + next_state + ); + } + other => panic!( + "Expected Transition, got {:?}", + std::mem::discriminant(&other) + ), + } + + assert!( + env.rms_sim + .submitted_set_scale_up_fabric_state_requests() + .await + .is_empty() + ); + assert!( + env.rms_sim + .submitted_get_device_info_by_device_list_requests() + .await + .is_empty() + ); + assert!( + env.rms_sim + .submitted_configure_scale_up_fabric_manager_requests() + .await + .is_empty() + ); + + Ok(()) +} + +#[crate::sqlx_test] +async fn test_configure_nmx_cluster_disable_scale_up_fabric_state_runs_on_all_switches( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env_with_overrides(pool.clone(), TestEnvOverrides::default()).await; + + let rack_id = new_rack_id(); + let mut txn = pool.acquire().await?; + db_rack::create( + &mut txn, + &rack_id, + Some(&RackProfileId::new("Empty")), + &RackConfig::default(), + None, + ) + .await?; + drop(txn); + + let switch_ids = attach_switches_with_nvos_credentials(&env, &rack_id, 2).await?; + env.rms_sim + .queue_set_scale_up_fabric_state_response(Ok(rms::SetScaleUpFabricStateResponse { + response: Some(rms::NodeBatchResponse { + status: rms::ReturnCode::Success as i32, + successful_nodes: switch_ids.len() as i32, + failed_nodes: 0, + ..Default::default() + }), + ..Default::default() + })) + .await; + + let mut rack = get_db_rack(env.db_reader().as_mut(), &rack_id).await; + + let handler_instance = RackStateHandler::default(); + let mut services = env.state_handler_services(); + let mut metrics = RackMetrics::default(); + let mut db_writes = DbWriteBatch::default(); + let mut ctx = StateHandlerContext:: { + services: &mut services, + metrics: &mut metrics, + pending_db_writes: &mut db_writes, + }; + + let nmx_state = RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::DisableScaleUpFabricState, + }, + }; + let outcome = handler_instance + .handle_object_state(&rack_id, &mut rack, &nmx_state, &mut ctx) + .await?; + + match outcome { + StateHandlerOutcome::Transition { next_state, .. } => { + assert!( + matches!( + next_state, + RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: + ConfigureNmxClusterState::ConfigureScaleUpFabricManager, + }, + } + ), + "DisableScaleUpFabricState should transition to ConfigureScaleUpFabricManager, got {:?}", + next_state + ); + } + other => panic!( + "Expected Transition, got {:?}", + std::mem::discriminant(&other) + ), + } + + let requests = env + .rms_sim + .submitted_set_scale_up_fabric_state_requests() + .await; + assert_eq!(requests.len(), 1); + let request = &requests[0]; + assert_eq!(request.enabled, Some(false)); + let devices = request + .nodes + .as_ref() + .expect("disable request should include nodes") + .devices + .as_slice(); + assert_eq!(devices.len(), switch_ids.len()); + let node_ids = devices + .iter() + .map(|device| device.node_id.clone()) + .collect::>(); + for switch_id in &switch_ids { + assert!(node_ids.contains(&switch_id.to_string())); + } + assert!( + env.rms_sim + .submitted_get_device_info_by_device_list_requests() + .await + .is_empty() + ); + assert!( + env.rms_sim + .submitted_configure_scale_up_fabric_manager_requests() + .await + .is_empty() + ); + + Ok(()) +} + /// test_configure_nmx_cluster_transitions_to_completed verifies that /// Maintenance::ConfigureNmxCluster transitions to Maintenance::Completed. #[crate::sqlx_test] From 1b958590f05bda95154708b81341fada85dd8bb4 Mon Sep 17 00:00:00 2001 From: Narasimhan Venkadeswaran Date: Mon, 18 May 2026 09:42:15 -0700 Subject: [PATCH 4/7] test(rack): cover nmx primary selection after fabric disable --- .../tests/rack_state_controller/handler.rs | 167 +++++++++++++++++- 1 file changed, 166 insertions(+), 1 deletion(-) diff --git a/crates/api/src/tests/rack_state_controller/handler.rs b/crates/api/src/tests/rack_state_controller/handler.rs index 0fce360d32..b7ed2c2f19 100644 --- a/crates/api/src/tests/rack_state_controller/handler.rs +++ b/crates/api/src/tests/rack_state_controller/handler.rs @@ -34,7 +34,7 @@ use model::rack::{ }; use model::rack_type::{ RackCapabilitiesSet, RackCapabilityCompute, RackCapabilityPowerShelf, RackCapabilitySwitch, - RackHardwareClass, RackHardwareType, RackProfile, RackProfileConfig, + RackHardwareClass, RackHardwareTopology, RackHardwareType, RackProfile, RackProfileConfig, }; use model::switch::{NewSwitch, SwitchConfig}; use serde_json::json; @@ -160,6 +160,18 @@ pub(crate) fn config_with_rack_profiles() -> crate::cfg::file::CarbideConfig { config } +fn config_with_nmx_cluster_profile() -> crate::cfg::file::CarbideConfig { + let mut config = config_with_rack_profiles(); + config.rack_profiles.rack_profiles.insert( + "NmxCluster".to_string(), + RackProfile { + rack_hardware_topology: Some(RackHardwareTopology::Gb200Nvl72r1C2g4Topology), + ..Default::default() + }, + ); + config +} + fn default_lookup_table_json() -> serde_json::Value { json!({ "devices": { @@ -2420,6 +2432,159 @@ async fn test_configure_nmx_cluster_disable_scale_up_fabric_state_runs_on_all_sw Ok(()) } +#[crate::sqlx_test] +async fn test_configure_nmx_cluster_configure_selects_persists_and_configures_primary_switch( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env_with_overrides( + pool.clone(), + TestEnvOverrides { + config: Some(config_with_nmx_cluster_profile()), + ..Default::default() + }, + ) + .await; + + let rack_id = new_rack_id(); + let mut txn = pool.acquire().await?; + db_rack::create( + &mut txn, + &rack_id, + Some(&RackProfileId::new("NmxCluster")), + &RackConfig::default(), + None, + ) + .await?; + drop(txn); + + let switch_ids = attach_switches_with_nvos_credentials(&env, &rack_id, 2).await?; + let secondary_switch_id = switch_ids[0].clone(); + let primary_switch_id = switch_ids[1].clone(); + let topology_type = RackHardwareTopology::Gb200Nvl72r1C2g4Topology.to_string(); + + env.rms_sim + .queue_get_device_info_by_device_list_response(Ok(rms::GetDeviceInfoByDeviceListResponse { + status: rms::ReturnCode::Success as i32, + node_device_info: vec![ + rms::NodeDeviceInfo { + node_id: secondary_switch_id.to_string(), + tray_index: Some(2), + slot_number: Some(2), + ..Default::default() + }, + rms::NodeDeviceInfo { + node_id: primary_switch_id.to_string(), + tray_index: Some(1), + slot_number: Some(1), + ..Default::default() + }, + ], + ..Default::default() + })) + .await; + env.rms_sim + .queue_configure_scale_up_fabric_manager_response(Ok( + rms::ConfigureScaleUpFabricManagerResponse { + status: rms::ReturnCode::Success as i32, + topology_used: topology_type.clone(), + scale_up_fabric_state_enabled: false, + grpc_enabled: true, + ..Default::default() + }, + )) + .await; + + let mut rack = get_db_rack(env.db_reader().as_mut(), &rack_id).await; + + let handler_instance = RackStateHandler::default(); + let mut services = env.state_handler_services(); + let mut metrics = RackMetrics::default(); + let mut db_writes = DbWriteBatch::default(); + let mut ctx = StateHandlerContext:: { + services: &mut services, + metrics: &mut metrics, + pending_db_writes: &mut db_writes, + }; + + let nmx_state = RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::ConfigureScaleUpFabricManager, + }, + }; + let outcome = handler_instance + .handle_object_state(&rack_id, &mut rack, &nmx_state, &mut ctx) + .await?; + + match outcome { + StateHandlerOutcome::Transition { next_state, .. } => { + assert!( + matches!( + next_state, + RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::WaitForFabricStatus, + }, + } + ), + "ConfigureScaleUpFabricManager should transition to WaitForFabricStatus, got {:?}", + next_state + ); + } + other => panic!( + "Expected Transition, got {:?}", + std::mem::discriminant(&other) + ), + } + + assert!( + env.rms_sim + .submitted_set_scale_up_fabric_state_requests() + .await + .is_empty() + ); + + let device_info_requests = env + .rms_sim + .submitted_get_device_info_by_device_list_requests() + .await; + assert_eq!(device_info_requests.len(), 1); + let device_info_nodes = device_info_requests[0] + .nodes + .as_ref() + .expect("device-info request should include nodes") + .devices + .as_slice(); + assert_eq!(device_info_nodes.len(), switch_ids.len()); + + let configure_requests = env + .rms_sim + .submitted_configure_scale_up_fabric_manager_requests() + .await; + assert_eq!(configure_requests.len(), 1); + let configure_request = &configure_requests[0]; + assert_eq!(configure_request.topology_type, topology_type); + assert_eq!( + configure_request + .device + .as_ref() + .expect("configure request should include a primary switch") + .node_id, + primary_switch_id.to_string() + ); + + let mut txn = pool.acquire().await?; + let primary_switch = db_switch::find_by_id(&mut txn, &primary_switch_id) + .await? + .expect("primary switch should exist"); + let secondary_switch = db_switch::find_by_id(&mut txn, &secondary_switch_id) + .await? + .expect("secondary switch should exist"); + assert!(primary_switch.is_primary); + assert!(!secondary_switch.is_primary); + + Ok(()) +} + /// test_configure_nmx_cluster_transitions_to_completed verifies that /// Maintenance::ConfigureNmxCluster transitions to Maintenance::Completed. #[crate::sqlx_test] From cd07e70a412068d5a3a5b5c36a7a2ed306dbd7ec Mon Sep 17 00:00:00 2001 From: Narasimhan Venkadeswaran Date: Mon, 18 May 2026 09:49:22 -0700 Subject: [PATCH 5/7] test(rack): cover nmx configure failure ordering --- .../tests/rack_state_controller/handler.rs | 340 ++++++++++++++++++ 1 file changed, 340 insertions(+) diff --git a/crates/api/src/tests/rack_state_controller/handler.rs b/crates/api/src/tests/rack_state_controller/handler.rs index b7ed2c2f19..73cd76d656 100644 --- a/crates/api/src/tests/rack_state_controller/handler.rs +++ b/crates/api/src/tests/rack_state_controller/handler.rs @@ -2585,6 +2585,346 @@ async fn test_configure_nmx_cluster_configure_selects_persists_and_configures_pr Ok(()) } +#[crate::sqlx_test] +async fn test_configure_nmx_cluster_disable_scale_up_fabric_state_failure_stops_flow( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env_with_overrides(pool.clone(), TestEnvOverrides::default()).await; + + let rack_id = new_rack_id(); + let mut txn = pool.acquire().await?; + db_rack::create( + &mut txn, + &rack_id, + Some(&RackProfileId::new("Empty")), + &RackConfig::default(), + None, + ) + .await?; + drop(txn); + + attach_switches_with_nvos_credentials(&env, &rack_id, 2).await?; + env.rms_sim + .queue_set_scale_up_fabric_state_response(Ok(rms::SetScaleUpFabricStateResponse { + response: Some(rms::NodeBatchResponse { + status: rms::ReturnCode::Failure as i32, + successful_nodes: 1, + failed_nodes: 1, + message: "disable rejected".to_string(), + ..Default::default() + }), + ..Default::default() + })) + .await; + + let mut rack = get_db_rack(env.db_reader().as_mut(), &rack_id).await; + + let handler_instance = RackStateHandler::default(); + let mut services = env.state_handler_services(); + let mut metrics = RackMetrics::default(); + let mut db_writes = DbWriteBatch::default(); + let mut ctx = StateHandlerContext:: { + services: &mut services, + metrics: &mut metrics, + pending_db_writes: &mut db_writes, + }; + + let nmx_state = RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::DisableScaleUpFabricState, + }, + }; + let outcome = handler_instance + .handle_object_state(&rack_id, &mut rack, &nmx_state, &mut ctx) + .await?; + + match outcome { + StateHandlerOutcome::Transition { next_state, .. } => match next_state { + RackState::Error { cause } => { + assert!(cause.contains("RMS SetScaleUpFabricState failed")); + assert!(cause.contains("disable rejected")); + } + other => panic!("Expected Error state, got {:?}", other), + }, + other => panic!( + "Expected Transition, got {:?}", + std::mem::discriminant(&other) + ), + } + + assert_eq!( + env.rms_sim + .submitted_set_scale_up_fabric_state_requests() + .await + .len(), + 1 + ); + assert!( + env.rms_sim + .submitted_get_device_info_by_device_list_requests() + .await + .is_empty() + ); + assert!( + env.rms_sim + .submitted_configure_scale_up_fabric_manager_requests() + .await + .is_empty() + ); + + Ok(()) +} + +#[crate::sqlx_test] +async fn test_configure_nmx_cluster_configure_selection_failure_stops_before_configure( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env_with_overrides( + pool.clone(), + TestEnvOverrides { + config: Some(config_with_nmx_cluster_profile()), + ..Default::default() + }, + ) + .await; + + let rack_id = new_rack_id(); + let mut txn = pool.acquire().await?; + db_rack::create( + &mut txn, + &rack_id, + Some(&RackProfileId::new("NmxCluster")), + &RackConfig::default(), + None, + ) + .await?; + drop(txn); + + let switch_ids = attach_switches_with_nvos_credentials(&env, &rack_id, 2).await?; + env.rms_sim + .queue_get_device_info_by_device_list_response(Ok(rms::GetDeviceInfoByDeviceListResponse { + status: rms::ReturnCode::Success as i32, + node_device_info: vec![ + rms::NodeDeviceInfo { + node_id: switch_ids[0].to_string(), + tray_index: Some(1), + slot_number: Some(1), + ..Default::default() + }, + rms::NodeDeviceInfo { + node_id: switch_ids[1].to_string(), + tray_index: Some(1), + slot_number: Some(2), + ..Default::default() + }, + ], + ..Default::default() + })) + .await; + + let mut rack = get_db_rack(env.db_reader().as_mut(), &rack_id).await; + + let handler_instance = RackStateHandler::default(); + let mut services = env.state_handler_services(); + let mut metrics = RackMetrics::default(); + let mut db_writes = DbWriteBatch::default(); + let mut ctx = StateHandlerContext:: { + services: &mut services, + metrics: &mut metrics, + pending_db_writes: &mut db_writes, + }; + + let nmx_state = RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::ConfigureScaleUpFabricManager, + }, + }; + let outcome = handler_instance + .handle_object_state(&rack_id, &mut rack, &nmx_state, &mut ctx) + .await?; + + match outcome { + StateHandlerOutcome::Transition { next_state, .. } => match next_state { + RackState::Error { cause } => { + assert!(cause.contains("duplicate tray_index 1")); + } + other => panic!("Expected Error state, got {:?}", other), + }, + other => panic!( + "Expected Transition, got {:?}", + std::mem::discriminant(&other) + ), + } + + assert!( + env.rms_sim + .submitted_set_scale_up_fabric_state_requests() + .await + .is_empty() + ); + assert_eq!( + env.rms_sim + .submitted_get_device_info_by_device_list_requests() + .await + .len(), + 1 + ); + assert!( + env.rms_sim + .submitted_configure_scale_up_fabric_manager_requests() + .await + .is_empty() + ); + + let mut txn = pool.acquire().await?; + for switch_id in switch_ids { + let switch = db_switch::find_by_id(&mut txn, &switch_id) + .await? + .expect("switch should exist"); + assert!(!switch.is_primary); + } + + Ok(()) +} + +#[crate::sqlx_test] +async fn test_configure_nmx_cluster_configure_failure_advances_to_wait_for_fabric_status( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env_with_overrides( + pool.clone(), + TestEnvOverrides { + config: Some(config_with_nmx_cluster_profile()), + ..Default::default() + }, + ) + .await; + + let rack_id = new_rack_id(); + let mut txn = pool.acquire().await?; + db_rack::create( + &mut txn, + &rack_id, + Some(&RackProfileId::new("NmxCluster")), + &RackConfig::default(), + None, + ) + .await?; + drop(txn); + + let switch_ids = attach_switches_with_nvos_credentials(&env, &rack_id, 2).await?; + let primary_switch_id = switch_ids[0].clone(); + let topology_type = RackHardwareTopology::Gb200Nvl72r1C2g4Topology.to_string(); + + env.rms_sim + .queue_get_device_info_by_device_list_response(Ok(rms::GetDeviceInfoByDeviceListResponse { + status: rms::ReturnCode::Success as i32, + node_device_info: vec![ + rms::NodeDeviceInfo { + node_id: primary_switch_id.to_string(), + tray_index: Some(1), + slot_number: Some(1), + ..Default::default() + }, + rms::NodeDeviceInfo { + node_id: switch_ids[1].to_string(), + tray_index: Some(2), + slot_number: Some(2), + ..Default::default() + }, + ], + ..Default::default() + })) + .await; + env.rms_sim + .queue_configure_scale_up_fabric_manager_response(Ok( + rms::ConfigureScaleUpFabricManagerResponse { + status: rms::ReturnCode::Failure as i32, + message: "configure rejected".to_string(), + ..Default::default() + }, + )) + .await; + + let mut rack = get_db_rack(env.db_reader().as_mut(), &rack_id).await; + + let handler_instance = RackStateHandler::default(); + let mut services = env.state_handler_services(); + let mut metrics = RackMetrics::default(); + let mut db_writes = DbWriteBatch::default(); + let mut ctx = StateHandlerContext:: { + services: &mut services, + metrics: &mut metrics, + pending_db_writes: &mut db_writes, + }; + + let nmx_state = RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::ConfigureScaleUpFabricManager, + }, + }; + let outcome = handler_instance + .handle_object_state(&rack_id, &mut rack, &nmx_state, &mut ctx) + .await?; + + match outcome { + StateHandlerOutcome::Transition { next_state, .. } => { + assert!( + matches!( + next_state, + RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::WaitForFabricStatus, + }, + } + ), + "ConfigureScaleUpFabricManager failure should transition to WaitForFabricStatus, got {:?}", + next_state + ); + } + other => panic!( + "Expected Transition, got {:?}", + std::mem::discriminant(&other) + ), + } + + assert!( + env.rms_sim + .submitted_set_scale_up_fabric_state_requests() + .await + .is_empty() + ); + assert_eq!( + env.rms_sim + .submitted_get_device_info_by_device_list_requests() + .await + .len(), + 1 + ); + let configure_requests = env + .rms_sim + .submitted_configure_scale_up_fabric_manager_requests() + .await; + assert_eq!(configure_requests.len(), 1); + assert_eq!(configure_requests[0].topology_type, topology_type); + assert_eq!( + configure_requests[0] + .device + .as_ref() + .expect("configure request should include a primary switch") + .node_id, + primary_switch_id.to_string() + ); + + let mut txn = pool.acquire().await?; + let primary_switch = db_switch::find_by_id(&mut txn, &primary_switch_id) + .await? + .expect("primary switch should exist"); + assert!(primary_switch.is_primary); + + Ok(()) +} + /// test_configure_nmx_cluster_transitions_to_completed verifies that /// Maintenance::ConfigureNmxCluster transitions to Maintenance::Completed. #[crate::sqlx_test] From f663d899159d41f5596bc3abbaf0fe526a76388f Mon Sep 17 00:00:00 2001 From: Narasimhan Venkadeswaran Date: Mon, 18 May 2026 10:19:30 -0700 Subject: [PATCH 6/7] test(rack): cover nmx configure end-to-end flow --- .../tests/rack_state_controller/handler.rs | 262 ++++++++++++++++++ 1 file changed, 262 insertions(+) diff --git a/crates/api/src/tests/rack_state_controller/handler.rs b/crates/api/src/tests/rack_state_controller/handler.rs index 73cd76d656..3acc6b0146 100644 --- a/crates/api/src/tests/rack_state_controller/handler.rs +++ b/crates/api/src/tests/rack_state_controller/handler.rs @@ -2585,6 +2585,268 @@ async fn test_configure_nmx_cluster_configure_selects_persists_and_configures_pr Ok(()) } +#[crate::sqlx_test] +async fn test_configure_nmx_cluster_runs_start_disable_configure_to_wait_for_fabric_status( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env_with_overrides( + pool.clone(), + TestEnvOverrides { + config: Some(config_with_nmx_cluster_profile()), + ..Default::default() + }, + ) + .await; + + let rack_id = new_rack_id(); + let mut txn = pool.acquire().await?; + db_rack::create( + &mut txn, + &rack_id, + Some(&RackProfileId::new("NmxCluster")), + &RackConfig::default(), + None, + ) + .await?; + drop(txn); + + let switch_ids = attach_switches_with_nvos_credentials(&env, &rack_id, 2).await?; + let secondary_switch_id = switch_ids[0].clone(); + let primary_switch_id = switch_ids[1].clone(); + let topology_type = RackHardwareTopology::Gb200Nvl72r1C2g4Topology.to_string(); + + env.rms_sim + .queue_set_scale_up_fabric_state_response(Ok(rms::SetScaleUpFabricStateResponse { + response: Some(rms::NodeBatchResponse { + status: rms::ReturnCode::Success as i32, + successful_nodes: switch_ids.len() as i32, + failed_nodes: 0, + ..Default::default() + }), + ..Default::default() + })) + .await; + env.rms_sim + .queue_get_device_info_by_device_list_response(Ok(rms::GetDeviceInfoByDeviceListResponse { + status: rms::ReturnCode::Success as i32, + node_device_info: vec![ + rms::NodeDeviceInfo { + node_id: secondary_switch_id.to_string(), + tray_index: Some(2), + slot_number: Some(2), + ..Default::default() + }, + rms::NodeDeviceInfo { + node_id: primary_switch_id.to_string(), + tray_index: Some(1), + slot_number: Some(1), + ..Default::default() + }, + ], + ..Default::default() + })) + .await; + env.rms_sim + .queue_configure_scale_up_fabric_manager_response(Ok( + rms::ConfigureScaleUpFabricManagerResponse { + status: rms::ReturnCode::Success as i32, + topology_used: topology_type.clone(), + scale_up_fabric_state_enabled: false, + grpc_enabled: true, + ..Default::default() + }, + )) + .await; + + let mut rack = get_db_rack(env.db_reader().as_mut(), &rack_id).await; + + let handler_instance = RackStateHandler::default(); + let mut services = env.state_handler_services(); + let mut metrics = RackMetrics::default(); + let mut db_writes = DbWriteBatch::default(); + let mut ctx = StateHandlerContext:: { + services: &mut services, + metrics: &mut metrics, + pending_db_writes: &mut db_writes, + }; + + let start_state = RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::Start, + }, + }; + let outcome = handler_instance + .handle_object_state(&rack_id, &mut rack, &start_state, &mut ctx) + .await?; + let disable_state = match outcome { + StateHandlerOutcome::Transition { next_state, .. } => { + assert!( + matches!( + next_state, + RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: + ConfigureNmxClusterState::DisableScaleUpFabricState, + }, + } + ), + "ConfigureNmxCluster(Start) should transition to DisableScaleUpFabricState, got {:?}", + next_state + ); + next_state + } + other => panic!( + "Expected Transition, got {:?}", + std::mem::discriminant(&other) + ), + }; + + assert!( + env.rms_sim + .submitted_set_scale_up_fabric_state_requests() + .await + .is_empty() + ); + assert!( + env.rms_sim + .submitted_get_device_info_by_device_list_requests() + .await + .is_empty() + ); + assert!( + env.rms_sim + .submitted_configure_scale_up_fabric_manager_requests() + .await + .is_empty() + ); + + let outcome = handler_instance + .handle_object_state(&rack_id, &mut rack, &disable_state, &mut ctx) + .await?; + let configure_state = match outcome { + StateHandlerOutcome::Transition { next_state, .. } => { + assert!( + matches!( + next_state, + RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: + ConfigureNmxClusterState::ConfigureScaleUpFabricManager, + }, + } + ), + "DisableScaleUpFabricState should transition to ConfigureScaleUpFabricManager, got {:?}", + next_state + ); + next_state + } + other => panic!( + "Expected Transition, got {:?}", + std::mem::discriminant(&other) + ), + }; + + let disable_requests = env + .rms_sim + .submitted_set_scale_up_fabric_state_requests() + .await; + assert_eq!(disable_requests.len(), 1); + let disable_request = &disable_requests[0]; + assert_eq!(disable_request.enabled, Some(false)); + let disable_devices = disable_request + .nodes + .as_ref() + .expect("disable request should include nodes") + .devices + .as_slice(); + assert_eq!(disable_devices.len(), switch_ids.len()); + let disabled_node_ids = disable_devices + .iter() + .map(|device| device.node_id.clone()) + .collect::>(); + for switch_id in &switch_ids { + assert!(disabled_node_ids.contains(&switch_id.to_string())); + } + assert!( + env.rms_sim + .submitted_get_device_info_by_device_list_requests() + .await + .is_empty() + ); + assert!( + env.rms_sim + .submitted_configure_scale_up_fabric_manager_requests() + .await + .is_empty() + ); + + let outcome = handler_instance + .handle_object_state(&rack_id, &mut rack, &configure_state, &mut ctx) + .await?; + + match outcome { + StateHandlerOutcome::Transition { next_state, .. } => { + assert!( + matches!( + next_state, + RackState::Maintenance { + maintenance_state: RackMaintenanceState::ConfigureNmxCluster { + configure_nmx_cluster: ConfigureNmxClusterState::WaitForFabricStatus, + }, + } + ), + "ConfigureScaleUpFabricManager should transition to WaitForFabricStatus, got {:?}", + next_state + ); + } + other => panic!( + "Expected Transition, got {:?}", + std::mem::discriminant(&other) + ), + } + + let device_info_requests = env + .rms_sim + .submitted_get_device_info_by_device_list_requests() + .await; + assert_eq!(device_info_requests.len(), 1); + let device_info_devices = device_info_requests[0] + .nodes + .as_ref() + .expect("device-info request should include nodes") + .devices + .as_slice(); + assert_eq!(device_info_devices.len(), switch_ids.len()); + + let configure_requests = env + .rms_sim + .submitted_configure_scale_up_fabric_manager_requests() + .await; + assert_eq!(configure_requests.len(), 1); + let configure_request = &configure_requests[0]; + assert_eq!(configure_request.topology_type, topology_type); + assert_eq!( + configure_request + .device + .as_ref() + .expect("configure request should include a primary switch") + .node_id, + primary_switch_id.to_string() + ); + + let mut txn = pool.acquire().await?; + let primary_switch = db_switch::find_by_id(&mut txn, &primary_switch_id) + .await? + .expect("primary switch should exist"); + let secondary_switch = db_switch::find_by_id(&mut txn, &secondary_switch_id) + .await? + .expect("secondary switch should exist"); + assert!(primary_switch.is_primary); + assert!(!secondary_switch.is_primary); + + Ok(()) +} + #[crate::sqlx_test] async fn test_configure_nmx_cluster_disable_scale_up_fabric_state_failure_stops_flow( pool: sqlx::PgPool, From f2d0d1f2f219965b20cb1f8dd5175bd7e21622c9 Mon Sep 17 00:00:00 2001 From: Narasimhan Venkadeswaran Date: Mon, 18 May 2026 11:10:52 -0700 Subject: [PATCH 7/7] clippy fix --- .../tests/rack_state_controller/handler.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/crates/api/src/tests/rack_state_controller/handler.rs b/crates/api/src/tests/rack_state_controller/handler.rs index 3acc6b0146..7fd882b827 100644 --- a/crates/api/src/tests/rack_state_controller/handler.rs +++ b/crates/api/src/tests/rack_state_controller/handler.rs @@ -321,13 +321,13 @@ async fn attach_switches_with_nvos_credentials( )?; let new_switch = NewSwitch { - id: switch_id.clone(), + id: switch_id, config: SwitchConfig { name: expected_switch.metadata.name.clone(), enable_nmxc: false, fabric_manager_config: None, }, - bmc_mac_address: Some(expected_switch.bmc_mac_address.clone()), + bmc_mac_address: Some(expected_switch.bmc_mac_address), metadata: None, rack_id: Some(rack_id.clone()), slot_number: Some(index as i32), @@ -344,7 +344,7 @@ async fn attach_switches_with_nvos_credentials( .set_credentials( &CredentialKey::BmcCredentials { credential_type: BmcCredentialType::BmcRoot { - bmc_mac_address: expected_switch.bmc_mac_address.clone(), + bmc_mac_address: expected_switch.bmc_mac_address, }, }, &Credentials::UsernamePassword { @@ -2348,7 +2348,6 @@ async fn test_configure_nmx_cluster_disable_scale_up_fabric_state_runs_on_all_sw failed_nodes: 0, ..Default::default() }), - ..Default::default() })) .await; @@ -2458,8 +2457,8 @@ async fn test_configure_nmx_cluster_configure_selects_persists_and_configures_pr drop(txn); let switch_ids = attach_switches_with_nvos_credentials(&env, &rack_id, 2).await?; - let secondary_switch_id = switch_ids[0].clone(); - let primary_switch_id = switch_ids[1].clone(); + let secondary_switch_id = switch_ids[0]; + let primary_switch_id = switch_ids[1]; let topology_type = RackHardwareTopology::Gb200Nvl72r1C2g4Topology.to_string(); env.rms_sim @@ -2611,8 +2610,8 @@ async fn test_configure_nmx_cluster_runs_start_disable_configure_to_wait_for_fab drop(txn); let switch_ids = attach_switches_with_nvos_credentials(&env, &rack_id, 2).await?; - let secondary_switch_id = switch_ids[0].clone(); - let primary_switch_id = switch_ids[1].clone(); + let secondary_switch_id = switch_ids[0]; + let primary_switch_id = switch_ids[1]; let topology_type = RackHardwareTopology::Gb200Nvl72r1C2g4Topology.to_string(); env.rms_sim @@ -2623,7 +2622,6 @@ async fn test_configure_nmx_cluster_runs_start_disable_configure_to_wait_for_fab failed_nodes: 0, ..Default::default() }), - ..Default::default() })) .await; env.rms_sim @@ -2875,7 +2873,6 @@ async fn test_configure_nmx_cluster_disable_scale_up_fabric_state_failure_stops_ message: "disable rejected".to_string(), ..Default::default() }), - ..Default::default() })) .await; @@ -3075,7 +3072,7 @@ async fn test_configure_nmx_cluster_configure_failure_advances_to_wait_for_fabri drop(txn); let switch_ids = attach_switches_with_nvos_credentials(&env, &rack_id, 2).await?; - let primary_switch_id = switch_ids[0].clone(); + let primary_switch_id = switch_ids[0]; let topology_type = RackHardwareTopology::Gb200Nvl72r1C2g4Topology.to_string(); env.rms_sim