From 2baa799c44b84b8bd8b64e1fb00d34a11c0374c4 Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 12 Jun 2025 18:08:51 +1200 Subject: [PATCH 01/21] [reconfigurator] Pre-checks and post_update actions for RoT bootloader update --- nexus/mgs-updates/Cargo.toml | 2 + nexus/mgs-updates/src/common_sp_update.rs | 15 + .../mgs-updates/src/rot_bootloader_updater.rs | 308 +++++++++++++++++- wicketd/src/update_tracker.rs | 1 + 4 files changed, 316 insertions(+), 10 deletions(-) diff --git a/nexus/mgs-updates/Cargo.toml b/nexus/mgs-updates/Cargo.toml index 33811919c9e..57fbcfd1ef9 100644 --- a/nexus/mgs-updates/Cargo.toml +++ b/nexus/mgs-updates/Cargo.toml @@ -7,10 +7,12 @@ edition = "2021" workspace = true [dependencies] +anyhow.workspace = true chrono.workspace = true futures.workspace = true gateway-client.workspace = true gateway-types.workspace = true +gateway-messages.workspace = true id-map.workspace = true internal-dns-resolver.workspace = true internal-dns-types.workspace = true diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index 6afc6fb7367..f297becb791 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -8,6 +8,7 @@ use super::MgsClients; use super::UpdateProgress; use futures::future::BoxFuture; +use gateway_client::types::RotImageError; use gateway_client::types::SpType; use gateway_client::types::SpUpdateStatus; use gateway_types::rot::RotSlot; @@ -258,6 +259,7 @@ pub trait SpComponentUpdateHelper { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, + // TODO-K: Add WaitingForOngoingUpdate? ) -> BoxFuture<'a, Result>; /// Attempts once to perform any post-update actions (e.g., reset the @@ -267,6 +269,7 @@ pub trait SpComponentUpdateHelper { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, + // TODO-K: Return a PostUpdateError here ) -> BoxFuture<'a, Result<(), GatewayClientError>>; } @@ -319,6 +322,18 @@ pub enum PrecheckError { WrongInactiveVersion { expected: ExpectedVersion, found: FoundVersion }, } +#[derive(Debug, Error)] +pub enum PostUpdateError { + #[error("communicating with MGS")] + GatewayClientError(#[from] GatewayClientError), + + #[error("communicating with RoT: {message:?}")] + RotCommunicationFailed { message: String }, + + #[error("invalid RoT image: {error:?}")] + RotBootloaderImageError { error: RotImageError }, +} + #[derive(Debug)] pub enum FoundVersion { MissingVersion, diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 04aa569b0df..2d77f53f62c 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -6,10 +6,28 @@ use super::MgsClients; use crate::SpComponentUpdateHelper; +use crate::common_sp_update::FoundVersion; +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; +use crate::common_sp_update::error_means_caboose_is_invalid; +use futures::FutureExt; use futures::future::BoxFuture; +use gateway_client::SpComponent; +use gateway_client::types::GetRotBootInfoParams; +use gateway_client::types::RotImageError; +use gateway_client::types::RotState; +use gateway_client::types::SpComponentFirmwareSlot; +use gateway_client::types::SpType; +use gateway_messages::RotBootInfo; +use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::PendingMgsUpdate; +use nexus_types::deployment::PendingMgsUpdateDetails; +use slog::Logger; +use slog::{debug, info}; +use std::sync::Arc; +use std::time::Duration; +use std::time::Instant; type GatewayClientError = gateway_client::Error; @@ -18,23 +36,293 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { /// Checks if the component is already updated or ready for update fn precheck<'a>( &'a self, - _log: &'a slog::Logger, - _mgs_clients: &'a mut MgsClients, - _update: &'a PendingMgsUpdate, + log: &'a slog::Logger, + mgs_clients: &'a mut MgsClients, + update: &'a PendingMgsUpdate, ) -> BoxFuture<'a, Result> { - // TODO-K: To be completed in a follow up PR - todo!() + async move { + // Verify that the device is the one we think it is. + let state = mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client.sp_get(update.sp_type, update.slot_id).await + }) + .await? + .into_inner(); + debug!(log, "found SP state"; "state" => ?state); + if state.model != update.baseboard_id.part_number + || state.serial_number != update.baseboard_id.serial_number + { + return Err(PrecheckError::WrongDevice { + sp_type: update.sp_type, + slot_id: update.slot_id, + expected_part: update.baseboard_id.part_number.clone(), + expected_serial: update.baseboard_id.serial_number.clone(), + found_part: state.model, + found_serial: state.serial_number, + }); + } + + // TODO-K: In the RoT bootloader update code in wicket, there is a set of + // known bootloader FWIDs that don't have cabooses. Is this something we + // should care about here? + // https://github.com/oxidecomputer/omicron/blob/89ce370f0a96165c777e90a008257a6085897f2a/wicketd/src/update_tracker.rs#L1817-L1841 + + // TODO-K: There are also older versions of the SP have a bug that prevents + // setting the active slot for the RoT bootloader. Is this something we should + // care about here? + // https://github.com/oxidecomputer/omicron/blob/89ce370f0a96165c777e90a008257a6085897f2a/wicketd/src/update_tracker.rs#L1705-L1710 + + // Fetch the caboose from the currently active slot (stage0). + let caboose = mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_caboose_get( + update.sp_type, + update.slot_id, + &SpComponent::STAGE0.to_string(), + 0, + ) + .await + }) + .await? + .into_inner(); + debug!(log, "found active slot caboose"; "caboose" => ?caboose); + + // If the version in the currently active slot matches the one we're + // trying to set, then there's nothing to do. + if caboose.version == update.artifact_version.as_str() { + return Ok(PrecheckStatus::UpdateComplete); + } + + // Otherwise, if the version in the currently active slot does not + // match what we expect to find, bail out. It may be that somebody + // else has come along and completed a subsequent update and we + // don't want to roll that back. (If for some reason we *do* want + // to do this update, the planner will have to notice that what's + // here is wrong and update the blueprint.) + let PendingMgsUpdateDetails::RotBootloader { + expected_stage0_version, + expected_stage0_next_version, + } = &update.details + else { + unreachable!( + "pending MGS update details within ReconfiguratorSpUpdater \ + will always be for the RoT bootloader" + ); + }; + if caboose.version != expected_stage0_version.to_string() { + return Err(PrecheckError::WrongActiveVersion { + expected: expected_stage0_version.clone(), + found: caboose.version, + }); + } + + // For the same reason, check that the version in the inactive slot + // matches what we expect to find. + // TODO It's important for us to detect the condition that a caboose + // is invalid because this can happen when devices are programmed + // with a bad image. Unfortunately, MGS currently reports this as a + // 503. Besides being annoying for us to look for, this causes + // `try_all_serially()` to try the other MGS. That's pointless + // here, but not a big deal. + let found_stage0_next_caboose_result = mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_caboose_get( + update.sp_type, + update.slot_id, + // The name for the SP component here is STAGE0 + // it's a little confusing because we're really + // trying to reach STAGE0NEXT, and there is no + // ROT_BOOTLOADER variant. We specify that we + // want STAGE0NEXT by setting the firmware slot + // to 1, which is where it will always be. + &SpComponent::STAGE0.to_string(), + 1, + ) + .await + }) + .await; + let found_stage0_next_version = + match found_stage0_next_caboose_result { + Ok(version) => { + FoundVersion::Version(version.into_inner().version) + } + Err(error) => { + if error_means_caboose_is_invalid(&error) { + FoundVersion::MissingVersion + } else { + return Err(PrecheckError::from(error)); + } + } + }; + match (&expected_stage0_next_version, &found_stage0_next_version) { + // expected garbage, found garbage + ( + ExpectedVersion::NoValidVersion, + FoundVersion::MissingVersion, + ) => (), + // expected a specific version and found it + ( + ExpectedVersion::Version(artifact_version), + FoundVersion::Version(found_stage0_next_version), + ) if artifact_version.to_string() + == *found_stage0_next_version => + { + () + } + // anything else is a mismatch + (ExpectedVersion::NoValidVersion, FoundVersion::Version(_)) + | (ExpectedVersion::Version(_), FoundVersion::MissingVersion) + | (ExpectedVersion::Version(_), FoundVersion::Version(_)) => { + return Err(PrecheckError::WrongInactiveVersion { + expected: expected_stage0_next_version.clone(), + found: found_stage0_next_version, + }); + } + }; + + // TODO-K: In post_update we'll be restarting the RoT twice to do signature + // checks, and to set stage0 to the new version. What happens if the RoT + // itself is being updated (during the reset stage)? Should we check for that + // here before setting the RoT bootloader as ready to update? + + Ok(PrecheckStatus::ReadyForUpdate) + } + .boxed() } /// Attempts once to perform any post-update actions (e.g., reset the /// device) fn post_update<'a>( &'a self, - _log: &'a slog::Logger, - _mgs_clients: &'a mut MgsClients, - _update: &'a PendingMgsUpdate, + log: &'a slog::Logger, + mgs_clients: &'a mut MgsClients, + update: &'a PendingMgsUpdate, ) -> BoxFuture<'a, Result<(), GatewayClientError>> { - // TODO-K: To be completed in a follow up PR - todo!() + // ) -> BoxFuture<'a, Result<(), PostUpdateError>> { + const WAIT_FOR_BOOT_TIMEOUT: Duration = Duration::from_secs(30); + + // TODO-K: Again, we're resetting the ROT twice here, what happens + // if an RoT update is happening at the same time? + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + // Before setting stage0 to the new version we want to ensure + // the image is good and we're not going to brick the device. + // The RoT will do a signature check when reset. + debug!(log, "attempting to reset device to do bootloader signature check"); + mgs_client + .sp_component_reset( + update.sp_type, + update.slot_id, + &SpComponent::ROT.to_string(), + ) + .await?; + + // We now retrieve boot info from the RoT to verify the reset + // has completed and signature checks done. + let _stage0next_error = wait_for_rot_boot_info( + log, + mgs_client.clone(), + update.sp_type, + update.slot_id, + WAIT_FOR_BOOT_TIMEOUT + ).await; + + // If the image is not valid we bail + debug!(log, "attempting to retrieve boot info to verify image validity"); + // TODO-K: uncomment when we return PostUpdateError instead of GatewayClientError + //if let Some(error) = stage0next_error { + // return Err(PostUpdateError::RotBootloaderImageError { + // error: error, + // }); + //} + + debug!(log, "attempting to set RoT bootloader active slot"); + let persist = true; + mgs_client + .sp_component_active_slot_set( + update.sp_type, + update.slot_id, + &SpComponent::STAGE0.to_string(), + persist, + &SpComponentFirmwareSlot { slot: 0 } + ) + .await?; + + debug!(log, "attempting to reset device to set to new RoT bootloader version"); + mgs_client + .sp_component_reset( + update.sp_type, + update.slot_id, + &SpComponent::ROT.to_string(), + ) + .await?; + Ok(()) + } + ).boxed() + } +} + +/// Poll the RoT asking for its boot information. This is used to check +/// state after RoT bootloader updates +async fn wait_for_rot_boot_info( + log: &Logger, + mgs_client: Arc, + sp_type: SpType, + sp_slot: u32, + timeout: Duration, +) -> Result, PostUpdateError> { + let mut ticker = tokio::time::interval(Duration::from_secs(1)); + + let start = Instant::now(); + loop { + ticker.tick().await; + match mgs_client + .sp_rot_boot_info( + sp_type, + sp_slot, + SpComponent::ROT.const_as_str(), + &GetRotBootInfoParams { + version: RotBootInfo::HIGHEST_KNOWN_VERSION, + }, + ) + .await + { + Ok(state) => match state.into_inner() { + // The minimum we will ever return is 3. + // Additionally, V2 does not report image errors, so we cannot + // know with certainty if a signature check came back with errors + RotState::V2 { .. } => unreachable!(), + RotState::V3 { stage0next_error, .. } => { + return Ok(stage0next_error); + } + // The RoT is probably still booting + RotState::CommunicationFailed { message } => { + if start.elapsed() < timeout { + info!( + log, + "failed getting RoT boot info (will retry)"; + "error" => %message, + ); + } else { + return Err(PostUpdateError::RotCommunicationFailed { + message, + }); + } + } + }, + Err(error) => { + if start.elapsed() < timeout { + info!( + log, + "failed getting RoT boot info (will retry)"; + "error" => %error, + ); + } else { + return Err(PostUpdateError::GatewayClientError(error)); + } + } + } } } diff --git a/wicketd/src/update_tracker.rs b/wicketd/src/update_tracker.rs index 80b10bd8b3b..ff2fc61992d 100644 --- a/wicketd/src/update_tracker.rs +++ b/wicketd/src/update_tracker.rs @@ -2816,6 +2816,7 @@ impl<'a> SpComponentUpdateContext<'a> { UpdateComponent::RotBootloader => { const WAIT_FOR_BOOT_TIMEOUT: Duration = Duration::from_secs(30); + // TODO-K: Check steps here // We need to reset the RoT in order to check the signature on what we just // updated registrar From 8072c7d2fff3816bb5ebf546288a5d4fa2d58f7e Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 12 Jun 2025 19:35:30 +1200 Subject: [PATCH 02/21] building blocks in case I can get PostUpdateError to work --- nexus/mgs-updates/src/common_sp_update.rs | 26 +++++++++---------- nexus/mgs-updates/src/driver_update.rs | 16 ++++++++++++ .../mgs-updates/src/rot_bootloader_updater.rs | 21 ++++++++------- 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index f297becb791..d94d6a2d072 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -8,7 +8,6 @@ use super::MgsClients; use super::UpdateProgress; use futures::future::BoxFuture; -use gateway_client::types::RotImageError; use gateway_client::types::SpType; use gateway_client::types::SpUpdateStatus; use gateway_types::rot::RotSlot; @@ -269,7 +268,8 @@ pub trait SpComponentUpdateHelper { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - // TODO-K: Return a PostUpdateError here + // TODO-K: Return a PostUpdateError here? Is it even possible + // due to try_serially ) -> BoxFuture<'a, Result<(), GatewayClientError>>; } @@ -322,17 +322,17 @@ pub enum PrecheckError { WrongInactiveVersion { expected: ExpectedVersion, found: FoundVersion }, } -#[derive(Debug, Error)] -pub enum PostUpdateError { - #[error("communicating with MGS")] - GatewayClientError(#[from] GatewayClientError), - - #[error("communicating with RoT: {message:?}")] - RotCommunicationFailed { message: String }, - - #[error("invalid RoT image: {error:?}")] - RotBootloaderImageError { error: RotImageError }, -} +//#[derive(Debug, Error)] +//pub enum PostUpdateError { +// #[error("communicating with MGS")] +// GatewayClientError(#[from] GatewayClientError), +// +// #[error("communicating with RoT: {message:?}")] +// RotCommunicationFailed { message: String }, +// +// #[error("invalid RoT image: {error:?}")] +// RotBootloaderImageError { error: RotImageError }, +//} #[derive(Debug)] pub enum FoundVersion { diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 5ec96d19ca3..b30dfbbacb1 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -355,6 +355,22 @@ pub(crate) async fn apply_update( while let Err(error) = update_helper.post_update(log, &mut mgs_clients, update).await { + // TODO-K: Can I get this to happen? + // match error { + // PostUpdateError::GatewayClientError(error) => { + // if !matches!(error, gateway_client::Error::CommunicationError(_)) { + // let error = InlineErrorChain::new(&error); + // error!(log, "post_update failed"; &error); + // return Err(ApplyUpdateError::SpResetFailed(error.to_string())); + // } + // }, + // PostUpdateError::RotBootloaderImageError { error } => { + // let error = InlineErrorChain::new(&error); + // error!(log, "post_update failed"; &error); + // return Err(ApplyUpdateError::SpResetFailed(error.to_string())); + // }, + // PostUpdateError::RotCommunicationFailed { message: _ } => {}, + // } if !matches!(error, gateway_client::Error::CommunicationError(_)) { let error = InlineErrorChain::new(&error); error!(log, "post_update failed"; &error); diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 2d77f53f62c..d49c39cfc87 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -7,7 +7,6 @@ use super::MgsClients; use crate::SpComponentUpdateHelper; use crate::common_sp_update::FoundVersion; -use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::error_means_caboose_is_invalid; @@ -24,7 +23,7 @@ use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::PendingMgsUpdate; use nexus_types::deployment::PendingMgsUpdateDetails; use slog::Logger; -use slog::{debug, info}; +use slog::{debug, error, info}; use std::sync::Arc; use std::time::Duration; use std::time::Instant; @@ -200,7 +199,6 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, ) -> BoxFuture<'a, Result<(), GatewayClientError>> { - // ) -> BoxFuture<'a, Result<(), PostUpdateError>> { const WAIT_FOR_BOOT_TIMEOUT: Duration = Duration::from_secs(30); // TODO-K: Again, we're resetting the ROT twice here, what happens @@ -227,11 +225,11 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { update.sp_type, update.slot_id, WAIT_FOR_BOOT_TIMEOUT - ).await; + ).await?; // If the image is not valid we bail debug!(log, "attempting to retrieve boot info to verify image validity"); - // TODO-K: uncomment when we return PostUpdateError instead of GatewayClientError + // TODO-K: uncomment if I can get PostUpdateError to work instead of GatewayClientError //if let Some(error) = stage0next_error { // return Err(PostUpdateError::RotBootloaderImageError { // error: error, @@ -272,7 +270,7 @@ async fn wait_for_rot_boot_info( sp_type: SpType, sp_slot: u32, timeout: Duration, -) -> Result, PostUpdateError> { +) -> Result, GatewayClientError> { let mut ticker = tokio::time::interval(Duration::from_secs(1)); let start = Instant::now(); @@ -306,9 +304,12 @@ async fn wait_for_rot_boot_info( "error" => %message, ); } else { - return Err(PostUpdateError::RotCommunicationFailed { - message, - }); + error!( + log, + "failed to get RoT boot info"; + "error" => %message, + ); + return Ok(Some(RotImageError::Unchecked)); } } }, @@ -320,7 +321,7 @@ async fn wait_for_rot_boot_info( "error" => %error, ); } else { - return Err(PostUpdateError::GatewayClientError(error)); + return Err(error); } } } From 2de9298b2aa26e4fdd8c998d0ff8fd731ae8c7b9 Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 13 Jun 2025 12:01:07 +1200 Subject: [PATCH 03/21] Introduce PostUpdateError --- Cargo.lock | 1 + clients/gateway-client/Cargo.toml | 1 + clients/gateway-client/src/lib.rs | 2 +- nexus/mgs-updates/src/common_sp_update.rs | 27 ++-- nexus/mgs-updates/src/driver_update.rs | 42 ++--- .../mgs-updates/src/rot_bootloader_updater.rs | 152 +++++++++++------- nexus/mgs-updates/src/rot_updater.rs | 20 ++- nexus/mgs-updates/src/sp_updater.rs | 33 ++-- 8 files changed, 168 insertions(+), 110 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a6f1e172ce7..f986ccc48b3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3567,6 +3567,7 @@ dependencies = [ "serde", "serde_json", "slog", + "thiserror 1.0.69", "uuid", ] diff --git a/clients/gateway-client/Cargo.toml b/clients/gateway-client/Cargo.toml index 6e74b903a57..37e015c5d26 100644 --- a/clients/gateway-client/Cargo.toml +++ b/clients/gateway-client/Cargo.toml @@ -21,5 +21,6 @@ serde.workspace = true serde_json.workspace = true schemars.workspace = true slog.workspace = true +thiserror.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index df7b88c13b4..a8c6d5a84dd 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -63,7 +63,7 @@ progenitor::generate_api!( HostPhase2RecoveryImageId = { derives = [PartialEq, Eq, PartialOrd, Ord] }, ImageVersion = { derives = [PartialEq, Eq, PartialOrd, Ord] }, RotImageDetails = { derives = [PartialEq, Eq, PartialOrd, Ord] }, - RotImageError = { derives = [ PartialEq, Eq, PartialOrd, Ord] }, + RotImageError = { derives = [ thiserror::Error, PartialEq, Eq, PartialOrd, Ord] }, RotState = { derives = [PartialEq, Eq, PartialOrd, Ord] }, SpComponentCaboose = { derives = [PartialEq, Eq] }, SpIdentifier = { derives = [Copy, PartialEq, Hash, Eq] }, diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index d94d6a2d072..f5bfa824c3b 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -8,6 +8,7 @@ use super::MgsClients; use super::UpdateProgress; use futures::future::BoxFuture; +use gateway_client::types::RotImageError; use gateway_client::types::SpType; use gateway_client::types::SpUpdateStatus; use gateway_types::rot::RotSlot; @@ -270,7 +271,8 @@ pub trait SpComponentUpdateHelper { update: &'a PendingMgsUpdate, // TODO-K: Return a PostUpdateError here? Is it even possible // due to try_serially - ) -> BoxFuture<'a, Result<(), GatewayClientError>>; + //) -> BoxFuture<'a, Result<(), GatewayClientError>>; + ) -> BoxFuture<'a, Result<(), PostUpdateError>>; } /// Describes the live state of the component before the update begins @@ -322,17 +324,18 @@ pub enum PrecheckError { WrongInactiveVersion { expected: ExpectedVersion, found: FoundVersion }, } -//#[derive(Debug, Error)] -//pub enum PostUpdateError { -// #[error("communicating with MGS")] -// GatewayClientError(#[from] GatewayClientError), -// -// #[error("communicating with RoT: {message:?}")] -// RotCommunicationFailed { message: String }, -// -// #[error("invalid RoT image: {error:?}")] -// RotBootloaderImageError { error: RotImageError }, -//} +#[derive(Debug, thiserror::Error)] +pub enum PostUpdateError { + #[error("communicating with MGS")] + GatewayClientError(#[from] GatewayClientError), + + // TODO-K: This one may not be necessary + #[error("communicating with RoT: {message:?}")] + RotCommunicationFailed { message: String }, + + #[error("invalid RoT image: {error:?}")] + RotBootloaderImageError { error: RotImageError }, +} #[derive(Debug)] pub enum FoundVersion { diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index b30dfbbacb1..0a79b7b674a 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -4,6 +4,7 @@ //! Concurrent-safe facilities for doing MGS-managed upates +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::STATUS_POLL_INTERVAL; @@ -355,26 +356,27 @@ pub(crate) async fn apply_update( while let Err(error) = update_helper.post_update(log, &mut mgs_clients, update).await { - // TODO-K: Can I get this to happen? - // match error { - // PostUpdateError::GatewayClientError(error) => { - // if !matches!(error, gateway_client::Error::CommunicationError(_)) { - // let error = InlineErrorChain::new(&error); - // error!(log, "post_update failed"; &error); - // return Err(ApplyUpdateError::SpResetFailed(error.to_string())); - // } - // }, - // PostUpdateError::RotBootloaderImageError { error } => { - // let error = InlineErrorChain::new(&error); - // error!(log, "post_update failed"; &error); - // return Err(ApplyUpdateError::SpResetFailed(error.to_string())); - // }, - // PostUpdateError::RotCommunicationFailed { message: _ } => {}, - // } - if !matches!(error, gateway_client::Error::CommunicationError(_)) { - let error = InlineErrorChain::new(&error); - error!(log, "post_update failed"; &error); - return Err(ApplyUpdateError::SpResetFailed(error.to_string())); + match error { + PostUpdateError::GatewayClientError(error) => { + if !matches!( + error, + gateway_client::Error::CommunicationError(_) + ) { + let error = InlineErrorChain::new(&error); + error!(log, "post_update failed"; &error); + return Err(ApplyUpdateError::SpResetFailed( + error.to_string(), + )); + } + } + PostUpdateError::RotBootloaderImageError { error } => { + let error = InlineErrorChain::new(&error); + error!(log, "post_update failed"; &error); + return Err(ApplyUpdateError::SpResetFailed( + error.to_string(), + )); + } + PostUpdateError::RotCommunicationFailed { message: _ } => {} } tokio::time::sleep(RESET_DELAY_INTERVAL).await; diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index d49c39cfc87..6469470b7d6 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -7,6 +7,7 @@ use super::MgsClients; use crate::SpComponentUpdateHelper; use crate::common_sp_update::FoundVersion; +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::error_means_caboose_is_invalid; @@ -24,7 +25,6 @@ use nexus_types::deployment::PendingMgsUpdate; use nexus_types::deployment::PendingMgsUpdateDetails; use slog::Logger; use slog::{debug, error, info}; -use std::sync::Arc; use std::time::Duration; use std::time::Instant; @@ -198,57 +198,73 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>> { + ) -> BoxFuture<'a, Result<(), PostUpdateError>> { const WAIT_FOR_BOOT_TIMEOUT: Duration = Duration::from_secs(30); // TODO-K: Again, we're resetting the ROT twice here, what happens // if an RoT update is happening at the same time? - mgs_clients - .try_all_serially(log, move |mgs_client| async move { - // Before setting stage0 to the new version we want to ensure - // the image is good and we're not going to brick the device. - // The RoT will do a signature check when reset. - debug!(log, "attempting to reset device to do bootloader signature check"); - mgs_client - .sp_component_reset( - update.sp_type, - update.slot_id, - &SpComponent::ROT.to_string(), - ) - .await?; - // We now retrieve boot info from the RoT to verify the reset - // has completed and signature checks done. - let _stage0next_error = wait_for_rot_boot_info( - log, - mgs_client.clone(), - update.sp_type, - update.slot_id, - WAIT_FOR_BOOT_TIMEOUT - ).await?; + async move { + // Before setting stage0 to the new version we want to ensure + // the image is good and we're not going to brick the device. + // The RoT will do a signature check when reset. + debug!( + log, + "attempting to reset device to do bootloader signature check" + ); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + mgs_client + .sp_component_reset( + update.sp_type, + update.slot_id, + &SpComponent::ROT.to_string(), + ) + .await + }) + .await?; - // If the image is not valid we bail - debug!(log, "attempting to retrieve boot info to verify image validity"); - // TODO-K: uncomment if I can get PostUpdateError to work instead of GatewayClientError - //if let Some(error) = stage0next_error { - // return Err(PostUpdateError::RotBootloaderImageError { - // error: error, - // }); - //} + // We now retrieve boot info from the RoT to verify the reset + // has completed and signature checks done. + debug!( + log, + "attempting to retrieve boot info to verify image validity" + ); + let stage0next_error = wait_for_stage0_next_image_check( + log, + mgs_clients, + update.sp_type, + update.slot_id, + WAIT_FOR_BOOT_TIMEOUT, + ) + .await?; + // If the image is not valid we bail + if let Some(error) = stage0next_error { + return Err(PostUpdateError::RotBootloaderImageError { + error, + }); + } - debug!(log, "attempting to set RoT bootloader active slot"); - let persist = true; - mgs_client - .sp_component_active_slot_set( - update.sp_type, - update.slot_id, - &SpComponent::STAGE0.to_string(), - persist, - &SpComponentFirmwareSlot { slot: 0 } - ) - .await?; + debug!(log, "attempting to set RoT bootloader active slot"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + let persist = true; + mgs_client + .sp_component_active_slot_set( + update.sp_type, + update.slot_id, + &SpComponent::STAGE0.to_string(), + persist, + &SpComponentFirmwareSlot { slot: 0 }, + ) + .await?; + Ok(()) + }) + .await?; - debug!(log, "attempting to reset device to set to new RoT bootloader version"); + debug!(log, "attempting to reset device to set to new RoT bootloader version"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { mgs_client .sp_component_reset( update.sp_type, @@ -257,34 +273,44 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { ) .await?; Ok(()) - } - ).boxed() + }) + .await?; + + Ok(()) + } + .boxed() } } /// Poll the RoT asking for its boot information. This is used to check /// state after RoT bootloader updates -async fn wait_for_rot_boot_info( +async fn wait_for_stage0_next_image_check( log: &Logger, - mgs_client: Arc, + mgs_clients: &mut MgsClients, sp_type: SpType, sp_slot: u32, timeout: Duration, + // TODO-K: Change to PostUpdateError? ) -> Result, GatewayClientError> { let mut ticker = tokio::time::interval(Duration::from_secs(1)); let start = Instant::now(); loop { ticker.tick().await; - match mgs_client - .sp_rot_boot_info( - sp_type, - sp_slot, - SpComponent::ROT.const_as_str(), - &GetRotBootInfoParams { - version: RotBootInfo::HIGHEST_KNOWN_VERSION, - }, - ) + + match mgs_clients + .try_all_serially(log, |mgs_client| async move { + mgs_client + .sp_rot_boot_info( + sp_type, + sp_slot, + SpComponent::ROT.const_as_str(), + &GetRotBootInfoParams { + version: RotBootInfo::HIGHEST_KNOWN_VERSION, + }, + ) + .await + }) .await { Ok(state) => match state.into_inner() { @@ -309,6 +335,7 @@ async fn wait_for_rot_boot_info( "failed to get RoT boot info"; "error" => %message, ); + // TODO-K: change this one to RotCommunicationFailed? return Ok(Some(RotImageError::Unchecked)); } } @@ -327,3 +354,14 @@ async fn wait_for_rot_boot_info( } } } + +// match mgs_client +// .sp_rot_boot_info( +// sp_type, +// sp_slot, +// SpComponent::ROT.const_as_str(), +// &GetRotBootInfoParams { +// version: RotBootInfo::HIGHEST_KNOWN_VERSION, +// }, +// ) +// .await diff --git a/nexus/mgs-updates/src/rot_updater.rs b/nexus/mgs-updates/src/rot_updater.rs index b36a2a4b0cd..b87818acfc3 100644 --- a/nexus/mgs-updates/src/rot_updater.rs +++ b/nexus/mgs-updates/src/rot_updater.rs @@ -11,6 +11,7 @@ use super::common_sp_update::SpComponentUpdater; use super::common_sp_update::deliver_update; use crate::SpComponentUpdateHelper; use crate::common_sp_update::FoundVersion; +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::error_means_caboose_is_invalid; @@ -396,11 +397,12 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>> { + ) -> BoxFuture<'a, Result<(), PostUpdateError>> { + async move { + // We want to set the slot we've just updated as the active one + debug!(log, "attempting to set active slot"); mgs_clients .try_all_serially(log, move |mgs_client| async move { - // We want to set the slot we've just updated as the active one - debug!(log, "attempting to set active slot"); let inactive_slot = match &update.details { PendingMgsUpdateDetails::Rot { expected_active_slot, .. } => { expected_active_slot.slot().toggled().to_u16() @@ -421,8 +423,13 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { &SpComponentFirmwareSlot { slot: inactive_slot } ) .await?; + Ok(()) + }) + .await?; - debug!(log, "attempting to reset device"); + debug!(log, "attempting to reset device"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { mgs_client .sp_component_reset( update.sp_type, @@ -431,7 +438,8 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { ) .await?; Ok(()) - }) - .boxed() + }).await?; + Ok(()) + }.boxed() } } diff --git a/nexus/mgs-updates/src/sp_updater.rs b/nexus/mgs-updates/src/sp_updater.rs index e50b4f3061b..88e45e4556d 100644 --- a/nexus/mgs-updates/src/sp_updater.rs +++ b/nexus/mgs-updates/src/sp_updater.rs @@ -9,6 +9,7 @@ use crate::SpComponentUpdateError; use crate::SpComponentUpdateHelper; use crate::UpdateProgress; use crate::common_sp_update::FoundVersion; +use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::SpComponentUpdater; @@ -293,19 +294,23 @@ impl SpComponentUpdateHelper for ReconfiguratorSpUpdater { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - ) -> BoxFuture<'a, Result<(), GatewayClientError>> { - mgs_clients - .try_all_serially(log, move |mgs_client| async move { - debug!(log, "attempting to reset device"); - mgs_client - .sp_component_reset( - update.sp_type, - update.slot_id, - &SpComponent::SP_ITSELF.to_string(), - ) - .await?; - Ok(()) - }) - .boxed() + ) -> BoxFuture<'a, Result<(), PostUpdateError>> { + async move { + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + debug!(log, "attempting to reset device"); + mgs_client + .sp_component_reset( + update.sp_type, + update.slot_id, + &SpComponent::SP_ITSELF.to_string(), + ) + .await?; + Ok(()) + }) + .await?; + Ok(()) + } + .boxed() } } From f578e35cacc60d1c9d3a0c641c12895b8897eba3 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 17 Jun 2025 11:01:49 +1200 Subject: [PATCH 04/21] implement WaitingForOngoingUpdate variant --- nexus/mgs-updates/src/common_sp_update.rs | 5 ++--- nexus/mgs-updates/src/driver_update.rs | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index f5bfa824c3b..958d1d222fd 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -269,9 +269,6 @@ pub trait SpComponentUpdateHelper { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - // TODO-K: Return a PostUpdateError here? Is it even possible - // due to try_serially - //) -> BoxFuture<'a, Result<(), GatewayClientError>>; ) -> BoxFuture<'a, Result<(), PostUpdateError>>; } @@ -280,6 +277,8 @@ pub trait SpComponentUpdateHelper { pub enum PrecheckStatus { UpdateComplete, ReadyForUpdate, + // TODO-K: Add more detailed comment here + WaitingForOngoingUpdate, } #[derive(Debug, Error)] diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 0a79b7b674a..89bedae7bd7 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -47,6 +47,11 @@ pub const DEFAULT_RETRY_TIMEOUT: Duration = Duration::from_secs(60); /// How long to wait after resetting the device before expecting it to come up const RESET_TIMEOUT: Duration = Duration::from_secs(60); +/// TODO-K: Write a better comment here +// Is 3 munutes enough? we're giving 1 minute per reset, and we're resetting +// twice, so it makes sense to add one more minute? +const WAIT_FOR_ONGOING_UPDATE_TIMEOUT: Duration = Duration::from_secs(180); + /// Parameters describing a request to update one SP-managed component /// /// This is similar in spirit to the `SpComponentUpdater` trait but uses a @@ -217,7 +222,9 @@ pub(crate) async fn apply_update( // - if not, then if our required preconditions are met status.update(UpdateAttemptStatus::Precheck); match update_helper.precheck(log, &mut mgs_clients, update).await { - Ok(PrecheckStatus::ReadyForUpdate) => (), + // TODO-K: Add more detailed comment here + Ok(PrecheckStatus::ReadyForUpdate) | + Ok(PrecheckStatus::WaitingForOngoingUpdate) => (), Ok(PrecheckStatus::UpdateComplete) => { return Ok(UpdateCompletedHow::FoundNoChangesNeeded); } @@ -616,6 +623,17 @@ async fn wait_for_update_done( // Check if we're done. Ok(PrecheckStatus::UpdateComplete) => return Ok(()), + // TODO-K: Add more detailed comment here + // Loop for 3 minutes to wait for any ongoing update (maybe more minutes?) + Ok(PrecheckStatus::WaitingForOngoingUpdate) => { + if before.elapsed() >= WAIT_FOR_ONGOING_UPDATE_TIMEOUT { + return Err(UpdateWaitError::Timeout(WAIT_FOR_ONGOING_UPDATE_TIMEOUT)); + } + + tokio::time::sleep(PROGRESS_POLL_INTERVAL).await; + continue; + }, + // An incorrect version in the "inactive" slot, incorrect active slot, // or non-empty pending_persistent_boot_preference/transient_boot_preference // are normal during the upgrade. We have no reason to think these won't From 3ced09184003e553a2366b91498a412fe9db25d6 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 17 Jun 2025 14:19:24 +1200 Subject: [PATCH 05/21] Implement a waiting time for an ongoing RoT bootloader update --- nexus/mgs-updates/src/common_sp_update.rs | 4 +-- nexus/mgs-updates/src/driver_update.rs | 31 +++++++++++-------- .../mgs-updates/src/rot_bootloader_updater.rs | 12 +------ wicketd/src/update_tracker.rs | 1 - 4 files changed, 20 insertions(+), 28 deletions(-) diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index 958d1d222fd..075c7afcfe9 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -259,7 +259,6 @@ pub trait SpComponentUpdateHelper { log: &'a slog::Logger, mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, - // TODO-K: Add WaitingForOngoingUpdate? ) -> BoxFuture<'a, Result>; /// Attempts once to perform any post-update actions (e.g., reset the @@ -277,8 +276,7 @@ pub trait SpComponentUpdateHelper { pub enum PrecheckStatus { UpdateComplete, ReadyForUpdate, - // TODO-K: Add more detailed comment here - WaitingForOngoingUpdate, + WaitingForOngoingRotBootloaderUpdate, } #[derive(Debug, Error)] diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 89bedae7bd7..a0c3e3114b2 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -33,7 +33,7 @@ use uuid::Uuid; /// How long may the status remain unchanged without us treating this as a /// problem? -pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(120); +pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(180); /// How long to wait between failed attempts to reset the device const RESET_DELAY_INTERVAL: Duration = Duration::from_secs(10); @@ -47,10 +47,11 @@ pub const DEFAULT_RETRY_TIMEOUT: Duration = Duration::from_secs(60); /// How long to wait after resetting the device before expecting it to come up const RESET_TIMEOUT: Duration = Duration::from_secs(60); -/// TODO-K: Write a better comment here -// Is 3 munutes enough? we're giving 1 minute per reset, and we're resetting -// twice, so it makes sense to add one more minute? -const WAIT_FOR_ONGOING_UPDATE_TIMEOUT: Duration = Duration::from_secs(180); +/// How long to wait for an ongoing RoT bootloader update +const WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT: Duration = Duration::from_secs(180); + +/// How long to wait between poll attempts on RoT bootloader update status +const ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL: Duration = Duration::from_secs(10); /// Parameters describing a request to update one SP-managed component /// @@ -222,9 +223,11 @@ pub(crate) async fn apply_update( // - if not, then if our required preconditions are met status.update(UpdateAttemptStatus::Precheck); match update_helper.precheck(log, &mut mgs_clients, update).await { - // TODO-K: Add more detailed comment here Ok(PrecheckStatus::ReadyForUpdate) | - Ok(PrecheckStatus::WaitingForOngoingUpdate) => (), + // This is the first time a Nexus instance is attempting to + // update the RoT bootloader, we don't need to wait for an + // ongoing update. + Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) => (), Ok(PrecheckStatus::UpdateComplete) => { return Ok(UpdateCompletedHow::FoundNoChangesNeeded); } @@ -623,14 +626,16 @@ async fn wait_for_update_done( // Check if we're done. Ok(PrecheckStatus::UpdateComplete) => return Ok(()), - // TODO-K: Add more detailed comment here - // Loop for 3 minutes to wait for any ongoing update (maybe more minutes?) - Ok(PrecheckStatus::WaitingForOngoingUpdate) => { - if before.elapsed() >= WAIT_FOR_ONGOING_UPDATE_TIMEOUT { - return Err(UpdateWaitError::Timeout(WAIT_FOR_ONGOING_UPDATE_TIMEOUT)); + // We'll loop for 3 minutes to wait for any ongoing RoT bootloader update. + // We need to wait for 2 resets which have a timeout of 60 seconds each, + // and an attempt to retrieve boot info, which has a time out of 30 seconds. + // We give an additional 30 seconds to as a buffer for the other actions. + Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) => { + if before.elapsed() >= WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT { + return Err(UpdateWaitError::Timeout(WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT)); } - tokio::time::sleep(PROGRESS_POLL_INTERVAL).await; + tokio::time::sleep(ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL).await; continue; }, diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 6469470b7d6..dfbe2cdd506 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -255,7 +255,7 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { update.slot_id, &SpComponent::STAGE0.to_string(), persist, - &SpComponentFirmwareSlot { slot: 0 }, + &SpComponentFirmwareSlot { slot: 1 }, ) .await?; Ok(()) @@ -355,13 +355,3 @@ async fn wait_for_stage0_next_image_check( } } -// match mgs_client -// .sp_rot_boot_info( -// sp_type, -// sp_slot, -// SpComponent::ROT.const_as_str(), -// &GetRotBootInfoParams { -// version: RotBootInfo::HIGHEST_KNOWN_VERSION, -// }, -// ) -// .await diff --git a/wicketd/src/update_tracker.rs b/wicketd/src/update_tracker.rs index ff2fc61992d..80b10bd8b3b 100644 --- a/wicketd/src/update_tracker.rs +++ b/wicketd/src/update_tracker.rs @@ -2816,7 +2816,6 @@ impl<'a> SpComponentUpdateContext<'a> { UpdateComponent::RotBootloader => { const WAIT_FOR_BOOT_TIMEOUT: Duration = Duration::from_secs(30); - // TODO-K: Check steps here // We need to reset the RoT in order to check the signature on what we just // updated registrar From c15ca0874f4fe0b7e8c91a2e764b155506bd1f1c Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 17 Jun 2025 14:22:49 +1200 Subject: [PATCH 06/21] fmt --- nexus/mgs-updates/src/driver_update.rs | 19 +++++++++++++------ .../mgs-updates/src/rot_bootloader_updater.rs | 1 - 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index a0c3e3114b2..0ff6a37250c 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -48,10 +48,12 @@ pub const DEFAULT_RETRY_TIMEOUT: Duration = Duration::from_secs(60); const RESET_TIMEOUT: Duration = Duration::from_secs(60); /// How long to wait for an ongoing RoT bootloader update -const WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT: Duration = Duration::from_secs(180); +const WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT: Duration = + Duration::from_secs(180); /// How long to wait between poll attempts on RoT bootloader update status -const ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL: Duration = Duration::from_secs(10); +const ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL: Duration = + Duration::from_secs(10); /// Parameters describing a request to update one SP-managed component /// @@ -631,13 +633,18 @@ async fn wait_for_update_done( // and an attempt to retrieve boot info, which has a time out of 30 seconds. // We give an additional 30 seconds to as a buffer for the other actions. Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) => { - if before.elapsed() >= WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT { - return Err(UpdateWaitError::Timeout(WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT)); + if before.elapsed() + >= WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT + { + return Err(UpdateWaitError::Timeout( + WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT, + )); } - tokio::time::sleep(ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL).await; + tokio::time::sleep(ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL) + .await; continue; - }, + } // An incorrect version in the "inactive" slot, incorrect active slot, // or non-empty pending_persistent_boot_preference/transient_boot_preference diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index dfbe2cdd506..feacd0d3f4c 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -354,4 +354,3 @@ async fn wait_for_stage0_next_image_check( } } } - From 4fb917a703bfc09b68510545b571e1b92453fecc Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 17 Jun 2025 17:31:26 +1200 Subject: [PATCH 07/21] Improved error handling --- nexus/mgs-updates/src/common_sp_update.rs | 1 - nexus/mgs-updates/src/driver_update.rs | 6 +++--- .../mgs-updates/src/rot_bootloader_updater.rs | 18 ++++++++---------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index 075c7afcfe9..f4c81c5f1f7 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -326,7 +326,6 @@ pub enum PostUpdateError { #[error("communicating with MGS")] GatewayClientError(#[from] GatewayClientError), - // TODO-K: This one may not be necessary #[error("communicating with RoT: {message:?}")] RotCommunicationFailed { message: String }, diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 0ff6a37250c..7b8524ae36f 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -362,9 +362,9 @@ pub(crate) async fn apply_update( if try_reset { // We retry this until we get some error *other* than a communication - // error. There is intentionally no timeout here. If we've staged an - // update but not managed to reset the device, there's no point where - // we'd want to stop trying to do so. + // error or an RoT bootloader image error. There is intentionally no + // timeout here. If we've staged an update but not managed to reset + // the device, there's no point where we'd want to stop trying to do so. while let Err(error) = update_helper.post_update(log, &mut mgs_clients, update).await { diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index feacd0d3f4c..89356757947 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -28,7 +28,7 @@ use slog::{debug, error, info}; use std::time::Duration; use std::time::Instant; -type GatewayClientError = gateway_client::Error; +const WAIT_FOR_BOOT_INFO_TIMEOUT: Duration = Duration::from_secs(30); pub struct ReconfiguratorRotBootloaderUpdater; impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { @@ -186,7 +186,7 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { // itself is being updated (during the reset stage)? Should we check for that // here before setting the RoT bootloader as ready to update? - Ok(PrecheckStatus::ReadyForUpdate) + Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) } .boxed() } @@ -199,8 +199,6 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, ) -> BoxFuture<'a, Result<(), PostUpdateError>> { - const WAIT_FOR_BOOT_TIMEOUT: Duration = Duration::from_secs(30); - // TODO-K: Again, we're resetting the ROT twice here, what happens // if an RoT update is happening at the same time? @@ -235,7 +233,7 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { mgs_clients, update.sp_type, update.slot_id, - WAIT_FOR_BOOT_TIMEOUT, + WAIT_FOR_BOOT_INFO_TIMEOUT, ) .await?; // If the image is not valid we bail @@ -290,8 +288,7 @@ async fn wait_for_stage0_next_image_check( sp_type: SpType, sp_slot: u32, timeout: Duration, - // TODO-K: Change to PostUpdateError? -) -> Result, GatewayClientError> { +) -> Result, PostUpdateError> { let mut ticker = tokio::time::interval(Duration::from_secs(1)); let start = Instant::now(); @@ -335,8 +332,9 @@ async fn wait_for_stage0_next_image_check( "failed to get RoT boot info"; "error" => %message, ); - // TODO-K: change this one to RotCommunicationFailed? - return Ok(Some(RotImageError::Unchecked)); + return Err(PostUpdateError::RotCommunicationFailed { + message, + }); } } }, @@ -348,7 +346,7 @@ async fn wait_for_stage0_next_image_check( "error" => %error, ); } else { - return Err(error); + return Err(PostUpdateError::GatewayClientError(error)); } } } From dff02541c0b073777df3ed33f3f2278bf82c8390 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 17 Jun 2025 17:38:16 +1200 Subject: [PATCH 08/21] Clean up --- nexus/mgs-updates/Cargo.toml | 1 - nexus/mgs-updates/src/common_sp_update.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/nexus/mgs-updates/Cargo.toml b/nexus/mgs-updates/Cargo.toml index 57fbcfd1ef9..269ba745e63 100644 --- a/nexus/mgs-updates/Cargo.toml +++ b/nexus/mgs-updates/Cargo.toml @@ -7,7 +7,6 @@ edition = "2021" workspace = true [dependencies] -anyhow.workspace = true chrono.workspace = true futures.workspace = true gateway-client.workspace = true diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index f4c81c5f1f7..50a33cfa860 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -329,7 +329,7 @@ pub enum PostUpdateError { #[error("communicating with RoT: {message:?}")] RotCommunicationFailed { message: String }, - #[error("invalid RoT image: {error:?}")] + #[error("invalid RoT bootloader image: {error:?}")] RotBootloaderImageError { error: RotImageError }, } From 25b0f4047428f6fbdba6552a198f5a7cbc904472 Mon Sep 17 00:00:00 2001 From: karencfv Date: Wed, 18 Jun 2025 20:18:51 +1200 Subject: [PATCH 09/21] add some tests --- nexus/mgs-updates/src/driver_update.rs | 255 ++++++++++++++++++ .../src/test_util/sp_test_state.rs | 56 ++++ .../src/test_util/test_artifacts.rs | 56 ++++ nexus/mgs-updates/src/test_util/updates.rs | 83 +++++- 4 files changed, 446 insertions(+), 4 deletions(-) diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 5754f2c23d4..4578ddba943 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -778,6 +778,91 @@ mod test { finished.expect_sp_success(expected_result); } + /// Tests several happy-path cases of updating an RoT bootloader + #[tokio::test] + async fn test_rot_bootloader_update_basic() { + let gwtestctx = gateway_test_utils::setup::test_setup( + "test_rot_bootloader_update_basic", + SpPort::One, + ) + .await; + let log = &gwtestctx.logctx.log; + let artifacts = TestArtifacts::new(log).await.unwrap(); + + // Basic case: normal update + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Sled, + 1, + &artifacts.rot_bootloader_gimlet_artifact_hash, + UpdateCompletedHow::CompletedUpdate, + ) + .await; + + // Basic case: attempted update, found no changes needed + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Sled, + 1, + &artifacts.rot_bootloader_gimlet_artifact_hash, + UpdateCompletedHow::FoundNoChangesNeeded, + ) + .await; + + // Run the same two tests for a switch RoT bootloader. + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Switch, + 0, + &artifacts.rot_bootloader_sidecar_artifact_hash, + UpdateCompletedHow::CompletedUpdate, + ) + .await; + run_one_successful_rot_bootloader_update( + &gwtestctx, + &artifacts, + SpType::Switch, + 0, + &artifacts.rot_bootloader_sidecar_artifact_hash, + UpdateCompletedHow::FoundNoChangesNeeded, + ) + .await; + + artifacts.teardown().await; + gwtestctx.teardown().await; + } + + async fn run_one_successful_rot_bootloader_update( + gwtestctx: &GatewayTestContext, + artifacts: &TestArtifacts, + sp_type: SpType, + slot_id: u32, + artifact_hash: &ArtifactHash, + expected_result: UpdateCompletedHow, + ) { + let desc = UpdateDescription { + gwtestctx, + artifacts, + sp_type, + slot_id, + artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + + let in_progress = desc.setup().await; + let finished = in_progress.finish().await; + finished.expect_rot_bootloader_success(expected_result); + } + /// Tests several happy-path cases of updating an RoT #[tokio::test] async fn test_rot_update_basic() { @@ -1414,4 +1499,174 @@ mod test { gwtestctx.teardown().await; } + + /// Tests a bunch of easy fast-failure RoT bootloader cases. + #[tokio::test] + async fn test_rot_bootloader_basic_failures() { + let gwtestctx = gateway_test_utils::setup::test_setup( + "test_rot_bootloader_basic_failures", + SpPort::One, + ) + .await; + let log = &gwtestctx.logctx.log; + let artifacts = TestArtifacts::new(log).await.unwrap(); + + // Test a case of mistaken identity (reported baseboard does not match + // the one that we expect). + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: Some(BaseboardId { + part_number: String::from("i86pc"), + serial_number: String::from("SimGimlet0"), + }), + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "in sled slot 1, expected to find part \"i86pc\" serial \ + \"SimGimlet0\", but found part \"i86pc\" serial \ + \"SimGimlet01\"", + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Test a case where the active version doesn't match what we expect. + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: Some( + "not-right".parse().unwrap(), + ), + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "expected to find active version \"not-right\", but \ + found \"0.0.200\"" + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Test a case where the inactive version doesn't match what it should + // (expected invalid, found something else). + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: Some( + ExpectedVersion::NoValidVersion, + ), + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "expected to find inactive version NoValidVersion, \ + but found Version(\"0.0.200\")" + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Now test a case where the inactive version doesn't match what it + // should (expected a different valid version). + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: Some( + ExpectedVersion::Version( + "something-else".parse().unwrap(), + ), + ), + }, + override_progress_timeout: None, + }; + + desc.setup().await.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::PreconditionFailed(..)); + let message = InlineErrorChain::new(error).to_string(); + eprintln!("{}", message); + assert!(message.contains( + "expected to find inactive version \ + Version(ArtifactVersion(\"something-else\")), but found \ + Version(\"0.0.200\")" + )); + + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + // Test a case where we fail to fetch the artifact. We simulate this by + // tearing down our artifact server before the update starts. + let desc = UpdateDescription { + gwtestctx: &gwtestctx, + artifacts: &artifacts, + sp_type: SpType::Sled, + slot_id: 1, + artifact_hash: &artifacts.sp_gimlet_artifact_hash, + override_baseboard_id: None, + override_expected_sp_component: + ExpectedSpComponent::RotBootloader { + override_expected_stage0: None, + override_expected_stage0_next: None, + }, + override_progress_timeout: None, + }; + let in_progress = desc.setup().await; + artifacts.teardown().await; + in_progress.finish().await.expect_failure(&|error, sp1, sp2| { + assert_matches!(error, ApplyUpdateError::FetchArtifact(..)); + // No changes should have been made in this case. + assert_eq!(sp1, sp2); + }); + + gwtestctx.teardown().await; + } } diff --git a/nexus/mgs-updates/src/test_util/sp_test_state.rs b/nexus/mgs-updates/src/test_util/sp_test_state.rs index 9c9198a4d5c..75b02972c83 100644 --- a/nexus/mgs-updates/src/test_util/sp_test_state.rs +++ b/nexus/mgs-updates/src/test_util/sp_test_state.rs @@ -44,6 +44,16 @@ pub struct SpTestState { /// This can be None if the caboose contents were not valid. pub caboose_rot_b: Option, + /// caboose read from stage 0 (active slot for the RoT bootloader) + /// + /// This is not an `Option` because we never expect to fail to read this. + pub caboose_stage0: SpComponentCaboose, + + /// caboose read from stage 0 next (inactive slot for the RoT bootloader) + /// + /// This can be None if the caboose contents were not valid. + pub caboose_stage0_next: Option, + /// Overall SP state pub sp_state: SpState, @@ -94,6 +104,24 @@ impl SpTestState { ) .await .map(|c| c.into_inner()); + let caboose_stage0 = mgs_client + .sp_component_caboose_get( + sp_type, + sp_slot, + SpComponent::STAGE0.const_as_str(), + 0, + ) + .await? + .into_inner(); + let caboose_stage0_next = mgs_client + .sp_component_caboose_get( + sp_type, + sp_slot, + SpComponent::STAGE0.const_as_str(), + 1, + ) + .await + .map(|c| c.into_inner()); let sp_info = mgs_client.sp_get(sp_type, sp_slot).await?.into_inner(); let sp_boot_info = mgs_client .sp_rot_boot_info( @@ -113,6 +141,10 @@ impl SpTestState { ), caboose_rot_a: ignore_invalid_caboose_error(caboose_rot_a), caboose_rot_b: ignore_invalid_caboose_error(caboose_rot_b), + caboose_stage0, + caboose_stage0_next: ignore_invalid_caboose_error( + caboose_stage0_next, + ), sp_state: sp_info, sp_boot_info, }) @@ -141,6 +173,14 @@ impl SpTestState { } } + pub fn expect_caboose_stage0(&self) -> &SpComponentCaboose { + &self.caboose_stage0 + } + + pub fn expect_caboose_stage0_next(&self) -> &SpComponentCaboose { + self.caboose_stage0_next.as_ref().expect("stage0 next caboose") + } + pub fn expect_caboose_rot_inactive(&self) -> &SpComponentCaboose { let slot = self.expect_rot_active_slot().toggled(); match slot { @@ -255,6 +295,22 @@ impl SpTestState { }, } } + + pub fn expect_stage0_version(&self) -> ArtifactVersion { + self.expect_caboose_stage0() + .version + .parse() + .expect("valid artifact version") + } + + pub fn expect_stage0_next_version(&self) -> ExpectedVersion { + match &self.caboose_stage0_next { + Some(v) => ExpectedVersion::Version( + v.version.parse().expect("valid stage0 next version"), + ), + None => ExpectedVersion::NoValidVersion, + } + } } fn ignore_invalid_caboose_error( diff --git a/nexus/mgs-updates/src/test_util/test_artifacts.rs b/nexus/mgs-updates/src/test_util/test_artifacts.rs index 33cb4853009..4eb753f0c47 100644 --- a/nexus/mgs-updates/src/test_util/test_artifacts.rs +++ b/nexus/mgs-updates/src/test_util/test_artifacts.rs @@ -36,6 +36,8 @@ pub struct TestArtifacts { pub sp_sidecar_artifact_hash: ArtifactHash, pub rot_gimlet_artifact_hash: ArtifactHash, pub rot_sidecar_artifact_hash: ArtifactHash, + pub rot_bootloader_gimlet_artifact_hash: ArtifactHash, + pub rot_bootloader_sidecar_artifact_hash: ArtifactHash, pub artifact_cache: Arc, deployed_cabooses: BTreeMap, resolver: FixedResolver, @@ -108,12 +110,56 @@ impl TestArtifacts { ArtifactHash(digest.finalize().into()) }; + // Make an RoT bootloader update artifact for SimGimlet. + let rot_bootloader_gimlet_artifact_caboose = CabooseBuilder::default() + .git_commit("fake-git-commit") + .board(SIM_GIMLET_BOARD) + .version("0.0.0") + .name("fake-name") + .build(); + let mut builder = HubrisArchiveBuilder::with_fake_image(); + builder + .write_caboose(rot_bootloader_gimlet_artifact_caboose.as_slice()) + .unwrap(); + let rot_bootloader_gimlet_artifact = builder.build_to_vec().unwrap(); + let rot_bootloader_gimlet_artifact_hash = { + let mut digest = sha2::Sha256::default(); + digest.update(&rot_bootloader_gimlet_artifact); + ArtifactHash(digest.finalize().into()) + }; + + // Make an RoT bootloader update artifact for SimSidecar + let rot_bootloader_sidecar_artifact_caboose = CabooseBuilder::default() + .git_commit("fake-git-commit") + .board(SIM_SIDECAR_BOARD) + .version("0.0.0") + .name("fake-name") + .build(); + let mut builder = HubrisArchiveBuilder::with_fake_image(); + builder + .write_caboose(rot_bootloader_sidecar_artifact_caboose.as_slice()) + .unwrap(); + let rot_bootloader_sidecar_artifact = builder.build_to_vec().unwrap(); + let rot_bootloader_sidecar_artifact_hash = { + let mut digest = sha2::Sha256::default(); + digest.update(&rot_bootloader_sidecar_artifact); + ArtifactHash(digest.finalize().into()) + }; + // Assemble a map of artifact hash to artifact contents. let artifact_data = [ (sp_gimlet_artifact_hash, sp_gimlet_artifact), (sp_sidecar_artifact_hash, sp_sidecar_artifact), (rot_gimlet_artifact_hash, rot_gimlet_artifact), (rot_sidecar_artifact_hash, rot_sidecar_artifact), + ( + rot_bootloader_gimlet_artifact_hash, + rot_bootloader_gimlet_artifact, + ), + ( + rot_bootloader_sidecar_artifact_hash, + rot_bootloader_sidecar_artifact, + ), ] .into_iter() .collect(); @@ -124,6 +170,14 @@ impl TestArtifacts { (sp_sidecar_artifact_hash, sp_sidecar_artifact_caboose), (rot_gimlet_artifact_hash, rot_gimlet_artifact_caboose), (rot_sidecar_artifact_hash, rot_sidecar_artifact_caboose), + ( + rot_bootloader_gimlet_artifact_hash, + rot_bootloader_gimlet_artifact_caboose, + ), + ( + rot_bootloader_sidecar_artifact_hash, + rot_bootloader_sidecar_artifact_caboose, + ), ] .into_iter() .collect(); @@ -155,6 +209,8 @@ impl TestArtifacts { sp_sidecar_artifact_hash, rot_gimlet_artifact_hash, rot_sidecar_artifact_hash, + rot_bootloader_gimlet_artifact_hash, + rot_bootloader_sidecar_artifact_hash, deployed_cabooses, artifact_cache, resolver, diff --git a/nexus/mgs-updates/src/test_util/updates.rs b/nexus/mgs-updates/src/test_util/updates.rs index f59986c6315..fbcc17b0937 100644 --- a/nexus/mgs-updates/src/test_util/updates.rs +++ b/nexus/mgs-updates/src/test_util/updates.rs @@ -54,9 +54,10 @@ pub enum ExpectedSpComponent { override_expected_pending_persistent_boot_preference: Option, override_expected_transient_boot_preference: Option, }, - // TODO-K: Remove once fully implemented - #[allow(dead_code)] - RotBootloader {}, + RotBootloader { + override_expected_stage0: Option, + override_expected_stage0_next: Option, + }, } /// Describes an update operation that can later be executed any number of times @@ -152,7 +153,23 @@ impl UpdateDescription<'_> { expected_transient_boot_preference, } } - ExpectedSpComponent::RotBootloader {} => unimplemented!(), + ExpectedSpComponent::RotBootloader { + override_expected_stage0, + override_expected_stage0_next, + } => { + let expected_stage0_version = override_expected_stage0 + .clone() + .unwrap_or_else(|| sp1.expect_stage0_version()); + let expected_stage0_next_version = + override_expected_stage0_next + .clone() + .unwrap_or_else(|| sp1.expect_stage0_next_version()); + + PendingMgsUpdateDetails::RotBootloader { + expected_stage0_version, + expected_stage0_next_version, + } + } }; let deployed_caboose = self @@ -492,6 +509,64 @@ impl FinishedUpdateAttempt { } } + /// Asserts various conditions associated with successful RoT bootloader updates. + pub fn expect_rot_bootloader_success( + &self, + expected_result: UpdateCompletedHow, + ) { + let how = match self.result { + Ok(how) if how == expected_result => how, + _ => { + panic!( + "unexpected result from apply_update(): {:?}", + self.result, + ); + } + }; + + eprintln!("apply_update() -> {:?}", how); + let sp2 = &self.sp2; + + // The active slot should contain what we just updated to. + let deployed_caboose = &self.deployed_caboose; + assert!(cabooses_equal(&sp2.caboose_stage0, &deployed_caboose)); + + // RoT information should not have changed. + let sp1 = &self.sp1; + assert_eq!(sp1.expect_caboose_rot_a(), sp2.expect_caboose_rot_a()); + assert_eq!(sp1.expect_caboose_rot_b(), sp2.expect_caboose_rot_b()); + + // SP information should not have changed + let sp1 = &self.sp1; + assert_eq!( + sp1.expect_caboose_sp_inactive(), + sp2.expect_caboose_sp_inactive(), + ); + assert_eq!( + sp1.expect_caboose_sp_active(), + sp2.expect_caboose_sp_active(), + ); + + if how == UpdateCompletedHow::FoundNoChangesNeeded { + assert_eq!(sp1.caboose_stage0, sp2.caboose_stage0); + assert_eq!( + sp1.expect_caboose_stage0_next(), + sp2.expect_caboose_stage0_next() + ); + assert_eq!(sp1.sp_state, sp2.sp_state); + assert_eq!(sp1.sp_boot_info, sp2.sp_boot_info); + } else { + // One way or another, an update was completed. Stage 0 + // and Stage 0 next should contain the same contents + assert_eq!( + sp2.expect_caboose_stage0(), + sp2.expect_caboose_stage0_next() + ); + // RoT boot info should have changed + assert_ne!(sp1.sp_boot_info, sp2.sp_boot_info); + } + } + /// Asserts that the update failed and invokes `assert_error(error, /// initial_sp_state, final_sp_state)` for you to make your own assertions /// about why it failed and what state things were left in. From e06418ef160837a6c4b214373642feacc2eb6b4c Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 26 Jun 2025 20:31:39 +1200 Subject: [PATCH 10/21] address comments --- nexus/mgs-updates/src/common_sp_update.rs | 9 ++-- nexus/mgs-updates/src/driver_update.rs | 18 +++---- .../mgs-updates/src/rot_bootloader_updater.rs | 47 +++++++++++----- nexus/mgs-updates/src/rot_updater.rs | 54 +++++++++---------- 4 files changed, 73 insertions(+), 55 deletions(-) diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index 50a33cfa860..a38d0c21def 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -8,7 +8,6 @@ use super::MgsClients; use super::UpdateProgress; use futures::future::BoxFuture; -use gateway_client::types::RotImageError; use gateway_client::types::SpType; use gateway_client::types::SpUpdateStatus; use gateway_types::rot::RotSlot; @@ -326,11 +325,11 @@ pub enum PostUpdateError { #[error("communicating with MGS")] GatewayClientError(#[from] GatewayClientError), - #[error("communicating with RoT: {message:?}")] - RotCommunicationFailed { message: String }, + #[error("transient error: {message:?}")] + TransientError { message: String }, - #[error("invalid RoT bootloader image: {error:?}")] - RotBootloaderImageError { error: RotImageError }, + #[error("fatal error: {error:?}")] + FatalError { error: String }, } #[derive(Debug)] diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 4578ddba943..f21cf2cee00 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -33,7 +33,12 @@ use uuid::Uuid; /// How long may the status remain unchanged without us treating this as a /// problem? -pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(180); +/// +/// With the RoT bootloader need to wait for 2 resets which have a timeout +/// of 60 seconds each, and an attempt to retrieve boot info, which has a +/// time out of 30 seconds. We then give ourselves a few more minutes to act +/// as a buffer for other pending actions. +pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(600); /// How long to wait between failed attempts to reset the device const RESET_DELAY_INTERVAL: Duration = Duration::from_secs(10); @@ -381,14 +386,13 @@ pub(crate) async fn apply_update( )); } } - PostUpdateError::RotBootloaderImageError { error } => { - let error = InlineErrorChain::new(&error); - error!(log, "post_update failed"; &error); + PostUpdateError::FatalError { error } => { + error!(log, "post_update failed"; "error" => ?error); return Err(ApplyUpdateError::SpResetFailed( error.to_string(), )); } - PostUpdateError::RotCommunicationFailed { message: _ } => {} + PostUpdateError::TransientError { message: _ } => {} } tokio::time::sleep(RESET_DELAY_INTERVAL).await; @@ -628,10 +632,6 @@ async fn wait_for_update_done( // Check if we're done. Ok(PrecheckStatus::UpdateComplete) => return Ok(()), - // We'll loop for 3 minutes to wait for any ongoing RoT bootloader update. - // We need to wait for 2 resets which have a timeout of 60 seconds each, - // and an attempt to retrieve boot info, which has a time out of 30 seconds. - // We give an additional 30 seconds to as a buffer for the other actions. Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) => { if before.elapsed() >= WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 89356757947..8df402e054a 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -61,15 +61,12 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { }); } - // TODO-K: In the RoT bootloader update code in wicket, there is a set of - // known bootloader FWIDs that don't have cabooses. Is this something we - // should care about here? - // https://github.com/oxidecomputer/omicron/blob/89ce370f0a96165c777e90a008257a6085897f2a/wicketd/src/update_tracker.rs#L1817-L1841 - - // TODO-K: There are also older versions of the SP have a bug that prevents - // setting the active slot for the RoT bootloader. Is this something we should - // care about here? - // https://github.com/oxidecomputer/omicron/blob/89ce370f0a96165c777e90a008257a6085897f2a/wicketd/src/update_tracker.rs#L1705-L1710 + // TODO: In the RoT bootloader update code in wicket, there is a set of + // known bootloader FWIDs that don't have cabooses. There are also older + // versions of the SP have a bug that prevents setting the active slot for + // the RoT bootloader. We should reject any update request that comes from + // a system using those devices. + // https://github.com/oxidecomputer/omicron/issues/8457 // Fetch the caboose from the currently active slot (stage0). let caboose = mgs_clients @@ -205,7 +202,9 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { async move { // Before setting stage0 to the new version we want to ensure // the image is good and we're not going to brick the device. - // The RoT will do a signature check when reset. + // We'll reset the device, causing it to check the signature. + // Then we'll validate that signature before we activate the + // new stage0. debug!( log, "attempting to reset device to do bootloader signature check" @@ -237,12 +236,32 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { ) .await?; // If the image is not valid we bail - if let Some(error) = stage0next_error { - return Err(PostUpdateError::RotBootloaderImageError { - error, + if let Some(e) = stage0next_error { + return Err(PostUpdateError::FatalError { + error: e.to_string(), }); } + // This operation is very delicate. Here, we're overwriting the device + // bootloader with the one that we've written to the stage0next slot. + // The hardware has no fallback slot for the bootloader. So if the + // device resets or loses power while we're copying stage0next to the + // stage0 slot, it could still become bricked. + // + // We've already done everything we can to mitigate this: + // + // - The data is already on the device, minimizing the time to copy it + // to where it needs to go. + // - The image has already been verified by the device (and the device + // validates _that_ before starting this operation), so it won't fail + // at boot for that reason. + // - The device can't be externally reset _during_ this operation because + // the same code responsible for processing the reset request will be + // busy doing the copy. + // - We only ever update one RoT stage0 at a time in a rack, so if we brick + // one, only one sled would be affected (still bad). + // + // So we're ready to roll! debug!(log, "attempting to set RoT bootloader active slot"); mgs_clients .try_all_serially(log, move |mgs_client| async move { @@ -332,7 +351,7 @@ async fn wait_for_stage0_next_image_check( "failed to get RoT boot info"; "error" => %message, ); - return Err(PostUpdateError::RotCommunicationFailed { + return Err(PostUpdateError::TransientError { message, }); } diff --git a/nexus/mgs-updates/src/rot_updater.rs b/nexus/mgs-updates/src/rot_updater.rs index c7b0128a095..88bff73cc3f 100644 --- a/nexus/mgs-updates/src/rot_updater.rs +++ b/nexus/mgs-updates/src/rot_updater.rs @@ -399,33 +399,33 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { update: &'a PendingMgsUpdate, ) -> BoxFuture<'a, Result<(), PostUpdateError>> { async move { - // We want to set the slot we've just updated as the active one - debug!(log, "attempting to set active slot"); - mgs_clients - .try_all_serially(log, move |mgs_client| async move { - let inactive_slot = match &update.details { - PendingMgsUpdateDetails::Rot { expected_active_slot, .. } => { - expected_active_slot.slot().toggled().to_u16() - }, - PendingMgsUpdateDetails::Sp { .. } - | PendingMgsUpdateDetails::RotBootloader { .. } => unreachable!( - "pending MGS update details within ReconfiguratorRotUpdater \ - will always be for the RoT" - ) - }; - let persist = true; - mgs_client - .sp_component_active_slot_set( - update.sp_type, - update.slot_id, - &SpComponent::ROT.to_string(), - persist, - &SpComponentFirmwareSlot { slot: inactive_slot } - ) - .await?; - Ok(()) - }) - .await?; + // We want to set the slot we've just updated as the active one + debug!(log, "attempting to set active slot"); + mgs_clients + .try_all_serially(log, move |mgs_client| async move { + let inactive_slot = match &update.details { + PendingMgsUpdateDetails::Rot { expected_active_slot, .. } => { + expected_active_slot.slot().toggled().to_u16() + }, + PendingMgsUpdateDetails::Sp { .. } + | PendingMgsUpdateDetails::RotBootloader { .. } => unreachable!( + "pending MGS update details within ReconfiguratorRotUpdater \ + will always be for the RoT" + ) + }; + let persist = true; + mgs_client + .sp_component_active_slot_set( + update.sp_type, + update.slot_id, + &SpComponent::ROT.to_string(), + persist, + &SpComponentFirmwareSlot { slot: inactive_slot } + ) + .await?; + Ok(()) + }) + .await?; debug!(log, "attempting to reset device"); mgs_clients From b93303f5e8002fe5f802f723ffbb729575b322a8 Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 27 Jun 2025 11:57:25 +1200 Subject: [PATCH 11/21] clean up --- nexus/mgs-updates/src/common_sp_update.rs | 2 +- nexus/mgs-updates/src/driver_update.rs | 4 ++-- .../mgs-updates/src/rot_bootloader_updater.rs | 20 ++++++------------- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index a38d0c21def..4261797afbe 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -275,7 +275,7 @@ pub trait SpComponentUpdateHelper { pub enum PrecheckStatus { UpdateComplete, ReadyForUpdate, - WaitingForOngoingRotBootloaderUpdate, + WaitingForOngoingUpdate, } #[derive(Debug, Error)] diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index f21cf2cee00..9a2a6c37170 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -234,7 +234,7 @@ pub(crate) async fn apply_update( // This is the first time a Nexus instance is attempting to // update the RoT bootloader, we don't need to wait for an // ongoing update. - Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) => (), + Ok(PrecheckStatus::WaitingForOngoingUpdate) => (), Ok(PrecheckStatus::UpdateComplete) => { return Ok(UpdateCompletedHow::FoundNoChangesNeeded); } @@ -632,7 +632,7 @@ async fn wait_for_update_done( // Check if we're done. Ok(PrecheckStatus::UpdateComplete) => return Ok(()), - Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) => { + Ok(PrecheckStatus::WaitingForOngoingUpdate) => { if before.elapsed() >= WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT { diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 8df402e054a..8bae9a6fe7d 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -61,13 +61,6 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { }); } - // TODO: In the RoT bootloader update code in wicket, there is a set of - // known bootloader FWIDs that don't have cabooses. There are also older - // versions of the SP have a bug that prevents setting the active slot for - // the RoT bootloader. We should reject any update request that comes from - // a system using those devices. - // https://github.com/oxidecomputer/omicron/issues/8457 - // Fetch the caboose from the currently active slot (stage0). let caboose = mgs_clients .try_all_serially(log, move |mgs_client| async move { @@ -127,12 +120,11 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { .sp_component_caboose_get( update.sp_type, update.slot_id, - // The name for the SP component here is STAGE0 - // it's a little confusing because we're really - // trying to reach STAGE0NEXT, and there is no - // ROT_BOOTLOADER variant. We specify that we - // want STAGE0NEXT by setting the firmware slot - // to 1, which is where it will always be. + // The naming here is a bit confusing because "stage0" + // sometimes refers to the component (RoT bootloader) + // and sometimes refers to the active slot for that + // component. Here, we're accessing the inactive slot + // for it. The component is still "stage0". &SpComponent::STAGE0.to_string(), 1, ) @@ -183,7 +175,7 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { // itself is being updated (during the reset stage)? Should we check for that // here before setting the RoT bootloader as ready to update? - Ok(PrecheckStatus::WaitingForOngoingRotBootloaderUpdate) + Ok(PrecheckStatus::WaitingForOngoingUpdate) } .boxed() } From 0fbdcf1eb69bfd7a9ae4866907f1857488df479f Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 27 Jun 2025 12:38:02 +1200 Subject: [PATCH 12/21] change conditions for determining whether we wait for an ongoing update --- .../mgs-updates/src/rot_bootloader_updater.rs | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 8bae9a6fe7d..083a80fc5ba 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -77,9 +77,11 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { .into_inner(); debug!(log, "found active slot caboose"; "caboose" => ?caboose); + let found_stage0_version = caboose.version; + // If the version in the currently active slot matches the one we're // trying to set, then there's nothing to do. - if caboose.version == update.artifact_version.as_str() { + if found_stage0_version == update.artifact_version.as_str() { return Ok(PrecheckStatus::UpdateComplete); } @@ -99,10 +101,10 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { will always be for the RoT bootloader" ); }; - if caboose.version != expected_stage0_version.to_string() { + if found_stage0_version != expected_stage0_version.to_string() { return Err(PrecheckError::WrongActiveVersion { expected: expected_stage0_version.clone(), - found: caboose.version, + found: found_stage0_version, }); } @@ -170,12 +172,22 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { } }; - // TODO-K: In post_update we'll be restarting the RoT twice to do signature - // checks, and to set stage0 to the new version. What happens if the RoT - // itself is being updated (during the reset stage)? Should we check for that - // here before setting the RoT bootloader as ready to update? - - Ok(PrecheckStatus::WaitingForOngoingUpdate) + // The status is only considered ready for update if the stage0_next + // version found in the caboose is valid, and it matches what we + // found in the stage_0 caboose. Otherwise, we are waiting for an + // ongoing update. + match found_stage0_next_version { + FoundVersion::Version(v) => { + if v == found_stage0_version { + Ok(PrecheckStatus::ReadyForUpdate) + } else { + Ok(PrecheckStatus::WaitingForOngoingUpdate) + } + } + FoundVersion::MissingVersion => { + Ok(PrecheckStatus::WaitingForOngoingUpdate) + } + } } .boxed() } From 609122c1e828931f917579148e62d504f299c03e Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 27 Jun 2025 13:15:28 +1200 Subject: [PATCH 13/21] Update wait for ongoing update logic --- nexus/mgs-updates/src/driver_update.rs | 62 +++++++++++--------------- 1 file changed, 26 insertions(+), 36 deletions(-) diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 9a2a6c37170..6863eca8e81 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -52,14 +52,6 @@ pub const DEFAULT_RETRY_TIMEOUT: Duration = Duration::from_secs(60); /// How long to wait after resetting the device before expecting it to come up const RESET_TIMEOUT: Duration = Duration::from_secs(60); -/// How long to wait for an ongoing RoT bootloader update -const WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT: Duration = - Duration::from_secs(180); - -/// How long to wait between poll attempts on RoT bootloader update status -const ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL: Duration = - Duration::from_secs(10); - /// Parameters describing a request to update one SP-managed component /// /// This is similar in spirit to the `SpComponentUpdater` trait but uses a @@ -226,23 +218,34 @@ pub(crate) async fn apply_update( debug!(log, "loaded artifact contents"); // Check the live state first to see if: - // - this update has already been completed, or + // - this update has already been completed, + // - we are waiting for an ongoing update, or // - if not, then if our required preconditions are met status.update(UpdateAttemptStatus::Precheck); - match update_helper.precheck(log, &mut mgs_clients, update).await { - Ok(PrecheckStatus::ReadyForUpdate) | - // This is the first time a Nexus instance is attempting to - // update the RoT bootloader, we don't need to wait for an - // ongoing update. - Ok(PrecheckStatus::WaitingForOngoingUpdate) => (), - Ok(PrecheckStatus::UpdateComplete) => { - return Ok(UpdateCompletedHow::FoundNoChangesNeeded); - } - Err(error) => { - return Err(ApplyUpdateError::PreconditionFailed(error)); - } - }; + let before = Instant::now(); + loop { + match update_helper.precheck(log, &mut mgs_clients, update).await { + Ok(PrecheckStatus::ReadyForUpdate) => break, + Ok(PrecheckStatus::WaitingForOngoingUpdate) => { + if before.elapsed() >= progress_timeout { + warn!( + log, + "update takeover: timed out while waiting for ongoing update" + ); + break; + } + tokio::time::sleep(PROGRESS_POLL_INTERVAL).await; + continue; + } + Ok(PrecheckStatus::UpdateComplete) => { + return Ok(UpdateCompletedHow::FoundNoChangesNeeded); + } + Err(error) => { + return Err(ApplyUpdateError::PreconditionFailed(error)); + } + }; + } // Start the update. debug!(log, "ready to start update"); status.update(UpdateAttemptStatus::Updating); @@ -632,20 +635,6 @@ async fn wait_for_update_done( // Check if we're done. Ok(PrecheckStatus::UpdateComplete) => return Ok(()), - Ok(PrecheckStatus::WaitingForOngoingUpdate) => { - if before.elapsed() - >= WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT - { - return Err(UpdateWaitError::Timeout( - WAIT_FOR_ONGOING_ROT_BOOTLOADER_UPDATE_TIMEOUT, - )); - } - - tokio::time::sleep(ROT_BOOLOADER_UPDATE_PROGRESS_INTERVAL) - .await; - continue; - } - // An incorrect version in the "inactive" slot, incorrect active slot, // or non-empty pending_persistent_boot_preference/transient_boot_preference // are normal during the upgrade. We have no reason to think these won't @@ -654,6 +643,7 @@ async fn wait_for_update_done( | Err(PrecheckError::WrongInactiveVersion { .. }) | Err(PrecheckError::WrongActiveSlot { .. }) | Err(PrecheckError::EphemeralRotBootPreferenceSet) + | Ok(PrecheckStatus::WaitingForOngoingUpdate) | Ok(PrecheckStatus::ReadyForUpdate) => { if before.elapsed() >= timeout { return Err(UpdateWaitError::Timeout(timeout)); From 0d40a7398b1cd46155ac9007137f394379cb323c Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 27 Jun 2025 13:38:52 +1200 Subject: [PATCH 14/21] Remove ticker --- .../mgs-updates/src/rot_bootloader_updater.rs | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 083a80fc5ba..1289c1aac7c 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -30,6 +30,8 @@ use std::time::Instant; const WAIT_FOR_BOOT_INFO_TIMEOUT: Duration = Duration::from_secs(30); +const WAIT_FOR_BOOT_INFO_INTERVAL: Duration = Duration::from_secs(10); + pub struct ReconfiguratorRotBootloaderUpdater; impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { /// Checks if the component is already updated or ready for update @@ -312,12 +314,8 @@ async fn wait_for_stage0_next_image_check( sp_slot: u32, timeout: Duration, ) -> Result, PostUpdateError> { - let mut ticker = tokio::time::interval(Duration::from_secs(1)); - - let start = Instant::now(); + let before = Instant::now(); loop { - ticker.tick().await; - match mgs_clients .try_all_serially(log, |mgs_client| async move { mgs_client @@ -343,34 +341,37 @@ async fn wait_for_stage0_next_image_check( } // The RoT is probably still booting RotState::CommunicationFailed { message } => { - if start.elapsed() < timeout { - info!( - log, - "failed getting RoT boot info (will retry)"; - "error" => %message, - ); - } else { + if before.elapsed() >= timeout { error!( log, "failed to get RoT boot info"; - "error" => %message, + "error" => %message ); return Err(PostUpdateError::TransientError { message, }); } - } - }, - Err(error) => { - if start.elapsed() < timeout { + info!( log, "failed getting RoT boot info (will retry)"; - "error" => %error, + "error" => %message, ); - } else { + tokio::time::sleep(WAIT_FOR_BOOT_INFO_INTERVAL).await; + } + }, + // The RoT might still be booting + Err(error) => { + if before.elapsed() >= timeout { return Err(PostUpdateError::GatewayClientError(error)); } + + info!( + log, + "failed getting RoT boot info (will retry)"; + "error" => %error, + ); + tokio::time::sleep(WAIT_FOR_BOOT_INFO_INTERVAL).await; } } } From 92e97e3a91524c4882b21a47ba6b723bcb5ba628 Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 27 Jun 2025 13:50:36 +1200 Subject: [PATCH 15/21] clean up --- nexus/mgs-updates/src/rot_bootloader_updater.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 1289c1aac7c..18aa67e7977 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -202,9 +202,6 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { mgs_clients: &'a mut MgsClients, update: &'a PendingMgsUpdate, ) -> BoxFuture<'a, Result<(), PostUpdateError>> { - // TODO-K: Again, we're resetting the ROT twice here, what happens - // if an RoT update is happening at the same time? - async move { // Before setting stage0 to the new version we want to ensure // the image is good and we're not going to brick the device. @@ -363,6 +360,11 @@ async fn wait_for_stage0_next_image_check( // The RoT might still be booting Err(error) => { if before.elapsed() >= timeout { + error!( + log, + "failed to get RoT boot info"; + "error" => %error + ); return Err(PostUpdateError::GatewayClientError(error)); } From 69a8a4fbeb5b9d93d8458a0fcaab48df00dd3bd8 Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 27 Jun 2025 14:33:05 +1200 Subject: [PATCH 16/21] Missed one --- nexus/mgs-updates/src/common_sp_update.rs | 12 +++++++++++ nexus/mgs-updates/src/driver_update.rs | 25 ++++------------------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index 4261797afbe..25daa07e784 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -332,6 +332,18 @@ pub enum PostUpdateError { FatalError { error: String }, } +impl PostUpdateError { + pub fn is_fatal(&self) -> bool { + match self { + PostUpdateError::GatewayClientError(error) => { + !matches!(error, gateway_client::Error::CommunicationError(_)) + } + PostUpdateError::TransientError { .. } => false, + PostUpdateError::FatalError { .. } => true, + } + } +} + #[derive(Debug)] pub enum FoundVersion { MissingVersion, diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 6863eca8e81..0f54637632e 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -4,7 +4,6 @@ //! Concurrent-safe facilities for doing MGS-managed upates -use crate::common_sp_update::PostUpdateError; use crate::common_sp_update::PrecheckError; use crate::common_sp_update::PrecheckStatus; use crate::common_sp_update::STATUS_POLL_INTERVAL; @@ -376,26 +375,10 @@ pub(crate) async fn apply_update( while let Err(error) = update_helper.post_update(log, &mut mgs_clients, update).await { - match error { - PostUpdateError::GatewayClientError(error) => { - if !matches!( - error, - gateway_client::Error::CommunicationError(_) - ) { - let error = InlineErrorChain::new(&error); - error!(log, "post_update failed"; &error); - return Err(ApplyUpdateError::SpResetFailed( - error.to_string(), - )); - } - } - PostUpdateError::FatalError { error } => { - error!(log, "post_update failed"; "error" => ?error); - return Err(ApplyUpdateError::SpResetFailed( - error.to_string(), - )); - } - PostUpdateError::TransientError { message: _ } => {} + if error.is_fatal() { + let error = InlineErrorChain::new(&error); + error!(log, "post_update failed"; &error); + return Err(ApplyUpdateError::SpResetFailed(error.to_string())); } tokio::time::sleep(RESET_DELAY_INTERVAL).await; From 99bd5da1a193bac678dfa2fcbf5fd94d62dfabef Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 8 Jul 2025 13:38:11 +1200 Subject: [PATCH 17/21] fix after merge --- nexus/mgs-updates/src/driver_update.rs | 2 +- .../mgs-updates/src/rot_bootloader_updater.rs | 33 ++++++++++--------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 019ccfd2c86..93422179d14 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -817,7 +817,7 @@ mod test { gwtestctx: &GatewayTestContext, artifacts: &TestArtifacts, sp_type: SpType, - slot_id: u32, + slot_id: u16, artifact_hash: &ArtifactHash, expected_result: UpdateCompletedHow, ) { diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 18aa67e7977..12fb45317cf 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -245,24 +245,25 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { }); } - // This operation is very delicate. Here, we're overwriting the device - // bootloader with the one that we've written to the stage0next slot. - // The hardware has no fallback slot for the bootloader. So if the - // device resets or loses power while we're copying stage0next to the - // stage0 slot, it could still become bricked. + // This operation is very delicate. Here, we're overwriting the + // device bootloader with the one that we've written to the + // stage0next slot. + // The hardware has no fallback slot for the bootloader. So if the + // device resets or loses power while we're copying stage0next to + // the stage0 slot, it could still become bricked. // // We've already done everything we can to mitigate this: // - // - The data is already on the device, minimizing the time to copy it - // to where it needs to go. - // - The image has already been verified by the device (and the device - // validates _that_ before starting this operation), so it won't fail - // at boot for that reason. - // - The device can't be externally reset _during_ this operation because - // the same code responsible for processing the reset request will be - // busy doing the copy. - // - We only ever update one RoT stage0 at a time in a rack, so if we brick - // one, only one sled would be affected (still bad). + // - The data is already on the device, minimizing the time to copy + // it to where it needs to go. + // - The image has already been verified by the device (and the + // device validates _that_ before starting this operation), so it + // won't fail at boot for that reason. + // - The device can't be externally reset _during_ this operation + // because the same code responsible for processing the reset + // request will be busy doing the copy. + // - We only ever update one RoT stage0 at a time in a rack, so if + // we brick one, only one sled would be affected (still bad). // // So we're ready to roll! debug!(log, "attempting to set RoT bootloader active slot"); @@ -308,7 +309,7 @@ async fn wait_for_stage0_next_image_check( log: &Logger, mgs_clients: &mut MgsClients, sp_type: SpType, - sp_slot: u32, + sp_slot: u16, timeout: Duration, ) -> Result, PostUpdateError> { let before = Instant::now(); From dc9e7875d6b3827cda65323d0e5282581d2167ec Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 8 Jul 2025 14:41:23 +1200 Subject: [PATCH 18/21] address comments --- nexus/mgs-updates/src/driver_update.rs | 39 ++++++++++++--- .../mgs-updates/src/rot_bootloader_updater.rs | 49 ++++++++++++------- 2 files changed, 64 insertions(+), 24 deletions(-) diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 93422179d14..11b1eca38e8 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -32,11 +32,36 @@ use uuid::Uuid; /// How long may the status remain unchanged without us treating this as a /// problem? -/// -/// With the RoT bootloader need to wait for 2 resets which have a timeout -/// of 60 seconds each, and an attempt to retrieve boot info, which has a -/// time out of 30 seconds. We then give ourselves a few more minutes to act -/// as a buffer for other pending actions. +// +// Generally, this value covers two different things: +// +// 1. While we're uploading an image to the SP or it's being prepared, how long +// can the status stay the same before we give up altogether and try again? +// In practice, this would rarely pause for more than a few seconds. +// 2. The period where we might wait for an update to complete -- either our own +// update (in which case this is the period after the final device reset +// until the device comes up reporting the new version) or another instance's +// update (in which case this could cover almost the _entire_ update +// process). +// +// In both cases, if the timeout is reached, the whole update attempt will fail. +// This behavior is only intended to deal with pathological cases, like an MGS +// crash (which could cause an upload to hang indefinitely) or a Nexus crash +// (which could cause any update to hang indefinitely at any point). So we can +// afford to be generous here. Further, we really don't want to trip this +// erroneously in a working system because we're likely to get stuck continuing +// to retry and give up before each attempt finishes. +// +// In terms of sizing this timeout: +// - For all updates, the upload phase generally takes 10-20 seconds. +// - For SP updates, the post-reset phase can take about 30s (with Sidecar SPs +// being the longest). +// - For RoT and RoT bootloader updates, two resets and an intervening "set +// active slot" operation are required. Together, these could take just a +// few seconds. +// +// Adding all the above together, and giving ourselves plenty of margin, we +// choose 10 minutes. pub const PROGRESS_TIMEOUT: Duration = Duration::from_secs(600); /// How long to wait between failed attempts to reset the device @@ -220,7 +245,7 @@ pub(crate) async fn apply_update( // Check the live state first to see if: // - this update has already been completed, - // - we are waiting for an ongoing update, or + // - we should wait a bit because an update may be in-progress, or // - if not, then if our required preconditions are met status.update(UpdateAttemptStatus::Precheck); let before = Instant::now(); @@ -374,7 +399,7 @@ pub(crate) async fn apply_update( if try_reset { // We retry this until we get some error *other* than a communication - // error or an RoT bootloader image error. There is intentionally no + // error or some other transient error. There is intentionally no // timeout here. If we've staged an update but not managed to reset // the device, there's no point where we'd want to stop trying to do so. while let Err(error) = diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 12fb45317cf..31bdc7588e9 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -25,10 +25,11 @@ use nexus_types::deployment::PendingMgsUpdate; use nexus_types::deployment::PendingMgsUpdateDetails; use slog::Logger; use slog::{debug, error, info}; +use slog_error_chain::InlineErrorChain; use std::time::Duration; use std::time::Instant; -const WAIT_FOR_BOOT_INFO_TIMEOUT: Duration = Duration::from_secs(30); +const WAIT_FOR_BOOT_INFO_TIMEOUT: Duration = Duration::from_secs(120); const WAIT_FOR_BOOT_INFO_INTERVAL: Duration = Duration::from_secs(10); @@ -99,8 +100,9 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { } = &update.details else { unreachable!( - "pending MGS update details within ReconfiguratorSpUpdater \ - will always be for the RoT bootloader" + "pending MGS update details within \ + ReconfiguratorRotBootloaderUpdater will always be for the \ + RoT bootloader" ); }; if found_stage0_version != expected_stage0_version.to_string() { @@ -203,11 +205,11 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { update: &'a PendingMgsUpdate, ) -> BoxFuture<'a, Result<(), PostUpdateError>> { async move { - // Before setting stage0 to the new version we want to ensure - // the image is good and we're not going to brick the device. - // We'll reset the device, causing it to check the signature. - // Then we'll validate that signature before we activate the - // new stage0. + // To protect against bricking itself, the device will only activate + // a new image after it's been verified. Images are only verified at + // device boot time. Thus, we'll reset the device once to cause the + // signature to be verified. Then we can activate the new image and + // reset the device again. debug!( log, "attempting to reset device to do bootloader signature check" @@ -238,10 +240,12 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { WAIT_FOR_BOOT_INFO_TIMEOUT, ) .await?; - // If the image is not valid we bail + // If boot info contains any error with the image loaded onto + // stage0_next, we run the risk of bricking the device if this image + // is loaded onto stage0. We return a fatal error. if let Some(e) = stage0next_error { return Err(PostUpdateError::FatalError { - error: e.to_string(), + error: InlineErrorChain::new(&e).to_string(), }); } @@ -330,10 +334,18 @@ async fn wait_for_stage0_next_image_check( .await { Ok(state) => match state.into_inner() { - // The minimum we will ever return is 3. + // The minimum we will ever return is v3. // Additionally, V2 does not report image errors, so we cannot // know with certainty if a signature check came back with errors - RotState::V2 { .. } => unreachable!(), + RotState::V2 { .. } => { + let error = "unexpected RoT version: 2".to_string(); + error!( + log, + "failed to get RoT boot info"; + "error" => &error + ); + return Err(PostUpdateError::FatalError { error }); + } RotState::V3 { stage0next_error, .. } => { return Ok(stage0next_error); } @@ -345,8 +357,8 @@ async fn wait_for_stage0_next_image_check( "failed to get RoT boot info"; "error" => %message ); - return Err(PostUpdateError::TransientError { - message, + return Err(PostUpdateError::FatalError { + error: message, }); } @@ -360,19 +372,22 @@ async fn wait_for_stage0_next_image_check( }, // The RoT might still be booting Err(error) => { + let e = InlineErrorChain::new(&error); if before.elapsed() >= timeout { error!( log, "failed to get RoT boot info"; - "error" => %error + &e, ); - return Err(PostUpdateError::GatewayClientError(error)); + return Err(PostUpdateError::FatalError { + error: e.to_string(), + }); } info!( log, "failed getting RoT boot info (will retry)"; - "error" => %error, + e, ); tokio::time::sleep(WAIT_FOR_BOOT_INFO_INTERVAL).await; } From 0e76cce6255714489034b5ee4eef1f666f3d84e3 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 8 Jul 2025 15:27:07 +1200 Subject: [PATCH 19/21] simplify checks --- nexus/mgs-updates/src/common_sp_update.rs | 32 ++++++++++++++++++- .../mgs-updates/src/rot_bootloader_updater.rs | 29 ++--------------- nexus/mgs-updates/src/rot_updater.rs | 23 +------------ nexus/mgs-updates/src/sp_updater.rs | 23 +------------ 4 files changed, 36 insertions(+), 71 deletions(-) diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index b61c0326053..fb49de4cf94 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -344,12 +344,42 @@ impl PostUpdateError { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum FoundVersion { MissingVersion, Version(String), } +impl FoundVersion { + pub fn matches( + self, + expected: &ExpectedVersion, + ) -> Result<(), PrecheckError> { + match (expected, &self) { + // expected garbage, found garbage + (ExpectedVersion::NoValidVersion, FoundVersion::MissingVersion) => { + () + } + // expected a specific version and found it + ( + ExpectedVersion::Version(artifact_version), + FoundVersion::Version(found_version), + ) if artifact_version.to_string() == *found_version => (), + // anything else is a mismatch + (ExpectedVersion::NoValidVersion, FoundVersion::Version(_)) + | (ExpectedVersion::Version(_), FoundVersion::MissingVersion) + | (ExpectedVersion::Version(_), FoundVersion::Version(_)) => { + return Err(PrecheckError::WrongInactiveVersion { + expected: expected.clone(), + found: self, + }); + } + }; + + Ok(()) + } +} + pub(crate) fn error_means_caboose_is_invalid( error: &GatewayClientError, ) -> bool { diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 31bdc7588e9..5dfa5927f15 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -20,7 +20,6 @@ use gateway_client::types::RotState; use gateway_client::types::SpComponentFirmwareSlot; use gateway_client::types::SpType; use gateway_messages::RotBootInfo; -use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::PendingMgsUpdate; use nexus_types::deployment::PendingMgsUpdateDetails; use slog::Logger; @@ -150,31 +149,9 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { } } }; - match (&expected_stage0_next_version, &found_stage0_next_version) { - // expected garbage, found garbage - ( - ExpectedVersion::NoValidVersion, - FoundVersion::MissingVersion, - ) => (), - // expected a specific version and found it - ( - ExpectedVersion::Version(artifact_version), - FoundVersion::Version(found_stage0_next_version), - ) if artifact_version.to_string() - == *found_stage0_next_version => - { - () - } - // anything else is a mismatch - (ExpectedVersion::NoValidVersion, FoundVersion::Version(_)) - | (ExpectedVersion::Version(_), FoundVersion::MissingVersion) - | (ExpectedVersion::Version(_), FoundVersion::Version(_)) => { - return Err(PrecheckError::WrongInactiveVersion { - expected: expected_stage0_next_version.clone(), - found: found_stage0_next_version, - }); - } - }; + found_stage0_next_version + .clone() + .matches(&expected_stage0_next_version)?; // The status is only considered ready for update if the stage0_next // version found in the caboose is valid, and it matches what we diff --git a/nexus/mgs-updates/src/rot_updater.rs b/nexus/mgs-updates/src/rot_updater.rs index decae3c3fea..cc73d14b844 100644 --- a/nexus/mgs-updates/src/rot_updater.rs +++ b/nexus/mgs-updates/src/rot_updater.rs @@ -22,7 +22,6 @@ use gateway_client::types::RotState; use gateway_client::types::SpComponentFirmwareSlot; use gateway_client::types::SpType; use gateway_types::rot::RotSlot; -use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::PendingMgsUpdate; use nexus_types::deployment::PendingMgsUpdateDetails; use slog::Logger; @@ -346,27 +345,7 @@ impl SpComponentUpdateHelper for ReconfiguratorRotUpdater { } } }; - match (&expected_inactive_version, &found_version) { - // expected garbage, found garbage - ( - ExpectedVersion::NoValidVersion, - FoundVersion::MissingVersion, - ) => (), - // expected a specific version and found it - ( - ExpectedVersion::Version(artifact_version), - FoundVersion::Version(found_version), - ) if artifact_version.to_string() == *found_version => (), - // anything else is a mismatch - (ExpectedVersion::NoValidVersion, FoundVersion::Version(_)) - | (ExpectedVersion::Version(_), FoundVersion::MissingVersion) - | (ExpectedVersion::Version(_), FoundVersion::Version(_)) => { - return Err(PrecheckError::WrongInactiveVersion { - expected: expected_inactive_version.clone(), - found: found_version, - }); - } - }; + found_version.matches(expected_inactive_version)?; // If transient boot is being used, the persistent preference is not going to match // the active slot. At the moment, this mismatch can also mean one of the partitions diff --git a/nexus/mgs-updates/src/sp_updater.rs b/nexus/mgs-updates/src/sp_updater.rs index 60406d4b2a8..158b1cf097d 100644 --- a/nexus/mgs-updates/src/sp_updater.rs +++ b/nexus/mgs-updates/src/sp_updater.rs @@ -19,7 +19,6 @@ use futures::FutureExt; use futures::future::BoxFuture; use gateway_client::SpComponent; use gateway_client::types::SpType; -use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::PendingMgsUpdate; use nexus_types::deployment::PendingMgsUpdateDetails; use slog::Logger; @@ -260,27 +259,7 @@ impl SpComponentUpdateHelper for ReconfiguratorSpUpdater { } } }; - match (&expected_inactive_version, &found_version) { - // expected garbage, found garbage - ( - ExpectedVersion::NoValidVersion, - FoundVersion::MissingVersion, - ) => (), - // expected a specific version and found it - ( - ExpectedVersion::Version(artifact_version), - FoundVersion::Version(found_version), - ) if artifact_version.to_string() == *found_version => (), - // anything else is a mismatch - (ExpectedVersion::NoValidVersion, FoundVersion::Version(_)) - | (ExpectedVersion::Version(_), FoundVersion::MissingVersion) - | (ExpectedVersion::Version(_), FoundVersion::Version(_)) => { - return Err(PrecheckError::WrongInactiveVersion { - expected: expected_inactive_version.clone(), - found: found_version, - }); - } - }; + found_version.matches(expected_inactive_version)?; Ok(PrecheckStatus::ReadyForUpdate) } From 486b54bebc6a6add4082f5d86f25466bc0c4a05b Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 8 Jul 2025 17:32:16 +1200 Subject: [PATCH 20/21] address comments --- nexus/mgs-updates/src/common_sp_update.rs | 1 - nexus/mgs-updates/src/driver_update.rs | 34 +++++-------------- .../mgs-updates/src/rot_bootloader_updater.rs | 17 +--------- 3 files changed, 10 insertions(+), 42 deletions(-) diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index fb49de4cf94..cb96fac0b80 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -275,7 +275,6 @@ pub trait SpComponentUpdateHelper { pub enum PrecheckStatus { UpdateComplete, ReadyForUpdate, - WaitingForOngoingUpdate, } #[derive(Debug, Error)] diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 11b1eca38e8..99d14b3a22b 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -248,30 +248,15 @@ pub(crate) async fn apply_update( // - we should wait a bit because an update may be in-progress, or // - if not, then if our required preconditions are met status.update(UpdateAttemptStatus::Precheck); - let before = Instant::now(); - loop { - match update_helper.precheck(log, &mut mgs_clients, update).await { - Ok(PrecheckStatus::ReadyForUpdate) => break, - Ok(PrecheckStatus::WaitingForOngoingUpdate) => { - if before.elapsed() >= progress_timeout { - warn!( - log, - "update takeover: timed out while waiting for ongoing update" - ); - break; - } - - tokio::time::sleep(PROGRESS_POLL_INTERVAL).await; - continue; - } - Ok(PrecheckStatus::UpdateComplete) => { - return Ok(UpdateCompletedHow::FoundNoChangesNeeded); - } - Err(error) => { - return Err(ApplyUpdateError::PreconditionFailed(error)); - } - }; - } + match update_helper.precheck(log, &mut mgs_clients, update).await { + Ok(PrecheckStatus::ReadyForUpdate) => (), + Ok(PrecheckStatus::UpdateComplete) => { + return Ok(UpdateCompletedHow::FoundNoChangesNeeded); + } + Err(error) => { + return Err(ApplyUpdateError::PreconditionFailed(error)); + } + }; // Start the update. debug!(log, "ready to start update"); status.update(UpdateAttemptStatus::Updating); @@ -656,7 +641,6 @@ async fn wait_for_update_done( | Err(PrecheckError::WrongInactiveVersion { .. }) | Err(PrecheckError::WrongActiveSlot { .. }) | Err(PrecheckError::EphemeralRotBootPreferenceSet) - | Ok(PrecheckStatus::WaitingForOngoingUpdate) | Ok(PrecheckStatus::ReadyForUpdate) => { if before.elapsed() >= timeout { return Err(UpdateWaitError::Timeout(timeout)); diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 5dfa5927f15..6d9e4edf560 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -153,22 +153,7 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { .clone() .matches(&expected_stage0_next_version)?; - // The status is only considered ready for update if the stage0_next - // version found in the caboose is valid, and it matches what we - // found in the stage_0 caboose. Otherwise, we are waiting for an - // ongoing update. - match found_stage0_next_version { - FoundVersion::Version(v) => { - if v == found_stage0_version { - Ok(PrecheckStatus::ReadyForUpdate) - } else { - Ok(PrecheckStatus::WaitingForOngoingUpdate) - } - } - FoundVersion::MissingVersion => { - Ok(PrecheckStatus::WaitingForOngoingUpdate) - } - } + Ok(PrecheckStatus::ReadyForUpdate) } .boxed() } From fec4545619a2599c3b516dbf43e3c2330895c28d Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 10 Jul 2025 10:11:21 +1200 Subject: [PATCH 21/21] address comments --- nexus/mgs-updates/src/common_sp_update.rs | 4 ++-- nexus/mgs-updates/src/driver_update.rs | 1 - nexus/mgs-updates/src/rot_bootloader_updater.rs | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/nexus/mgs-updates/src/common_sp_update.rs b/nexus/mgs-updates/src/common_sp_update.rs index cb96fac0b80..f4315c41a0c 100644 --- a/nexus/mgs-updates/src/common_sp_update.rs +++ b/nexus/mgs-updates/src/common_sp_update.rs @@ -351,7 +351,7 @@ pub enum FoundVersion { impl FoundVersion { pub fn matches( - self, + &self, expected: &ExpectedVersion, ) -> Result<(), PrecheckError> { match (expected, &self) { @@ -370,7 +370,7 @@ impl FoundVersion { | (ExpectedVersion::Version(_), FoundVersion::Version(_)) => { return Err(PrecheckError::WrongInactiveVersion { expected: expected.clone(), - found: self, + found: self.clone(), }); } }; diff --git a/nexus/mgs-updates/src/driver_update.rs b/nexus/mgs-updates/src/driver_update.rs index 99d14b3a22b..5b55b970a4b 100644 --- a/nexus/mgs-updates/src/driver_update.rs +++ b/nexus/mgs-updates/src/driver_update.rs @@ -245,7 +245,6 @@ pub(crate) async fn apply_update( // Check the live state first to see if: // - this update has already been completed, - // - we should wait a bit because an update may be in-progress, or // - if not, then if our required preconditions are met status.update(UpdateAttemptStatus::Precheck); match update_helper.precheck(log, &mut mgs_clients, update).await { diff --git a/nexus/mgs-updates/src/rot_bootloader_updater.rs b/nexus/mgs-updates/src/rot_bootloader_updater.rs index 6d9e4edf560..bf3a9a61eb4 100644 --- a/nexus/mgs-updates/src/rot_bootloader_updater.rs +++ b/nexus/mgs-updates/src/rot_bootloader_updater.rs @@ -203,8 +203,8 @@ impl SpComponentUpdateHelper for ReconfiguratorRotBootloaderUpdater { ) .await?; // If boot info contains any error with the image loaded onto - // stage0_next, we run the risk of bricking the device if this image - // is loaded onto stage0. We return a fatal error. + // stage0_next, the device won't let us load this image onto stage0. + // We return a fatal error. if let Some(e) = stage0next_error { return Err(PostUpdateError::FatalError { error: InlineErrorChain::new(&e).to_string(),