diff --git a/Cargo.lock b/Cargo.lock index f6708987a4d..5914f8fef3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8148,11 +8148,13 @@ version = "0.1.0" dependencies = [ "anyhow", "camino", + "chrono", "clap", "dropshot", "expectorate", "futures", "gateway-test-utils", + "illumos-utils", "libc", "nexus-config", "nexus-test-interface", @@ -8160,12 +8162,14 @@ dependencies = [ "omicron-dev-lib", "omicron-nexus", "omicron-rpaths", + "omicron-sled-agent", "omicron-test-utils", "omicron-workspace-hack", "oxide-client", "oxide-tokio-rt", "pq-sys", "signal-hook-tokio", + "sled-agent-types", "subprocess", "tokio", "tokio-postgres", @@ -13088,6 +13092,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "chrono", "derive_more 0.99.20", "dropshot", "futures", diff --git a/dev-tools/omicron-dev/Cargo.toml b/dev-tools/omicron-dev/Cargo.toml index 21a4bc7210c..0e271a6c15f 100644 --- a/dev-tools/omicron-dev/Cargo.toml +++ b/dev-tools/omicron-dev/Cargo.toml @@ -13,20 +13,24 @@ omicron-rpaths.workspace = true [dependencies] anyhow.workspace = true camino.workspace = true +chrono.workspace = true clap.workspace = true dropshot.workspace = true futures.workspace = true gateway-test-utils.workspace = true +illumos-utils.workspace = true libc.workspace = true nexus-config.workspace = true nexus-test-interface.workspace = true nexus-test-utils = { workspace = true, features = ["omicron-dev"] } omicron-nexus.workspace = true omicron-workspace-hack.workspace = true +omicron-sled-agent.workspace = true oxide-tokio-rt.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" signal-hook-tokio.workspace = true +sled-agent-types.workspace = true tokio.workspace = true toml.workspace = true diff --git a/dev-tools/omicron-dev/src/main.rs b/dev-tools/omicron-dev/src/main.rs index 9fa5ac0fc05..539432a613d 100644 --- a/dev-tools/omicron-dev/src/main.rs +++ b/dev-tools/omicron-dev/src/main.rs @@ -11,12 +11,18 @@ use libc::SIGINT; use nexus_config::NexusConfig; use nexus_test_interface::NexusServer; use nexus_test_utils::resource_helpers::DiskTest; +use omicron_sled_agent::sim::ConfigHealthMonitor; use signal_hook_tokio::Signals; use std::fs; const DEFAULT_NEXUS_CONFIG: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../nexus/examples/config.toml"); +const DEFAULT_HEALTH_MONITOR_CONFIG: &str = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../sled-agent/tests/configs/health_monitor_sim.toml" +); + fn main() -> anyhow::Result<()> { oxide_tokio_rt::run(async { let args = OmicronDevApp::parse(); @@ -57,6 +63,9 @@ struct RunAllArgs { /// Override the nexus configuration file. #[clap(long, default_value = DEFAULT_NEXUS_CONFIG)] nexus_config: Utf8PathBuf, + /// Override the sled agent health monitor configuration file. + #[clap(long, default_value = DEFAULT_HEALTH_MONITOR_CONFIG)] + health_monitor_config: Utf8PathBuf, } impl RunAllArgs { @@ -87,10 +96,23 @@ impl RunAllArgs { .set_port(p); } + let health_monitor_config_str = + fs::read_to_string(&self.health_monitor_config)?; + let sled_agent_health_monitor: ConfigHealthMonitor = + toml::from_str(&health_monitor_config_str).context(format!( + "parsing config: {}", + self.health_monitor_config.as_str() + ))?; + println!("omicron-dev: setting up all services ... "); let cptestctx = nexus_test_utils::omicron_dev_setup_with_config::< omicron_nexus::Server, - >(&mut config, 0, self.gateway_config.clone()) + >( + &mut config, + 0, + self.gateway_config.clone(), + sled_agent_health_monitor, + ) .await .context("error setting up services")?; diff --git a/illumos-utils/src/svcs.rs b/illumos-utils/src/svcs.rs index 17b68412379..d14fbe6819c 100644 --- a/illumos-utils/src/svcs.rs +++ b/illumos-utils/src/svcs.rs @@ -199,8 +199,8 @@ impl From for SvcState { #[serde(rename_all = "snake_case")] /// Information about an SMF service that is enabled but not running pub struct SvcInMaintenance { - fmri: String, - zone: String, + pub fmri: String, + pub zone: String, } impl SvcInMaintenance { diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 915cfa58af3..cdb38d53241 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -980,6 +980,12 @@ mod test { None, sim::ZpoolConfig::None, SledCpuFamily::AmdMilan, + // For now we disable the health monitor, we can change this preference + // later if necessary. + sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ); let agent = diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index a57bf139a9b..cb38253185f 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -7,6 +7,7 @@ use crate::ControlPlaneStarter; use crate::ControlPlaneTestContextSledAgent; use crate::starter::PopulateCrdb; +use crate::starter::SledAgentOptions; use crate::starter::setup_with_config_impl; #[cfg(feature = "omicron-dev")] use anyhow::Context; @@ -85,9 +86,15 @@ impl<'a> ControlPlaneBuilder<'a> { setup_with_config_impl( starter, PopulateCrdb::FromEnvironmentSeed, - sim::SimMode::Explicit, + SledAgentOptions { + sim_mode: sim::SimMode::Explicit, + extra_sled_agents: self.nextra_sled_agents, + sled_agent_health_monitor: sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, + }, self.tls_cert, - self.nextra_sled_agents, DEFAULT_SP_SIM_CONFIG.into(), false, ) @@ -361,6 +368,7 @@ pub async fn omicron_dev_setup_with_config( config: &mut NexusConfig, extra_sled_agents: u16, gateway_config_file: Utf8PathBuf, + sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> Result> { let starter = ControlPlaneStarter::::new("omicron-dev", config); @@ -383,9 +391,12 @@ pub async fn omicron_dev_setup_with_config( Ok(setup_with_config_impl( starter, PopulateCrdb::FromSeed { input_tar: seed_tar }, - sim::SimMode::Auto, + SledAgentOptions { + sim_mode: sim::SimMode::Auto, + extra_sled_agents, + sled_agent_health_monitor, + }, None, - extra_sled_agents, gateway_config_file, true, ) diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index 696a40b8e88..27281b2affc 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -880,6 +880,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { sled_id: SledUuid, sled_index: u16, sim_mode: sim::SimMode, + health_monitor: sim::ConfigHealthMonitor, ) { let nexus_address = self.nexus_internal_addr.expect("Must launch Nexus first"); @@ -896,6 +897,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { tempdir.path(), sim_mode, &self.simulated_upstairs, + health_monitor, ) .await .expect("Failed to start sled agent"); @@ -1000,6 +1002,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { sled_id: SledUuid, sled_index: u16, sim_mode: sim::SimMode, + health_monitor: sim::ConfigHealthMonitor, ) { let nexus_address = self.nexus_internal_addr.expect("Must launch Nexus first"); @@ -1016,6 +1019,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { tempdir.path(), sim_mode, &self.simulated_upstairs, + health_monitor, ) .await .expect("Failed to start sled agent"); @@ -1534,15 +1538,26 @@ impl RackInitRequestBuilder { } } +#[derive(Debug, Clone)] +pub(crate) struct SledAgentOptions { + pub sim_mode: sim::SimMode, + pub extra_sled_agents: u16, + pub sled_agent_health_monitor: sim::ConfigHealthMonitor, +} + pub(crate) async fn setup_with_config_impl( mut starter: ControlPlaneStarter<'_, N>, populate: PopulateCrdb, - sim_mode: sim::SimMode, + sled_agent_opts: SledAgentOptions, initial_cert: Option, - extra_sled_agents: u16, gateway_config_file: Utf8PathBuf, second_nexus: bool, ) -> ControlPlaneTestContext { + let SledAgentOptions { + sim_mode, + extra_sled_agents, + sled_agent_health_monitor, + } = sled_agent_opts; const STEP_TIMEOUT: Duration = Duration::from_secs(600); // All setups will start with CRDB and clickhouse @@ -1705,6 +1720,7 @@ pub(crate) async fn setup_with_config_impl( // The first and second sled agents have special UUIDs, and any extra ones // after that are random. + let health_monitor = sled_agent_health_monitor.clone(); starter .init_with_steps( vec![( @@ -1715,6 +1731,7 @@ pub(crate) async fn setup_with_config_impl( SLED_AGENT_UUID.parse().unwrap(), 0, sim_mode, + health_monitor, ) .boxed() }), @@ -1723,6 +1740,7 @@ pub(crate) async fn setup_with_config_impl( ) .await; + let health_monitor = sled_agent_health_monitor.clone(); if extra_sled_agents > 0 { starter .init_with_steps( @@ -1734,6 +1752,7 @@ pub(crate) async fn setup_with_config_impl( SLED_AGENT2_UUID.parse().unwrap(), 1, sim_mode, + health_monitor, ) .boxed() }), @@ -1743,7 +1762,9 @@ pub(crate) async fn setup_with_config_impl( .await; } + let health_monitor = sled_agent_health_monitor.clone(); for index in 1..extra_sled_agents { + let health_monitor = health_monitor.clone(); starter .init_with_steps( vec![( @@ -1754,6 +1775,7 @@ pub(crate) async fn setup_with_config_impl( SledUuid::new_v4(), index.checked_add(1).unwrap(), sim_mode, + health_monitor.clone(), ) .boxed() }), @@ -1839,6 +1861,7 @@ pub(crate) enum PopulateCrdb { /// /// Note: you should probably use the `extra_sled_agents` macro parameter on /// `nexus_test` instead! +#[allow(clippy::too_many_arguments)] pub async fn start_sled_agent( log: Logger, nexus_address: SocketAddr, @@ -1847,6 +1870,7 @@ pub async fn start_sled_agent( update_directory: &Utf8Path, sim_mode: sim::SimMode, simulated_upstairs: &Arc, + health_monitor: sim::ConfigHealthMonitor, ) -> Result { // Generate a baseboard serial number that matches the SP configuration // (SimGimlet00, SimGimlet01, etc.) so that inventory can link sled agents @@ -1861,6 +1885,7 @@ pub async fn start_sled_agent( sim::ZpoolConfig::None, SledCpuFamily::AmdMilan, Some(baseboard_serial), + health_monitor, ); start_sled_agent_with_config(log, &config, sled_index, simulated_upstairs) .await diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 70503aa0c1a..fb5373b4407 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1160,6 +1160,10 @@ async fn test_instance_migration_compatible_cpu_platforms( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, + omicron_sled_agent::sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ); let new_sled_id = config.id; @@ -1349,6 +1353,10 @@ async fn test_instance_migration_incompatible_cpu_platforms( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, + omicron_sled_agent::sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ); let turin_sled_id = config.id; @@ -1426,6 +1434,10 @@ async fn test_instance_migration_unknown_sled_type( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::Unknown, + omicron_sled_agent::sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ); let new_sled_id = config.id; @@ -7125,6 +7137,10 @@ async fn test_can_start_instance_with_cpu_platform( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, + omicron_sled_agent::sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ); let new_sled_id = config.id; diff --git a/nexus/tests/integration_tests/sleds.rs b/nexus/tests/integration_tests/sleds.rs index 53ae7d92394..afc325ce8a0 100644 --- a/nexus/tests/integration_tests/sleds.rs +++ b/nexus/tests/integration_tests/sleds.rs @@ -79,6 +79,10 @@ async fn test_sleds_list(cptestctx: &ControlPlaneTestContext) { &update_directory, sim::SimMode::Explicit, &cptestctx.first_sled_agent().simulated_upstairs, + sim::ConfigHealthMonitor { + enabled: false, + sim_health_checks: None, + }, ) .await .unwrap(), diff --git a/sled-agent/health-monitor/Cargo.toml b/sled-agent/health-monitor/Cargo.toml index 0e034220fd0..319eeb7a474 100644 --- a/sled-agent/health-monitor/Cargo.toml +++ b/sled-agent/health-monitor/Cargo.toml @@ -10,6 +10,7 @@ workspace = true [dependencies] anyhow.workspace = true async-trait.workspace = true +chrono.workspace = true derive_more.workspace = true dropshot.workspace = true futures.workspace = true diff --git a/sled-agent/health-monitor/src/handle.rs b/sled-agent/health-monitor/src/handle.rs index 030bdbec630..ac2259f4a72 100644 --- a/sled-agent/health-monitor/src/handle.rs +++ b/sled-agent/health-monitor/src/handle.rs @@ -3,6 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::health_checks::poll_smf_services_in_maintenance; +use crate::health_checks::sim_smf_services_in_maintenance; use illumos_utils::svcs::SvcsInMaintenanceResult; use sled_agent_types::inventory::HealthMonitorInventory; @@ -21,14 +22,6 @@ pub struct HealthMonitorHandle { } impl HealthMonitorHandle { - /// Returns a `HealthMonitorHandle` that doesn't monitor health and always - /// reports no problems - pub fn stub() -> Self { - let (_tx, smf_services_in_maintenance_rx) = - watch::channel(Ok(SvcsInMaintenanceResult::new())); - Self { smf_services_in_maintenance_rx } - } - pub fn spawn(log: Logger) -> Self { // Spawn a task to retrieve information about services in maintenance info!(log, "Starting SMF service health poller"); @@ -55,4 +48,29 @@ impl HealthMonitorHandle { .clone(), } } + + /// Returns a `HealthMonitorHandle` that doesn't monitor health and always + /// reports no problems unless a `ConfigSimHealthMonitor` with simulated + /// data is passed. + pub fn spawn_sim( + sim_health_checks: Option, + ) -> Self { + let (smf_services_in_maintenance_tx, smf_services_in_maintenance_rx) = + watch::channel(Ok(SvcsInMaintenanceResult::new())); + + if let Some(results) = sim_health_checks { + let HealthMonitorInventory { smf_services_in_maintenance } = + results; + + tokio::spawn(async move { + sim_smf_services_in_maintenance( + smf_services_in_maintenance, + smf_services_in_maintenance_tx, + ) + .await + }); + }; + + Self { smf_services_in_maintenance_rx } + } } diff --git a/sled-agent/health-monitor/src/health_checks.rs b/sled-agent/health-monitor/src/health_checks.rs index ec2611ad9ea..c4df690586a 100644 --- a/sled-agent/health-monitor/src/health_checks.rs +++ b/sled-agent/health-monitor/src/health_checks.rs @@ -43,3 +43,14 @@ pub(crate) async fn poll_smf_services_in_maintenance( }; } } + +pub(crate) async fn sim_smf_services_in_maintenance( + sim_smf_services_in_maintenance: Result, + smf_services_in_maintenance_tx: watch::Sender< + Result, + >, +) { + smf_services_in_maintenance_tx.send_modify(|status| { + *status = sim_smf_services_in_maintenance; + }) +} diff --git a/sled-agent/health-monitor/src/lib.rs b/sled-agent/health-monitor/src/lib.rs index fde8d5f55f1..25d313d5d97 100644 --- a/sled-agent/health-monitor/src/lib.rs +++ b/sled-agent/health-monitor/src/lib.rs @@ -4,10 +4,9 @@ //! Machinery for sled-agent to run periodic health checks. //! -//! The initial entry point to this system is [`HealthMonitorHandle::stub()`]. -//! This should be called early in sled-agent startup. Later during the -//! sled-agent start process, sled-agent should spawn each of the polling tasks -//! found in the health_checks module. +//! The initial entry point to this system is [`HealthMonitorHandle::spawn()`]. +//! During the sled-agent start process, sled-agent will spawn each of the +//! polling tasks found in the health_checks module. //! //! The health checks we run are: //! diff --git a/sled-agent/src/bin/sled-agent-sim.rs b/sled-agent/src/bin/sled-agent-sim.rs index e18ab69c213..482297be97a 100644 --- a/sled-agent/src/bin/sled-agent-sim.rs +++ b/sled-agent/src/bin/sled-agent-sim.rs @@ -15,6 +15,7 @@ use dropshot::ConfigLoggingLevel; use omicron_common::api::internal::nexus::Certificate; use omicron_common::cmd::CmdError; use omicron_common::cmd::fatal; +use omicron_sled_agent::sim::ConfigHealthMonitor; use omicron_sled_agent::sim::RssArgs; use omicron_sled_agent::sim::{ Config, ConfigHardware, ConfigStorage, ConfigZpool, SimMode, ZpoolConfig, @@ -22,9 +23,15 @@ use omicron_sled_agent::sim::{ }; use omicron_uuid_kinds::SledUuid; use sled_hardware_types::{Baseboard, SledCpuFamily}; +use std::fs; use std::net::SocketAddr; use std::net::SocketAddrV6; +pub const DEFAULT_HEALTH_MONITOR_CONFIG: &str = concat!( + env!("CARGO_MANIFEST_DIR"), + "/tests/configs/health_monitor_sim.toml" +); + fn parse_sim_mode(src: &str) -> Result { match src { "auto" => Ok(SimMode::Auto), @@ -56,6 +63,10 @@ struct Args { #[clap(action)] nexus_lockstep_port: u16, + /// Override the sled agent health monitor configuration file. + #[clap(long, default_value = DEFAULT_HEALTH_MONITOR_CONFIG)] + health_monitor_config: Utf8PathBuf, + #[clap(long, name = "NEXUS_EXTERNAL_IP:PORT", action)] /// If specified, when the simulated sled agent initializes the rack, it /// will record the Nexus service running with the specified external IP @@ -97,6 +108,18 @@ fn main() { async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); + let health_monitor_config_str = + fs::read_to_string(&args.health_monitor_config) + .context(format!("reading {:?}", &args.health_monitor_config)) + .map_err(CmdError::Failure)?; + let health_monitor: ConfigHealthMonitor = + toml::from_str(&health_monitor_config_str) + .context(format!( + "parsing config: {}", + args.health_monitor_config.as_str() + )) + .map_err(CmdError::Failure)?; + let tmp = camino_tempfile::tempdir() .map_err(|e| CmdError::Failure(anyhow!(e)))?; let config = Config { @@ -127,6 +150,7 @@ async fn do_run() -> Result<(), CmdError> { Some(tmp.path()), ZpoolConfig::TenVirtualU2s, SledCpuFamily::AmdMilan, + health_monitor, ) }; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index e42d3c1b0dd..e177f8e210b 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -284,7 +284,7 @@ async fn spawn_bootstore_tasks( node_handle } -async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { +pub async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { info!(log, "Starting health monitor"); let log = log.new(o!("component" => "HealthMonitor")); HealthMonitorHandle::spawn(log) diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs index 744ebb1bea3..37de1cb7823 100644 --- a/sled-agent/src/sim/config.rs +++ b/sled-agent/src/sim/config.rs @@ -10,6 +10,7 @@ use dropshot::ConfigDropshot; use omicron_uuid_kinds::SledUuid; use serde::Deserialize; use serde::Serialize; +use sled_agent_types::inventory::HealthMonitorInventory; pub use sled_hardware_types::{Baseboard, SledCpuFamily}; use sp_sim::FAKE_GIMLET_MODEL; use std::net::Ipv6Addr; @@ -66,6 +67,16 @@ pub struct ConfigHardware { pub baseboard: Baseboard, } +/// Configuration for the simulated health monitor. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConfigHealthMonitor { + /// Whether the real health monitor is running or not. + /// If set, it will override any simulated health check results. + pub enabled: bool, + /// Simulated failed health checks + pub sim_health_checks: Option, +} + /// Configuration for a sled agent #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct Config { @@ -83,6 +94,8 @@ pub struct Config { pub updates: ConfigUpdates, /// configuration to emulate the sled agent's hardware pub hardware: ConfigHardware, + /// configuration for the sled agent's health monitor + pub health_monitor: ConfigHealthMonitor, } pub enum ZpoolConfig { @@ -101,6 +114,7 @@ impl Config { update_directory: Option<&Utf8Path>, zpool_config: ZpoolConfig, cpu_family: SledCpuFamily, + health_monitor: ConfigHealthMonitor, ) -> Config { Self::for_testing_with_baseboard( id, @@ -110,9 +124,11 @@ impl Config { zpool_config, cpu_family, None, + health_monitor, ) } + #[allow(clippy::too_many_arguments)] pub fn for_testing_with_baseboard( id: SledUuid, sim_mode: SimMode, @@ -121,6 +137,7 @@ impl Config { zpool_config: ZpoolConfig, cpu_family: SledCpuFamily, baseboard_serial: Option, + health_monitor: ConfigHealthMonitor, ) -> Config { // This IP range is guaranteed by RFC 6666 to discard traffic. // For tests that don't use a Nexus, we use this address to simulate a @@ -173,6 +190,7 @@ impl Config { revision: 3, }, }, + health_monitor, } } } diff --git a/sled-agent/src/sim/mod.rs b/sled-agent/src/sim/mod.rs index ef7915293e8..6662ee2c1ba 100644 --- a/sled-agent/src/sim/mod.rs +++ b/sled-agent/src/sim/mod.rs @@ -20,8 +20,9 @@ mod upstairs; pub use crate::updates::ConfigUpdates; pub use config::{ - Baseboard, Config, ConfigHardware, ConfigStorage, ConfigZpool, SimMode, - TEST_HARDWARE_THREADS, TEST_RESERVOIR_RAM, ZpoolConfig, + Baseboard, Config, ConfigHardware, ConfigHealthMonitor, ConfigStorage, + ConfigZpool, SimMode, TEST_HARDWARE_THREADS, TEST_RESERVOIR_RAM, + ZpoolConfig, }; pub use server::{RssArgs, Server, run_standalone_server}; pub use sled_agent::SledAgent; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index bb6e9c028e8..ee7b0e0a0e0 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -12,9 +12,10 @@ use super::instance::{self, SimInstance}; use super::storage::CrucibleData; use super::storage::Storage; use crate::artifact_store::ArtifactStore; +use crate::long_running_tasks::spawn_health_monitor_tasks; use crate::nexus::NexusClient; -use crate::sim::SimulatedUpstairs; use crate::sim::simulatable::Simulatable; +use crate::sim::{ConfigHealthMonitor, SimulatedUpstairs}; use crate::support_bundle::storage::SupportBundleQueryType; use crate::updates::UpdateManager; use anyhow::Context; @@ -168,7 +169,14 @@ impl SledAgent { .await .start(&log, &config.dropshot); - let health_monitor = HealthMonitorHandle::stub(); + let ConfigHealthMonitor { enabled, sim_health_checks } = + config.health_monitor.clone(); + + let health_monitor = if enabled { + spawn_health_monitor_tasks(&log).await + } else { + HealthMonitorHandle::spawn_sim(sim_health_checks) + }; Arc::new(SledAgent { id, diff --git a/sled-agent/tests/configs/health_monitor_sim.toml b/sled-agent/tests/configs/health_monitor_sim.toml new file mode 100644 index 00000000000..0ef10f01b28 --- /dev/null +++ b/sled-agent/tests/configs/health_monitor_sim.toml @@ -0,0 +1,7 @@ +# +# Sled agent health monitor: example config file +# +# With this configuration, all health checks will appear as successful. +# + +enabled = false \ No newline at end of file diff --git a/sled-agent/tests/configs/health_monitor_sim_enabled.toml b/sled-agent/tests/configs/health_monitor_sim_enabled.toml new file mode 100644 index 00000000000..6926edbde82 --- /dev/null +++ b/sled-agent/tests/configs/health_monitor_sim_enabled.toml @@ -0,0 +1,8 @@ +# +# Sled agent health monitor: example config file +# +# With this configuration, the actual health monitor will be running, performing +# the actual health checks against the machine this is running on. +# + +enabled = true \ No newline at end of file diff --git a/sled-agent/tests/configs/health_monitor_sim_unhealthy.toml b/sled-agent/tests/configs/health_monitor_sim_unhealthy.toml new file mode 100644 index 00000000000..2a2dacfcbd1 --- /dev/null +++ b/sled-agent/tests/configs/health_monitor_sim_unhealthy.toml @@ -0,0 +1,19 @@ +# +# Sled agent health monitor: example config file +# +# With this configuration, we have injected some dummy failed health check +# results. +# + +enabled = false + +[sim_health_checks.smf_services_in_maintenance.ok] +services = [ + { fmri = "svc:/system/fake-service-1:default", zone = "oxz_fake_zone_1" }, + { fmri = "svc:/network/fake-service-2:default", zone = "oxz_fake_zone_2" }, + { fmri = "svc:/application/fake-service-3:default", zone = "global" } +] + +errors = [] + +time_of_status = "2026-04-12T23:20:50.52Z" \ No newline at end of file