diff --git a/Cargo.lock b/Cargo.lock index 1c4ddecd6e7..92d2a5bd2e5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5205,6 +5205,7 @@ dependencies = [ "http", "ipnetwork", "itertools 0.14.0", + "key-manager-types", "libc", "macaddr", "nix 0.30.1", @@ -5232,6 +5233,7 @@ dependencies = [ "toml 0.8.23", "uuid", "whoami", + "zfs-atomic-change-key", "zone 0.3.1", ] @@ -5794,6 +5796,7 @@ version = "0.1.0" dependencies = [ "async-trait", "hkdf", + "key-manager-types", "omicron-common", "omicron-workspace-hack", "secrecy 0.10.3", @@ -5804,6 +5807,14 @@ dependencies = [ "zeroize", ] +[[package]] +name = "key-manager-types" +version = "0.1.0" +dependencies = [ + "omicron-workspace-hack", + "secrecy 0.10.3", +] + [[package]] name = "knuffel" version = "3.2.0" @@ -13124,6 +13135,7 @@ dependencies = [ "illumos-utils", "installinator-common", "key-manager", + "key-manager-types", "ntp-admin-client", "omicron-common", "omicron-test-utils", @@ -13134,6 +13146,7 @@ dependencies = [ "regex", "schemars 0.8.22", "scopeguard", + "secrecy 0.10.3", "serde", "serde_json", "sha2", @@ -13149,6 +13162,7 @@ dependencies = [ "test-strategy", "thiserror 2.0.18", "tokio", + "trust-quorum-types", "tufaceous-artifact", "xshell", "zone 0.3.1", @@ -17236,6 +17250,19 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "zfs-atomic-change-key" +version = "0.1.0" +source = "git+https://github.com/oxidecomputer/zfs-atomic-change-key#0324c273eb8acc837eff4ffaea17cd773a76fd93" +dependencies = [ + "hex", + "serde", + "serde_json", + "thiserror 1.0.69", + "tokio", + "zeroize", +] + [[package]] name = "zfs-test-harness" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 200f38df400..613a080774e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,6 +74,7 @@ members = [ "internal-dns/types/versions", "ipcc", "key-manager", + "key-manager/types", "live-tests", "live-tests/macros", "nexus", @@ -247,6 +248,7 @@ default-members = [ "internal-dns/types/versions", "ipcc", "key-manager", + "key-manager/types", "live-tests", "live-tests/macros", "nexus", @@ -553,6 +555,7 @@ ipnetwork = { version = "0.21", features = ["schemars", "serde"] } ispf = { git = "https://github.com/oxidecomputer/ispf" } jiff = "0.2.15" key-manager = { path = "key-manager" } +key-manager-types = { path = "key-manager/types" } kstat-rs = "0.2.4" libc = "0.2.174" libipcc = { git = "https://github.com/oxidecomputer/ipcc-rs", rev = "524eb8f125003dff50b9703900c6b323f00f9e1b" } @@ -829,6 +832,7 @@ wicketd-client = { path = "clients/wicketd-client" } xshell = "0.2.7" zerocopy = "0.8.26" zeroize = { version = "1.8.1", features = ["zeroize_derive", "std"] } +zfs-atomic-change-key = { git = "https://github.com/oxidecomputer/zfs-atomic-change-key" } zfs-test-harness = { path = "sled-storage/zfs-test-harness" } zip = { version = "4.2.0", default-features = false, features = ["deflate","bzip2"] } zone = { version = "0.3.1", default-features = false, features = ["async"] } diff --git a/illumos-utils/Cargo.toml b/illumos-utils/Cargo.toml index 2c1908d761a..332c4b74850 100644 --- a/illumos-utils/Cargo.toml +++ b/illumos-utils/Cargo.toml @@ -24,6 +24,7 @@ futures.workspace = true http.workspace = true ipnetwork.workspace = true itertools.workspace = true +key-manager-types.workspace = true libc.workspace = true macaddr.workspace = true nix.workspace = true @@ -44,6 +45,7 @@ tokio.workspace = true uuid.workspace = true whoami.workspace = true zone.workspace = true +zfs-atomic-change-key.workspace = true tofino.workspace = true rustix.workspace = true diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index 9baede417ef..59dba8a65c5 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -206,6 +206,42 @@ pub struct GetValueError { #[error("Failed to list snapshots: {0}")] pub struct ListSnapshotsError(#[from] crate::ExecutionError); +/// Error returned by [`Zfs::change_key`]. +#[derive(thiserror::Error, Debug)] +#[error("Failed to change encryption key for dataset '{name}'")] +pub struct ChangeKeyError { + pub name: String, + #[source] + pub err: anyhow::Error, +} + +/// Error returned by [`Zfs::load_key`]. +#[derive(thiserror::Error, Debug)] +#[error("Failed to load encryption key for dataset '{name}'")] +pub struct LoadKeyError { + pub name: String, + #[source] + pub err: crate::ExecutionError, +} + +/// Error returned by [`Zfs::dataset_exists`]. +#[derive(thiserror::Error, Debug)] +#[error("Failed to check if dataset '{name}' exists")] +pub struct DatasetExistsError { + pub name: String, + #[source] + pub err: crate::ExecutionError, +} + +/// Error returned by [`Zfs::unload_key`]. +#[derive(thiserror::Error, Debug)] +#[error("Failed to unload encryption key for dataset '{name}'")] +pub struct UnloadKeyError { + pub name: String, + #[source] + pub err: crate::ExecutionError, +} + #[derive(Debug, thiserror::Error)] #[error( "Failed to create snapshot '{snap_name}' from filesystem '{filesystem}': {err}" @@ -523,11 +559,19 @@ pub struct DatasetProperties { /// string so that unexpected compression formats don't prevent inventory /// from being collected. pub compression: String, + /// The encryption key epoch for this dataset. + /// + /// Only present on encrypted datasets that directly hold a key (e.g., + /// crypt datasets on U.2s). Not present on datasets that inherit + /// encryption from a parent. + pub epoch: Option, } impl DatasetProperties { - const ZFS_GET_PROPS: &'static str = - "oxide:uuid,name,mounted,avail,used,quota,reservation,compression"; + const ZFS_GET_PROPS: &'static str = concat!( + "oxide:uuid,oxide:epoch,", + "name,mounted,avail,used,quota,reservation,compression", + ); } impl TryFrom<&DatasetProperties> for SharedDatasetConfig { @@ -648,6 +692,18 @@ impl DatasetProperties { .get("compression") .map(|(prop, _source)| prop.to_string()) .ok_or_else(|| anyhow!("Missing 'compression'"))?; + // The epoch property is only present on encrypted datasets. + // Like oxide:uuid, we ignore inherited values. + let epoch = props + .get("oxide:epoch") + .filter(|(prop, source)| { + !source.starts_with("inherited") && *prop != "-" + }) + .map(|(prop, _source)| { + prop.parse::() + .context("Failed to parse 'oxide:epoch'") + }) + .transpose()?; Ok(DatasetProperties { id, @@ -658,6 +714,7 @@ impl DatasetProperties { quota, reservation, compression, + epoch, }) }) .collect::, _>>() @@ -1197,7 +1254,7 @@ impl Zfs { name: &str, mountpoint: &Mountpoint, ) -> Result<(), EnsureDatasetErrorRaw> { - let mount_info = Self::dataset_exists(name, mountpoint).await?; + let mount_info = Self::dataset_mount_info(name, mountpoint).await?; if !mount_info.exists { return Err(EnsureDatasetErrorRaw::DoesNotExist); } @@ -1246,7 +1303,7 @@ impl Zfs { additional_options, }: DatasetEnsureArgs<'_>, ) -> Result<(), EnsureDatasetErrorRaw> { - let dataset_info = Self::dataset_exists(name, &mountpoint).await?; + let dataset_info = Self::dataset_mount_info(name, &mountpoint).await?; // Non-zoned datasets with an explicit mountpoint and the // "canmount=on" property should be mounted within the global zone. @@ -1365,9 +1422,29 @@ impl Zfs { Ok(()) } - // Return (true, mounted) if the dataset exists, (false, false) otherwise, - // where mounted is if the dataset is mounted. - async fn dataset_exists( + /// Check if a ZFS dataset exists. + pub async fn dataset_exists( + name: &str, + ) -> Result { + let mut cmd = Command::new(ZFS); + cmd.args(&["list", "-H", name]); + match execute_async(&mut cmd).await { + Ok(_) => Ok(true), + Err(crate::ExecutionError::CommandFailure(ref info)) + if info.stderr.contains("does not exist") => + { + Ok(false) + } + Err(err) => Err(DatasetExistsError { name: name.to_string(), err }), + } + } + + /// Get mount info for a dataset, validating its mountpoint. + /// + /// Returns (exists=true, mounted) if the dataset exists with the expected + /// mountpoint, (exists=false, mounted=false) if it doesn't exist. + /// Returns an error if the dataset exists but has an unexpected mountpoint. + async fn dataset_mount_info( name: &str, mountpoint: &Mountpoint, ) -> Result { @@ -1523,6 +1600,62 @@ impl Zfs { }) } + /// Atomically change the encryption key and set the oxide:epoch property. + /// + /// This operation is used for ZFS key rotation when a new Trust Quorum + /// epoch is committed. + pub async fn change_key( + dataset: &str, + key: &key_manager_types::VersionedAes256GcmDiskEncryptionKey, + ) -> Result<(), ChangeKeyError> { + // FIXME: Replace the use of `zfs_atomic_change_key` with a native + // invocation of `zfs change-key` using the `-o oxide:epoch` option to + // set the epoch. At time of writing, the `zfs change-key` command does + // not support setting user properties inline, but a patch is pending to + // add this feature. + + let ds = zfs_atomic_change_key::Dataset::new(dataset).map_err(|e| { + ChangeKeyError { + name: dataset.to_string(), + err: anyhow::anyhow!("invalid dataset name: {e}"), + } + })?; + + ds.change_key(zfs_atomic_change_key::Key::hex(*key.expose_secret())) + .property("oxide:epoch", key.epoch().to_string()) + .await + .map_err(|e| ChangeKeyError { + name: dataset.to_string(), + err: anyhow::Error::from(e), + }) + } + + /// Load the encryption key for an encrypted ZFS dataset. + /// + /// This makes the dataset accessible for mounting. The key must have + /// previously been written to the dataset's keylocation. + pub async fn load_key(name: &str) -> Result<(), LoadKeyError> { + let mut cmd = Command::new(PFEXEC); + cmd.args(&[ZFS, "load-key", name]); + execute_async(&mut cmd) + .await + .map(|_| ()) + .map_err(|err| LoadKeyError { name: name.to_string(), err }) + } + + /// Unload the encryption key for an encrypted ZFS dataset. + /// + /// This is used for cleanup after failed key operations or during + /// trial decryption recovery. The dataset must not be mounted. + pub async fn unload_key(name: &str) -> Result<(), UnloadKeyError> { + let mut cmd = Command::new(PFEXEC); + cmd.args(&[ZFS, "unload-key", name]); + execute_async(&mut cmd) + .await + .map(|_| ()) + .map_err(|err| UnloadKeyError { name: name.to_string(), err }) + } + /// Calls "zfs get" to acquire multiple values /// /// - `names`: The properties being acquired diff --git a/key-manager/Cargo.toml b/key-manager/Cargo.toml index a38be5beff9..c0107576d27 100644 --- a/key-manager/Cargo.toml +++ b/key-manager/Cargo.toml @@ -10,6 +10,7 @@ workspace = true [dependencies] async-trait.workspace = true hkdf.workspace = true +key-manager-types.workspace = true omicron-common.workspace = true secrecy.workspace = true sha3.workspace = true diff --git a/key-manager/src/lib.rs b/key-manager/src/lib.rs index 029a5d42b51..4868cde4abb 100644 --- a/key-manager/src/lib.rs +++ b/key-manager/src/lib.rs @@ -9,7 +9,9 @@ use std::fmt::Debug; use async_trait::async_trait; use hkdf::Hkdf; -use secrecy::{ExposeSecret, ExposeSecretMut, SecretBox}; +use key_manager_types::Aes256GcmDiskEncryptionKey; +pub use key_manager_types::VersionedAes256GcmDiskEncryptionKey; +use secrecy::{ExposeSecret, SecretBox}; use sha3::Sha3_256; use slog::{Logger, o, warn}; use tokio::sync::{mpsc, oneshot}; @@ -52,27 +54,6 @@ pub enum Error { SecretRetrieval(#[from] SecretRetrieverError), } -/// Derived Disk Encryption key -#[derive(Default)] -struct Aes256GcmDiskEncryptionKey(SecretBox<[u8; 32]>); - -/// A Disk encryption key for a given epoch to be used with ZFS datasets for -/// U.2 devices -pub struct VersionedAes256GcmDiskEncryptionKey { - epoch: u64, - key: Aes256GcmDiskEncryptionKey, -} - -impl VersionedAes256GcmDiskEncryptionKey { - pub fn epoch(&self) -> u64 { - self.epoch - } - - pub fn expose_secret(&self) -> &[u8; 32] { - &self.key.0.expose_secret() - } -} - /// A request sent from a [`StorageKeyRequester`] to the [`KeyManager`]. enum StorageKeyRequest { GetKey { @@ -255,11 +236,11 @@ impl KeyManager { disk_id.model.as_bytes(), disk_id.serial.as_bytes(), ], - key.0.expose_secret_mut(), + key.expose_secret_mut(), ) .unwrap(); - Ok(VersionedAes256GcmDiskEncryptionKey { epoch, key }) + Ok(VersionedAes256GcmDiskEncryptionKey::new(epoch, key)) } /// Return the epochs for all secrets which are loaded @@ -309,6 +290,9 @@ pub enum SecretRetrieverError { #[error("Trust quorum error: {0}")] TrustQuorum(String), + + #[error("Timeout retrieving secret")] + Timeout, } /// A mechanism for retrieving a secrets to use as input key material to HKDF- @@ -405,7 +389,7 @@ mod tests { }; let epoch = 0; let key = km.disk_encryption_key(epoch, &disk_id).await.unwrap(); - assert_eq!(key.epoch, epoch); + assert_eq!(key.epoch(), epoch); // Key derivation is deterministic based on disk_id and loaded secrets let key2 = km.disk_encryption_key(epoch, &disk_id).await.unwrap(); @@ -436,8 +420,8 @@ mod tests { let epoch = 0; let key1 = km.disk_encryption_key(epoch, &id_1).await.unwrap(); let key2 = km.disk_encryption_key(epoch, &id_2).await.unwrap(); - assert_eq!(key1.epoch, epoch); - assert_eq!(key2.epoch, epoch); + assert_eq!(key1.epoch(), epoch); + assert_eq!(key2.epoch(), epoch); assert_ne!(key1.expose_secret(), key2.expose_secret()); } diff --git a/key-manager/types/Cargo.toml b/key-manager/types/Cargo.toml new file mode 100644 index 00000000000..1c4b16c28ec --- /dev/null +++ b/key-manager/types/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "key-manager-types" +version = "0.1.0" +edition.workspace = true +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +secrecy.workspace = true +omicron-workspace-hack.workspace = true diff --git a/key-manager/types/src/lib.rs b/key-manager/types/src/lib.rs new file mode 100644 index 00000000000..c74636db410 --- /dev/null +++ b/key-manager/types/src/lib.rs @@ -0,0 +1,45 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for disk encryption keys used by the key-manager crate. + +use secrecy::{ExposeSecret, ExposeSecretMut, SecretBox}; + +/// Derived Disk Encryption key +#[derive(Debug, Default)] +pub struct Aes256GcmDiskEncryptionKey(SecretBox<[u8; 32]>); + +impl Aes256GcmDiskEncryptionKey { + /// Expose the secret key bytes mutably for writing. + /// + /// This is intended for use by the key-manager crate during key derivation. + pub fn expose_secret_mut(&mut self) -> &mut [u8; 32] { + self.0.expose_secret_mut() + } +} + +/// A Disk encryption key for a given epoch to be used with ZFS datasets for +/// U.2 devices +#[derive(Debug)] +pub struct VersionedAes256GcmDiskEncryptionKey { + epoch: u64, + key: Aes256GcmDiskEncryptionKey, +} + +impl VersionedAes256GcmDiskEncryptionKey { + /// Create a new versioned disk encryption key. + /// + /// This is intended for use by the key-manager crate during key derivation. + pub fn new(epoch: u64, key: Aes256GcmDiskEncryptionKey) -> Self { + Self { epoch, key } + } + + pub fn epoch(&self) -> u64 { + self.epoch + } + + pub fn expose_secret(&self) -> &[u8; 32] { + &self.key.0.expose_secret() + } +} diff --git a/sled-agent/config-reconciler/Cargo.toml b/sled-agent/config-reconciler/Cargo.toml index 75695b7069e..f4f3360682b 100644 --- a/sled-agent/config-reconciler/Cargo.toml +++ b/sled-agent/config-reconciler/Cargo.toml @@ -25,12 +25,14 @@ iddqd.workspace = true illumos-utils.workspace = true installinator-common.workspace = true key-manager.workspace = true +key-manager-types.workspace = true sled-agent-types-versions.workspace = true ntp-admin-client.workspace = true omicron-common.workspace = true omicron-uuid-kinds.workspace = true rand.workspace = true regex.workspace = true +secrecy.workspace = true serde.workspace = true sha2.workspace = true sled-agent-api.workspace = true @@ -42,6 +44,7 @@ slog-error-chain.workspace = true strum.workspace = true thiserror.workspace = true tokio.workspace = true +trust-quorum-types.workspace = true tufaceous-artifact.workspace = true zone.workspace = true diff --git a/sled-agent/config-reconciler/src/dataset_serialization_task.rs b/sled-agent/config-reconciler/src/dataset_serialization_task.rs index 877609ba255..39f760300fb 100644 --- a/sled-agent/config-reconciler/src/dataset_serialization_task.rs +++ b/sled-agent/config-reconciler/src/dataset_serialization_task.rs @@ -26,12 +26,14 @@ use illumos_utils::zfs::DestroyDatasetError; use illumos_utils::zfs::Mountpoint; use illumos_utils::zfs::WhichDatasets; use illumos_utils::zfs::Zfs; +use key_manager_types::VersionedAes256GcmDiskEncryptionKey; use omicron_common::disk::DatasetConfig; use omicron_common::disk::DatasetKind; use omicron_common::disk::DatasetName; use omicron_common::disk::SharedDatasetConfig; use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::PhysicalDiskUuid; use sled_agent_types::inventory::InventoryDataset; use sled_agent_types::inventory::OrphanedDataset; use sled_storage::config::MountConfig; @@ -160,6 +162,47 @@ pub enum NestedDatasetListError { }, } +/// Error returned when ZFS encryption key rotation fails. +#[derive(Debug, thiserror::Error)] +#[error("failed to rotate encryption key for dataset {dataset}")] +pub struct KeyRotationError { + dataset: String, + #[source] + err: anyhow::Error, +} + +/// Information for rekeying a single dataset. +#[derive(Debug)] +pub struct DatasetRekeyInfo { + /// Full ZFS dataset name (e.g., `oxp_/crypt`) + pub dataset_name: String, + /// The new encryption key with its epoch. + pub key: VersionedAes256GcmDiskEncryptionKey, +} + +/// Request to rekey a batch of datasets. +#[derive(Debug, Default)] +pub struct RekeyRequest { + /// Datasets to rekey, keyed by physical disk UUID. + pub disks: BTreeMap, +} + +/// Result of a batch rekey operation. +#[derive(Debug, Default)] +pub struct RekeyResult { + /// Disks that were successfully rekeyed. + pub succeeded: BTreeSet, + /// Disks that failed to rekey. + pub failed: BTreeSet, +} + +impl RekeyResult { + /// Returns true if any rekey operations failed. + pub fn has_failures(&self) -> bool { + !self.failed.is_empty() + } +} + #[derive(Debug)] pub(crate) struct DatasetEnsureResult { pub(crate) config: DatasetConfig, @@ -330,6 +373,18 @@ impl DatasetTaskHandle { .await } + /// Rotate encryption keys for multiple datasets. + pub async fn rekey_datasets( + &self, + request: RekeyRequest, + ) -> Result { + self.try_send_request(|tx| DatasetTaskRequest::DatasetsRekey { + request, + tx, + }) + .await + } + async fn try_send_request( &self, make_request: F, @@ -421,6 +476,9 @@ impl DatasetTask { self.nested_dataset_list(dataset, options, zfs).await, ); } + DatasetTaskRequest::DatasetsRekey { request, tx } => { + _ = tx.0.send(self.datasets_rekey(request, zfs).await); + } } } @@ -1117,6 +1175,90 @@ impl DatasetTask { }) .collect()) } + + /// Rotate encryption keys for multiple datasets. + /// + /// For each request, checks if the dataset is already at the target epoch + /// (idempotent), and if not, rotates the key and sets the epoch property + /// (atomically with the key rotation). + /// + /// This is a batch operation so that the caller can ensure that all + /// datasets are updated together with no intervening operations. It does + /// not guarantee that the operation will succeed atomically across all + /// datasets. + async fn datasets_rekey( + &mut self, + request: RekeyRequest, + zfs: &T, + ) -> RekeyResult { + let log = self.log.new(slog::o!("request" => "datasets_rekey")); + let mut succeeded = BTreeSet::new(); + let mut failed = BTreeSet::new(); + + // Batch fetch current epochs for all datasets to check for idempotency + let dataset_names: Vec = + request.disks.values().map(|r| r.dataset_name.clone()).collect(); + let current_epochs: BTreeMap> = match zfs + .get_dataset_properties(&dataset_names, WhichDatasets::SelfOnly) + .await + { + Ok(props) => props.into_iter().map(|p| (p.name, p.epoch)).collect(), + Err(e) => { + // If we can't read properties, log and proceed with all rekeys + warn!( + log, + "Could not read dataset properties, proceeding with rekeys"; + "error" => %e, + ); + BTreeMap::new() + } + }; + + for (disk_id, req) in request.disks { + let new_epoch = req.key.epoch(); + let current_epoch = + current_epochs.get(&req.dataset_name).copied().flatten(); + + // Check current epoch - skip if already at target + if current_epoch == Some(new_epoch) { + info!( + log, + "Dataset already at target epoch, skipping"; + "dataset" => &req.dataset_name, + "epoch" => new_epoch, + ); + succeeded.insert(disk_id); + continue; + } + + info!( + log, + "Rotating encryption key"; + "dataset" => &req.dataset_name, + "current_epoch" => ?current_epoch, + "new_epoch" => new_epoch, + ); + + match zfs.change_key(&req.dataset_name, &req.key).await { + Ok(()) => { + succeeded.insert(disk_id); + } + Err(e) => { + warn!( + log, + "Failed to rotate encryption key"; + "dataset" => &req.dataset_name, + "current_epoch" => ?current_epoch, + "new_epoch" => new_epoch, + "error" => %e, + ); + failed.insert(disk_id); + } + } + } + + RekeyResult { succeeded, failed } + } } #[derive(Debug)] @@ -1162,6 +1304,10 @@ enum DatasetTaskRequest { >, >, }, + DatasetsRekey { + request: RekeyRequest, + tx: DebugIgnore>, + }, } #[derive(Debug)] @@ -1239,6 +1385,16 @@ trait ZfsImpl: Send + Sync + 'static { datasets: &[String], which: WhichDatasets, ) -> impl Future>> + Send; + + /// Atomically change the encryption key and set the oxide:epoch property. + /// + /// This is used for ZFS key rotation when a new Trust Quorum epoch is + /// committed. + fn change_key( + &self, + dataset: &str, + key: &VersionedAes256GcmDiskEncryptionKey, + ) -> impl Future> + Send; } struct RealZfs; @@ -1284,12 +1440,24 @@ impl ZfsImpl for RealZfs { ) -> anyhow::Result> { Zfs::get_dataset_properties(datasets, which).await } + + async fn change_key( + &self, + dataset: &str, + key: &VersionedAes256GcmDiskEncryptionKey, + ) -> Result<(), KeyRotationError> { + Zfs::change_key(dataset, key).await.map_err(|err| KeyRotationError { + dataset: dataset.to_string(), + err: err.into(), + }) + } } #[cfg(test)] mod tests { use super::*; use crate::CurrentlyManagedZpoolsReceiver; + use anyhow::anyhow; use assert_matches::assert_matches; use illumos_utils::zfs::DestroyDatasetErrorVariant; use omicron_common::api::external::ByteCount; @@ -1371,6 +1539,7 @@ mod tests { .as_ref() .map(|sd| sd.compression.to_string()) .unwrap_or_else(|| "on".to_string()), + epoch: args.encryption_details.as_ref().map(|ed| ed.epoch), }; state.datasets.insert(props.name.clone(), props); Ok(()) @@ -1471,6 +1640,23 @@ mod tests { .map(|(_, props)| props.clone()) .collect()) } + + async fn change_key( + &self, + dataset: &str, + key: &VersionedAes256GcmDiskEncryptionKey, + ) -> Result<(), KeyRotationError> { + let mut state = self.inner.lock().unwrap(); + + // Verify dataset exists and update its epoch + let props = + state.datasets.get_mut(dataset).ok_or(KeyRotationError { + dataset: dataset.to_string(), + err: anyhow!("dataset does not exist"), + })?; + props.epoch = Some(key.epoch()); + Ok(()) + } } #[derive(Debug, Arbitrary, PartialEq, Eq, PartialOrd, Ord)] @@ -2051,6 +2237,7 @@ mod tests { quota: None, reservation: None, compression: CompressionAlgorithm::On.to_string(), + epoch: None, }, ); } diff --git a/sled-agent/config-reconciler/src/handle.rs b/sled-agent/config-reconciler/src/handle.rs index 4da9addf7ab..a1b93972fd3 100644 --- a/sled-agent/config-reconciler/src/handle.rs +++ b/sled-agent/config-reconciler/src/handle.rs @@ -24,6 +24,7 @@ use std::sync::Arc; use std::sync::OnceLock; use tokio::sync::oneshot; use tokio::sync::watch; +use trust_quorum_types::types::Epoch; #[cfg(feature = "testing")] use camino_tempfile::Utf8TempDir; @@ -254,8 +255,11 @@ impl ConfigReconcilerHandle { /// Spawn the primary config reconciliation task. /// /// This method can effectively only be called once, because the caller must - /// supply the `token` returned by `spawn_ledger_task()` when this handle was created. + /// supply the `token` returned by `spawn_ledger_task()` when this handle + /// was created. /// + /// The `committed_epoch_rx` is used to receive notifications when a new + /// Trust Quorum epoch is committed, triggering ZFS key rotation. pub fn spawn_reconciliation_task< T: SledAgentFacilities, U: SledAgentArtifactStore + Clone, @@ -263,6 +267,7 @@ impl ConfigReconcilerHandle { &self, sled_agent_facilities: T, sled_agent_artifact_store: U, + committed_epoch_rx: watch::Receiver>, token: ConfigReconcilerSpawnToken, ) { let ConfigReconcilerSpawnToken { @@ -292,6 +297,7 @@ impl ConfigReconcilerHandle { external_disks_tx, former_zone_root_archiver, raw_disks_rx, + committed_epoch_rx, sled_agent_facilities, sled_agent_artifact_store, reconciler_task_log, diff --git a/sled-agent/config-reconciler/src/reconciler_task.rs b/sled-agent/config-reconciler/src/reconciler_task.rs index 9344392f2b3..b28980cd882 100644 --- a/sled-agent/config-reconciler/src/reconciler_task.rs +++ b/sled-agent/config-reconciler/src/reconciler_task.rs @@ -24,25 +24,32 @@ use sled_agent_types::inventory::OmicronSledConfig; use sled_agent_types::inventory::OrphanedDataset; use sled_agent_types::inventory::RemoveMupdateOverrideInventory; use sled_storage::config::MountConfig; +use sled_storage::dataset::CRYPT_DATASET; use sled_storage::dataset::LOCAL_STORAGE_DATASET; use sled_storage::dataset::LOCAL_STORAGE_UNENCRYPTED_DATASET; use sled_storage::dataset::U2_DEBUG_DATASET; use sled_storage::dataset::ZONE_DATASET; use sled_storage::disk::Disk; use slog::Logger; +use slog::debug; +use slog::error; use slog::info; use slog::warn; use std::collections::BTreeMap; +use std::collections::BTreeSet; use std::collections::HashSet; use std::sync::Arc; use std::time::Duration; use std::time::Instant; use tokio::sync::watch; +use trust_quorum_types::types::Epoch; use crate::InternalDisksReceiver; use crate::SledAgentArtifactStore; use crate::TimeSyncConfig; +use crate::dataset_serialization_task::DatasetRekeyInfo; use crate::dataset_serialization_task::DatasetTaskHandle; +use crate::dataset_serialization_task::RekeyRequest; use crate::debug_collector::FormerZoneRootArchiver; use crate::host_phase_2::BootPartitionReconciler; use crate::ledger::CurrentSledConfig; @@ -75,6 +82,7 @@ pub(crate) fn spawn( external_disks_tx: watch::Sender>, former_zone_root_archiver: FormerZoneRootArchiver, raw_disks_rx: RawDisksReceiver, + committed_epoch_rx: watch::Receiver>, sled_agent_facilities: T, sled_agent_artifact_store: U, log: Logger, @@ -101,6 +109,7 @@ pub(crate) fn spawn( datasets, zones, boot_partitions, + committed_epoch_rx, log, } .run(sled_agent_facilities, sled_agent_artifact_store), @@ -305,6 +314,9 @@ struct ReconcilerTask { datasets: OmicronDatasets, zones: OmicronZones, boot_partitions: BootPartitionReconciler, + /// Receiver for committed epoch notifications from trust quorum. + /// When a new epoch is committed, we need to rotate ZFS encryption keys. + committed_epoch_rx: watch::Receiver>, log: Logger, } @@ -408,6 +420,30 @@ impl ReconcilerTask { ); continue; } + + // Cancel-safe per docs on `changed()` + // + // Handle committed epoch changes from trust quorum. When a new + // epoch is committed, we need to rotate ZFS encryption keys for + // all managed U.2 crypt datasets. The rekey operation is + // performed as part of normal reconciliation in do_reconciliation. + result = self.committed_epoch_rx.changed() => { + match result { + Ok(()) => { + info!( + self.log, + "starting reconciliation due to epoch change" + ); + continue; + } + Err(_closed) => { + warn!( + self.log, + "committed_epoch watch channel closed" + ); + } + } + } } } } @@ -547,6 +583,16 @@ impl ReconcilerTask { ) .await; + // Check if any disks need rekeying to the current committed epoch. + // We use borrow_and_update() to mark the epoch as seen, so we don't + // trigger another reconciliation for the same epoch change. + let current_epoch = *self.committed_epoch_rx.borrow_and_update(); + let rekey_result = if let Some(epoch) = current_epoch { + self.rekey_for_epoch(epoch).await + } else { + ReconciliationResult::NoRetryNeeded + }; + // Ensure all the datasets we want exist. self.datasets .ensure_datasets_if_needed( @@ -600,12 +646,13 @@ impl ReconcilerTask { } // We'll retry even if there have been no config changes if (a) time - // isn't sync'd yet or (b) any of our disk/dataset/zone attempts failed - // with a retryable error. + // isn't sync'd yet, (b) any of our disk/dataset/zone attempts failed + // with a retryable error, or (c) any rekey operations failed. let result = if !timesync_status.is_synchronized() || self.external_disks.has_retryable_error() || self.zones.has_retryable_error() || self.datasets.has_retryable_error() + || matches!(rekey_result, ReconciliationResult::ShouldRetry) { ReconciliationResult::ShouldRetry } else { @@ -633,6 +680,127 @@ impl ReconcilerTask { result } + + /// Rotate encryption keys for managed U.2 crypt datasets that need rekeying. + /// + /// This is called during reconciliation when the committed epoch has changed. + /// Disks whose cached epoch is less than the target epoch OR whose epoch + /// is unknown (None) are candidates for rekeying. The actual rekey operation + /// has an idempotency check that skips disks already at the target epoch. + /// On success, updates the cached epoch in `external_disks`. + async fn rekey_for_epoch( + &mut self, + target_epoch: Epoch, + ) -> ReconciliationResult { + // Log an error if any disk has an epoch ahead of target (should + // not happen, but guard against it). + for info in self.external_disks.disk_rekey_info() { + if let Some(e) = info.cached_epoch { + if e > target_epoch { + error!( + self.log, + "Disk has epoch ahead of target"; + "disk_id" => %info.disk_id, + "cached_epoch" => %e, + "target_epoch" => %target_epoch, + ); + } + } + } + + // Filter to disks that need rekeying. `None < Some(_)` so disks + // with unknown epoch are included (idempotency check in + // datasets_rekey will skip if already at target). + let disks_needing_rekey: Vec<_> = self + .external_disks + .disk_rekey_info() + .filter(|i| i.cached_epoch < Some(target_epoch)) + .collect(); + + if disks_needing_rekey.is_empty() { + debug!(self.log, "No datasets need rekeying"; "epoch" => %target_epoch); + return ReconciliationResult::NoRetryNeeded; + } + + // Build request, tracking key derivation failures separately + let mut failed = BTreeSet::new(); + let mut request = RekeyRequest::default(); + + for info in disks_needing_rekey { + match self + .key_requester + .get_key(target_epoch.0, info.disk.identity().clone()) + .await + { + Ok(key) => { + let dataset_name = + format!("{}/{}", info.disk.zpool_name(), CRYPT_DATASET); + request.disks.insert( + info.disk_id, + DatasetRekeyInfo { dataset_name, key }, + ); + } + Err(e) => { + error!( + self.log, + "Failed to derive key"; + "disk_id" => %info.disk_id, + "error" => %e, + ); + failed.insert(info.disk_id); + } + } + } + + if request.disks.is_empty() { + warn!( + self.log, + "No rekey requests (all key derivations failed)"; + "epoch" => %target_epoch + ); + return ReconciliationResult::ShouldRetry; + } + + let disk_count = request.disks.len(); + info!( + self.log, + "Rotating encryption keys"; + "count" => disk_count, + "epoch" => %target_epoch, + ); + + // Send request to dataset task + let result = match self.datasets.rekey_datasets(request).await { + Ok(r) => r, + Err(e) => { + error!( + self.log, + "Failed to send rekey request to dataset task"; + "disk_count" => disk_count, + "error" => %e, + ); + return ReconciliationResult::ShouldRetry; + } + }; + + // Update cached epochs for successful rekeys + self.external_disks.apply_rekey_result(&result, target_epoch); + + let has_failures = !failed.is_empty() || result.has_failures(); + + info!( + self.log, + "Key rotation complete"; + "succeeded" => result.succeeded.len(), + "failed" => failed.len() + result.failed.len(), + ); + + if has_failures { + ReconciliationResult::ShouldRetry + } else { + ReconciliationResult::NoRetryNeeded + } + } } enum ReconciliationResult { diff --git a/sled-agent/config-reconciler/src/reconciler_task/datasets.rs b/sled-agent/config-reconciler/src/reconciler_task/datasets.rs index 2cae109c382..223948afcb6 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/datasets.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/datasets.rs @@ -14,7 +14,10 @@ use super::CurrentlyManagedZpools; use crate::dataset_serialization_task::DatasetEnsureError; use crate::dataset_serialization_task::DatasetEnsureResult; +use crate::dataset_serialization_task::DatasetTaskError; use crate::dataset_serialization_task::DatasetTaskHandle; +use crate::dataset_serialization_task::RekeyRequest; +use crate::dataset_serialization_task::RekeyResult; use iddqd::IdOrdItem; use iddqd::IdOrdMap; use iddqd::id_upcast; @@ -24,17 +27,31 @@ use omicron_common::disk::DatasetConfig; use omicron_common::disk::DatasetKind; use omicron_common::disk::DatasetName; use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::PhysicalDiskUuid; use sled_agent_types::inventory::ConfigReconcilerInventoryResult; use sled_agent_types::inventory::OmicronZoneConfig; use sled_agent_types::inventory::OrphanedDataset; use sled_storage::config::MountConfig; use sled_storage::dataset::ZONE_DATASET; +use sled_storage::disk::Disk; use slog::Logger; use slog::info; use slog::warn; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::sync::Arc; +use trust_quorum_types::types::Epoch; + +/// Information about a managed disk for rekey filtering. +/// +/// Used by `rekey_for_epoch` to determine which disks need key rotation +/// based on their cached epoch. Disks with `cached_epoch < target` or +/// `cached_epoch = None` (unknown) are candidates for rekeying. +pub(super) struct DiskRekeyInfo<'a> { + pub disk: &'a Disk, + pub disk_id: PhysicalDiskUuid, + pub cached_epoch: Option, +} #[derive(Debug, thiserror::Error)] pub(super) enum ZoneDatasetDependencyError { @@ -299,6 +316,14 @@ impl OmicronDatasets { pub(crate) fn orphaned_datasets(&self) -> &IdOrdMap { &self.orphaned_datasets } + + /// Forward rekey requests to the dataset task. + pub(super) async fn rekey_datasets( + &self, + request: RekeyRequest, + ) -> Result { + self.dataset_task.rekey_datasets(request).await + } } #[derive(Debug)] @@ -322,3 +347,26 @@ enum DatasetState { Ensured, FailedToEnsure(Arc), } + +#[cfg(test)] +mod tests { + use super::*; + use crate::dataset_serialization_task::RekeyResult; + use std::collections::BTreeSet; + + #[test] + fn test_rekey_result_has_failures() { + let mut failed = BTreeSet::new(); + failed.insert(PhysicalDiskUuid::new_v4()); + let result = RekeyResult { succeeded: BTreeSet::new(), failed }; + assert!(result.has_failures()); + + let mut succeeded = BTreeSet::new(); + succeeded.insert(PhysicalDiskUuid::new_v4()); + let result = RekeyResult { succeeded, failed: BTreeSet::new() }; + assert!(!result.has_failures()); + + let result = RekeyResult::default(); + assert!(!result.has_failures()); + } +} diff --git a/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs b/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs index 6db0a195e7d..2f61021a141 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs @@ -26,6 +26,7 @@ use omicron_uuid_kinds::ZpoolUuid; use rand::distr::{Alphanumeric, SampleString}; use sled_agent_types::inventory::ConfigReconcilerInventoryResult; use sled_storage::config::MountConfig; +use sled_storage::dataset::CRYPT_DATASET; use sled_storage::dataset::DatasetError; use sled_storage::dataset::ZONE_DATASET; use sled_storage::disk::Disk; @@ -44,7 +45,10 @@ use std::future::Future; use std::sync::Arc; use std::sync::OnceLock; use tokio::sync::watch; +use trust_quorum_types::types::Epoch; +use super::datasets::DiskRekeyInfo; +use crate::dataset_serialization_task::RekeyResult; use crate::debug_collector::FormerZoneRootArchiver; use crate::disks_common::MaybeUpdatedDisk; use crate::disks_common::update_properties_from_raw_disk; @@ -293,6 +297,33 @@ impl ExternalDisks { Arc::clone(&*self.currently_managed_zpools_tx.borrow()) } + /// Returns rekey info for all managed disks. + pub(super) fn disk_rekey_info( + &self, + ) -> impl Iterator> { + self.disks.iter().filter_map(|disk_state| match &disk_state.state { + DiskState::Managed(disk) => Some(DiskRekeyInfo { + disk, + disk_id: disk_state.config.id, + cached_epoch: disk_state.epoch, + }), + DiskState::FailedToManage(_) => None, + }) + } + + /// Apply the results of a rekey operation, updating cached epochs for succeeded disks. + pub(super) fn apply_rekey_result( + &mut self, + result: &RekeyResult, + target_epoch: Epoch, + ) { + for &disk_id in &result.succeeded { + if let Some(mut disk_state) = self.disks.get_mut(&disk_id) { + disk_state.epoch = Some(target_epoch); + } + } + } + fn update_output_watch_channels(&self) { let current_disks = self .disks @@ -568,14 +599,21 @@ impl ExternalDisks { disk_adopter: &T, log: &Logger, ) -> ExternalDiskState { - match current.map(|d| &d.state) { + match current { // If we're already managing this disk, check whether there are any // new properties to update. - Some(DiskState::Managed(disk)) => { - self.update_disk_properties(disk, config, raw_disk, log) + Some(ExternalDiskState { + state: DiskState::Managed(disk), + epoch, + .. + }) => { + self.update_disk_properties(disk, config, raw_disk, *epoch, log) } // If we previously failed to manage this disk, try again. - Some(DiskState::FailedToManage(prev_err)) => { + Some(ExternalDiskState { + state: DiskState::FailedToManage(prev_err), + .. + }) => { info!( log, "Retrying management of disk"; "disk_identity" => ?config.identity, @@ -611,6 +649,7 @@ impl ExternalDisks { disk: &Disk, config: OmicronPhysicalDiskConfig, raw_disk: &RawDisk, + current_epoch: Option, log: &Logger, ) -> ExternalDiskState { // Make sure the incoming config's zpool ID matches our @@ -637,7 +676,7 @@ impl ExternalDisks { MaybeUpdatedDisk::Unchanged => disk.clone(), }; - ExternalDiskState::managed(config, disk) + ExternalDiskState::managed(config, disk, current_epoch) } async fn start_managing_disk( @@ -651,12 +690,13 @@ impl ExternalDisks { .adopt_disk(raw_disk, &self.mount_config, config.pool_id, log) .await { - Ok(disk) => { + Ok(AdoptedDisk { disk, epoch }) => { info!( log, "Successfully started management of disk"; "disk_identity" => ?config.identity, + "epoch" => ?epoch, ); - ExternalDiskState::managed(config, disk) + ExternalDiskState::managed(config, disk, epoch) } Err(err) => { warn!( @@ -674,18 +714,25 @@ impl ExternalDisks { struct ExternalDiskState { config: OmicronPhysicalDiskConfig, state: DiskState, + /// The current encryption epoch for this disk's crypt dataset. + /// None if the disk is not yet managed or doesn't have encryption. + epoch: Option, } impl ExternalDiskState { - fn managed(config: OmicronPhysicalDiskConfig, disk: Disk) -> Self { - Self { config, state: DiskState::Managed(disk) } + fn managed( + config: OmicronPhysicalDiskConfig, + disk: Disk, + epoch: Option, + ) -> Self { + Self { config, state: DiskState::Managed(disk), epoch } } fn failed( config: OmicronPhysicalDiskConfig, err: DiskManagementError, ) -> Self { - Self { config, state: DiskState::FailedToManage(err) } + Self { config, state: DiskState::FailedToManage(err), epoch: None } } } @@ -705,17 +752,31 @@ enum DiskState { FailedToManage(DiskManagementError), } +/// Result of successfully adopting a disk. +struct AdoptedDisk { + /// The adopted disk. + disk: Disk, + /// The current encryption epoch for this disk's crypt dataset. + /// None if the disk doesn't have encryption or the epoch could not be read. + epoch: Option, +} + /// Helper to allow unit tests to run without interacting with the real [`Disk`] /// implementation. In production, the only implementor of this trait is /// [`RealDiskAdopter`]. trait DiskAdopter { + /// Adopt a disk, returning the disk and its current encryption epoch. + /// + /// The epoch is read from the oxide:epoch property on the crypt dataset + /// after successful adoption. Returns `None` for the epoch if the disk + /// is not encrypted or the epoch could not be read. fn adopt_disk( &self, raw_disk: RawDisk, mount_config: &MountConfig, pool_id: ZpoolUuid, log: &Logger, - ) -> impl Future> + Send; + ) -> impl Future> + Send; fn archive_and_destroy_former_zone_roots( &self, @@ -737,8 +798,8 @@ impl DiskAdopter for RealDiskAdopter<'_> { mount_config: &MountConfig, pool_id: ZpoolUuid, log: &Logger, - ) -> Result { - Disk::new( + ) -> Result { + let disk = Disk::new( log, mount_config, raw_disk, @@ -757,7 +818,50 @@ impl DiskAdopter for RealDiskAdopter<'_> { } _ => DiskManagementError::Other(err_string), } - }) + })?; + + // Read the epoch from the crypt dataset after successful adoption. + // This tells us what encryption key the disk is currently using. + let crypt_dataset = format!("{}/{}", disk.zpool_name(), CRYPT_DATASET); + let epoch = match Zfs::get_oxide_value(&crypt_dataset, "epoch").await { + Ok(epoch_str) => match epoch_str.parse::() { + Ok(epoch_val) => { + debug!( + log, + "Read epoch from adopted disk"; + "zpool" => %disk.zpool_name(), + "epoch" => epoch_val, + ); + // ZFS stores epoch as u64; we wrap it in the Epoch + // newtype for type safety in the reconciler. + Some(Epoch(epoch_val)) + } + Err(e) => { + warn!( + log, + "Failed to parse epoch from adopted disk"; + "zpool" => %disk.zpool_name(), + "epoch_str" => &epoch_str, + "error" => %e, + ); + None + } + }, + Err(e) => { + // This could happen if the disk doesn't have an encrypted + // crypt dataset (shouldn't happen in production) or if there + // was an error reading the property. + warn!( + log, + "Failed to read epoch from adopted disk"; + "zpool" => %disk.zpool_name(), + "error" => %e, + ); + None + } + }; + + Ok(AdoptedDisk { disk, epoch }) } async fn archive_and_destroy_former_zone_roots( @@ -971,7 +1075,7 @@ mod tests { _mount_config: &MountConfig, pool_id: ZpoolUuid, _log: &Logger, - ) -> Result { + ) -> Result { // ExternalDisks should only adopt U2 disks assert_eq!(raw_disk.variant(), DiskVariant::U2); let disk = Disk::Real(PooledDisk { @@ -988,7 +1092,8 @@ mod tests { firmware: raw_disk.firmware().clone(), }); self.requests.lock().unwrap().push(raw_disk); - Ok(disk) + // In tests, use epoch 0 as the initial epoch + Ok(AdoptedDisk { disk, epoch: Some(Epoch(0)) }) } async fn archive_and_destroy_former_zone_roots( diff --git a/sled-agent/src/bootstrap/secret_retriever/tq.rs b/sled-agent/src/bootstrap/secret_retriever/tq.rs index 6f7b25a7c10..1e5c0b9bb5c 100644 --- a/sled-agent/src/bootstrap/secret_retriever/tq.rs +++ b/sled-agent/src/bootstrap/secret_retriever/tq.rs @@ -17,6 +17,7 @@ use key_manager::{ }; use secrecy::ExposeSecret; use std::time::Duration; +use tokio::time::{sleep, timeout}; use trust_quorum::NodeTaskHandle; use trust_quorum_protocol::ReconstructedRackSecret; use trust_quorum_types::types::Epoch; @@ -27,6 +28,11 @@ use trust_quorum_types::types::Epoch; /// interval until the secret becomes available. const POLL_INTERVAL: Duration = Duration::from_millis(500); +/// Timeout for share collection (2 minutes). +/// +/// If shares cannot be collected within this time, an error is returned. +const POLL_TIMEOUT: Duration = Duration::from_secs(120); + /// A [`key_manager::SecretRetriever`] for use with full Trust Quorum (TQ). pub(super) struct TqSecretRetriever { pub(super) salt: [u8; 32], @@ -50,20 +56,24 @@ impl TqSecretRetriever { async fn load_latest_with_retry( &self, ) -> Result<(Epoch, ReconstructedRackSecret), SecretRetrieverError> { - loop { - match self.handle.load_latest_rack_secret().await { - Ok(Some((epoch, secret))) => return Ok((epoch, secret)), - Ok(None) => { - // Share collection in progress, keep polling - tokio::time::sleep(POLL_INTERVAL).await; - } - Err(e) => { - return Err(SecretRetrieverError::TrustQuorum( - e.to_string(), - )); + timeout(POLL_TIMEOUT, async { + loop { + match self.handle.load_latest_rack_secret().await { + Ok(Some((epoch, secret))) => return Ok((epoch, secret)), + Ok(None) => { + // Share collection in progress, keep polling + sleep(POLL_INTERVAL).await; + } + Err(e) => { + return Err(SecretRetrieverError::TrustQuorum( + e.to_string(), + )); + } } } - } + }) + .await + .map_err(|_| SecretRetrieverError::Timeout)? } /// Poll for a specific epoch's rack secret until available. @@ -73,20 +83,24 @@ impl TqSecretRetriever { &self, epoch: Epoch, ) -> Result { - loop { - match self.handle.load_rack_secret(epoch).await { - Ok(Some(secret)) => return Ok(secret), - Ok(None) => { - // Share collection in progress, keep polling - tokio::time::sleep(POLL_INTERVAL).await; - } - Err(e) => { - return Err(SecretRetrieverError::TrustQuorum( - e.to_string(), - )); + timeout(POLL_TIMEOUT, async { + loop { + match self.handle.load_rack_secret(epoch).await { + Ok(Some(secret)) => return Ok(secret), + Ok(None) => { + // Share collection in progress, keep polling + sleep(POLL_INTERVAL).await; + } + Err(e) => { + return Err(SecretRetrieverError::TrustQuorum( + e.to_string(), + )); + } } } - } + }) + .await + .map_err(|_| SecretRetrieverError::Timeout)? } } diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 042d1d49e20..50b15d49130 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -830,17 +830,6 @@ impl SledAgentApi for SledAgentImpl { let sa = rqctx.context(); let request = body.into_inner(); - // Perform some minimal validation - if request.start_request.body.use_trust_quorum - && !request.start_request.body.is_lrtq_learner - { - return Err(HttpError::for_bad_request( - None, - "New sleds must be LRTQ learners if trust quorum is in use" - .to_string(), - )); - } - crate::sled_agent::sled_add( sa.logger().clone(), sa.sprockets().clone(), diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 4de28473bfc..41ad853134b 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -1419,6 +1419,7 @@ impl StorageInner { quota: dataset.inner.quota, reservation: dataset.inner.reservation, compression: dataset.inner.compression.to_string(), + epoch: None, }); } } @@ -1438,6 +1439,7 @@ impl StorageInner { quota: config.quota, reservation: config.reservation, compression: config.compression.to_string(), + epoch: None, }); } } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 75717cc6443..2ee17498da7 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -626,6 +626,7 @@ impl SledAgent { SledAgentArtifactStoreWrapper(Arc::clone( &long_running_task_handles.artifact_store, )), + long_running_task_handles.trust_quorum.committed_epoch_rx(), config_reconciler_spawn_token, ); diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index 3aa3c18f5e4..8447e9bf0f5 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -172,6 +172,15 @@ pub enum DatasetError { #[error("Failed to make datasets encrypted")] EncryptionMigration(#[from] DatasetEncryptionMigrationError), + #[error(transparent)] + ZfsQueryFailed(#[from] zfs::DatasetExistsError), + + #[error( + "Trial decryption recovery failed for dataset '{dataset}' \ + after trying epochs {latest_epoch} down to 0" + )] + TrialDecryptionFailed { dataset: String, latest_epoch: u64 }, + #[error(transparent)] Other(#[from] anyhow::Error), } @@ -210,8 +219,9 @@ pub(crate) async fn ensure_zpool_has_datasets( let keypath: Keypath = illumos_utils::zfs::Keypath::new(disk_identity, &mount_config.root); + let name = format!("{}/{}", zpool_name, dataset); let epoch = if let Ok(epoch_str) = - Zfs::get_oxide_value(dataset, "epoch").await + Zfs::get_oxide_value(&name, "epoch").await { if let Ok(epoch) = epoch_str.parse::() { epoch @@ -222,22 +232,37 @@ pub(crate) async fn ensure_zpool_has_datasets( } } else { // We got an error trying to call `Zfs::get_oxide_value` - // which indicates that the dataset doesn't exist or there - // was a problem running the command. - // - // Note that `Zfs::get_oxide_value` will succeed even if - // the epoch is missing. `epoch_str` will show up as a dash - // (`-`) and will not parse into a `u64`. So we don't have - // to worry about that case here as it is handled above. + // which indicates that the dataset doesn't exist, or + // the epoch property is missing (returns "-"). // - // If the error indicated that the command failed for some - // other reason, but the dataset actually existed, we will - // try to create the dataset below and that will fail. So - // there is no harm in just loading the latest secret here. - info!(log, "Loading latest secret"; "disk_id"=>?disk_identity); - let epoch = key_requester.load_latest_secret().await?; - info!(log, "Loaded latest secret"; "epoch"=>%epoch, "disk_id"=>?disk_identity); - epoch + // Check if the dataset actually exists to distinguish + // between these two cases. + let exists = Zfs::dataset_exists(&name).await?; + if exists { + // Dataset exists but epoch property is missing - this is + // an unexpected state that shouldn't happen in normal + // operation. Attempt to recover by trial decryption. + warn!( + log, + "Epoch property missing from existing crypt dataset, \ + attempting recovery by trial decryption"; + "dataset" => &name + ); + recover_epoch_by_trial_decryption( + &name, + disk_identity, + mount_config, + key_requester, + log, + ) + .await? + } else { + // Dataset doesn't exist - use latest epoch to create it. + info!(log, "Loading latest secret"; "disk_id"=>?disk_identity); + let epoch = key_requester.load_latest_secret().await?; + info!(log, "Loaded latest secret"; "epoch"=>%epoch, "disk_id"=>?disk_identity); + epoch + } }; info!(log, "Retrieving key"; "epoch"=>%epoch, "disk_id"=>?disk_identity); @@ -256,9 +281,8 @@ pub(crate) async fn ensure_zpool_has_datasets( info!( log, - "Ensuring encrypted filesystem: {} for epoch {}", dataset, epoch + "Ensuring encrypted filesystem: {} for epoch {}", name, epoch ); - let name = format!("{}/{}", zpool_name, dataset); let result = Zfs::ensure_dataset(zfs::DatasetEnsureArgs { name: &name, mountpoint: Mountpoint(mountpoint), @@ -332,6 +356,9 @@ pub enum DatasetEncryptionMigrationError { #[error("Cannot create new encrypted dataset")] DatasetCreation(#[from] illumos_utils::zfs::EnsureDatasetError), + #[error(transparent)] + DatasetExistsCheck(#[from] illumos_utils::zfs::DatasetExistsError), + #[error("Missing stdout stream during 'zfs send' command")] MissingStdoutForZfsSend, } @@ -460,8 +487,8 @@ async fn ensure_zpool_dataset_is_encrypted( let encrypted_dataset = encrypted_dataset.full_name(); let (unencrypted_dataset_exists, encrypted_dataset_exists) = ( - dataset_exists(&unencrypted_dataset).await?, - dataset_exists(&encrypted_dataset).await?, + Zfs::dataset_exists(&unencrypted_dataset).await?, + Zfs::dataset_exists(&encrypted_dataset).await?, ); match (unencrypted_dataset_exists, encrypted_dataset_exists) { @@ -548,15 +575,6 @@ async fn ensure_zpool_dataset_is_encrypted( .await; } -// Returns true if the dataset exists. -async fn dataset_exists( - dataset: &str, -) -> Result { - let mut command = tokio::process::Command::new(illumos_utils::zfs::ZFS); - let cmd = command.args(&["list", "-H", dataset]); - Ok(cmd.status().await?.success()) -} - // Destroys the dataset and all children, recursively. async fn zfs_destroy( dataset: &str, @@ -676,6 +694,147 @@ async fn finalize_encryption_migration( Ok(()) } +/// Recover the encryption epoch for an existing crypt dataset by trial decryption. +/// +/// This function is called when an encrypted dataset exists but its `oxide:epoch` +/// property is missing or corrupt. It attempts to mount the dataset with keys +/// from the latest epoch down to 0, finding the correct key by trial and error. +/// +/// If successful, it also sets the `oxide:epoch` property so future boots don't +/// need recovery. +/// +/// This is an unexpected situation that indicates something went wrong (e.g., +/// property was accidentally cleared, or a crash during rekey). The function +/// emits warnings to indicate this abnormal state. +async fn recover_epoch_by_trial_decryption( + dataset_name: &str, + disk_identity: &DiskIdentity, + mount_config: &MountConfig, + key_requester: &StorageKeyRequester, + log: &Logger, +) -> Result { + // Get the latest epoch to know where to start our search + let latest_epoch = key_requester.load_latest_secret().await?; + + info!( + log, + "Attempting trial decryption recovery"; + "dataset" => dataset_name, + "latest_epoch" => latest_epoch, + ); + + let keypath = Keypath::new(disk_identity, &mount_config.root); + + // Try each epoch from latest down to 0 + for epoch in (0..=latest_epoch).rev() { + // Unload any previously-loaded key before each attempt. ZFS retains + // loaded keys across process restarts, so if we crashed after a + // successful load-key but before setting the epoch property, the + // correct key would still be loaded and our load-key would fail. + if let Err(e) = Zfs::unload_key(dataset_name).await { + debug!( + log, + "Failed to unload key as expected during trial decryption"; + "dataset" => dataset_name, + "error" => %e, + ); + } else { + warn!( + log, + "Successfully unloaded key during trial decryption,\\ + likely due to prior crash before setting oxide:epoch"; + "dataset" => dataset_name, + ); + } + + // Get the key for this epoch + let key = + match key_requester.get_key(epoch, disk_identity.clone()).await { + Ok(k) => k, + Err(e) => { + debug!( + log, + "Failed to get key for epoch, skipping"; + "epoch" => epoch, + "error" => %e, + ); + continue; + } + }; + + // Write the keyfile + let mut keyfile = + match KeyFile::create(keypath.clone(), key.expose_secret(), log) + .await + { + Ok(kf) => kf, + Err(e) => { + warn!( + log, + "Failed to create keyfile for trial decryption"; + "epoch" => epoch, + "error" => %e, + ); + continue; + } + }; + + // Try to load the encryption key for this dataset + // Using `zfs load-key` to test if this is the right key + let load_result = Zfs::load_key(dataset_name).await; + + // Always clean up the keyfile + if let Err(e) = keyfile.zero_and_unlink().await { + warn!( + log, + "Failed to clean up keyfile after trial decryption attempt"; + "epoch" => epoch, + "error" => %e, + ); + } + + if load_result.is_ok() { + info!( + log, + "Successfully recovered epoch by trial decryption"; + "dataset" => dataset_name, + "epoch" => epoch, + ); + + // Set the epoch property so future boots don't need recovery + if let Err(e) = + Zfs::set_oxide_value(dataset_name, "epoch", &epoch.to_string()) + .await + { + warn!( + log, + "Failed to set epoch property after recovery \ + (dataset will work but recovery may be needed again)"; + "dataset" => dataset_name, + "epoch" => epoch, + "error" => %e, + ); + } + + return Ok(epoch); + } + + debug!( + log, + "Trial decryption failed for epoch, trying next"; + "dataset" => dataset_name, + "epoch" => epoch, + ); + + // No need to unload here -- we unload at the start of each iteration + } + + Err(DatasetError::TrialDecryptionFailed { + dataset: dataset_name.to_string(), + latest_epoch, + }) +} + #[cfg(test)] mod test { use super::*; diff --git a/trust-quorum/src/task.rs b/trust-quorum/src/task.rs index 695e0ab48cd..4e39e4d6a4d 100644 --- a/trust-quorum/src/task.rs +++ b/trust-quorum/src/task.rs @@ -24,7 +24,7 @@ use std::sync::Arc; use thiserror::Error; use tokio::sync::mpsc::error::SendError; use tokio::sync::oneshot::error::RecvError; -use tokio::sync::{mpsc, oneshot}; +use tokio::sync::{mpsc, oneshot, watch}; use trust_quorum_protocol::{ CommitError, LoadRackSecretError, LrtqUpgradeError, Node, NodeCallerCtx as _, NodeCommonCtx as _, NodeCtx, PersistentState, @@ -203,6 +203,7 @@ pub struct NodeTaskHandle { baseboard_id: BaseboardId, tx: mpsc::Sender, listen_addr: SocketAddrV6, + committed_epoch_rx: watch::Receiver>, } impl NodeTaskHandle { @@ -392,6 +393,14 @@ impl NodeTaskHandle { let res = rx.await?; Ok(res) } + + /// Get a receiver for committed epoch notifications. + /// + /// The channel sends notifications when a new epoch is committed. + /// This is used by the config reconciler to trigger ZFS key rotation. + pub fn committed_epoch_rx(&self) -> watch::Receiver> { + self.committed_epoch_rx.clone() + } } pub struct NodeTask { @@ -416,6 +425,12 @@ pub struct NodeTask { proxy_tracker: proxy::Tracker, /// Handle to receive updates to new reference measurements measurements: Arc, + + /// Sender for committed epoch notifications. + /// + /// Used to notify subscribers (e.g., config reconciler) when a new epoch + /// is committed, triggering ZFS key rotation. + committed_epoch_tx: watch::Sender>, } impl NodeTask { @@ -472,6 +487,12 @@ impl NodeTask { ) .await; + // Create watch channel for committed epoch notifications. + // Initialize with the current committed epoch (if any). + let initial_epoch = ctx.persistent_state().latest_committed_epoch(); + let (committed_epoch_tx, committed_epoch_rx) = + watch::channel(initial_epoch); + let node = Node::new(&log, &mut ctx); let conn_mgr = ConnMgr::new( &log, @@ -495,8 +516,14 @@ impl NodeTask { network_config, proxy_tracker: proxy::Tracker::new(), measurements, + committed_epoch_tx, + }, + NodeTaskHandle { + baseboard_id, + tx, + listen_addr, + committed_epoch_rx, }, - NodeTaskHandle { baseboard_id, tx, listen_addr }, ) } @@ -830,6 +857,20 @@ impl NodeTask { self.ctx.persistent_state().clone(), ) .await; + + // Notify subscribers if latest committed epoch changed + let new_epoch = + self.ctx.persistent_state().latest_committed_epoch(); + let log = &self.log; + self.committed_epoch_tx.send_if_modified(|current| { + if *current != new_epoch { + info!(log, "Committed epoch changed"; "epoch" => ?new_epoch); + *current = new_epoch; + true + } else { + false + } + }); } }