Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

129 changes: 129 additions & 0 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -579,9 +579,11 @@ struct TrustQuorumArgs {
}

#[derive(Debug, Subcommand)]
#[allow(clippy::large_enum_variant)]
enum TrustQuorumCommands {
GetConfig(TrustQuorumConfigArgs),
LrtqUpgrade,
RemoveSled(TrustQuorumRemoveSledArgs),
}

#[derive(Debug, Clone, Copy, Args)]
Expand All @@ -590,6 +592,15 @@ struct TrustQuorumConfigArgs {
epoch: TrustQuorumEpochOrLatest,
}

#[derive(Debug, Args)]
struct TrustQuorumRemoveSledArgs {
// remove is _extremely_ dangerous, so we also require a database
// connection to perform some safety checks
#[clap(flatten)]
db_url_opts: DbUrlOptions,
sled_id: SledUuid,
}

#[derive(Debug, Clone, Copy)]
pub(crate) enum TrustQuorumEpochOrLatest {
Latest,
Expand Down Expand Up @@ -912,6 +923,15 @@ impl NexusArgs {
let token = omdb.check_allow_destructive()?;
cmd_nexus_trust_quorum_lrtq_upgrade(&client, token).await
}
NexusCommands::TrustQuorum(TrustQuorumArgs {
command: TrustQuorumCommands::RemoveSled(args),
}) => {
let token = omdb.check_allow_destructive()?;
cmd_nexus_trust_quorum_remove_sled(
&client, args, omdb, log, token,
)
.await
}
NexusCommands::UpdateStatus(args) => {
cmd_nexus_update_status(&client, args).await
}
Expand Down Expand Up @@ -4320,6 +4340,13 @@ async fn cmd_nexus_sled_expunge_with_datastore(
}
}

eprintln!(
"WARNING: Are you sure that you have removed this sled from the latest \
trust quorum configuration for rack {}?. Please double check with: \
`omdb nexus trust-quorum get-config <RACK_ID> latest`\n",
sled.rack_id
);

eprintln!(
"WARNING: This operation will PERMANENTLY and IRRECOVABLY mark sled \
{} ({}) expunged. To proceed, type the sled's serial number.",
Expand Down Expand Up @@ -4562,6 +4589,108 @@ async fn cmd_nexus_trust_quorum_lrtq_upgrade(
Ok(())
}

async fn cmd_nexus_trust_quorum_remove_sled(
client: &nexus_lockstep_client::Client,
args: &TrustQuorumRemoveSledArgs,
omdb: &Omdb,
log: &slog::Logger,
destruction_token: DestructiveOperationToken,
) -> Result<(), anyhow::Error> {
let datastore = args.db_url_opts.connect(omdb, log).await?;
let result = cmd_nexus_trust_quorum_remove_sled_with_datastore(
&datastore,
client,
args,
log,
destruction_token,
)
.await;
datastore.terminate().await;
result
}

// `omdb nexus trust-quorum remove-sled`, but borrowing a datastore
async fn cmd_nexus_trust_quorum_remove_sled_with_datastore(
datastore: &Arc<DataStore>,
client: &nexus_lockstep_client::Client,
args: &TrustQuorumRemoveSledArgs,
log: &slog::Logger,
_destruction_token: DestructiveOperationToken,
) -> Result<(), anyhow::Error> {
use nexus_db_queries::context::OpContext;
let opctx = OpContext::for_tests(log.clone(), datastore.clone());
let opctx = &opctx;

// First, we need to look up the sled so we know its serial number.
let (_authz_sled, sled) = LookupPath::new(opctx, datastore)
.sled_id(args.sled_id)
.fetch()
.await
.with_context(|| format!("failed to find sled {}", args.sled_id))?;

// Helper to get confirmation messages from the user.
let mut prompt = ConfirmationPrompt::new();

println!(
"WARNING: This is step 1 of the process to expunge a sled. If you \
remove a sled from the trust quorum and reboot it, it will not be able \
to unlock its storage and participate in the control plane. However, \
the Reconfigurator will not yet know the sled is expunged and may \
still try to use it."
);

println!(
"Therefore, you must treat this action in conjunction with a reboot as \
the software equivalent of physically removing the sled from the rack \
before expungement."
);

println!(
"After this sled is removed from the trust quorum, you must reboot it \
and expunge it to complete the process."
);

println!(
"WARNING: This operation will PERMANENTLY and IRRECOVABLY remove sled \
{} ({}) from the trust-quorum for rack {}. To proceed, type the \
sled's serial number.",
args.sled_id,
sled.serial_number(),
sled.rack_id
);
prompt.read_and_validate("sled serial number", sled.serial_number())?;

println!(
"About to start the trust quorum reconfiguration to remove the sled."
);

println!(
"If this operation fails with a timeout, please check the latest trust \
quorum configuration to see whether or not to proceed with rack reboot \
and expungement."
);

println!(
"You can poll the trust quorum reconfiguration with \
`omdb nexus trust-quorum get-config <RACK_ID> <EPOCH | latest>`\n"
);

println!(
"Once the trust quorum configuration is committed, please reboot \
the sled and proceed to call `omdb nexus sled expunge`.\n"
);

let epoch = client
.trust_quorum_remove_sled(&args.sled_id.into_untyped_uuid())
.await
.context("trust quorum remove sled")?
.into_inner();

println!("Started trust quorum reconfiguration at epoch {epoch}\n");

Ok(())
}

/// Runs `omdb nexus support-bundles create`
async fn cmd_nexus_support_bundles_create(
client: &nexus_lockstep_client::Client,
Expand Down
2 changes: 2 additions & 0 deletions nexus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ omicron-rpaths.workspace = true

[dependencies]
anyhow.workspace = true
async-bb8-diesel.workspace = true
assert_matches.workspace = true
async-trait.workspace = true
base64.workspace = true
Expand All @@ -34,6 +35,7 @@ crucible-pantry-client.workspace = true
crucible-common.workspace = true
dns-service-client.workspace = true
dpd-client.workspace = true
diesel.workspace = true
ereport-types.workspace = true
mg-admin-client.workspace = true
dropshot.workspace = true
Expand Down
1 change: 1 addition & 0 deletions nexus/lockstep-api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ omicron-uuid-kinds.workspace = true
omicron-workspace-hack.workspace = true
schemars.workspace = true
serde.workspace = true
sled-hardware-types.workspace = true
trust-quorum-types.workspace = true
uuid.workspace = true
15 changes: 15 additions & 0 deletions nexus/lockstep-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,21 @@ pub trait NexusLockstepApi {
async fn trust_quorum_lrtq_upgrade(
rqctx: RequestContext<Self::Context>,
) -> Result<HttpResponseOk<Epoch>, HttpError>;

/// Remove a sled from the trust quorum
///
/// This is a required first step towards expunging a sled
///
/// Return the epoch of the proposed configuration so it can be polled
/// asynchronously.
#[endpoint {
method = POST,
path = "/trust-quorum/remove/{sled}"
}]
async fn trust_quorum_remove_sled(
rqctx: RequestContext<Self::Context>,
path_params: Path<params::SledSelector>,
) -> Result<HttpResponseOk<Epoch>, HttpError>;
}

/// Path parameters for Rack requests.
Expand Down
43 changes: 41 additions & 2 deletions nexus/src/app/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::InstanceUuid;
use omicron_uuid_kinds::PhysicalDiskUuid;
use omicron_uuid_kinds::PropolisUuid;
use omicron_uuid_kinds::RackUuid;
use omicron_uuid_kinds::SledUuid;
use omicron_uuid_kinds::ZpoolUuid;
use sled_agent_client::Client as SledAgentClient;
use sled_agent_types::inventory::SledRole;
use sled_hardware_types::BaseboardId;
use std::net::SocketAddrV6;
use std::sync::Arc;
use uuid::Uuid;
Expand Down Expand Up @@ -109,8 +111,45 @@ impl super::Nexus {
sled_id: SledUuid,
) -> Result<SledPolicy, Error> {
let sled_lookup = self.sled_lookup(opctx, &sled_id)?;
let (authz_sled,) =
sled_lookup.lookup_for(authz::Action::Modify).await?;
let (authz_sled, sled) =
sled_lookup.fetch_for(authz::Action::Modify).await?;

let rack_id = RackUuid::from_untyped_uuid(sled.rack_id);

// If the sled still exists in the latest committed trust quorum
// configuration, it cannot be expunged.
//
// This is just a sanity check. There is an inherent TOCTUO here, since
// we aren't combining the check with with the policy change inside
// a transaction. However, it is unlikely that an operator would be
// creating a different trust quorum configuration while trying to
// expunge this sled, since both have to be done from omdb by an oxide
// employee right now.
//
// When we add an external API, we'll probably want to come back and add
// some extra safeguards including checking that there are no ongoing
// configurations currently, as the last committed configuration can be
// different from the current configuration. However, if we checked that
// here we'd have to do yet more DB lookups to check that the state of
// the last configuration wasn't aborted. This is probably good enough
// for now given that the user already has to confirm a bunch of prompts
// before kicking off the operation.
let (tq_latest_committed_config, _) = self
.tq_load_latest_possible_committed_config(opctx, rack_id)
.await?;
let baseboard_id = BaseboardId {
part_number: sled.part_number().to_string(),
serial_number: sled.serial_number().to_string(),
};
if tq_latest_committed_config.members.contains_key(&baseboard_id) {
return Err(Error::conflict(format!(
"Cannot expunge sled {sled_id}, as its baseboard \
{baseboard_id} is still a member of the latest committed \
trust quorum configuration at epoch {} for rack {rack_id}",
tq_latest_committed_config.epoch
)));
}

let prev_policy = self
.db_datastore
.sled_set_policy_to_expunged(opctx, &authz_sled)
Expand Down
Loading
Loading