Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions docs/flake-patterns.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,21 @@ NOTE: The `serial_test` crate **does not work** within Omicron: it depends on an

* https://github.com/oxidecomputer/omicron/commit/c9dec4111860[`c9dec4111860`] ensures that learner nodes' ledger file names cannot collide with those of regular peer nodes. (Learner 1 and peer node `pc-b-1` both generated `test-1-network-config-ledger`.)
* https://github.com/oxidecomputer/omicron/commit/067c79302158[`067c79302158`] changes a scenario from being a separate test to being listed within `test_replicated`.

=== Ephemeral port reuse races [[ephemeral-port-reuse-races]]

**What this is:** A test performs the following sequence of operations:

. Start a service on an ephemeral TCP port (i.e., bind to port 0).
. Kill the service.
. Restart the service, attempting to bind to the port determined in step 1.

In between steps 2 and 3, a different process such as a test running concurrently can grab the same port. This can result in test flakiness.

**Why this is bad:** This is a test flake at best and cross-test interference at worst.

**How to fix this:** Use the `RetargetableTcpProxy` available at `test-utils/src/dev/tcp-proxy.rs`. That provides a persistent port that stays bound throughout process restarts.

**Example commits:**

* https://github.com/oxidecomputer/omicron/commit/AAAAA[`AAAAA`] introduces the TCP proxy.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to link to this PR to avoid having to update it after it lands?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've generally been linking to commit hashes so that automated tools like Claude can just git/jj show them and not have to reach out to the network.

9 changes: 7 additions & 2 deletions nexus/reconfigurator/execution/src/test_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,13 @@ pub fn overridables_for_test(
let sled_id = id_str.parse().unwrap();
let ip = Ipv6Addr::LOCALHOST;
let mgs_port = cptestctx.gateway.get(&switch_slot).unwrap().port;
let dendrite_port =
cptestctx.dendrite.read().unwrap().get(&switch_slot).unwrap().port;
let dendrite_port = cptestctx
.dendrite
.read()
.unwrap()
.get(&switch_slot)
.unwrap()
.port();
let mgd_port = cptestctx.mgd.get(&switch_slot).unwrap().port;
let ddm_port = cptestctx.ddm.get(&switch_slot).unwrap().port;
overrides.override_switch_zone_ip(sled_id, ip);
Expand Down
47 changes: 11 additions & 36 deletions nexus/src/app/sagas/instance_start.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1165,7 +1165,6 @@ async fn sis_ensure_running(
mod test {
use core::time::Duration;
use nexus_types::identity::Asset as _;
use std::net::SocketAddrV6;

use crate::app::sagas::disk_delete::test::ExpungeTestHarness;
use crate::app::sagas::disk_delete::test::create_disk;
Expand Down Expand Up @@ -1397,19 +1396,14 @@ mod test {
.expect("unable to update switch1 settings");

// Shutdown one of the switch daemons
let mut switch0_dpd = cptestctx
.dendrite
.write()
.unwrap()
.remove(&SwitchSlot::Switch0)
.expect("there should be at least one dendrite running");

let switch0_port = switch0_dpd.port;

switch0_dpd
.cleanup()
.await
.expect("switch0 process should get cleaned up");
let switch0_port = {
let dendrite_guard = cptestctx.dendrite.read().unwrap();
dendrite_guard
.get(&SwitchSlot::Switch0)
.expect("a dendrite instance should exist for switch0")
.port()
};
cptestctx.stop_dendrite(SwitchSlot::Switch0).await;

let log = &opctx.log;

Expand Down Expand Up @@ -1455,7 +1449,7 @@ mod test {
dendrite_guard
.get(&SwitchSlot::Switch1)
.expect("two dendrites should be present in test context")
.port
.port()
};

let client_state = dpd_client::ClientState {
Expand Down Expand Up @@ -1505,28 +1499,9 @@ mod test {
.await
.expect("NAT entry should appear on switch1");

// Reuse the port number from the removed Switch0 to start a new dendrite instance
let nexus_address = cptestctx.internal_client.bind_address;
let mgs = cptestctx.gateway.get(&SwitchSlot::Switch0).unwrap();
let mgs_address =
SocketAddrV6::new(Ipv6Addr::LOCALHOST, mgs.port, 0, 0).into();

// Test fault recovery for nat propogation
// Start a new dendrite instance for switch0
let new_switch0 =
omicron_test_utils::dev::dendrite::DendriteInstance::start(
switch0_port,
Some(nexus_address),
Some(mgs_address),
)
.await
.unwrap();

cptestctx
.dendrite
.write()
.unwrap()
.insert(SwitchSlot::Switch0, new_switch0);
// Start a new dpd for switch0
cptestctx.restart_dendrite(SwitchSlot::Switch0).await;

// Ensure that the nat entry for the address has made it onto the new switch0 dendrite.
// This might take some time while the new dendrite comes online.
Expand Down
56 changes: 4 additions & 52 deletions nexus/src/app/sagas/instance_update/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2554,7 +2554,7 @@ mod test {
.unwrap();

// Shutdown switch 0.
shutdown_switch0(&cptestctx).await;
cptestctx.stop_dendrite(SwitchSlot::Switch0).await;
assert!(switch0_dpd_client.dpd_uptime().await.is_err());

// Okay, now that we've taken down one of the simulated switches, we
Expand Down Expand Up @@ -2656,7 +2656,7 @@ mod test {
.await;

// Shut down switch 0.
let switch0_port = shutdown_switch0(&cptestctx).await;
cptestctx.stop_dendrite(SwitchSlot::Switch0).await;
assert!(switch0_dpd_client.dpd_uptime().await.is_err());

// Run the instance-update saga to complete the migration.
Expand Down Expand Up @@ -2696,7 +2696,7 @@ mod test {
.unwrap();

// Restart switch 0 and verify it also gets the new entries.
restart_switch0(&cptestctx, switch0_port).await;
cptestctx.restart_dendrite(SwitchSlot::Switch0).await;
wait_for_n_nat_entries(
log,
&switch0_dpd_client,
Expand Down Expand Up @@ -2819,7 +2819,7 @@ mod test {
let port = dendrite_guard
.get(&switch_slot)
.expect("dendrite should be present for this switch slot")
.port;
.port();
let client_state = dpd_client::ClientState {
tag: String::from("nexus"),
log: cptestctx.logctx.log.new(o!(
Expand Down Expand Up @@ -2913,54 +2913,6 @@ mod test {
})
}

/// Shut down switch 0's dendrite, returning the port it was listening on.
async fn shutdown_switch0(cptestctx: &ControlPlaneTestContext) -> u16 {
let mut switch0_dpd = cptestctx
.dendrite
.write()
.unwrap()
.remove(&SwitchSlot::Switch0)
.expect("switch 0 dendrite should be running");

let port = switch0_dpd.port;

switch0_dpd
.cleanup()
.await
.expect("switch0 process should get cleaned up");

port
}

/// Restart switch 0's dendrite on the given port.
async fn restart_switch0(
cptestctx: &ControlPlaneTestContext,
switch0_port: u16,
) {
use std::net::Ipv6Addr;
use std::net::SocketAddrV6;

let nexus_address = cptestctx.internal_client.bind_address;
let mgs = cptestctx.gateway.get(&SwitchSlot::Switch0).unwrap();
let mgs_address =
SocketAddrV6::new(Ipv6Addr::LOCALHOST, mgs.port, 0, 0).into();

let new_switch0 =
omicron_test_utils::dev::dendrite::DendriteInstance::start(
switch0_port,
Some(nexus_address),
Some(mgs_address),
)
.await
.unwrap();

cptestctx
.dendrite
.write()
.unwrap()
.insert(SwitchSlot::Switch0, new_switch0);
}

// === migration test helpers ===

#[derive(Clone, Copy, Default)]
Expand Down
4 changes: 2 additions & 2 deletions nexus/test-utils/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,12 @@ pub fn dpd_client<N: NexusServer>(
let dendrite_guard = cptestctx.dendrite.read().unwrap();
let (switch_slot, dendrite_instance) = dendrite_guard
.iter()
.next()
.find(|(_, instance)| instance.is_dpd_running())
.expect("No dendrite instances running for test");

// Copy the values we need while the guard is still alive
let switch_slot = *switch_slot;
let port = dendrite_instance.port;
let port = dendrite_instance.port();
drop(dendrite_guard);

let client_state = dpd_client::ClientState {
Expand Down
103 changes: 47 additions & 56 deletions nexus/test-utils/src/nexus_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@ pub struct ControlPlaneTestContext<N> {
pub oximeter: Oximeter,
pub producer: ProducerServer,
pub gateway: BTreeMap<SwitchSlot, GatewayTestContext>,
/// All dpd instances, whether currently running or not, indexed by switch
/// slot.
pub dendrite: RwLock<HashMap<SwitchSlot, dev::dendrite::DendriteInstance>>,
/// Ports of stopped dendrite instances (for use by start_dendrite)
pub stopped_dendrite_ports: RwLock<HashMap<SwitchSlot, u16>>,
pub mgd: HashMap<SwitchSlot, dev::maghemite::MgdInstance>,
pub ddm: HashMap<SwitchSlot, dev::maghemite::DdmInstance>,
pub external_dns_zone_name: String,
Expand Down Expand Up @@ -214,80 +214,71 @@ impl<N: NexusServer> ControlPlaneTestContext<N> {

/// Stop a Dendrite instance for testing failure scenarios.
///
/// Stores the port so that [`Self::restart_dendrite`] can restart on the same port.
/// Panics if no Dendrite was found for the given switch slot, or it was
/// already stopped. (But note that the Dendrite instance is temporarily
/// removed from the switch slot while it is being stopped, so trying to
/// call this function with the same switch slot concurrently may panic.)
pub async fn stop_dendrite(&self, switch_slot: SwitchSlot) {
use slog::debug;
let log = &self.logctx.log;
debug!(log, "Stopping Dendrite"; "switch_slot" => ?switch_slot);

let dendrite_opt =
{ self.dendrite.write().unwrap().remove(&switch_slot) };
if let Some(mut dendrite) = dendrite_opt {
// Store the port for later restart via start_dendrite
self.stopped_dendrite_ports
.write()
.unwrap()
.insert(switch_slot, dendrite.port);
dendrite.cleanup().await.unwrap();
}
// Extract from mutex first to avoid holding lock across await
let mut dendrite =
self.dendrite.write().unwrap().remove(&switch_slot).unwrap_or_else(
|| {
panic!(
"a dendrite instance should exist \
for switch slot {switch_slot:?}"
);
},
);

let prior_dpd_state = dendrite.stop_dpd().await.unwrap();
assert_eq!(
prior_dpd_state,
dev::dendrite::PriorDpdState::Running,
"dendrite should have been running before stop_dendrite"
);
self.dendrite.write().unwrap().insert(switch_slot, dendrite);
}

/// Restart a Dendrite instance for testing drift correction scenarios.
///
/// Simulates a switch restart where DPD loses its programmed state.
/// Restarts on the same port so test DNS stays valid.
///
/// Works both when Dendrite is currently running (will stop and restart)
/// or when it was previously stopped via [`Self::stop_dendrite`].
/// Works both when Dendrite is currently running (will stop and restart) or
/// when it was previously stopped via [`Self::stop_dendrite`].
///
/// Panics if no Dendrite was found for the given switch slot. (But note
/// that the Dendrite instance is temporarily removed from the internal map
/// while it is being restarted, so trying to call this function with the
/// same switch slot concurrently may panic.)
pub async fn restart_dendrite(&self, switch_slot: SwitchSlot) {
use slog::debug;
let log = self.logctx.log.new(slog::o!(
"switch_slot" => format!("{switch_slot:?}"),
));
debug!(log, "Restarting Dendrite");

// Get port either from running instance or from stored port after stop
// Extract from mutex first to avoid holding lock across await
let old = self.dendrite.write().unwrap().remove(&switch_slot);
let port = if let Some(mut old) = old {
let port = old.port;
debug!(
log, "Shutting down old dpd instance for restart";
"port" => port,
// Extract from mutex first to avoid holding the lock across an await
// point. This does mean that while restart_dendrite is running, other
// code wouldn't be able to find the dpd instance via the switch slot.
let mut dendrite =
self.dendrite.write().unwrap().remove(&switch_slot).unwrap_or_else(
|| {
panic!(
"a dendrite instance should exist \
for switch slot {switch_slot:?}"
);
},
);
old.cleanup().await.unwrap();
port
} else {
// Must have been stopped - get stored port
let port = self.stopped_dendrite_ports
.write()
.unwrap()
.remove(&switch_slot)
.expect("Dendrite not running and no stored port from stop_dendrite");
debug!(
log, "Reusing port from previously-shut-down dpd instance";
"port" => port,
);
port
};

let mgs = self.gateway.get(&switch_slot).unwrap();
let mgs_addr = std::net::SocketAddrV6::new(
std::net::Ipv6Addr::LOCALHOST,
mgs.port,
0,
0,
)
.into();
let port = dendrite.port();
debug!(log, "Restarting dpd behind its proxy"; "port" => port);

let dendrite =
omicron_test_utils::dev::dendrite::DendriteInstance::start(
port,
Some(self.internal_client.bind_address),
Some(mgs_addr),
)
.await
.unwrap();
let prior_dpd_state = dendrite.restart_dpd().await.unwrap();
debug!(log, "Restarted dpd"; "prior_dpd_state" => ?prior_dpd_state);

// Wait for Dendrite to be ready before returning.
// We check `switch_identifiers()` rather than just `dpd_uptime()`
Expand Down Expand Up @@ -332,7 +323,7 @@ impl<N: NexusServer> ControlPlaneTestContext<N> {
for (_, gateway) in self.gateway {
gateway.teardown().await;
}
for (_, mut dendrite) in self.dendrite.into_inner().unwrap() {
for (_, dendrite) in self.dendrite.into_inner().unwrap() {
dendrite.cleanup().await.unwrap();
}
for (_, mut mgd) in self.mgd {
Expand Down
Loading
Loading