From d981b5ef551d893439d9c1a863f067b8e251b4bd Mon Sep 17 00:00:00 2001 From: Onyeka Obi Date: Wed, 1 Jul 2026 02:19:49 -0700 Subject: [PATCH] [nexus] Don't re-fetch a disk after committing attach/detach (fixes #10695) `DataStore::instance_attach_disk` and `DataStore::instance_detach_disk` each ran an atomic CTE that commits the disk's state transition, and then performed a second, non-transactional `disk_get` re-fetch, acquiring an additional pool connection while still holding the first, purely to return the richer `datastore::Disk` type. Under connection-pool exhaustion that post-commit re-fetch fails with "Backends exist, and appear online, but all claims are used" *after* the attach/detach has already committed. In a saga this is fatal: steno gives a forward action a single attempt and never runs the undo of a *failed* node, so a disk left `Attached` by an attach whose re-fetch failed has no detach scheduled. During unwind its create-record undo then fails permanently ("disk cannot be deleted in state \"attached\""), which wedges the instance-create unwind and leaves the instance stuck in `starting` (#10695). `instance_detach_disk` has the mirror problem on the saga *undo* path: a re-fetch failure after the detach commits turns a successful detach into a permanent undo failure. Make the committing CTE the last fallible operation in both methods by returning the `model::Disk` the CTE already produces, and move the `datastore::Disk` upgrade to the two API callers in `app/instance.rs`, where a post-commit failure is recoverable (the client simply retries; attach and detach are idempotent). The instance-create saga discards the returned disk entirely, so the re-fetch was pure waste on that path. Signed-off-by: Onyeka Obi --- nexus/db-queries/src/db/datastore/disk.rs | 51 +++++++++++++++-------- nexus/src/app/instance.rs | 15 ++++++- 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index f1fb522c526..34d4094c6a4 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -940,7 +940,7 @@ impl DataStore { authz_instance: &authz::Instance, authz_disk: &authz::Disk, max_disks: u32, - ) -> Result<(Instance, Disk), Error> { + ) -> Result<(Instance, model::Disk), Error> { use nexus_db_schema::schema::disk; use nexus_db_schema::schema::instance; use nexus_db_schema::schema::sled_resource_vmm; @@ -1095,12 +1095,11 @@ impl DataStore { let conn = self.pool_connection_authorized(opctx).await?; - let instance = match query.attach_and_get_result_async(&conn).await { - Ok((instance, _db_disk)) => { - // We'll re-fetch the datastore::Disk later, so ignore the - // model::Disk here - instance - } + let (instance, db_disk) = match query + .attach_and_get_result_async(&conn) + .await + { + Ok((instance, db_disk)) => (instance, db_disk), Err(e) => match e { AttachError::CollectionNotFound => { @@ -1128,12 +1127,12 @@ impl DataStore { api::external::DiskState::Attached(id) if id == authz_instance.id() => { - collection + (collection, resource) } api::external::DiskState::Attaching(id) if id == authz_instance.id() => { - collection + (collection, resource) } // Ok-to-attach disk states: Inspect the state to infer // why we did not attach. @@ -1239,9 +1238,21 @@ impl DataStore { }, }; - // Re-fetch the disk to get the updates - let disk = self.disk_get(&opctx, authz_disk.id()).await?; - Ok((instance, disk)) + // NB: The attach mutation above is the *last fallible operation* in this + // method, and we return the `model::Disk` produced by the CTE directly. + // + // We deliberately do NOT re-fetch here to build a richer + // `datastore::Disk`: a re-fetch acquires an additional pool connection + // *after* the attach has already committed, and if that acquisition + // fails (e.g. under connection-pool exhaustion) the caller observes an + // error even though the disk is now attached. In a saga that is fatal: + // steno never runs the undo of a failed forward node, so the committed + // attach would leak and later wedge the unwind (see omicron#10695). + // + // Callers that need a `datastore::Disk` should upgrade the returned + // `model::Disk` via `disk_get`, in a context where a post-commit failure + // is recoverable (i.e. not from a saga action). + Ok((instance, db_disk)) } pub async fn instance_detach_disk( @@ -1249,7 +1260,7 @@ impl DataStore { opctx: &OpContext, authz_instance: &authz::Instance, authz_disk: &authz::Disk, - ) -> Result { + ) -> Result { use nexus_db_schema::schema::{disk, instance}; opctx.authorize(authz::Action::Modify, authz_instance).await?; @@ -1275,7 +1286,7 @@ impl DataStore { let conn = self.pool_connection_authorized(opctx).await?; - let _db_disk = Instance::detach_resource( + let db_disk = Instance::detach_resource( authz_instance.id(), authz_disk.id(), instance::table @@ -1390,9 +1401,15 @@ impl DataStore { } })?; - let disk = self.disk_get(&opctx, authz_disk.id()).await?; - - Ok(disk) + // NB: As with `instance_attach_disk`, the detach mutation above is the + // *last fallible operation* in this method: we return the `model::Disk` + // produced by the CTE directly rather than re-fetching. A post-commit + // re-fetch here would be especially dangerous because `instance_detach_disk` + // is used as a saga *undo* action (`sic_attach_disk_to_instance_undo`); + // steno gives undo actions a single attempt with no retry, so a failure + // after the detach commits would permanently wedge the unwind even + // though the detach succeeded (see omicron#10695). + Ok(db_disk) } pub async fn disk_update_runtime( diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 9d7a34ddce6..b117652142a 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1924,7 +1924,7 @@ impl super::Nexus { // - Update the DB if the request succeeded (hopefully to "Attached"). // - If the instance is not running... // - Update the disk state in the DB to "Attached". - let (_instance, disk) = self + let (_instance, _db_disk) = self .db_datastore .instance_attach_disk( &opctx, @@ -1934,6 +1934,14 @@ impl super::Nexus { ) .await?; + // `instance_attach_disk` returns a `model::Disk` and deliberately does + // not re-fetch a `datastore::Disk` internally (see omicron#10695): a + // re-fetch after the attach commits can fail (e.g. pool exhaustion) and + // would leak the committed attach if it ran inside a saga action. It is + // safe to upgrade to a `datastore::Disk` here, outside the mutation, + // because a failure in this API path is recoverable (the client can + // retry; attach is idempotent). + let disk = self.db_datastore.disk_get(&opctx, authz_disk.id()).await?; Ok(disk) } @@ -1972,10 +1980,13 @@ impl super::Nexus { // - Update the DB if the request succeeded (hopefully to "Detached"). // - If the instance is not running... // - Update the disk state in the DB to "Detached". - let disk = self + let _db_disk = self .db_datastore .instance_detach_disk(&opctx, &authz_instance, &authz_disk) .await?; + // As in `instance_attach_disk`, upgrade the returned `model::Disk` to a + // `datastore::Disk` outside the datastore mutation (see omicron#10695). + let disk = self.db_datastore.disk_get(&opctx, authz_disk.id()).await?; Ok(disk) }