From d981b5ef551d893439d9c1a863f067b8e251b4bd Mon Sep 17 00:00:00 2001
From: Onyeka Obi <softwareengineerasaservant@isurvivable.cv>
Date: Wed, 1 Jul 2026 02:19:49 -0700
Subject: [PATCH] [nexus] Don't re-fetch a disk after committing attach/detach
 (fixes #10695)

`DataStore::instance_attach_disk` and `DataStore::instance_detach_disk`
each ran an atomic CTE that commits the disk's state transition, and then
performed a second, non-transactional `disk_get` re-fetch, acquiring an
additional pool connection while still holding the first, purely to
return the richer `datastore::Disk` type.

Under connection-pool exhaustion that post-commit re-fetch fails with
"Backends exist, and appear online, but all claims are used" *after* the
attach/detach has already committed.  In a saga this is fatal: steno gives
a forward action a single attempt and never runs the undo of a *failed*
node, so a disk left `Attached` by an attach whose re-fetch failed has no
detach scheduled.  During unwind its create-record undo then fails
permanently ("disk cannot be deleted in state \"attached\""), which wedges
the instance-create unwind and leaves the instance stuck in `starting`
(#10695).  `instance_detach_disk` has the mirror problem on the saga *undo*
path: a re-fetch failure after the detach commits turns a successful
detach into a permanent undo failure.

Make the committing CTE the last fallible operation in both methods by
returning the `model::Disk` the CTE already produces, and move the
`datastore::Disk` upgrade to the two API callers in `app/instance.rs`,
where a post-commit failure is recoverable (the client simply retries;
attach and detach are idempotent).  The instance-create saga discards the
returned disk entirely, so the re-fetch was pure waste on that path.

Signed-off-by: Onyeka Obi <softwareengineerasaservant@isurvivable.cv>
---
 nexus/db-queries/src/db/datastore/disk.rs | 51 +++++++++++++++--------
 nexus/src/app/instance.rs                 | 15 ++++++-
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs
index f1fb522c526..34d4094c6a4 100644
--- a/nexus/db-queries/src/db/datastore/disk.rs
+++ b/nexus/db-queries/src/db/datastore/disk.rs
@@ -940,7 +940,7 @@ impl DataStore {
         authz_instance: &authz::Instance,
         authz_disk: &authz::Disk,
         max_disks: u32,
-    ) -> Result<(Instance, Disk), Error> {
+    ) -> Result<(Instance, model::Disk), Error> {
         use nexus_db_schema::schema::disk;
         use nexus_db_schema::schema::instance;
         use nexus_db_schema::schema::sled_resource_vmm;
@@ -1095,12 +1095,11 @@ impl DataStore {
 
         let conn = self.pool_connection_authorized(opctx).await?;
 
-        let instance = match query.attach_and_get_result_async(&conn).await {
-            Ok((instance, _db_disk)) => {
-                // We'll re-fetch the datastore::Disk later, so ignore the
-                // model::Disk here
-                instance
-            }
+        let (instance, db_disk) = match query
+            .attach_and_get_result_async(&conn)
+            .await
+        {
+            Ok((instance, db_disk)) => (instance, db_disk),
 
             Err(e) => match e {
                 AttachError::CollectionNotFound => {
@@ -1128,12 +1127,12 @@ impl DataStore {
                         api::external::DiskState::Attached(id)
                             if id == authz_instance.id() =>
                         {
-                            collection
+                            (collection, resource)
                         }
                         api::external::DiskState::Attaching(id)
                             if id == authz_instance.id() =>
                         {
-                            collection
+                            (collection, resource)
                         }
                         // Ok-to-attach disk states: Inspect the state to infer
                         // why we did not attach.
@@ -1239,9 +1238,21 @@ impl DataStore {
             },
         };
 
-        // Re-fetch the disk to get the updates
-        let disk = self.disk_get(&opctx, authz_disk.id()).await?;
-        Ok((instance, disk))
+        // NB: The attach mutation above is the *last fallible operation* in this
+        // method, and we return the `model::Disk` produced by the CTE directly.
+        //
+        // We deliberately do NOT re-fetch here to build a richer
+        // `datastore::Disk`: a re-fetch acquires an additional pool connection
+        // *after* the attach has already committed, and if that acquisition
+        // fails (e.g. under connection-pool exhaustion) the caller observes an
+        // error even though the disk is now attached. In a saga that is fatal:
+        // steno never runs the undo of a failed forward node, so the committed
+        // attach would leak and later wedge the unwind (see omicron#10695).
+        //
+        // Callers that need a `datastore::Disk` should upgrade the returned
+        // `model::Disk` via `disk_get`, in a context where a post-commit failure
+        // is recoverable (i.e. not from a saga action).
+        Ok((instance, db_disk))
     }
 
     pub async fn instance_detach_disk(
@@ -1249,7 +1260,7 @@ impl DataStore {
         opctx: &OpContext,
         authz_instance: &authz::Instance,
         authz_disk: &authz::Disk,
-    ) -> Result<Disk, Error> {
+    ) -> Result<model::Disk, Error> {
         use nexus_db_schema::schema::{disk, instance};
 
         opctx.authorize(authz::Action::Modify, authz_instance).await?;
@@ -1275,7 +1286,7 @@ impl DataStore {
 
         let conn = self.pool_connection_authorized(opctx).await?;
 
-        let _db_disk = Instance::detach_resource(
+        let db_disk = Instance::detach_resource(
             authz_instance.id(),
             authz_disk.id(),
             instance::table
@@ -1390,9 +1401,15 @@ impl DataStore {
             }
         })?;
 
-        let disk = self.disk_get(&opctx, authz_disk.id()).await?;
-
-        Ok(disk)
+        // NB: As with `instance_attach_disk`, the detach mutation above is the
+        // *last fallible operation* in this method: we return the `model::Disk`
+        // produced by the CTE directly rather than re-fetching. A post-commit
+        // re-fetch here would be especially dangerous because `instance_detach_disk`
+        // is used as a saga *undo* action (`sic_attach_disk_to_instance_undo`);
+        // steno gives undo actions a single attempt with no retry, so a failure
+        // after the detach commits would permanently wedge the unwind even
+        // though the detach succeeded (see omicron#10695).
+        Ok(db_disk)
     }
 
     pub async fn disk_update_runtime(
diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
index 9d7a34ddce6..b117652142a 100644
--- a/nexus/src/app/instance.rs
+++ b/nexus/src/app/instance.rs
@@ -1924,7 +1924,7 @@ impl super::Nexus {
         //   - Update the DB if the request succeeded (hopefully to "Attached").
         // - If the instance is not running...
         //   - Update the disk state in the DB to "Attached".
-        let (_instance, disk) = self
+        let (_instance, _db_disk) = self
             .db_datastore
             .instance_attach_disk(
                 &opctx,
@@ -1934,6 +1934,14 @@ impl super::Nexus {
             )
             .await?;
 
+        // `instance_attach_disk` returns a `model::Disk` and deliberately does
+        // not re-fetch a `datastore::Disk` internally (see omicron#10695): a
+        // re-fetch after the attach commits can fail (e.g. pool exhaustion) and
+        // would leak the committed attach if it ran inside a saga action. It is
+        // safe to upgrade to a `datastore::Disk` here, outside the mutation,
+        // because a failure in this API path is recoverable (the client can
+        // retry; attach is idempotent).
+        let disk = self.db_datastore.disk_get(&opctx, authz_disk.id()).await?;
         Ok(disk)
     }
 
@@ -1972,10 +1980,13 @@ impl super::Nexus {
         //   - Update the DB if the request succeeded (hopefully to "Detached").
         // - If the instance is not running...
         //   - Update the disk state in the DB to "Detached".
-        let disk = self
+        let _db_disk = self
             .db_datastore
             .instance_detach_disk(&opctx, &authz_instance, &authz_disk)
             .await?;
+        // As in `instance_attach_disk`, upgrade the returned `model::Disk` to a
+        // `datastore::Disk` outside the datastore mutation (see omicron#10695).
+        let disk = self.db_datastore.disk_get(&opctx, authz_disk.id()).await?;
         Ok(disk)
     }