From 9e122de283a07f39067d72d76591a0721394a43c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20Mondaini=20Calv=C3=A3o?= Date: Tue, 9 Jun 2026 10:08:36 -0300 Subject: [PATCH] fix: exit cleanly on fatal startup errors instead of crash-looping (#4253) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After "Migration complete", a fatal error in the server startup window (e.g. a rejected `app.prepare()`) did not terminate the process. Background handles — the ioredis reconnect loop, open sockets — kept the event loop alive, so instead of exiting the process spun at high CPU, never passed the healthcheck, and Docker Swarm crash-looped the container showing only "ELIFECYCLE Command failed." Reproduced with the real compiled bundle (node:24.4.0-slim + Postgres 16): - before: fatal startup error -> process never exits, spins until killed - after: fatal startup error -> logs the cause, exits 1 in ~3-5s Changes in apps/dokploy/server/server.ts: - Phase-gated process handlers: before the HTTP server is listening, an uncaught exception or unhandled rejection logs the cause and exit(1)s so the orchestrator restarts cleanly. After it is listening, a stray rejection is only logged, so a healthy serving instance is never killed (verified: real post-listen docker.sock ENOENT rejection is survived). - try/catch around the synchronous directory/Traefik init. - await the listen() bind so a bind failure (e.g. EADDRINUSE) exits instead of spinning; only mark the server ready once actually listening. - .catch() on app.prepare() with a labeled diagnostic. --- apps/dokploy/server/server.ts | 131 ++++++++++++++++++++++++---------- 1 file changed, 92 insertions(+), 39 deletions(-) diff --git a/apps/dokploy/server/server.ts b/apps/dokploy/server/server.ts index 4de4d76897..529ac4b854 100644 --- a/apps/dokploy/server/server.ts +++ b/apps/dokploy/server/server.ts @@ -28,53 +28,106 @@ const PORT = Number.parseInt(process.env.PORT || "3000", 10); const HOST = process.env.HOST || "0.0.0.0"; const dev = process.env.NODE_ENV !== "production"; +// Tracks whether the HTTP server has reached the "listening" state. Until then, +// any fatal error means the process can never serve requests and MUST exit(1) so +// the orchestrator restarts it cleanly. Without this, a startup failure (e.g. a +// rejected `app.prepare()`) leaves background handles — the Redis reconnect loop, +// open sockets — keeping the event loop alive, so the process never exits: it +// spins at ~100% CPU, the healthcheck never passes, and Docker Swarm crash-loops +// the container with only an opaque "ELIFECYCLE Command failed." See issue #4253. +let isServerListening = false; + +// Node terminates the process on an uncaught exception by default; preserve that +// but log the cause first so the failure point is never silent. +process.on("uncaughtException", (error) => { + console.error("Uncaught exception:", error); + process.exit(1); +}); + +// A stray unhandled rejection BEFORE the server is listening means startup has +// failed — exit so we don't spin forever (see above). Once we are serving +// requests we only log, to avoid crashing an otherwise-healthy instance. +process.on("unhandledRejection", (reason) => { + console.error("Unhandled rejection:", reason); + if (!isServerListening) { + process.exit(1); + } +}); + // Initialize critical directories and Traefik config BEFORE Next.js starts // This prevents race conditions with the install script if (process.env.NODE_ENV === "production" && !IS_CLOUD) { - setupDirectories(); - createDefaultTraefikConfig(); - createDefaultServerTraefikConfig(); - console.log("✅ initialization complete"); + try { + setupDirectories(); + createDefaultTraefikConfig(); + createDefaultServerTraefikConfig(); + console.log("✅ initialization complete"); + } catch (error) { + console.error("Failed to initialize directories/Traefik config:", error); + process.exit(1); + } } const app = next({ dev, turbopack: process.env.TURBOPACK === "1" }); const handle = app.getRequestHandler(); -void app.prepare().then(async () => { - try { - console.log("Running DokployVersion: ", packageInfo.version); - const server = http.createServer((req, res) => { - handle(req, res); - }); +void app + .prepare() + .then(async () => { + try { + console.log("Running DokployVersion: ", packageInfo.version); + const server = http.createServer((req, res) => { + handle(req, res); + }); - // WEBSOCKET - setupDrawerLogsWebSocketServer(server); - setupDeploymentLogsWebSocketServer(server); - setupDockerContainerLogsWebSocketServer(server); - setupDockerContainerTerminalWebSocketServer(server); - setupTerminalWebSocketServer(server); - if (!IS_CLOUD) { - setupDockerStatsMonitoringSocketServer(server); - } + // WEBSOCKET + setupDrawerLogsWebSocketServer(server); + setupDeploymentLogsWebSocketServer(server); + setupDockerContainerLogsWebSocketServer(server); + setupDockerContainerTerminalWebSocketServer(server); + setupTerminalWebSocketServer(server); + if (!IS_CLOUD) { + setupDockerStatsMonitoringSocketServer(server); + } - server.listen(PORT, HOST); - console.log(`Server Started on: http://${HOST}:${PORT}`); - if (process.env.NODE_ENV === "production" && !IS_CLOUD) { - createDefaultMiddlewares(); - await initializeNetwork(); - await initCronJobs(); - await initSchedules(); - await initCancelDeployments(); - await initVolumeBackupsCronJobs(); - await sendDokployRestartNotifications(); - } - await initEnterpriseBackupCronJobs(); + // Wait for the bind to succeed (or fail, e.g. EADDRINUSE) before + // continuing, so a listen failure exits cleanly instead of spinning. + await new Promise((resolve, reject) => { + const onError = (error: Error) => reject(error); + server.once("error", onError); + server.listen(PORT, HOST, () => { + server.removeListener("error", onError); + resolve(); + }); + }); + isServerListening = true; + console.log(`Server Started on: http://${HOST}:${PORT}`); - if (!IS_CLOUD) { - console.log("Starting Deployment Worker"); - const { deploymentWorker } = await import("./queues/deployments-queue"); - await deploymentWorker.run(); + if (process.env.NODE_ENV === "production" && !IS_CLOUD) { + createDefaultMiddlewares(); + await initializeNetwork(); + await initCronJobs(); + await initSchedules(); + await initCancelDeployments(); + await initVolumeBackupsCronJobs(); + await sendDokployRestartNotifications(); + } + await initEnterpriseBackupCronJobs(); + + if (!IS_CLOUD) { + console.log("Starting Deployment Worker"); + const { deploymentWorker } = await import("./queues/deployments-queue"); + await deploymentWorker.run(); + } + } catch (e) { + console.error("Main Server Error", e); + // If we failed before binding, the process can never serve traffic — + // exit so the orchestrator restarts us instead of leaving it spinning. + if (!isServerListening) { + process.exit(1); + } } - } catch (e) { - console.error("Main Server Error", e); - } -}); + }) + .catch((error) => { + console.error("Failed to prepare Next.js app server:", error); + process.exit(1); + });