From 25f868eb67d9cf3f9ed86741d58ddee06c0e33cb Mon Sep 17 00:00:00 2001 From: Saadi Myftija Date: Wed, 10 Jun 2026 19:53:57 +0200 Subject: [PATCH 1/4] fix: retry transient instance create failures instead of abandoning the run ComputeWorkloadManager.create swallows gateway errors by design, so a cold start that fails placement (e.g. a netns slot with a busy tap, a full node disk) silently abandons the dequeued run until the run engine's PENDING_EXECUTING timeout redrives it minutes later. These failures are transient per placement - redriven runs virtually always succeed - so retry the create up to 3 times with short backoff before giving up. Gateway 5xx and network-level fetch failures are retried; 4xx responses (won't heal) and timeouts (the instance may still be provisioning) are not. --- .../src/workloadManager/compute.test.ts | 45 +++++++++ .../supervisor/src/workloadManager/compute.ts | 92 ++++++++++++++----- 2 files changed, 116 insertions(+), 21 deletions(-) create mode 100644 apps/supervisor/src/workloadManager/compute.test.ts diff --git a/apps/supervisor/src/workloadManager/compute.test.ts b/apps/supervisor/src/workloadManager/compute.test.ts new file mode 100644 index 0000000000..7598caf04f --- /dev/null +++ b/apps/supervisor/src/workloadManager/compute.test.ts @@ -0,0 +1,45 @@ +import { describe, expect, it } from "vitest"; +import { ComputeClientError } from "@internal/compute"; +import { isRetryableCreateError } from "./compute.js"; + +describe("isRetryableCreateError", () => { + it("retries statuses where the create definitely did not commit", () => { + expect(isRetryableCreateError(new ComputeClientError(500, "tap busy", "http://gw"))).toBe( + true + ); + expect(isRetryableCreateError(new ComputeClientError(503, "no placement", "http://gw"))).toBe( + true + ); + }); + + it("does not retry lost-response statuses (create may have committed)", () => { + expect(isRetryableCreateError(new ComputeClientError(502, "bad gateway", "http://gw"))).toBe( + false + ); + expect( + isRetryableCreateError(new ComputeClientError(504, "gateway timeout", "http://gw")) + ).toBe(false); + }); + + it("does not retry 4xx responses", () => { + expect(isRetryableCreateError(new ComputeClientError(400, "bad request", "http://gw"))).toBe( + false + ); + expect(isRetryableCreateError(new ComputeClientError(409, "conflict", "http://gw"))).toBe( + false + ); + }); + + it("does not retry timeouts (instance may still be provisioning)", () => { + expect(isRetryableCreateError(new DOMException("timed out", "TimeoutError"))).toBe(false); + }); + + it("retries network-level fetch failures", () => { + expect(isRetryableCreateError(new TypeError("fetch failed"))).toBe(true); + }); + + it("does not retry unknown errors", () => { + expect(isRetryableCreateError(new Error("something else"))).toBe(false); + expect(isRetryableCreateError("string error")).toBe(false); + }); +}); diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 3efad7d407..2719712d91 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -6,12 +6,41 @@ import { type WorkloadManagerCreateOptions, type WorkloadManagerOptions, } from "./types.js"; -import { ComputeClient, stripImageDigest } from "@internal/compute"; +import { ComputeClient, ComputeClientError, stripImageDigest } from "@internal/compute"; +import { setTimeout as sleep } from "node:timers/promises"; import { extractTraceparent, getRunnerId } from "../util.js"; import type { OtlpTraceService } from "../services/otlpTraceService.js"; import { tryCatch } from "@trigger.dev/core"; import { encodeBaggage, fromContext } from "../wideEvents/index.js"; +const CREATE_MAX_ATTEMPTS = 3; +const CREATE_RETRY_BASE_DELAY_MS = 250; + +/** + * Whether a failed instance create is worth retrying. Only statuses where + * the create definitely did NOT commit are retried: 500 means the agent or + * fcrun returned a create error (e.g. a netns slot holding the tap busy, a + * full node disk - placement may differ on retry), 503 means the gateway + * had nowhere to place it. 502/504 are excluded: the gateway emits those + * when it fails to reach the node or read its response, which can happen + * AFTER the agent committed the create - and the gateway only records the + * instance name on a clean 201, so a same-name retry would miss the + * collision check and could double-create the VM on another node. 4xx won't + * heal on retry, and timeouts may still be provisioning. Network-level + * fetch failures are safe: if the gateway processed the create, its name + * index is populated and the retry 409s harmlessly. + */ +export function isRetryableCreateError(error: unknown): boolean { + if (error instanceof ComputeClientError) { + return error.status === 500 || error.status === 503; + } + if (error instanceof DOMException && error.name === "TimeoutError") { + return false; + } + // Network-level fetch failures (gateway briefly unreachable) + return error instanceof TypeError; +} + type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { gateway: { url: string; @@ -165,27 +194,48 @@ export class ComputeWorkloadManager implements WorkloadManager { const startMs = performance.now(); try { - const [error, data] = await tryCatch( - this.compute.instances.create({ - name: runnerId, - image: imageRef, - env: envVars, - cpu: opts.machine.cpu, - memory_gb: opts.machine.memory, - metadata: { - runId: opts.runFriendlyId, - envId: opts.envId, - envType: opts.envType, - orgId: opts.orgId, - projectId: opts.projectId, - deploymentVersion: opts.deploymentVersion, - machine: opts.machine.name, - }, - ...(Object.keys(labels).length > 0 ? { labels } : {}), - }) - ); + const createRequest = { + name: runnerId, + image: imageRef, + env: envVars, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, + metadata: { + runId: opts.runFriendlyId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + }, + ...(Object.keys(labels).length > 0 ? { labels } : {}), + }; + + // Retry transient placement failures instead of abandoning the run: a + // swallowed create error leaves the run waiting for the run engine's + // PENDING_EXECUTING timeout (minutes) before it is redriven, while a + // retried create typically succeeds in under a second (TRI-10293). + let error: unknown; + let data: Awaited> | null | undefined; + let attempt = 1; + for (; attempt <= CREATE_MAX_ATTEMPTS; attempt++) { + [error, data] = await tryCatch(this.compute.instances.create(createRequest)); + + if (!error) break; + + this.logger.warn("create instance attempt failed", { + runnerId, + attempt, + error: error instanceof Error ? error.message : String(error), + }); + + if (!isRetryableCreateError(error) || attempt === CREATE_MAX_ATTEMPTS) break; + await sleep(CREATE_RETRY_BASE_DELAY_MS * attempt); + } + event.createAttempts = attempt; - if (error) { + if (error || !data) { event.error = error instanceof Error ? error.message : String(error); event.errorType = error instanceof DOMException && error.name === "TimeoutError" ? "timeout" : "fetch"; From 461dc8944c33fe721d7018a26895a94853ec37c3 Mon Sep 17 00:00:00 2001 From: Saadi Myftija Date: Wed, 10 Jun 2026 21:13:19 +0200 Subject: [PATCH 2/4] fix: suffix retried instance create names to dodge stale registrations A failed create can leave its instance name registered gateway/fcrun-side until async cleanup runs, so a same-name retry can 409 against our own residue (observed: tap-EBUSY 500 at 18:29Z followed by 409 name_conflict on the retry 2.7s later, costing the full redrive anyway). Give retry attempts a deterministic -rN suffix; attempt 1 keeps the unsuffixed name so the non-retry path is unchanged. The suffixed name flows into both the instance name and TRIGGER_RUNNER_ID from the same variable - every downstream flow (suspend scheduling, snapshot dispatch, cancel guards, run-engine fields) treats it as one opaque self-reported token, and restored VMs already carry deterministic name suffixes. Temporary measure (TRI-10293): the proper fix is gateway-side cleanup of failed-create registrations. --- .../src/workloadManager/compute.test.ts | 13 +++++- .../supervisor/src/workloadManager/compute.ts | 46 +++++++++++++++++-- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/apps/supervisor/src/workloadManager/compute.test.ts b/apps/supervisor/src/workloadManager/compute.test.ts index 7598caf04f..ea5ddabf28 100644 --- a/apps/supervisor/src/workloadManager/compute.test.ts +++ b/apps/supervisor/src/workloadManager/compute.test.ts @@ -1,6 +1,17 @@ import { describe, expect, it } from "vitest"; import { ComputeClientError } from "@internal/compute"; -import { isRetryableCreateError } from "./compute.js"; +import { isRetryableCreateError, runnerNameForAttempt } from "./compute.js"; + +describe("runnerNameForAttempt", () => { + it("keeps the unsuffixed name for the first attempt", () => { + expect(runnerNameForAttempt("runner-abc123", 1)).toBe("runner-abc123"); + }); + + it("suffixes retry attempts deterministically", () => { + expect(runnerNameForAttempt("runner-abc123", 2)).toBe("runner-abc123-r2"); + expect(runnerNameForAttempt("runner-abc123", 3)).toBe("runner-abc123-r3"); + }); +}); describe("isRetryableCreateError", () => { it("retries statuses where the create definitely did not commit", () => { diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 2719712d91..307dc3615a 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -16,6 +16,23 @@ import { encodeBaggage, fromContext } from "../wideEvents/index.js"; const CREATE_MAX_ATTEMPTS = 3; const CREATE_RETRY_BASE_DELAY_MS = 250; +/** + * TEMPORARY (TRI-10293): a failed create can leave its instance name + * registered gateway/fcrun-side until async cleanup runs, so a same-name + * retry can 409 against our own residue. Until the gateway cleans up + * failed-create registrations properly, retry attempts get a deterministic + * suffix. Attempt 1 keeps the unsuffixed name so the non-retry path is + * unchanged; the suffixed name flows into both the instance name and + * TRIGGER_RUNNER_ID, which downstream flows treat as one opaque + * self-reported token. Only attempts following a ComputeClientError are + * suffixed - network-failure retries keep the same name on purpose, because + * the gateway's name-collision 409 is their safety net against + * double-creating an instance whose create response was lost. + */ +export function runnerNameForAttempt(runnerId: string, attempt: number): string { + return attempt === 1 ? runnerId : `${runnerId}-r${attempt}`; +} + /** * Whether a failed instance create is worth retrying. Only statuses where * the create definitely did NOT commit are retried: 500 means the agent or @@ -219,13 +236,36 @@ export class ComputeWorkloadManager implements WorkloadManager { let error: unknown; let data: Awaited> | null | undefined; let attempt = 1; + // Set after a ComputeClientError: the failed create may have left its + // name registered, so subsequent attempts use a suffixed name. + let suffixAttempts = false; for (; attempt <= CREATE_MAX_ATTEMPTS; attempt++) { - [error, data] = await tryCatch(this.compute.instances.create(createRequest)); + const attemptRunnerId = suffixAttempts + ? runnerNameForAttempt(runnerId, attempt) + : runnerId; + [error, data] = await tryCatch( + this.compute.instances.create( + attemptRunnerId === runnerId + ? createRequest + : { + ...createRequest, + name: attemptRunnerId, + env: { ...envVars, TRIGGER_RUNNER_ID: attemptRunnerId }, + } + ) + ); + + if (!error) { + event.runnerId = attemptRunnerId; + break; + } - if (!error) break; + if (error instanceof ComputeClientError) { + suffixAttempts = true; + } this.logger.warn("create instance attempt failed", { - runnerId, + runnerId: attemptRunnerId, attempt, error: error instanceof Error ? error.message : String(error), }); From af4962b2e7bd62144e8f4f5b5a3b2354e5f24cb6 Mon Sep 17 00:00:00 2001 From: Saadi Myftija Date: Thu, 11 Jun 2026 16:28:24 +0200 Subject: [PATCH 3/4] feat: make instance create retry attempts and backoff configurable COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS (default 3, 1 disables retries) and COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS (default 250) thread through ComputeWorkloadManagerOptions.createRetry; behavior is unchanged at the defaults. --- apps/supervisor/src/env.ts | 8 ++++++++ apps/supervisor/src/index.ts | 4 ++++ .../supervisor/src/workloadManager/compute.ts | 20 ++++++++++++++----- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index f3b0d4c023..3919e73a7e 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -114,6 +114,14 @@ const Env = z COMPUTE_TRACE_OTLP_ENDPOINT: z.string().url().optional(), // Override for span export (derived from TRIGGER_API_URL if unset) COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000), COMPUTE_SNAPSHOT_DISPATCH_LIMIT: z.coerce.number().int().min(1).max(100).default(10), + // Instance create retries for transient placement failures (1 = no retries) + COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS: z.coerce.number().int().min(1).max(10).default(3), + COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS: z.coerce + .number() + .int() + .min(0) + .max(10_000) + .default(250), // Kubernetes settings KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 64f3959c2b..e97c4c7bb9 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -144,6 +144,10 @@ class ManagedSupervisor { otelEndpoint: env.OTEL_EXPORTER_OTLP_ENDPOINT, prettyLogs: env.RUNNER_PRETTY_LOGS, }, + createRetry: { + maxAttempts: env.COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS, + baseDelayMs: env.COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS, + }, }); this.computeManager = computeManager; this.workloadManager = computeManager; diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts index 307dc3615a..5af1c9b5c3 100644 --- a/apps/supervisor/src/workloadManager/compute.ts +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -13,8 +13,8 @@ import type { OtlpTraceService } from "../services/otlpTraceService.js"; import { tryCatch } from "@trigger.dev/core"; import { encodeBaggage, fromContext } from "../wideEvents/index.js"; -const CREATE_MAX_ATTEMPTS = 3; -const CREATE_RETRY_BASE_DELAY_MS = 250; +const DEFAULT_CREATE_MAX_ATTEMPTS = 3; +const DEFAULT_CREATE_RETRY_BASE_DELAY_MS = 250; /** * TEMPORARY (TRI-10293): a failed create can leave its instance name @@ -76,13 +76,23 @@ type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { otelEndpoint: string; prettyLogs: boolean; }; + createRetry?: { + maxAttempts: number; + baseDelayMs: number; + }; }; export class ComputeWorkloadManager implements WorkloadManager { private readonly logger = new SimpleStructuredLogger("compute-workload-manager"); private readonly compute: ComputeClient; + private readonly createMaxAttempts: number; + private readonly createRetryBaseDelayMs: number; constructor(private opts: ComputeWorkloadManagerOptions) { + this.createMaxAttempts = opts.createRetry?.maxAttempts ?? DEFAULT_CREATE_MAX_ATTEMPTS; + this.createRetryBaseDelayMs = + opts.createRetry?.baseDelayMs ?? DEFAULT_CREATE_RETRY_BASE_DELAY_MS; + if (opts.workloadApiDomain) { this.logger.warn("⚠️ Custom workload API domain", { domain: opts.workloadApiDomain, @@ -239,7 +249,7 @@ export class ComputeWorkloadManager implements WorkloadManager { // Set after a ComputeClientError: the failed create may have left its // name registered, so subsequent attempts use a suffixed name. let suffixAttempts = false; - for (; attempt <= CREATE_MAX_ATTEMPTS; attempt++) { + for (; attempt <= this.createMaxAttempts; attempt++) { const attemptRunnerId = suffixAttempts ? runnerNameForAttempt(runnerId, attempt) : runnerId; @@ -270,8 +280,8 @@ export class ComputeWorkloadManager implements WorkloadManager { error: error instanceof Error ? error.message : String(error), }); - if (!isRetryableCreateError(error) || attempt === CREATE_MAX_ATTEMPTS) break; - await sleep(CREATE_RETRY_BASE_DELAY_MS * attempt); + if (!isRetryableCreateError(error) || attempt === this.createMaxAttempts) break; + await sleep(this.createRetryBaseDelayMs * attempt); } event.createAttempts = attempt; From ddbe4b7060d224483e7a9fe8467a4721620873a5 Mon Sep 17 00:00:00 2001 From: Saadi Myftija Date: Thu, 11 Jun 2026 17:04:56 +0200 Subject: [PATCH 4/4] chore: add server-changes entry --- .server-changes/retry-transient-instance-create-failures.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .server-changes/retry-transient-instance-create-failures.md diff --git a/.server-changes/retry-transient-instance-create-failures.md b/.server-changes/retry-transient-instance-create-failures.md new file mode 100644 index 0000000000..f7b9c7afd1 --- /dev/null +++ b/.server-changes/retry-transient-instance-create-failures.md @@ -0,0 +1,6 @@ +--- +area: supervisor +type: fix +--- + +Retry transient instance create failures during cold starts instead of waiting minutes for the run to be requeued.