Skip to content

Commit bd79b92

Browse files
committed
feat: harden request reliability and runtime telemetry
# Conflicts: # index.ts # lib/codex-manager/commands/status.ts # lib/runtime/metrics.ts # test/runtime-metrics.test.ts
1 parent 6ac8e0b commit bd79b92

10 files changed

Lines changed: 540 additions & 53 deletions

File tree

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Codex CLI-first multi-account OAuth manager for the official `@openai/codex` CLI
1919
- Interactive dashboard for account actions and settings
2020
- Experimental settings tab for staged sync, backup, and refresh-guard controls
2121
- Forecast, report, fix, and doctor commands for operational safety
22+
- Runtime request budget, cooldown, and traffic observability in `codex auth status` / `codex auth report`
2223
- Flagged account verification and restore flow
2324
- Session affinity and live account sync controls
2425
- Proactive refresh and preemptive quota deferral controls
@@ -156,6 +157,14 @@ If browser launch is blocked, use the alternate login paths in [docs/getting-sta
156157
| `codex auth report --live --json` | How do I get the full machine-readable health report? |
157158
| `codex auth fix --live --model gpt-5-codex` | How do I run live repair probes with a chosen model? |
158159

160+
### Reliability behavior
161+
162+
- whole-pool replay is disabled by default when every account is rate-limited
163+
- active requests use a bounded outbound request budget so one prompt cannot walk the full pool indefinitely
164+
- repeated cross-account 5xx bursts trigger a short cooldown instead of continuing aggressive rotation
165+
- proactive refresh is staggered to reduce background refresh bursts
166+
- `codex auth status` and `codex auth report --json` surface recent runtime request metrics and active cooldown windows
167+
159168
---
160169

161170
## Dashboard Hotkeys
@@ -265,6 +274,8 @@ codex auth login
265274

266275
- `codex auth` unrecognized: run `where codex`, then follow `docs/troubleshooting.md` for routing fallback commands
267276
- Switch succeeds but wrong account appears active: run `codex auth switch <index>`, then restart session
277+
- Requests fail fast with a pool cooldown message: wait for the cooldown window or inspect `codex auth status`
278+
- Requests fail fast after repeated upstream 5xx errors: inspect `codex auth report --json` for runtime traffic and cooldown details
268279
- OAuth callback on port `1455` fails: free the port and re-run `codex auth login`
269280
- Browser launch is blocked or you are in a headless shell: re-run `codex auth login --manual` or set `CODEX_AUTH_NO_BROWSER=1`
270281
- `missing field id_token` / `token_expired` / `refresh_token_reused`: re-login affected account

index.ts

Lines changed: 156 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,18 @@ import {
152152
parseFailoverMode,
153153
} from "./lib/request/failover-config.js";
154154
import {
155-
buildStreamFailoverCandidateOrder,
156155
capStreamFailoverMax,
157156
computeOutboundRequestAttemptBudget,
158157
} from "./lib/request/request-attempt-budget.js";
158+
import {
159+
armPoolExhaustionCooldown,
160+
buildAdaptiveStreamFailoverCandidateOrder,
161+
clearPoolExhaustionCooldown,
162+
clearServerBurstCooldown,
163+
getPoolExhaustionCooldownRemaining,
164+
getServerBurstCooldownRemaining,
165+
recordServerBurstFailure,
166+
} from "./lib/request/request-resilience.js";
159167
import {
160168
evaluateFailurePolicy,
161169
type FailoverMode,
@@ -239,6 +247,7 @@ import {
239247
ensureRefreshGuardianState,
240248
ensureSessionAffinityState,
241249
} from "./lib/runtime/runtime-services.js";
250+
import { mutateRuntimeObservabilitySnapshot } from "./lib/runtime/runtime-observability.js";
242251
import { applyAccountStorageScopeFromConfig } from "./lib/runtime/storage-scope.js";
243252
import { showRuntimeToast } from "./lib/runtime/toast.js";
244253
import { createRuntimeSessionRecoveryHook } from "./lib/runtime/session-recovery.js";
@@ -353,9 +362,14 @@ let sessionAffinityWriteVersion = 0;
353362
totalRequests: number;
354363
successfulRequests: number;
355364
failedRequests: number;
365+
responsesRequests: number;
366+
authRefreshRequests: number;
367+
diagnosticProbeRequests: number;
356368
outboundRequestAttemptBudget: number | null;
357369
outboundRequestAttemptsConsumed: number;
358370
requestAttemptBudgetExhaustions: number;
371+
poolExhaustionFastFails: number;
372+
serverBurstFastFails: number;
359373
rateLimitedResponses: number;
360374
serverErrors: number;
361375
networkErrors: number;
@@ -379,9 +393,14 @@ let sessionAffinityWriteVersion = 0;
379393
totalRequests: 0,
380394
successfulRequests: 0,
381395
failedRequests: 0,
396+
responsesRequests: 0,
397+
authRefreshRequests: 0,
398+
diagnosticProbeRequests: 0,
382399
outboundRequestAttemptBudget: null,
383400
outboundRequestAttemptsConsumed: 0,
384401
requestAttemptBudgetExhaustions: 0,
402+
poolExhaustionFastFails: 0,
403+
serverBurstFastFails: 0,
385404
rateLimitedResponses: 0,
386405
serverErrors: 0,
387406
networkErrors: 0,
@@ -422,6 +441,23 @@ let sessionAffinityWriteVersion = 0;
422441
findMatchingAccountIndex,
423442
modelFamilies: MODEL_FAMILIES,
424443
});
444+
const syncRuntimeObservability = (requestId: string | null): void => {
445+
mutateRuntimeObservabilitySnapshot((snapshot) => {
446+
snapshot.currentRequestId = requestId;
447+
snapshot.poolExhaustionCooldownUntil =
448+
getPoolExhaustionCooldownRemaining() > 0
449+
? Date.now() + getPoolExhaustionCooldownRemaining()
450+
: null;
451+
snapshot.serverBurstCooldownUntil =
452+
getServerBurstCooldownRemaining() > 0
453+
? Date.now() + getServerBurstCooldownRemaining()
454+
: null;
455+
snapshot.responsesRequests = runtimeMetrics.responsesRequests;
456+
snapshot.authRefreshRequests = runtimeMetrics.authRefreshRequests;
457+
snapshot.diagnosticProbeRequests = runtimeMetrics.diagnosticProbeRequests;
458+
snapshot.runtimeMetrics = { ...runtimeMetrics };
459+
});
460+
};
425461
const persistAccountPoolAndFlagged = async (
426462
results: TokenSuccessWithAccount[],
427463
flaggedStorage: Parameters<typeof saveFlaggedAccounts>[0],
@@ -929,31 +965,65 @@ let sessionAffinityWriteVersion = 0;
929965
sessionAffinityKey,
930966
);
931967
sessionAffinityStore?.prune();
932-
const requestCorrelationId = setCorrelationId(
933-
threadIdCandidate
934-
? `${threadIdCandidate}:${Date.now()}`
935-
: undefined,
936-
);
937-
runtimeMetrics.lastRequestAt = Date.now();
968+
const requestCorrelationId = setCorrelationId(
969+
threadIdCandidate
970+
? `${threadIdCandidate}:${Date.now()}`
971+
: undefined,
972+
);
973+
const requestTraceId = requestCorrelationId;
974+
runtimeMetrics.lastRequestAt = Date.now();
975+
syncRuntimeObservability(requestTraceId);
938976

939977
const abortSignal = requestInit?.signal ?? init?.signal ?? null;
940978
const sleep = createAbortableSleep(abortSignal);
979+
const poolCooldownRemainingMs = getPoolExhaustionCooldownRemaining();
980+
if (poolCooldownRemainingMs > 0) {
981+
runtimeMetrics.failedRequests += 1;
982+
runtimeMetrics.poolExhaustionFastFails += 1;
983+
runtimeMetrics.lastError = "Pool exhaustion cooldown active";
984+
syncRuntimeObservability(requestTraceId);
985+
return new Response(
986+
JSON.stringify({
987+
error: {
988+
message: `The account pool is cooling down after recent rate-limit exhaustion. Try again in ${formatWaitTime(poolCooldownRemainingMs)} or inspect \`codex auth status\`.`,
989+
},
990+
}),
991+
{ status: 429, headers: { "content-type": "application/json; charset=utf-8" } },
992+
);
993+
}
994+
const serverBurstCooldownRemainingMs = getServerBurstCooldownRemaining();
995+
if (serverBurstCooldownRemainingMs > 0) {
996+
runtimeMetrics.failedRequests += 1;
997+
runtimeMetrics.serverBurstFastFails += 1;
998+
runtimeMetrics.lastError = "Server burst cooldown active";
999+
syncRuntimeObservability(requestTraceId);
1000+
return new Response(
1001+
JSON.stringify({
1002+
error: {
1003+
message: `Multiple accounts recently failed with upstream server errors. Try again in ${formatWaitTime(serverBurstCooldownRemainingMs)} or inspect \`codex auth report --json\`.`,
1004+
},
1005+
}),
1006+
{ status: 503, headers: { "content-type": "application/json; charset=utf-8" } },
1007+
);
1008+
}
9411009
const maxOutboundRequestAttempts =
9421010
computeOutboundRequestAttemptBudget({
9431011
accountCount: accountManager.getAccountCount(),
9441012
maxSameAccountRetries,
9451013
emptyResponseMaxRetries,
9461014
streamFailoverMax,
9471015
});
948-
runtimeMetrics.outboundRequestAttemptBudget =
949-
maxOutboundRequestAttempts;
1016+
runtimeMetrics.outboundRequestAttemptBudget ??=
1017+
maxOutboundRequestAttempts;
9501018
logDebug("Configured outbound request attempt budget.", {
1019+
requestTraceId,
9511020
budget: maxOutboundRequestAttempts,
9521021
accountCount: accountManager.getAccountCount(),
9531022
maxSameAccountRetries,
9541023
emptyResponseMaxRetries,
9551024
streamFailoverMax,
9561025
});
1026+
syncRuntimeObservability(requestTraceId);
9571027
let outboundRequestAttemptsRemaining =
9581028
maxOutboundRequestAttempts;
9591029
const tryConsumeOutboundRequestAttempt = (
@@ -964,17 +1034,18 @@ let sessionAffinityWriteVersion = 0;
9641034
runtimeMetrics.requestAttemptBudgetExhaustions += 1;
9651035
runtimeMetrics.lastError =
9661036
`Request attempt budget exhausted after ${maxOutboundRequestAttempts} outbound request(s)`;
967-
logWarn(
968-
"Request attempt budget exhausted.",
969-
{
970-
reason,
971-
accountIndex,
972-
budget: maxOutboundRequestAttempts,
973-
consumed:
974-
runtimeMetrics.outboundRequestAttemptsConsumed,
975-
},
976-
);
977-
return false;
1037+
logWarn(
1038+
"Request attempt budget exhausted.",
1039+
{
1040+
requestTraceId,
1041+
reason,
1042+
accountIndex,
1043+
budget: maxOutboundRequestAttempts,
1044+
consumed: maxOutboundRequestAttempts,
1045+
},
1046+
);
1047+
syncRuntimeObservability(requestTraceId);
1048+
return false;
9781049
}
9791050

9801051
runtimeMetrics.outboundRequestAttemptsConsumed += 1;
@@ -1096,6 +1167,8 @@ let sessionAffinityWriteVersion = 0;
10961167
) as OAuthAuthDetails;
10971168
try {
10981169
if (shouldRefreshToken(accountAuth, tokenRefreshSkewMs)) {
1170+
runtimeMetrics.authRefreshRequests += 1;
1171+
syncRuntimeObservability(requestTraceId);
10991172
accountAuth = (await refreshAndUpdateToken(
11001173
accountAuth,
11011174
client,
@@ -1409,6 +1482,8 @@ let sessionAffinityWriteVersion = 0;
14091482

14101483
try {
14111484
runtimeMetrics.totalRequests++;
1485+
runtimeMetrics.responsesRequests++;
1486+
syncRuntimeObservability(requestTraceId);
14121487
response = await fetch(
14131488
url,
14141489
applyProxyCompatibleInit(url, {
@@ -1978,7 +2053,7 @@ let sessionAffinityWriteVersion = 0;
19782053
break;
19792054
}
19802055

1981-
if (errorResponse.status === 429 && rateLimit) {
2056+
if (errorResponse.status === 429 && rateLimit) {
19822057
runtimeMetrics.rateLimitedResponses++;
19832058
const retryAfterMs =
19842059
rateLimit?.retryAfterMs ?? 60_000;
@@ -2085,11 +2160,14 @@ let sessionAffinityWriteVersion = 0;
20852160
let responseForSuccess = response;
20862161
if (isStreaming) {
20872162
const streamFallbackCandidateOrder =
2088-
buildStreamFailoverCandidateOrder(
2163+
buildAdaptiveStreamFailoverCandidateOrder(
20892164
account.index,
2090-
accountSnapshotList.map(
2091-
(candidate) => candidate.index,
2092-
),
2165+
accountSnapshotList as Array<
2166+
Pick<
2167+
import("./lib/accounts.js").ManagedAccount,
2168+
"index" | "lastUsed" | "enabled" | "coolingDownUntil" | "rateLimitResetTimes"
2169+
>
2170+
>,
20932171
);
20942172
runtimeMetrics.lastStreamFailoverCandidateCount =
20952173
streamFallbackCandidateOrder.length;
@@ -2098,12 +2176,14 @@ let sessionAffinityWriteVersion = 0;
20982176
logDebug(
20992177
"Prepared stream failover candidates.",
21002178
{
2179+
requestTraceId,
21012180
primaryAccountIndex: account.index,
21022181
candidateCount:
21032182
streamFallbackCandidateOrder.length,
21042183
candidateIndices: streamFallbackCandidateOrder,
21052184
},
21062185
);
2186+
syncRuntimeObservability(requestTraceId);
21072187
responseForSuccess = withStreamingFailover(
21082188
response,
21092189
async (failoverAttempt, emittedBytes) => {
@@ -2136,13 +2216,15 @@ let sessionAffinityWriteVersion = 0;
21362216
fallbackAccount,
21372217
) as OAuthAuthDetails;
21382218
try {
2139-
if (
2140-
shouldRefreshToken(
2141-
fallbackAuth,
2142-
tokenRefreshSkewMs,
2143-
)
2144-
) {
2145-
fallbackAuth = (await refreshAndUpdateToken(
2219+
if (
2220+
shouldRefreshToken(
2221+
fallbackAuth,
2222+
tokenRefreshSkewMs,
2223+
)
2224+
) {
2225+
runtimeMetrics.authRefreshRequests += 1;
2226+
syncRuntimeObservability(requestTraceId);
2227+
fallbackAuth = (await refreshAndUpdateToken(
21462228
fallbackAuth,
21472229
client,
21482230
)) as OAuthAuthDetails;
@@ -2278,6 +2360,8 @@ let sessionAffinityWriteVersion = 0;
22782360

22792361
try {
22802362
runtimeMetrics.totalRequests++;
2363+
runtimeMetrics.responsesRequests++;
2364+
syncRuntimeObservability(requestTraceId);
22812365
const fallbackResponse = await fetch(
22822366
url,
22832367
applyProxyCompatibleInit(url, {
@@ -2507,6 +2591,24 @@ let sessionAffinityWriteVersion = 0;
25072591
);
25082592
break;
25092593
}
2594+
if (response.status >= 500) {
2595+
const serverBurstCooldownUntil =
2596+
recordServerBurstFailure(account.index);
2597+
if (serverBurstCooldownUntil > 0) {
2598+
runtimeMetrics.serverBurstFastFails += 1;
2599+
runtimeMetrics.lastError =
2600+
"Repeated upstream server errors across the account pool";
2601+
syncRuntimeObservability(requestTraceId);
2602+
return new Response(
2603+
JSON.stringify({
2604+
error: {
2605+
message: `Upstream server failures were observed across multiple accounts. Pausing retries for ${formatWaitTime(serverBurstCooldownUntil - Date.now())}. Check \`codex auth report --json\` for runtime metrics.`,
2606+
},
2607+
}),
2608+
{ status: 503, headers: { "content-type": "application/json; charset=utf-8" } },
2609+
);
2610+
}
2611+
}
25102612
logWarn(
25112613
`Empty response after ${emptyResponseMaxRetries} retries. Returning as-is.`,
25122614
);
@@ -2558,6 +2660,9 @@ let sessionAffinityWriteVersion = 0;
25582660
lastCodexCliActiveSyncIndex =
25592661
successAccountForResponse.index;
25602662
}
2663+
clearPoolExhaustionCooldown();
2664+
clearServerBurstCooldown();
2665+
syncRuntimeObservability(requestTraceId);
25612666
return successResponse;
25622667
}
25632668
if (retryNextAccountBeforeFallback) {
@@ -2603,26 +2708,31 @@ let sessionAffinityWriteVersion = 0;
26032708
continue;
26042709
}
26052710

2606-
const waitLabel =
2607-
waitMs > 0 ? formatWaitTime(waitMs) : "a bit";
2608-
const message =
2609-
count === 0
2610-
? "No Codex accounts configured. Run `codex login`."
2611-
: waitMs > 0
2612-
? `All ${count} account(s) are rate-limited. Try again in ${waitLabel} or add another account with \`codex login\`.`
2613-
: `All ${count} account(s) failed (server errors or auth issues). Check account health with \`codex-health\`.`;
2614-
runtimeMetrics.failedRequests++;
2615-
runtimeMetrics.lastError = message;
2616-
return new Response(JSON.stringify({ error: { message } }), {
2711+
const waitLabel =
2712+
waitMs > 0 ? formatWaitTime(waitMs) : "a bit";
2713+
if (count > 0 && waitMs > 0) {
2714+
armPoolExhaustionCooldown(waitMs);
2715+
}
2716+
const message =
2717+
count === 0
2718+
? "No Codex accounts configured. Run `codex login`."
2719+
: waitMs > 0
2720+
? `All ${count} account(s) are rate-limited. A short pool cooldown is now active for ${waitLabel}. Try again later or inspect \`codex auth status\`.`
2721+
: `All ${count} account(s) failed (server errors or auth issues). Check account health with \`codex auth report --json\`.`;
2722+
runtimeMetrics.failedRequests++;
2723+
runtimeMetrics.lastError = message;
2724+
syncRuntimeObservability(requestTraceId);
2725+
return new Response(JSON.stringify({ error: { message } }), {
26172726
status: waitMs > 0 ? 429 : 503,
26182727
headers: {
26192728
"content-type": "application/json; charset=utf-8",
26202729
},
26212730
});
26222731
}
2623-
} finally {
2624-
clearCorrelationId();
2625-
}
2732+
} finally {
2733+
syncRuntimeObservability(null);
2734+
clearCorrelationId();
2735+
}
26262736
},
26272737
};
26282738
} finally {

0 commit comments

Comments
 (0)