@@ -152,10 +152,18 @@ import {
152152 parseFailoverMode ,
153153} from "./lib/request/failover-config.js" ;
154154import {
155- buildStreamFailoverCandidateOrder ,
156155 capStreamFailoverMax ,
157156 computeOutboundRequestAttemptBudget ,
158157} from "./lib/request/request-attempt-budget.js" ;
158+ import {
159+ armPoolExhaustionCooldown ,
160+ buildAdaptiveStreamFailoverCandidateOrder ,
161+ clearPoolExhaustionCooldown ,
162+ clearServerBurstCooldown ,
163+ getPoolExhaustionCooldownRemaining ,
164+ getServerBurstCooldownRemaining ,
165+ recordServerBurstFailure ,
166+ } from "./lib/request/request-resilience.js" ;
159167import {
160168 evaluateFailurePolicy ,
161169 type FailoverMode ,
@@ -239,6 +247,7 @@ import {
239247 ensureRefreshGuardianState ,
240248 ensureSessionAffinityState ,
241249} from "./lib/runtime/runtime-services.js" ;
250+ import { mutateRuntimeObservabilitySnapshot } from "./lib/runtime/runtime-observability.js" ;
242251import { applyAccountStorageScopeFromConfig } from "./lib/runtime/storage-scope.js" ;
243252import { showRuntimeToast } from "./lib/runtime/toast.js" ;
244253import { createRuntimeSessionRecoveryHook } from "./lib/runtime/session-recovery.js" ;
@@ -353,9 +362,14 @@ let sessionAffinityWriteVersion = 0;
353362 totalRequests : number ;
354363 successfulRequests : number ;
355364 failedRequests : number ;
365+ responsesRequests : number ;
366+ authRefreshRequests : number ;
367+ diagnosticProbeRequests : number ;
356368 outboundRequestAttemptBudget : number | null ;
357369 outboundRequestAttemptsConsumed : number ;
358370 requestAttemptBudgetExhaustions : number ;
371+ poolExhaustionFastFails : number ;
372+ serverBurstFastFails : number ;
359373 rateLimitedResponses : number ;
360374 serverErrors : number ;
361375 networkErrors : number ;
@@ -379,9 +393,14 @@ let sessionAffinityWriteVersion = 0;
379393 totalRequests : 0 ,
380394 successfulRequests : 0 ,
381395 failedRequests : 0 ,
396+ responsesRequests : 0 ,
397+ authRefreshRequests : 0 ,
398+ diagnosticProbeRequests : 0 ,
382399 outboundRequestAttemptBudget : null ,
383400 outboundRequestAttemptsConsumed : 0 ,
384401 requestAttemptBudgetExhaustions : 0 ,
402+ poolExhaustionFastFails : 0 ,
403+ serverBurstFastFails : 0 ,
385404 rateLimitedResponses : 0 ,
386405 serverErrors : 0 ,
387406 networkErrors : 0 ,
@@ -422,6 +441,23 @@ let sessionAffinityWriteVersion = 0;
422441 findMatchingAccountIndex,
423442 modelFamilies : MODEL_FAMILIES ,
424443 } ) ;
444+ const syncRuntimeObservability = ( requestId : string | null ) : void => {
445+ mutateRuntimeObservabilitySnapshot ( ( snapshot ) => {
446+ snapshot . currentRequestId = requestId ;
447+ snapshot . poolExhaustionCooldownUntil =
448+ getPoolExhaustionCooldownRemaining ( ) > 0
449+ ? Date . now ( ) + getPoolExhaustionCooldownRemaining ( )
450+ : null ;
451+ snapshot . serverBurstCooldownUntil =
452+ getServerBurstCooldownRemaining ( ) > 0
453+ ? Date . now ( ) + getServerBurstCooldownRemaining ( )
454+ : null ;
455+ snapshot . responsesRequests = runtimeMetrics . responsesRequests ;
456+ snapshot . authRefreshRequests = runtimeMetrics . authRefreshRequests ;
457+ snapshot . diagnosticProbeRequests = runtimeMetrics . diagnosticProbeRequests ;
458+ snapshot . runtimeMetrics = { ...runtimeMetrics } ;
459+ } ) ;
460+ } ;
425461 const persistAccountPoolAndFlagged = async (
426462 results : TokenSuccessWithAccount [ ] ,
427463 flaggedStorage : Parameters < typeof saveFlaggedAccounts > [ 0 ] ,
@@ -929,31 +965,65 @@ let sessionAffinityWriteVersion = 0;
929965 sessionAffinityKey ,
930966 ) ;
931967 sessionAffinityStore ?. prune ( ) ;
932- const requestCorrelationId = setCorrelationId (
933- threadIdCandidate
934- ? `${ threadIdCandidate } :${ Date . now ( ) } `
935- : undefined ,
936- ) ;
937- runtimeMetrics . lastRequestAt = Date . now ( ) ;
968+ const requestCorrelationId = setCorrelationId (
969+ threadIdCandidate
970+ ? `${ threadIdCandidate } :${ Date . now ( ) } `
971+ : undefined ,
972+ ) ;
973+ const requestTraceId = requestCorrelationId ;
974+ runtimeMetrics . lastRequestAt = Date . now ( ) ;
975+ syncRuntimeObservability ( requestTraceId ) ;
938976
939977 const abortSignal = requestInit ?. signal ?? init ?. signal ?? null ;
940978 const sleep = createAbortableSleep ( abortSignal ) ;
979+ const poolCooldownRemainingMs = getPoolExhaustionCooldownRemaining ( ) ;
980+ if ( poolCooldownRemainingMs > 0 ) {
981+ runtimeMetrics . failedRequests += 1 ;
982+ runtimeMetrics . poolExhaustionFastFails += 1 ;
983+ runtimeMetrics . lastError = "Pool exhaustion cooldown active" ;
984+ syncRuntimeObservability ( requestTraceId ) ;
985+ return new Response (
986+ JSON . stringify ( {
987+ error : {
988+ message : `The account pool is cooling down after recent rate-limit exhaustion. Try again in ${ formatWaitTime ( poolCooldownRemainingMs ) } or inspect \`codex auth status\`.` ,
989+ } ,
990+ } ) ,
991+ { status : 429 , headers : { "content-type" : "application/json; charset=utf-8" } } ,
992+ ) ;
993+ }
994+ const serverBurstCooldownRemainingMs = getServerBurstCooldownRemaining ( ) ;
995+ if ( serverBurstCooldownRemainingMs > 0 ) {
996+ runtimeMetrics . failedRequests += 1 ;
997+ runtimeMetrics . serverBurstFastFails += 1 ;
998+ runtimeMetrics . lastError = "Server burst cooldown active" ;
999+ syncRuntimeObservability ( requestTraceId ) ;
1000+ return new Response (
1001+ JSON . stringify ( {
1002+ error : {
1003+ message : `Multiple accounts recently failed with upstream server errors. Try again in ${ formatWaitTime ( serverBurstCooldownRemainingMs ) } or inspect \`codex auth report --json\`.` ,
1004+ } ,
1005+ } ) ,
1006+ { status : 503 , headers : { "content-type" : "application/json; charset=utf-8" } } ,
1007+ ) ;
1008+ }
9411009 const maxOutboundRequestAttempts =
9421010 computeOutboundRequestAttemptBudget ( {
9431011 accountCount : accountManager . getAccountCount ( ) ,
9441012 maxSameAccountRetries,
9451013 emptyResponseMaxRetries,
9461014 streamFailoverMax,
9471015 } ) ;
948- runtimeMetrics . outboundRequestAttemptBudget =
949- maxOutboundRequestAttempts ;
1016+ runtimeMetrics . outboundRequestAttemptBudget ?? =
1017+ maxOutboundRequestAttempts ;
9501018 logDebug ( "Configured outbound request attempt budget." , {
1019+ requestTraceId,
9511020 budget : maxOutboundRequestAttempts ,
9521021 accountCount : accountManager . getAccountCount ( ) ,
9531022 maxSameAccountRetries,
9541023 emptyResponseMaxRetries,
9551024 streamFailoverMax,
9561025 } ) ;
1026+ syncRuntimeObservability ( requestTraceId ) ;
9571027 let outboundRequestAttemptsRemaining =
9581028 maxOutboundRequestAttempts ;
9591029 const tryConsumeOutboundRequestAttempt = (
@@ -964,17 +1034,18 @@ let sessionAffinityWriteVersion = 0;
9641034 runtimeMetrics . requestAttemptBudgetExhaustions += 1 ;
9651035 runtimeMetrics . lastError =
9661036 `Request attempt budget exhausted after ${ maxOutboundRequestAttempts } outbound request(s)` ;
967- logWarn (
968- "Request attempt budget exhausted." ,
969- {
970- reason,
971- accountIndex,
972- budget : maxOutboundRequestAttempts ,
973- consumed :
974- runtimeMetrics . outboundRequestAttemptsConsumed ,
975- } ,
976- ) ;
977- return false ;
1037+ logWarn (
1038+ "Request attempt budget exhausted." ,
1039+ {
1040+ requestTraceId,
1041+ reason,
1042+ accountIndex,
1043+ budget : maxOutboundRequestAttempts ,
1044+ consumed : maxOutboundRequestAttempts ,
1045+ } ,
1046+ ) ;
1047+ syncRuntimeObservability ( requestTraceId ) ;
1048+ return false ;
9781049 }
9791050
9801051 runtimeMetrics . outboundRequestAttemptsConsumed += 1 ;
@@ -1096,6 +1167,8 @@ let sessionAffinityWriteVersion = 0;
10961167 ) as OAuthAuthDetails ;
10971168 try {
10981169 if ( shouldRefreshToken ( accountAuth , tokenRefreshSkewMs ) ) {
1170+ runtimeMetrics . authRefreshRequests += 1 ;
1171+ syncRuntimeObservability ( requestTraceId ) ;
10991172 accountAuth = ( await refreshAndUpdateToken (
11001173 accountAuth ,
11011174 client ,
@@ -1409,6 +1482,8 @@ let sessionAffinityWriteVersion = 0;
14091482
14101483 try {
14111484 runtimeMetrics . totalRequests ++ ;
1485+ runtimeMetrics . responsesRequests ++ ;
1486+ syncRuntimeObservability ( requestTraceId ) ;
14121487 response = await fetch (
14131488 url ,
14141489 applyProxyCompatibleInit ( url , {
@@ -1978,7 +2053,7 @@ let sessionAffinityWriteVersion = 0;
19782053 break ;
19792054 }
19802055
1981- if ( errorResponse . status === 429 && rateLimit ) {
2056+ if ( errorResponse . status === 429 && rateLimit ) {
19822057 runtimeMetrics . rateLimitedResponses ++ ;
19832058 const retryAfterMs =
19842059 rateLimit ?. retryAfterMs ?? 60_000 ;
@@ -2085,11 +2160,14 @@ let sessionAffinityWriteVersion = 0;
20852160 let responseForSuccess = response ;
20862161 if ( isStreaming ) {
20872162 const streamFallbackCandidateOrder =
2088- buildStreamFailoverCandidateOrder (
2163+ buildAdaptiveStreamFailoverCandidateOrder (
20892164 account . index ,
2090- accountSnapshotList . map (
2091- ( candidate ) => candidate . index ,
2092- ) ,
2165+ accountSnapshotList as Array <
2166+ Pick <
2167+ import ( "./lib/accounts.js" ) . ManagedAccount ,
2168+ "index" | "lastUsed" | "enabled" | "coolingDownUntil" | "rateLimitResetTimes"
2169+ >
2170+ > ,
20932171 ) ;
20942172 runtimeMetrics . lastStreamFailoverCandidateCount =
20952173 streamFallbackCandidateOrder . length ;
@@ -2098,12 +2176,14 @@ let sessionAffinityWriteVersion = 0;
20982176 logDebug (
20992177 "Prepared stream failover candidates." ,
21002178 {
2179+ requestTraceId,
21012180 primaryAccountIndex : account . index ,
21022181 candidateCount :
21032182 streamFallbackCandidateOrder . length ,
21042183 candidateIndices : streamFallbackCandidateOrder ,
21052184 } ,
21062185 ) ;
2186+ syncRuntimeObservability ( requestTraceId ) ;
21072187 responseForSuccess = withStreamingFailover (
21082188 response ,
21092189 async ( failoverAttempt , emittedBytes ) => {
@@ -2136,13 +2216,15 @@ let sessionAffinityWriteVersion = 0;
21362216 fallbackAccount ,
21372217 ) as OAuthAuthDetails ;
21382218 try {
2139- if (
2140- shouldRefreshToken (
2141- fallbackAuth ,
2142- tokenRefreshSkewMs ,
2143- )
2144- ) {
2145- fallbackAuth = ( await refreshAndUpdateToken (
2219+ if (
2220+ shouldRefreshToken (
2221+ fallbackAuth ,
2222+ tokenRefreshSkewMs ,
2223+ )
2224+ ) {
2225+ runtimeMetrics . authRefreshRequests += 1 ;
2226+ syncRuntimeObservability ( requestTraceId ) ;
2227+ fallbackAuth = ( await refreshAndUpdateToken (
21462228 fallbackAuth ,
21472229 client ,
21482230 ) ) as OAuthAuthDetails ;
@@ -2278,6 +2360,8 @@ let sessionAffinityWriteVersion = 0;
22782360
22792361 try {
22802362 runtimeMetrics . totalRequests ++ ;
2363+ runtimeMetrics . responsesRequests ++ ;
2364+ syncRuntimeObservability ( requestTraceId ) ;
22812365 const fallbackResponse = await fetch (
22822366 url ,
22832367 applyProxyCompatibleInit ( url , {
@@ -2507,6 +2591,24 @@ let sessionAffinityWriteVersion = 0;
25072591 ) ;
25082592 break ;
25092593 }
2594+ if ( response . status >= 500 ) {
2595+ const serverBurstCooldownUntil =
2596+ recordServerBurstFailure ( account . index ) ;
2597+ if ( serverBurstCooldownUntil > 0 ) {
2598+ runtimeMetrics . serverBurstFastFails += 1 ;
2599+ runtimeMetrics . lastError =
2600+ "Repeated upstream server errors across the account pool" ;
2601+ syncRuntimeObservability ( requestTraceId ) ;
2602+ return new Response (
2603+ JSON . stringify ( {
2604+ error : {
2605+ message : `Upstream server failures were observed across multiple accounts. Pausing retries for ${ formatWaitTime ( serverBurstCooldownUntil - Date . now ( ) ) } . Check \`codex auth report --json\` for runtime metrics.` ,
2606+ } ,
2607+ } ) ,
2608+ { status : 503 , headers : { "content-type" : "application/json; charset=utf-8" } } ,
2609+ ) ;
2610+ }
2611+ }
25102612 logWarn (
25112613 `Empty response after ${ emptyResponseMaxRetries } retries. Returning as-is.` ,
25122614 ) ;
@@ -2558,6 +2660,9 @@ let sessionAffinityWriteVersion = 0;
25582660 lastCodexCliActiveSyncIndex =
25592661 successAccountForResponse . index ;
25602662 }
2663+ clearPoolExhaustionCooldown ( ) ;
2664+ clearServerBurstCooldown ( ) ;
2665+ syncRuntimeObservability ( requestTraceId ) ;
25612666 return successResponse ;
25622667 }
25632668 if ( retryNextAccountBeforeFallback ) {
@@ -2603,26 +2708,31 @@ let sessionAffinityWriteVersion = 0;
26032708 continue ;
26042709 }
26052710
2606- const waitLabel =
2607- waitMs > 0 ? formatWaitTime ( waitMs ) : "a bit" ;
2608- const message =
2609- count === 0
2610- ? "No Codex accounts configured. Run `codex login`."
2611- : waitMs > 0
2612- ? `All ${ count } account(s) are rate-limited. Try again in ${ waitLabel } or add another account with \`codex login\`.`
2613- : `All ${ count } account(s) failed (server errors or auth issues). Check account health with \`codex-health\`.` ;
2614- runtimeMetrics . failedRequests ++ ;
2615- runtimeMetrics . lastError = message ;
2616- return new Response ( JSON . stringify ( { error : { message } } ) , {
2711+ const waitLabel =
2712+ waitMs > 0 ? formatWaitTime ( waitMs ) : "a bit" ;
2713+ if ( count > 0 && waitMs > 0 ) {
2714+ armPoolExhaustionCooldown ( waitMs ) ;
2715+ }
2716+ const message =
2717+ count === 0
2718+ ? "No Codex accounts configured. Run `codex login`."
2719+ : waitMs > 0
2720+ ? `All ${ count } account(s) are rate-limited. A short pool cooldown is now active for ${ waitLabel } . Try again later or inspect \`codex auth status\`.`
2721+ : `All ${ count } account(s) failed (server errors or auth issues). Check account health with \`codex auth report --json\`.` ;
2722+ runtimeMetrics . failedRequests ++ ;
2723+ runtimeMetrics . lastError = message ;
2724+ syncRuntimeObservability ( requestTraceId ) ;
2725+ return new Response ( JSON . stringify ( { error : { message } } ) , {
26172726 status : waitMs > 0 ? 429 : 503 ,
26182727 headers : {
26192728 "content-type" : "application/json; charset=utf-8" ,
26202729 } ,
26212730 } ) ;
26222731 }
2623- } finally {
2624- clearCorrelationId ( ) ;
2625- }
2732+ } finally {
2733+ syncRuntimeObservability ( null ) ;
2734+ clearCorrelationId ( ) ;
2735+ }
26262736 } ,
26272737 } ;
26282738 } finally {
0 commit comments