-
Notifications
You must be signed in to change notification settings - Fork 95
Retry mechanism with retry breaking system #797
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
sosek108
wants to merge
9
commits into
Expensify:main
Choose a base branch
from
callstack-internal:feat/revised-retry-mechanism
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+788
−196
Open
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
02787c2
retry mechanism with retry breaking system
sosek108 624cd66
introduce storage circuit breaker
sosek108 1ffe796
updated api doc
sosek108 8bc993c
prettier fix
sosek108 406de55
Merge branch 'main' into feat/revised-retry-mechanism
sosek108 23bc519
move error classification to providers
sosek108 cd11e1c
record write success
sosek108 50a3ec6
prettier fixes
sosek108 edc6ec8
API docs
sosek108 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| import * as Logger from './Logger'; | ||
|
|
||
| /** | ||
| * Process-scoped circuit breaker for storage CAPACITY failures. | ||
| * | ||
| * The per-operation retry budget in `OnyxUtils.retryOperation` cannot stop a session-level storm: | ||
| * each evict -> OnyxDerived recompute -> new write starts its own fresh budget, so a full disk or | ||
| * exhausted quota can drive tens of thousands of evict+retry cycles that never make progress and | ||
| * freeze the app. This breaker is the session-level brake — `retryOperation` consults it before | ||
| * every eviction. | ||
| * | ||
| * It trips when EITHER: | ||
| * - capacity failures within {@link ROLLING_WINDOW_MS} exceed {@link FAILURE_THRESHOLD}, or | ||
| * - {@link NO_PROGRESS_CAP} consecutive evictions are each immediately followed by another capacity | ||
| * failure (the eviction freed nothing the next write could use — a no-progress cycle). This is a | ||
| * cheap proxy for `getDatabaseSize()`, which is costly and only reports origin-wide usage. | ||
| * | ||
| * On trip it emits exactly ONE alert and self-resets once the rolling window clears, so a persistent | ||
| * condition produces at most one alert per window instead of one log line per failed write. | ||
| */ | ||
|
|
||
| /** Rolling window over which capacity failures are counted, and how long a trip stays open. */ | ||
| const ROLLING_WINDOW_MS = 60 * 1000; | ||
|
|
||
| /** Capacity failures within the window above which the breaker trips (storm backstop). */ | ||
| const FAILURE_THRESHOLD = 50; | ||
|
|
||
| /** Consecutive no-progress evictions (evict -> still capacity failure) above which the breaker trips. */ | ||
| const NO_PROGRESS_CAP = 5; | ||
|
|
||
| let failureTimestamps: number[] = []; | ||
| let consecutiveNoProgressEvictions = 0; | ||
| let evictionAwaitingResult = false; | ||
| let trippedUntil = 0; | ||
|
|
||
| function reset(): void { | ||
| failureTimestamps = []; | ||
| consecutiveNoProgressEvictions = 0; | ||
| evictionAwaitingResult = false; | ||
| trippedUntil = 0; | ||
| } | ||
|
|
||
| /** Whether the breaker is currently open. Self-resets once the window since the trip has cleared. */ | ||
| function isTripped(): boolean { | ||
| if (trippedUntil === 0) { | ||
| return false; | ||
| } | ||
| if (Date.now() >= trippedUntil) { | ||
| reset(); | ||
| return false; | ||
| } | ||
| return true; | ||
| } | ||
|
|
||
| function trip(reason: string): void { | ||
| trippedUntil = Date.now() + ROLLING_WINDOW_MS; | ||
| Logger.logAlert(`Storage circuit breaker tripped: ${reason}. Halting eviction/retry for ${ROLLING_WINDOW_MS / 1000}s to stop a storage failure storm.`); | ||
| } | ||
|
|
||
| /** | ||
| * Record a CAPACITY failure. Call once per capacity failure in `retryOperation`, BEFORE deciding | ||
| * whether to evict; then check {@link isTripped} to decide whether to proceed. | ||
| */ | ||
| function recordCapacityFailure(): void { | ||
| // While open, recording is a no-op: no extra timestamps, no second alert, and nothing to keep the | ||
| // window from clearing. `isTripped()` self-resets here once the window has elapsed. | ||
| if (isTripped()) { | ||
| return; | ||
| } | ||
|
|
||
| const now = Date.now(); | ||
| failureTimestamps = failureTimestamps.filter((timestamp) => now - timestamp < ROLLING_WINDOW_MS); | ||
|
|
||
| // A fresh storm (nothing left in the window) resets the no-progress tracking so a stale eviction | ||
| // from an earlier, unrelated incident can't be miscounted as no-progress for this one. | ||
| if (failureTimestamps.length === 0) { | ||
| consecutiveNoProgressEvictions = 0; | ||
| evictionAwaitingResult = false; | ||
| } | ||
|
|
||
| // We evicted on the previous cycle and we're back here with another capacity failure, so that | ||
| // eviction freed no usable space. | ||
| if (evictionAwaitingResult) { | ||
| consecutiveNoProgressEvictions += 1; | ||
| evictionAwaitingResult = false; | ||
| } | ||
|
|
||
| failureTimestamps.push(now); | ||
|
|
||
| if (failureTimestamps.length > FAILURE_THRESHOLD) { | ||
| trip(`${failureTimestamps.length} capacity failures within ${ROLLING_WINDOW_MS / 1000}s`); | ||
| return; | ||
| } | ||
| if (consecutiveNoProgressEvictions >= NO_PROGRESS_CAP) { | ||
| trip(`${consecutiveNoProgressEvictions} consecutive evictions freed no usable space`); | ||
| } | ||
| } | ||
|
|
||
| /** Record that `retryOperation` just evicted a key, so the next capacity failure counts as no-progress. */ | ||
| function recordEviction(): void { | ||
| evictionAwaitingResult = true; | ||
| } | ||
|
|
||
| /** | ||
| * Record that a storage write SUCCEEDED. If an eviction was awaiting its verdict, the eviction freed | ||
| * usable space — so it must NOT later be miscounted as a no-progress cycle by the next capacity | ||
| * failure. Clear the pending flag and reset the consecutive no-progress streak (a success breaks the | ||
| * streak). No-op when no eviction is pending (the common case), so it's cheap to call on every write. | ||
| */ | ||
| function recordWriteSuccess(): void { | ||
| if (!evictionAwaitingResult) { | ||
| return; | ||
| } | ||
| evictionAwaitingResult = false; | ||
| consecutiveNoProgressEvictions = 0; | ||
| } | ||
|
|
||
| const StorageCircuitBreaker = {recordCapacityFailure, recordEviction, recordWriteSuccess, isTripped, reset, ROLLING_WINDOW_MS, FAILURE_THRESHOLD, NO_PROGRESS_CAP}; | ||
|
|
||
| export default StorageCircuitBreaker; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| import type {ValueOf} from 'type-fest'; | ||
|
|
||
| /** | ||
| * Shared vocabulary for storage write failures. The *classes* are engine-agnostic; the *matching* | ||
| * is not — each storage provider knows its own error dialect and owns its classifier (see each | ||
| * provider's `classifyError`). This module deliberately holds NO string matchers: it is the common | ||
| * taxonomy the two reacting layers agree on, while the per-engine knowledge lives with the engine. | ||
| * | ||
| * - the connection layer (`createStore`) recovers TRANSIENT and FATAL errors by reopening the DB, and | ||
| * - the operation layer (`OnyxUtils.retryOperation`) recovers CAPACITY by eviction and retries UNKNOWN. | ||
| * | ||
| * This module has no Onyx dependencies (and no engine dependencies) so it can live in the storage | ||
| * layer, and be imported by every provider, without creating an import cycle. | ||
| */ | ||
| const StorageErrorClass = { | ||
| /** Connection/transport failure (stale connection). Owner: connection layer — reopen + retry once. */ | ||
| TRANSIENT: 'transient', | ||
| /** Quota exceeded / disk full. Owner: operation layer — evict and retry. */ | ||
| CAPACITY: 'capacity', | ||
| /** Non-serializable payload. Never retriable — the same data will always fail. */ | ||
| INVALID_DATA: 'invalidData', | ||
| /** Backing-store corruption. Owner: connection layer — budgeted heal, then give up. */ | ||
| FATAL: 'fatal', | ||
| /** Unmatched by the active provider. Owner: operation layer — bounded retry, and log the shape so | ||
| * recurring cases can be promoted into one of the classes above. */ | ||
| UNKNOWN: 'unknown', | ||
| } as const; | ||
|
|
||
| type StorageErrorClassValue = ValueOf<typeof StorageErrorClass>; | ||
|
|
||
| /** | ||
| * Normalizes any thrown value into a lowercased `{name, message}` pair for matching. Shared by every | ||
| * provider's classifier so they all extract the error the same way. | ||
| */ | ||
| function getErrorParts(error: unknown): {name: string; message: string} { | ||
| if (error instanceof Error || error instanceof DOMException) { | ||
| return {name: (error.name ?? '').toLowerCase(), message: (error.message ?? '').toLowerCase()}; | ||
| } | ||
| return {name: '', message: String(error ?? '').toLowerCase()}; | ||
| } | ||
|
|
||
| export {StorageErrorClass, getErrorParts}; | ||
| export type {StorageErrorClassValue}; | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| import {StorageErrorClass, getErrorParts} from '../../errors'; | ||
| import type {StorageErrorClassValue} from '../../errors'; | ||
|
|
||
| /** | ||
| * Classifies an IndexedDB write failure into the shared storage taxonomy (lib/storage/errors.ts). | ||
| * Matching is done on the lowercased error name and message. This is the IndexedDB engine's own | ||
| * dialect — it is NOT shared with other engines. | ||
| */ | ||
| function classifyIDBError(error: unknown): StorageErrorClassValue { | ||
| const {name, message} = getErrorParts(error); | ||
|
|
||
| // Non-serializable data passed to IDBObjectStore.put — retrying is futile. | ||
| if (message.includes("failed to execute 'put' on 'idbobjectstore'")) { | ||
| return StorageErrorClass.INVALID_DATA; | ||
| } | ||
|
|
||
| // Browser quota exceeded. | ||
| if (name.includes('quotaexceedederror') || message.includes('quotaexceedederror')) { | ||
| return StorageErrorClass.CAPACITY; | ||
| } | ||
|
|
||
| // Backing-store corruption (Chromium LevelDB). Recoverable only via a budgeted reopen. | ||
| if (message.includes('internal error opening backing store')) { | ||
| return StorageErrorClass.FATAL; | ||
| } | ||
|
|
||
| // Transient connection/transport failures — the cached connection is stale and a reopen fixes it: | ||
| // - InvalidStateError: connection closed between getDB() resolving and db.transaction(). | ||
| // - AbortError: write transaction aborted (connection close / versionchange / sibling abort). | ||
| // - Safari/WebKit IDB server termination for backgrounded tabs. | ||
| if ( | ||
| name.includes('invalidstateerror') || | ||
| name.includes('aborterror') || | ||
| message.includes('connection to indexed database server lost') || | ||
| message.includes('connection is closing') || | ||
| // This is related to https://github.com/Expensify/react-native-onyx/pull/796 — remove this comment when #796 is merged. | ||
| message.includes('idb write transaction aborted without an error') | ||
| ) { | ||
| return StorageErrorClass.TRANSIENT; | ||
| } | ||
|
|
||
| return StorageErrorClass.UNKNOWN; | ||
| } | ||
|
|
||
| export default classifyIDBError; |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NAB: I think
ValueOf<typeof StorageErrorClass>is actually a touch clearer at the callsites than this named alias.