Skip to content

Commit 36c416f

Browse files
Rotate V8 isolate on heap growth or fragmentation (#4684)
# Description of Changes While testing #4663, I discovered the server would crash from a V8 out of memory error after processing many requests. Before #4663, this would not happen. I theorized that because we now have a single JS worker that can process an unbounded number of reducer calls over its lifetime, any V8 heap retention that would previously have been spread across several pooled isolates now accumulates in one isolate. This patch now periodically collects heap statistics and forces GC or replaces the isolate if memory cannot be reclaimed. This greatly reduces the risk of hitting the V8 heap limit and crashing the server. It doesn't remove the risk entirely however. But this risk was still present before we switched to a single worker model in #4663. In order to remove the risk of crashing the server entirely, we would need to run V8 in a separate process. # API and ABI breaking changes None # Expected complexity level and risk 3 # Testing TBD
1 parent 5c6f308 commit 36c416f

12 files changed

Lines changed: 472 additions & 43 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ hex.workspace = true
6464
hostname.workspace = true
6565
http.workspace = true
6666
http-body-util.workspace = true
67+
humantime.workspace = true
6768
hyper.workspace = true
6869
imara-diff.workspace = true
6970
indexmap.workspace = true

crates/core/src/config.rs

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
use std::path::Path;
2+
use std::time::Duration;
23
use std::{fmt, io};
34

5+
use serde::Deserialize;
46
use spacetimedb_lib::ConnectionId;
57
use spacetimedb_paths::cli::{ConfigDir, PrivKeyPath, PubKeyPath};
68
use spacetimedb_paths::server::{ConfigToml, MetadataTomlPath};
@@ -131,6 +133,8 @@ pub struct ConfigFile {
131133
pub certificate_authority: Option<CertificateAuthority>,
132134
#[serde(default)]
133135
pub logs: LogConfig,
136+
#[serde(default)]
137+
pub v8_heap_policy: V8HeapPolicyConfig,
134138
}
135139

136140
impl ConfigFile {
@@ -169,6 +173,139 @@ pub struct LogConfig {
169173
pub directives: Vec<String>,
170174
}
171175

176+
#[derive(Clone, Copy, Debug, serde::Deserialize)]
177+
#[serde(rename_all = "kebab-case")]
178+
pub struct V8HeapPolicyConfig {
179+
#[serde(default = "def_req_interval", deserialize_with = "de_nz_u64")]
180+
pub heap_check_request_interval: Option<u64>,
181+
#[serde(default = "def_time_interval", deserialize_with = "de_nz_duration")]
182+
pub heap_check_time_interval: Option<Duration>,
183+
#[serde(default = "def_gc_trigger", deserialize_with = "de_fraction")]
184+
pub heap_gc_trigger_fraction: f64,
185+
#[serde(default = "def_retire", deserialize_with = "de_fraction")]
186+
pub heap_retire_fraction: f64,
187+
#[serde(default, rename = "heap-limit-mb", deserialize_with = "de_limit_mb")]
188+
pub heap_limit_bytes: Option<usize>,
189+
}
190+
191+
impl Default for V8HeapPolicyConfig {
192+
fn default() -> Self {
193+
Self {
194+
heap_check_request_interval: def_req_interval(),
195+
heap_check_time_interval: def_time_interval(),
196+
heap_gc_trigger_fraction: def_gc_trigger(),
197+
heap_retire_fraction: def_retire(),
198+
heap_limit_bytes: None,
199+
}
200+
}
201+
}
202+
203+
impl V8HeapPolicyConfig {
204+
pub fn normalized(mut self) -> Self {
205+
if self.heap_retire_fraction < self.heap_gc_trigger_fraction {
206+
log::warn!(
207+
"v8-heap-policy.heap-retire-fraction ({}) is below \
208+
v8-heap-policy.heap-gc-trigger-fraction ({}); using the GC trigger fraction for both",
209+
self.heap_retire_fraction,
210+
self.heap_gc_trigger_fraction,
211+
);
212+
self.heap_retire_fraction = self.heap_gc_trigger_fraction;
213+
}
214+
215+
self
216+
}
217+
}
218+
219+
/// Default number of requests between V8 heap checks.
220+
fn def_req_interval() -> Option<u64> {
221+
Some(65_536)
222+
}
223+
224+
/// Default wall-clock interval between V8 heap checks.
225+
fn def_time_interval() -> Option<Duration> {
226+
Some(Duration::from_secs(30))
227+
}
228+
229+
/// Default heap fill fraction that triggers a GC.
230+
fn def_gc_trigger() -> f64 {
231+
0.67
232+
}
233+
234+
/// Default heap fill fraction that retires the worker after a GC.
235+
fn def_retire() -> f64 {
236+
0.75
237+
}
238+
239+
fn de_nz_u64<'de, D>(deserializer: D) -> Result<Option<u64>, D::Error>
240+
where
241+
D: serde::Deserializer<'de>,
242+
{
243+
let value = u64::deserialize(deserializer)?;
244+
Ok((value != 0).then_some(value))
245+
}
246+
247+
fn de_nz_duration<'de, D>(deserializer: D) -> Result<Option<Duration>, D::Error>
248+
where
249+
D: serde::Deserializer<'de>,
250+
{
251+
#[derive(serde::Deserialize)]
252+
#[serde(untagged)]
253+
enum DurationValue {
254+
String(String),
255+
Seconds(u64),
256+
}
257+
258+
let duration = match DurationValue::deserialize(deserializer)? {
259+
DurationValue::String(value) => humantime::parse_duration(&value).map_err(serde::de::Error::custom)?,
260+
DurationValue::Seconds(value) => Duration::from_secs(value),
261+
};
262+
263+
Ok((!duration.is_zero()).then_some(duration))
264+
}
265+
266+
fn de_fraction<'de, D>(deserializer: D) -> Result<f64, D::Error>
267+
where
268+
D: serde::Deserializer<'de>,
269+
{
270+
#[derive(serde::Deserialize)]
271+
#[serde(untagged)]
272+
enum FractionValue {
273+
Integer(u64),
274+
Float(f64),
275+
}
276+
277+
let value = match FractionValue::deserialize(deserializer)? {
278+
FractionValue::Integer(value) => value as f64,
279+
FractionValue::Float(value) => value,
280+
};
281+
282+
if value.is_finite() && (0.0..=1.0).contains(&value) {
283+
Ok(value)
284+
} else {
285+
Err(serde::de::Error::custom(format!(
286+
"expected a fraction between 0.0 and 1.0, got {value}"
287+
)))
288+
}
289+
}
290+
291+
fn de_limit_mb<'de, D>(deserializer: D) -> Result<Option<usize>, D::Error>
292+
where
293+
D: serde::Deserializer<'de>,
294+
{
295+
let value = u64::deserialize(deserializer)?;
296+
if value == 0 {
297+
return Ok(None);
298+
}
299+
300+
let bytes = value
301+
.checked_mul(1024 * 1024)
302+
.ok_or_else(|| serde::de::Error::custom("heap-limit-mb is too large"))?;
303+
304+
usize::try_from(bytes)
305+
.map(Some)
306+
.map_err(|_| serde::de::Error::custom("heap-limit-mb does not fit in usize"))
307+
}
308+
172309
#[cfg(test)]
173310
mod tests {
174311
use super::*;
@@ -270,4 +407,41 @@ mod tests {
270407
.check_compatibility_and_update(mkmeta_pre(2, 1, 0, "rc1"))
271408
.unwrap_err();
272409
}
410+
411+
#[test]
412+
fn v8_heap_policy_defaults_when_omitted() {
413+
let config: ConfigFile = toml::from_str("").unwrap();
414+
415+
assert_eq!(config.v8_heap_policy.heap_check_request_interval, Some(65_536));
416+
assert_eq!(
417+
config.v8_heap_policy.heap_check_time_interval,
418+
Some(Duration::from_secs(30))
419+
);
420+
assert_eq!(config.v8_heap_policy.heap_gc_trigger_fraction, 0.67);
421+
assert_eq!(config.v8_heap_policy.heap_retire_fraction, 0.75);
422+
assert_eq!(config.v8_heap_policy.heap_limit_bytes, None);
423+
}
424+
425+
#[test]
426+
fn v8_heap_policy_parses_from_toml() {
427+
let toml = r#"
428+
[v8-heap-policy]
429+
heap-check-request-interval = 0
430+
heap-check-time-interval = "45s"
431+
heap-gc-trigger-fraction = 0.6
432+
heap-retire-fraction = 0.8
433+
heap-limit-mb = 256
434+
"#;
435+
436+
let config: ConfigFile = toml::from_str(toml).unwrap();
437+
438+
assert_eq!(config.v8_heap_policy.heap_check_request_interval, None);
439+
assert_eq!(
440+
config.v8_heap_policy.heap_check_time_interval,
441+
Some(Duration::from_secs(45))
442+
);
443+
assert_eq!(config.v8_heap_policy.heap_gc_trigger_fraction, 0.6);
444+
assert_eq!(config.v8_heap_policy.heap_retire_fraction, 0.8);
445+
assert_eq!(config.v8_heap_policy.heap_limit_bytes, Some(256 * 1024 * 1024));
446+
}
273447
}

crates/core/src/host/host_controller.rs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use super::scheduler::SchedulerStarter;
33
use super::wasmtime::WasmtimeRuntime;
44
use super::{Scheduler, UpdateDatabaseResult};
55
use crate::client::{ClientActorId, ClientName};
6+
use crate::config::V8HeapPolicyConfig;
67
use crate::database_logger::DatabaseLogger;
78
use crate::db::persistence::PersistenceProvider;
89
use crate::db::relational_db::{self, spawn_view_cleanup_loop, DiskSizeFn, RelationalDB, Txdata};
@@ -124,9 +125,9 @@ pub(crate) struct HostRuntimes {
124125
}
125126

126127
impl HostRuntimes {
127-
fn new(data_dir: Option<&ServerDataDir>) -> Arc<Self> {
128+
fn new(data_dir: Option<&ServerDataDir>, v8_heap_policy: V8HeapPolicyConfig) -> Arc<Self> {
128129
let wasmtime = WasmtimeRuntime::new(data_dir);
129-
let v8 = V8Runtime::default();
130+
let v8 = V8Runtime::new(v8_heap_policy);
130131
Arc::new(Self { wasmtime, v8 })
131132
}
132133
}
@@ -210,6 +211,7 @@ impl HostController {
210211
pub fn new(
211212
data_dir: Arc<ServerDataDir>,
212213
default_config: db::Config,
214+
v8_heap_policy: V8HeapPolicyConfig,
213215
program_storage: ProgramStorage,
214216
energy_monitor: Arc<impl EnergyMonitor>,
215217
persistence: Arc<dyn PersistenceProvider>,
@@ -221,7 +223,7 @@ impl HostController {
221223
program_storage,
222224
energy_monitor,
223225
persistence,
224-
runtimes: HostRuntimes::new(Some(&data_dir)),
226+
runtimes: HostRuntimes::new(Some(&data_dir), v8_heap_policy),
225227
data_dir,
226228
page_pool: PagePool::new(default_config.page_pool_max_size),
227229
bsatn_rlb_pool: BsatnRowListBuilderPool::new(),
@@ -1387,7 +1389,7 @@ pub async fn extract_schema(program_bytes: Box<[u8]>, host_type: HostType) -> an
13871389
extract_schema_with_pools(
13881390
PagePool::new(None),
13891391
BsatnRowListBuilderPool::new(),
1390-
&HostRuntimes::new(None),
1392+
&HostRuntimes::new(None, V8HeapPolicyConfig::default()),
13911393
program_bytes,
13921394
host_type,
13931395
)
@@ -1421,4 +1423,12 @@ where
14211423
.data_size_blob_store_bytes_used_by_blobs
14221424
.remove_label_values(db);
14231425
let _ = WORKER_METRICS.wasm_memory_bytes.remove_label_values(db);
1426+
let _ = WORKER_METRICS.v8_total_heap_size_bytes.remove_label_values(db);
1427+
let _ = WORKER_METRICS.v8_total_physical_size_bytes.remove_label_values(db);
1428+
let _ = WORKER_METRICS.v8_used_global_handles_size_bytes.remove_label_values(db);
1429+
let _ = WORKER_METRICS.v8_used_heap_size_bytes.remove_label_values(db);
1430+
let _ = WORKER_METRICS.v8_heap_size_limit_bytes.remove_label_values(db);
1431+
let _ = WORKER_METRICS.v8_external_memory_bytes.remove_label_values(db);
1432+
let _ = WORKER_METRICS.v8_native_contexts.remove_label_values(db);
1433+
let _ = WORKER_METRICS.v8_detached_contexts.remove_label_values(db);
14241434
}

crates/core/src/host/module_host.rs

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -347,8 +347,8 @@ pub enum ModuleWithInstance {
347347
}
348348

349349
enum ModuleHostInner {
350-
Wasm(WasmtimeModuleHost),
351-
Js(V8ModuleHost),
350+
Wasm(Box<WasmtimeModuleHost>),
351+
Js(Box<V8ModuleHost>),
352352
}
353353

354354
struct WasmtimeModuleHost {
@@ -1070,20 +1070,20 @@ impl ModuleHost {
10701070
} => {
10711071
info = module.info();
10721072
let instance_manager = ModuleInstanceManager::new(module, Some(init_inst), database_identity);
1073-
Arc::new(ModuleHostInner::Wasm(WasmtimeModuleHost {
1073+
Arc::new(ModuleHostInner::Wasm(Box::new(WasmtimeModuleHost {
10741074
executor,
10751075
instance_manager,
1076-
}))
1076+
})))
10771077
}
10781078
ModuleWithInstance::Js { module, init_inst } => {
10791079
info = module.info();
10801080
let instance_lane = super::v8::JsInstanceLane::new(module.clone(), init_inst);
10811081
let procedure_instances = ModuleInstanceManager::new(module.clone(), None, database_identity);
1082-
Arc::new(ModuleHostInner::Js(V8ModuleHost {
1082+
Arc::new(ModuleHostInner::Js(Box::new(V8ModuleHost {
10831083
module,
10841084
instance_lane,
10851085
procedure_instances,
1086-
}))
1086+
})))
10871087
}
10881088
};
10891089
let on_panic = Arc::new(on_panic);
@@ -1143,15 +1143,17 @@ impl ModuleHost {
11431143
let timer_guard = self.start_call_timer(label);
11441144

11451145
Ok(match &*self.inner {
1146-
ModuleHostInner::Wasm(WasmtimeModuleHost { executor, .. }) => {
1146+
ModuleHostInner::Wasm(wasm) => {
1147+
let executor = &wasm.executor;
11471148
executor
11481149
.run_job(async move || {
11491150
drop(timer_guard);
11501151
f().await
11511152
})
11521153
.await
11531154
}
1154-
ModuleHostInner::Js(V8ModuleHost { instance_lane, .. }) => {
1155+
ModuleHostInner::Js(js) => {
1156+
let instance_lane = &js.instance_lane;
11551157
instance_lane
11561158
.run_on_thread(async move || {
11571159
drop(timer_guard);
@@ -1215,15 +1217,14 @@ impl ModuleHost {
12151217
});
12161218

12171219
Ok(match &*self.inner {
1218-
ModuleHostInner::Wasm(WasmtimeModuleHost {
1219-
executor,
1220-
instance_manager,
1221-
}) => {
1220+
ModuleHostInner::Wasm(wasm) => {
1221+
let executor = &wasm.executor;
1222+
let instance_manager = &wasm.instance_manager;
12221223
instance_manager
12231224
.with_instance(async |inst| work_wasm(timer_guard, executor, inst, arg).await)
12241225
.await
12251226
}
1226-
ModuleHostInner::Js(V8ModuleHost { instance_lane, .. }) => work_js(timer_guard, instance_lane, arg).await,
1227+
ModuleHostInner::Js(js) => work_js(timer_guard, &js.instance_lane, arg).await,
12271228
})
12281229
}
12291230

@@ -1294,13 +1295,10 @@ impl ModuleHost {
12941295
});
12951296

12961297
Ok(match &*self.inner {
1297-
ModuleHostInner::Wasm(WasmtimeModuleHost {
1298-
executor,
1299-
instance_manager,
1300-
}) => {
1301-
instance_manager
1298+
ModuleHostInner::Wasm(host) => {
1299+
host.instance_manager
13021300
.with_instance(async |mut inst| {
1303-
executor
1301+
host.executor
13041302
.run_job(async move || {
13051303
drop(timer_guard);
13061304
(wasm(arg, &mut inst).await, inst)
@@ -1309,10 +1307,8 @@ impl ModuleHost {
13091307
})
13101308
.await
13111309
}
1312-
ModuleHostInner::Js(V8ModuleHost {
1313-
procedure_instances, ..
1314-
}) => {
1315-
procedure_instances
1310+
ModuleHostInner::Js(host) => {
1311+
host.procedure_instances
13161312
.with_instance(async |inst| {
13171313
drop(timer_guard);
13181314
let res = js(arg, &inst).await;

0 commit comments

Comments
 (0)