Skip to content

Commit 05a4a7b

Browse files
Replace JsInstance pool with single worker and FIFO queue (#4663)
# Description of Changes Before this change, JS reducer requests borrowed a `JsInstance` from a pool. If no idle instance was available, we created another instance, which meant another V8 worker thread. Under load, this meant reducers bouncing across multiple OS threads. After this change, JS reducers go through a single long-lived `JsInstance` fed by a FIFO queue which results in much better cache locality. More accurately, each module now allocates a single OS thread, on which reducers (and most operations) run. Modules do not share workers/threads. And modules do not create multiple threads for running reducers. Note, the original instance pool is still used for procedures. It should probably be bounded, but I didn't make any changes to it. It's also used for executing views during initial subscription to avoid a reentrancy deadlock. The latter should be fixed and moved over to the JS worker thread at some point. # API and ABI breaking changes N/A # Expected complexity level and risk 4 # Testing ``` NODE_OPTIONS="--max-old-space-size=8192" \ MAX_INFLIGHT_PER_WORKER=512 \ BENCH_PRECOMPUTED_TRANSFER_PAIRS=1000000 \ pnpm bench test-1 --seconds 10 --concurrency 50 --alpha 1.5 --connectors spacetimedb ``` ``` 50K TPS -> 85K TPS on m2 mac ```
1 parent ad71f4c commit 05a4a7b

4 files changed

Lines changed: 640 additions & 256 deletions

File tree

crates/client-api/src/routes/database.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ fn map_reducer_error(e: ReducerCallError, reducer: &str) -> (StatusCode, String)
9898
log::debug!("Attempt to call non-existent reducer {reducer}");
9999
StatusCode::NOT_FOUND
100100
}
101+
ReducerCallError::WorkerError(_) => StatusCode::INTERNAL_SERVER_ERROR,
101102
ReducerCallError::LifecycleReducer(lifecycle) => {
102103
log::debug!("Attempt to call {lifecycle:?} lifecycle reducer {reducer}");
103104
StatusCode::BAD_REQUEST

crates/core/src/host/module_host.rs

Lines changed: 111 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use crate::error::DBError;
1111
use crate::estimation::{check_row_limit, estimate_rows_scanned};
1212
use crate::hash::Hash;
1313
use crate::host::host_controller::CallProcedureReturn;
14-
use crate::host::scheduler::{CallScheduledFunctionResult, ScheduledFunctionParams};
14+
use crate::host::scheduler::{CallScheduledFunctionError, CallScheduledFunctionResult, ScheduledFunctionParams};
1515
use crate::host::v8::JsInstance;
1616
pub use crate::host::wasm_common::module_host_actor::{InstanceCommon, WasmInstance};
1717
use crate::host::wasmtime::ModuleInstance;
@@ -357,7 +357,9 @@ struct WasmtimeModuleHost {
357357
}
358358

359359
struct V8ModuleHost {
360-
instance_manager: ModuleInstanceManager<super::v8::JsModule>,
360+
module: super::v8::JsModule,
361+
instance_lane: super::v8::JsInstanceLane,
362+
procedure_instances: ModuleInstanceManager<super::v8::JsModule>,
361363
}
362364

363365
/// A module; used as a bound on `InstanceManager`.
@@ -813,7 +815,7 @@ impl CreateInstanceTimeMetric {
813815
}
814816

815817
impl<M: GenericModule> ModuleInstanceManager<M> {
816-
fn new(module: M, init_inst: M::Instance, database_identity: Identity) -> Self {
818+
fn new(module: M, init_inst: Option<M::Instance>, database_identity: Identity) -> Self {
817819
let host_type = module.host_type();
818820
let module_instances_metric = ModuleInstancesMetric {
819821
metric: WORKER_METRICS
@@ -832,9 +834,8 @@ impl<M: GenericModule> ModuleInstanceManager<M> {
832834
database_identity,
833835
};
834836

835-
// Add the first instance.
836837
let mut instances = VecDeque::new();
837-
instances.push_front(init_inst);
838+
instances.extend(init_inst);
838839

839840
Self {
840841
instances: Mutex::new(instances),
@@ -937,6 +938,8 @@ pub enum ReducerCallError {
937938
Args(#[from] InvalidReducerArguments),
938939
#[error(transparent)]
939940
NoSuchModule(#[from] NoSuchModule),
941+
#[error("The reducer worker encountered a fatal error: {0}")]
942+
WorkerError(String),
940943
#[error("no such reducer")]
941944
NoSuchReducer,
942945
#[error("no such scheduled reducer")]
@@ -1066,16 +1069,21 @@ impl ModuleHost {
10661069
init_inst,
10671070
} => {
10681071
info = module.info();
1069-
let instance_manager = ModuleInstanceManager::new(module, init_inst, database_identity);
1072+
let instance_manager = ModuleInstanceManager::new(module, Some(init_inst), database_identity);
10701073
Arc::new(ModuleHostInner::Wasm(WasmtimeModuleHost {
10711074
executor,
10721075
instance_manager,
10731076
}))
10741077
}
10751078
ModuleWithInstance::Js { module, init_inst } => {
10761079
info = module.info();
1077-
let instance_manager = ModuleInstanceManager::new(module, init_inst, database_identity);
1078-
Arc::new(ModuleHostInner::Js(V8ModuleHost { instance_manager }))
1080+
let instance_lane = super::v8::JsInstanceLane::new(module.clone(), init_inst);
1081+
let procedure_instances = ModuleInstanceManager::new(module.clone(), None, database_identity);
1082+
Arc::new(ModuleHostInner::Js(V8ModuleHost {
1083+
module,
1084+
instance_lane,
1085+
procedure_instances,
1086+
}))
10791087
}
10801088
};
10811089
let on_panic = Arc::new(on_panic);
@@ -1143,18 +1151,13 @@ impl ModuleHost {
11431151
})
11441152
.await
11451153
}
1146-
ModuleHostInner::Js(V8ModuleHost { instance_manager }) => {
1147-
instance_manager
1148-
.with_instance(async |mut inst| {
1149-
let res = inst
1150-
.run_on_thread(async move || {
1151-
drop(timer_guard);
1152-
f().await
1153-
})
1154-
.await;
1155-
(res, inst)
1154+
ModuleHostInner::Js(V8ModuleHost { instance_lane, .. }) => {
1155+
instance_lane
1156+
.run_on_thread(async move || {
1157+
drop(timer_guard);
1158+
f().await
11561159
})
1157-
.await
1160+
.await?
11581161
}
11591162
})
11601163
}
@@ -1193,7 +1196,7 @@ impl ModuleHost {
11931196
arg: A,
11941197
timer: impl FnOnce(&str) -> Guard,
11951198
work_wasm: impl AsyncFnOnce(Guard, &SingleCoreExecutor, Box<ModuleInstance>, A) -> (R, Box<ModuleInstance>),
1196-
work_js: impl AsyncFnOnce(Guard, &mut JsInstance, A) -> R,
1199+
work_js: impl AsyncFnOnce(Guard, &super::v8::JsInstanceLane, A) -> R,
11971200
) -> Result<R, NoSuchModule> {
11981201
self.guard_closed()?;
11991202
let timer_guard = timer(label);
@@ -1220,11 +1223,7 @@ impl ModuleHost {
12201223
.with_instance(async |inst| work_wasm(timer_guard, executor, inst, arg).await)
12211224
.await
12221225
}
1223-
ModuleHostInner::Js(V8ModuleHost { instance_manager }) => {
1224-
instance_manager
1225-
.with_instance(async |mut inst| (work_js(timer_guard, &mut inst, arg).await, inst))
1226-
.await
1227-
}
1226+
ModuleHostInner::Js(V8ModuleHost { instance_lane, .. }) => work_js(timer_guard, instance_lane, arg).await,
12281227
})
12291228
}
12301229

@@ -1237,7 +1236,7 @@ impl ModuleHost {
12371236
label: &str,
12381237
arg: A,
12391238
wasm: impl AsyncFnOnce(A, &mut ModuleInstance) -> R + Send + 'static,
1240-
js: impl AsyncFnOnce(A, &mut JsInstance) -> R,
1239+
js: impl AsyncFnOnce(A, &super::v8::JsInstanceLane) -> R,
12411240
) -> Result<R, NoSuchModule>
12421241
where
12431242
R: Send + 'static,
@@ -1262,13 +1261,78 @@ impl ModuleHost {
12621261
.await
12631262
},
12641263
async move |timer_guard, inst, arg| {
1264+
super::v8::assert_not_on_js_module_thread(label);
12651265
drop(timer_guard);
12661266
js(arg, inst).await
12671267
},
12681268
)
12691269
.await
12701270
}
12711271

1272+
/// Run a function for this module using pooled instances.
1273+
///
1274+
/// For WASM, this is identical to [`Self::call`].
1275+
/// For V8/JS, this uses the pooled procedure instances instead of the
1276+
/// single instance lane.
1277+
async fn call_pooled<A, R>(
1278+
&self,
1279+
label: &str,
1280+
arg: A,
1281+
wasm: impl AsyncFnOnce(A, &mut ModuleInstance) -> R + Send + 'static,
1282+
js: impl AsyncFnOnce(A, &JsInstance) -> R,
1283+
) -> Result<R, NoSuchModule>
1284+
where
1285+
R: Send + 'static,
1286+
A: Send + 'static,
1287+
{
1288+
self.guard_closed()?;
1289+
let timer_guard = self.start_call_timer(label);
1290+
1291+
scopeguard::defer_on_unwind!({
1292+
log::warn!("pooled operation {label} panicked");
1293+
(self.on_panic)();
1294+
});
1295+
1296+
Ok(match &*self.inner {
1297+
ModuleHostInner::Wasm(WasmtimeModuleHost {
1298+
executor,
1299+
instance_manager,
1300+
}) => {
1301+
instance_manager
1302+
.with_instance(async |mut inst| {
1303+
executor
1304+
.run_job(async move || {
1305+
drop(timer_guard);
1306+
(wasm(arg, &mut inst).await, inst)
1307+
})
1308+
.await
1309+
})
1310+
.await
1311+
}
1312+
ModuleHostInner::Js(V8ModuleHost {
1313+
procedure_instances, ..
1314+
}) => {
1315+
procedure_instances
1316+
.with_instance(async |inst| {
1317+
drop(timer_guard);
1318+
let res = js(arg, &inst).await;
1319+
(res, inst)
1320+
})
1321+
.await
1322+
}
1323+
})
1324+
}
1325+
1326+
async fn call_view_command(&self, label: &str, cmd: ViewCommand) -> Result<ViewCommandResult, ViewCallError> {
1327+
self.call_pooled(
1328+
label,
1329+
cmd,
1330+
async |cmd, inst| Ok::<_, ViewCallError>(inst.call_view(cmd)),
1331+
async |cmd, inst| Ok::<_, ViewCallError>(inst.call_view(cmd).await),
1332+
)
1333+
.await?
1334+
}
1335+
12721336
pub async fn disconnect_client(&self, client_id: ClientActorId) {
12731337
log::trace!("disconnecting client {client_id}");
12741338
if let Err(e) = self
@@ -1536,14 +1600,13 @@ impl ModuleHost {
15361600
args,
15371601
};
15381602

1539-
Ok(self
1540-
.call(
1541-
&reducer_def.name,
1542-
call_reducer_params,
1543-
async |p, inst| inst.call_reducer(p),
1544-
async |p, inst| inst.call_reducer(p).await,
1545-
)
1546-
.await?)
1603+
self.call(
1604+
&reducer_def.name,
1605+
call_reducer_params,
1606+
async |p, inst| Ok(inst.call_reducer(p)),
1607+
async |p, inst| inst.call_reducer(p).await,
1608+
)
1609+
.await?
15471610
}
15481611

15491612
pub async fn call_reducer(
@@ -1611,12 +1674,7 @@ impl ModuleHost {
16111674
};
16121675

16131676
let res = self
1614-
.call(
1615-
"call_view_add_single_subscription",
1616-
cmd,
1617-
async |cmd, inst| inst.call_view(cmd),
1618-
async |cmd, inst| inst.call_view(cmd).await,
1619-
)
1677+
.call_view_command("call_view_add_single_subscription", cmd)
16201678
.await
16211679
//TODO: handle error better
16221680
.map_err(|e| DBError::Other(anyhow::anyhow!(e)))?;
@@ -1644,12 +1702,7 @@ impl ModuleHost {
16441702
};
16451703

16461704
let res = self
1647-
.call(
1648-
"call_view_add_multi_subscription",
1649-
cmd,
1650-
async |cmd, inst| inst.call_view(cmd),
1651-
async |cmd, inst| inst.call_view(cmd).await,
1652-
)
1705+
.call_view_command("call_view_add_multi_subscription", cmd)
16531706
.await
16541707
//TODO: handle error better
16551708
.map_err(|e| DBError::Other(anyhow::anyhow!(e)))?;
@@ -1677,12 +1730,7 @@ impl ModuleHost {
16771730
};
16781731

16791732
let res = self
1680-
.call(
1681-
"call_view_remove_v2_subscription",
1682-
cmd,
1683-
async |cmd, inst| inst.call_view(cmd),
1684-
async |cmd, inst| inst.call_view(cmd).await,
1685-
)
1733+
.call_view_command("call_view_remove_v2_subscription", cmd)
16861734
.await
16871735
.map_err(|e| DBError::Other(anyhow::anyhow!(e)))?;
16881736

@@ -1709,12 +1757,7 @@ impl ModuleHost {
17091757
};
17101758

17111759
let res = self
1712-
.call(
1713-
"call_view_add_multi_subscription",
1714-
cmd,
1715-
async |cmd, inst| inst.call_view(cmd),
1716-
async |cmd, inst| inst.call_view(cmd).await,
1717-
)
1760+
.call_view_command("call_view_add_multi_subscription", cmd)
17181761
.await
17191762
//TODO: handle error better
17201763
.map_err(|e| DBError::Other(anyhow::anyhow!(e)))?;
@@ -1742,12 +1785,7 @@ impl ModuleHost {
17421785
};
17431786

17441787
let res = self
1745-
.call(
1746-
"call_view_add_legacy_subscription",
1747-
cmd,
1748-
async |cmd, inst| inst.call_view(cmd),
1749-
async |cmd, inst| inst.call_view(cmd).await,
1750-
)
1788+
.call_view_command("call_view_add_legacy_subscription", cmd)
17511789
.await
17521790
//TODO: handle error better
17531791
.map_err(|e| DBError::Other(anyhow::anyhow!(e)))?;
@@ -1776,12 +1814,7 @@ impl ModuleHost {
17761814
};
17771815

17781816
let res = self
1779-
.call(
1780-
"call_view_sql",
1781-
cmd,
1782-
async |cmd, inst| inst.call_view(cmd),
1783-
async |cmd, inst| inst.call_view(cmd).await,
1784-
)
1817+
.call_view_command("call_view_sql", cmd)
17851818
.await
17861819
//TODO: handle error better
17871820
.map_err(|e| DBError::Other(anyhow::anyhow!(e)))?;
@@ -1885,7 +1918,7 @@ impl ModuleHost {
18851918
name: &str,
18861919
params: CallProcedureParams,
18871920
) -> Result<CallProcedureReturn, NoSuchModule> {
1888-
self.call(
1921+
self.call_pooled(
18891922
name,
18901923
params,
18911924
async move |params, inst| inst.call_procedure(params).await,
@@ -1897,14 +1930,14 @@ impl ModuleHost {
18971930
pub(super) async fn call_scheduled_function(
18981931
&self,
18991932
params: ScheduledFunctionParams,
1900-
) -> Result<CallScheduledFunctionResult, NoSuchModule> {
1901-
self.call(
1933+
) -> Result<CallScheduledFunctionResult, CallScheduledFunctionError> {
1934+
self.call_pooled(
19021935
"unknown scheduled function",
19031936
params,
1904-
async move |params, inst| inst.call_scheduled_function(params).await,
1905-
async move |params, inst| inst.call_scheduled_function(params).await,
1937+
async move |params, inst| Ok(inst.call_scheduled_function(params).await),
1938+
async move |params, inst| Ok(inst.call_scheduled_function(params).await),
19061939
)
1907-
.await
1940+
.await?
19081941
}
19091942

19101943
/// Materializes the views return by the `view_collector`, if not already materialized,
@@ -2467,14 +2500,14 @@ impl ModuleHost {
24672500
pub(crate) fn replica_ctx(&self) -> &ReplicaContext {
24682501
match &*self.inner {
24692502
ModuleHostInner::Wasm(wasm) => wasm.instance_manager.module.replica_ctx(),
2470-
ModuleHostInner::Js(js) => js.instance_manager.module.replica_ctx(),
2503+
ModuleHostInner::Js(js) => js.module.replica_ctx(),
24712504
}
24722505
}
24732506

24742507
fn scheduler(&self) -> &Scheduler {
24752508
match &*self.inner {
24762509
ModuleHostInner::Wasm(wasm) => wasm.instance_manager.module.scheduler(),
2477-
ModuleHostInner::Js(js) => js.instance_manager.module.scheduler(),
2510+
ModuleHostInner::Js(js) => js.module.scheduler(),
24782511
}
24792512
}
24802513
}

0 commit comments

Comments
 (0)