Skip to content

Commit cd50b1b

Browse files
committed
Be more careful about poisoning
Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com>
1 parent 9aebbcc commit cd50b1b

1 file changed

Lines changed: 48 additions & 13 deletions

File tree

src/hyperlight_host/src/sandbox/initialized_multi_use.rs

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -287,35 +287,64 @@ impl MultiUseSandbox {
287287
return Err(SnapshotSandboxMismatch);
288288
}
289289

290-
// Reset VM/vCPU state before memory restore.
291-
self.vm.reset_vm_state().map_err(|e| {
292-
self.poisoned = true;
293-
HyperlightVmError::Restore(e)
294-
})?;
290+
// A prior failed restore may have swapped mem_mgr.shared_mem
291+
// without updating the hypervisor's GPA mapping. Track this
292+
// so we can force a remap below if needed.
293+
let was_poisoned = self.poisoned;
294+
295+
// Assume poisoned: any failure below leaves the sandbox
296+
// inconsistent. Only cleared at the end on success.
297+
self.poisoned = true;
298+
299+
// Failure here may partially reset vCPU state. The sandbox is
300+
// poisoned, and the next restore() retry will call
301+
// reset_vm_state() again which is idempotent.
302+
self.vm
303+
.reset_vm_state()
304+
.map_err(HyperlightVmError::Restore)?;
295305

306+
// Failure here may leave mem_mgr with partially swapped
307+
// shared_mem/scratch_mem. The sandbox is poisoned, and if
308+
// shared_mem was swapped without updating the hypervisor, the
309+
// was_poisoned path below handles it on the retry.
296310
let (gsnapshot, gscratch) = self.mem_mgr.restore_snapshot(&snapshot)?;
297311
if let Some(gsnapshot) = gsnapshot {
312+
// Failure here leaves the hypervisor pointing at the old
313+
// (possibly freed) snapshot. The sandbox is poisoned, and
314+
// the next restore() will remap via the was_poisoned path.
315+
self.vm
316+
.update_snapshot_mapping(gsnapshot)
317+
.map_err(|e| HyperlightError::HyperlightVmError(e.into()))?;
318+
} else if was_poisoned {
319+
// restore_snapshot() skipped the swap because shared_mem
320+
// already matched, but the hypervisor may still point at
321+
// the old backing from before the failed restore. Force
322+
// a remap. Failure leaves us still poisoned for the same
323+
// reason as above.
324+
let snap_mem = snapshot.memory().to_mgr_snapshot_mem()?;
325+
let (_, gsnapshot) = snap_mem.build();
298326
self.vm
299327
.update_snapshot_mapping(gsnapshot)
300328
.map_err(|e| HyperlightError::HyperlightVmError(e.into()))?;
301329
}
302330
if let Some(gscratch) = gscratch {
331+
// Failure here leaves the hypervisor pointing at old
332+
// scratch. The sandbox is poisoned; scratch size is fixed
333+
// per sandbox so retry will zero the same allocation and
334+
// remap here again.
303335
self.vm
304336
.update_scratch_mapping(gscratch)
305337
.map_err(|e| HyperlightError::HyperlightVmError(e.into()))?;
306338
}
307339

340+
// Failure here means sregs (CR3, etc.) are not set. The
341+
// sandbox is poisoned; retry will call restore_sregs() again.
308342
let sregs = snapshot.sregs().ok_or_else(|| {
309343
HyperlightError::Error("snapshot from running sandbox should have sregs".to_string())
310344
})?;
311-
// TODO (ludfjig): Go through the rest of possible errors in this `MultiUseSandbox::restore` function
312-
// and determine if they should also poison the sandbox.
313345
self.vm
314346
.restore_sregs(snapshot.root_pt_gpa(), sregs)
315-
.map_err(|e| {
316-
self.poisoned = true;
317-
HyperlightVmError::Restore(e)
318-
})?;
347+
.map_err(HyperlightVmError::Restore)?;
319348

320349
self.vm.set_stack_top(snapshot.stack_top_gva());
321350
self.vm.set_entrypoint(snapshot.entrypoint());
@@ -327,12 +356,19 @@ impl MultiUseSandbox {
327356
let regions_to_map = snapshot_regions.difference(&current_regions);
328357

329358
for region in regions_to_unmap {
359+
// Failure here leaves extra regions mapped. The sandbox is
360+
// poisoned; retry will attempt the unmap again since the
361+
// region will still be in the current set.
330362
self.vm
331363
.unmap_region(region)
332364
.map_err(HyperlightVmError::UnmapRegion)?;
333365
}
334366

335367
for region in regions_to_map {
368+
// Failure here leaves regions missing. The sandbox is
369+
// poisoned; retry will attempt the map again since the
370+
// region will still be in the snapshot set.
371+
//
336372
// Safety: The region has been mapped before, and at that point the caller promised that the memory region is valid
337373
// in their call to `MultiUseSandbox::map_region`
338374
unsafe { self.vm.map_region(region) }.map_err(HyperlightVmError::MapRegion)?;
@@ -341,9 +377,8 @@ impl MultiUseSandbox {
341377
// The restored snapshot is now our most current snapshot
342378
self.snapshot = Some(snapshot.clone());
343379

344-
// Clear poison state when successfully restoring from snapshot.
380+
// Clear poison state: restore completed successfully.
345381
//
346-
// # Safety:
347382
// This is safe because:
348383
// 1. Snapshots can only be taken from non-poisoned sandboxes (verified at snapshot creation)
349384
// 2. Restoration completely replaces all memory state, eliminating:

0 commit comments

Comments
 (0)