@@ -287,35 +287,64 @@ impl MultiUseSandbox {
287287 return Err ( SnapshotSandboxMismatch ) ;
288288 }
289289
290- // Reset VM/vCPU state before memory restore.
291- self . vm . reset_vm_state ( ) . map_err ( |e| {
292- self . poisoned = true ;
293- HyperlightVmError :: Restore ( e)
294- } ) ?;
290+ // A prior failed restore may have swapped mem_mgr.shared_mem
291+ // without updating the hypervisor's GPA mapping. Track this
292+ // so we can force a remap below if needed.
293+ let was_poisoned = self . poisoned ;
294+
295+ // Assume poisoned: any failure below leaves the sandbox
296+ // inconsistent. Only cleared at the end on success.
297+ self . poisoned = true ;
298+
299+ // Failure here may partially reset vCPU state. The sandbox is
300+ // poisoned, and the next restore() retry will call
301+ // reset_vm_state() again which is idempotent.
302+ self . vm
303+ . reset_vm_state ( )
304+ . map_err ( HyperlightVmError :: Restore ) ?;
295305
306+ // Failure here may leave mem_mgr with partially swapped
307+ // shared_mem/scratch_mem. The sandbox is poisoned, and if
308+ // shared_mem was swapped without updating the hypervisor, the
309+ // was_poisoned path below handles it on the retry.
296310 let ( gsnapshot, gscratch) = self . mem_mgr . restore_snapshot ( & snapshot) ?;
297311 if let Some ( gsnapshot) = gsnapshot {
312+ // Failure here leaves the hypervisor pointing at the old
313+ // (possibly freed) snapshot. The sandbox is poisoned, and
314+ // the next restore() will remap via the was_poisoned path.
315+ self . vm
316+ . update_snapshot_mapping ( gsnapshot)
317+ . map_err ( |e| HyperlightError :: HyperlightVmError ( e. into ( ) ) ) ?;
318+ } else if was_poisoned {
319+ // restore_snapshot() skipped the swap because shared_mem
320+ // already matched, but the hypervisor may still point at
321+ // the old backing from before the failed restore. Force
322+ // a remap. Failure leaves us still poisoned for the same
323+ // reason as above.
324+ let snap_mem = snapshot. memory ( ) . to_mgr_snapshot_mem ( ) ?;
325+ let ( _, gsnapshot) = snap_mem. build ( ) ;
298326 self . vm
299327 . update_snapshot_mapping ( gsnapshot)
300328 . map_err ( |e| HyperlightError :: HyperlightVmError ( e. into ( ) ) ) ?;
301329 }
302330 if let Some ( gscratch) = gscratch {
331+ // Failure here leaves the hypervisor pointing at old
332+ // scratch. The sandbox is poisoned; scratch size is fixed
333+ // per sandbox so retry will zero the same allocation and
334+ // remap here again.
303335 self . vm
304336 . update_scratch_mapping ( gscratch)
305337 . map_err ( |e| HyperlightError :: HyperlightVmError ( e. into ( ) ) ) ?;
306338 }
307339
340+ // Failure here means sregs (CR3, etc.) are not set. The
341+ // sandbox is poisoned; retry will call restore_sregs() again.
308342 let sregs = snapshot. sregs ( ) . ok_or_else ( || {
309343 HyperlightError :: Error ( "snapshot from running sandbox should have sregs" . to_string ( ) )
310344 } ) ?;
311- // TODO (ludfjig): Go through the rest of possible errors in this `MultiUseSandbox::restore` function
312- // and determine if they should also poison the sandbox.
313345 self . vm
314346 . restore_sregs ( snapshot. root_pt_gpa ( ) , sregs)
315- . map_err ( |e| {
316- self . poisoned = true ;
317- HyperlightVmError :: Restore ( e)
318- } ) ?;
347+ . map_err ( HyperlightVmError :: Restore ) ?;
319348
320349 self . vm . set_stack_top ( snapshot. stack_top_gva ( ) ) ;
321350 self . vm . set_entrypoint ( snapshot. entrypoint ( ) ) ;
@@ -327,12 +356,19 @@ impl MultiUseSandbox {
327356 let regions_to_map = snapshot_regions. difference ( & current_regions) ;
328357
329358 for region in regions_to_unmap {
359+ // Failure here leaves extra regions mapped. The sandbox is
360+ // poisoned; retry will attempt the unmap again since the
361+ // region will still be in the current set.
330362 self . vm
331363 . unmap_region ( region)
332364 . map_err ( HyperlightVmError :: UnmapRegion ) ?;
333365 }
334366
335367 for region in regions_to_map {
368+ // Failure here leaves regions missing. The sandbox is
369+ // poisoned; retry will attempt the map again since the
370+ // region will still be in the snapshot set.
371+ //
336372 // Safety: The region has been mapped before, and at that point the caller promised that the memory region is valid
337373 // in their call to `MultiUseSandbox::map_region`
338374 unsafe { self . vm . map_region ( region) } . map_err ( HyperlightVmError :: MapRegion ) ?;
@@ -341,9 +377,8 @@ impl MultiUseSandbox {
341377 // The restored snapshot is now our most current snapshot
342378 self . snapshot = Some ( snapshot. clone ( ) ) ;
343379
344- // Clear poison state when successfully restoring from snapshot .
380+ // Clear poison state: restore completed successfully .
345381 //
346- // # Safety:
347382 // This is safe because:
348383 // 1. Snapshots can only be taken from non-poisoned sandboxes (verified at snapshot creation)
349384 // 2. Restoration completely replaces all memory state, eliminating:
0 commit comments