@@ -287,64 +287,35 @@ impl MultiUseSandbox {
287287 return Err ( SnapshotSandboxMismatch ) ;
288288 }
289289
290- // A prior failed restore may have swapped mem_mgr.shared_mem
291- // without updating the hypervisor's GPA mapping. Track this
292- // so we can force a remap below if needed.
293- let was_poisoned = self . poisoned ;
294-
295- // Assume poisoned: any failure below leaves the sandbox
296- // inconsistent. Only cleared at the end on success.
297- self . poisoned = true ;
298-
299- // Failure here may partially reset vCPU state. The sandbox is
300- // poisoned, and the next restore() retry will call
301- // reset_vm_state() again which is idempotent.
302- self . vm
303- . reset_vm_state ( )
304- . map_err ( HyperlightVmError :: Restore ) ?;
290+ // Reset VM/vCPU state before memory restore.
291+ self . vm . reset_vm_state ( ) . map_err ( |e| {
292+ self . poisoned = true ;
293+ HyperlightVmError :: Restore ( e)
294+ } ) ?;
305295
306- // Failure here may leave mem_mgr with partially swapped
307- // shared_mem/scratch_mem. The sandbox is poisoned, and if
308- // shared_mem was swapped without updating the hypervisor, the
309- // was_poisoned path below handles it on the retry.
310296 let ( gsnapshot, gscratch) = self . mem_mgr . restore_snapshot ( & snapshot) ?;
311297 if let Some ( gsnapshot) = gsnapshot {
312- // Failure here leaves the hypervisor pointing at the old
313- // (possibly freed) snapshot. The sandbox is poisoned, and
314- // the next restore() will remap via the was_poisoned path.
315- self . vm
316- . update_snapshot_mapping ( gsnapshot)
317- . map_err ( |e| HyperlightError :: HyperlightVmError ( e. into ( ) ) ) ?;
318- } else if was_poisoned {
319- // restore_snapshot() skipped the swap because shared_mem
320- // already matched, but the hypervisor may still point at
321- // the old backing from before the failed restore. Force
322- // a remap. Failure leaves us still poisoned for the same
323- // reason as above.
324- let snap_mem = snapshot. memory ( ) . to_mgr_snapshot_mem ( ) ?;
325- let ( _, gsnapshot) = snap_mem. build ( ) ;
326298 self . vm
327299 . update_snapshot_mapping ( gsnapshot)
328300 . map_err ( |e| HyperlightError :: HyperlightVmError ( e. into ( ) ) ) ?;
329301 }
330302 if let Some ( gscratch) = gscratch {
331- // Failure here leaves the hypervisor pointing at old
332- // scratch. The sandbox is poisoned; scratch size is fixed
333- // per sandbox so retry will zero the same allocation and
334- // remap here again.
335303 self . vm
336304 . update_scratch_mapping ( gscratch)
337305 . map_err ( |e| HyperlightError :: HyperlightVmError ( e. into ( ) ) ) ?;
338306 }
339307
340- // Failure here means sregs (CR3, etc.) are not set. The
341- // sandbox is poisoned; retry will call restore_sregs() again.
342308 let sregs = snapshot. sregs ( ) . ok_or_else ( || {
343309 HyperlightError :: Error ( "snapshot from running sandbox should have sregs" . to_string ( ) )
344310 } ) ?;
311+ // TODO (ludfjig): Go through the rest of possible errors in this `MultiUseSandbox::restore` function
312+ // and determine if they should also poison the sandbox.
345313 self . vm
346314 . restore_sregs ( snapshot. root_pt_gpa ( ) , sregs)
347- . map_err ( HyperlightVmError :: Restore ) ?;
315+ . map_err ( |e| {
316+ self . poisoned = true ;
317+ HyperlightVmError :: Restore ( e)
318+ } ) ?;
348319
349320 self . vm . set_stack_top ( snapshot. stack_top_gva ( ) ) ;
350321 self . vm . set_entrypoint ( snapshot. entrypoint ( ) ) ;
@@ -356,19 +327,12 @@ impl MultiUseSandbox {
356327 let regions_to_map = snapshot_regions. difference ( & current_regions) ;
357328
358329 for region in regions_to_unmap {
359- // Failure here leaves extra regions mapped. The sandbox is
360- // poisoned; retry will attempt the unmap again since the
361- // region will still be in the current set.
362330 self . vm
363331 . unmap_region ( region)
364332 . map_err ( HyperlightVmError :: UnmapRegion ) ?;
365333 }
366334
367335 for region in regions_to_map {
368- // Failure here leaves regions missing. The sandbox is
369- // poisoned; retry will attempt the map again since the
370- // region will still be in the snapshot set.
371- //
372336 // Safety: The region has been mapped before, and at that point the caller promised that the memory region is valid
373337 // in their call to `MultiUseSandbox::map_region`
374338 unsafe { self . vm . map_region ( region) } . map_err ( HyperlightVmError :: MapRegion ) ?;
@@ -377,8 +341,9 @@ impl MultiUseSandbox {
377341 // The restored snapshot is now our most current snapshot
378342 self . snapshot = Some ( snapshot. clone ( ) ) ;
379343
380- // Clear poison state: restore completed successfully.
344+ // Clear poison state when successfully restoring from snapshot .
381345 //
346+ // # Safety:
382347 // This is safe because:
383348 // 1. Snapshots can only be taken from non-poisoned sandboxes (verified at snapshot creation)
384349 // 2. Restoration completely replaces all memory state, eliminating:
0 commit comments