Skip to content

Commit 0ee9778

Browse files
committed
Address stale NPT bug
Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com>
1 parent 4e923d4 commit 0ee9778

2 files changed

Lines changed: 160 additions & 30 deletions

File tree

src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs

Lines changed: 159 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,19 @@ limitations under the License.
1616

1717
use std::os::raw::c_void;
1818

19+
use hyperlight_common::mem::PAGE_SIZE_USIZE;
1920
use hyperlight_common::outb::VmAction;
2021
#[cfg(feature = "trace_guest")]
2122
use tracing::Span;
2223
#[cfg(feature = "trace_guest")]
2324
use tracing_opentelemetry::OpenTelemetrySpanExt;
24-
use windows::Win32::Foundation::{CloseHandle, FreeLibrary, HANDLE};
25+
use windows::Win32::Foundation::{CloseHandle, FreeLibrary, HANDLE, INVALID_HANDLE_VALUE};
2526
use windows::Win32::System::Hypervisor::*;
2627
use windows::Win32::System::LibraryLoader::*;
27-
use windows::Win32::System::Memory::{MEMORY_MAPPED_VIEW_ADDRESS, UnmapViewOfFile};
28-
use windows::core::s;
28+
use windows::Win32::System::Memory::{
29+
CreateFileMappingA, MEMORY_MAPPED_VIEW_ADDRESS, PAGE_READWRITE, UnmapViewOfFile,
30+
};
31+
use windows::core::{PCSTR, s};
2932
use windows_result::HRESULT;
3033

3134
#[cfg(gdb)]
@@ -44,7 +47,10 @@ use crate::hypervisor::virtual_machine::{
4447
VirtualMachine, VmExit,
4548
};
4649
use crate::hypervisor::wrappers::HandleWrapper;
47-
use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags, MemoryRegionType};
50+
use crate::mem::layout::SandboxMemoryLayout;
51+
use crate::mem::memory_region::{
52+
MemoryRegion, MemoryRegionFlags, MemoryRegionType, SurrogateMapping,
53+
};
4854
#[cfg(feature = "trace_guest")]
4955
use crate::sandbox::trace::TraceContext as SandboxTraceContext;
5056

@@ -86,6 +92,134 @@ fn release_file_mapping(view_base: *mut c_void, mapping_handle: HandleWrapper) {
8692
}
8793
}
8894

95+
/// GPA for the NPT flush dummy page. Placed just past the maximum
96+
/// snapshot region so it can never collide with guest memory.
97+
const NPT_FLUSH_GPA: u64 =
98+
(SandboxMemoryLayout::BASE_ADDRESS + SandboxMemoryLayout::MAX_MEMORY_SIZE) as u64;
99+
100+
/// A single dummy page mapped at a fixed GPA whose permissions are
101+
/// toggled on each [`reset_partition`](VirtualMachine::reset_partition)
102+
/// call. On AMD SVM, `WHvResetPartition` does not flush the NPT
103+
/// (GPA→HPA) TLB. Changing a GPA mapping's permissions via
104+
/// `WHvMapGpaRange2` is the only WHP API path that triggers
105+
/// `ValFlushNestedTb` (an NPT TbGeneration bump), which causes
106+
/// `SVM_TLB_FLUSH_CURRENT_ASID` on the next VMRUN.
107+
#[derive(Debug)]
108+
struct NptFlushPage {
109+
/// Backing file-mapping handle.
110+
handle: HANDLE,
111+
/// Address of the page in the surrogate process.
112+
surrogate_addr: *mut c_void,
113+
/// Cached function pointer for `WHvMapGpaRange2`.
114+
map_gpa_range2: WHvMapGpaRange2Func,
115+
/// Toggled on each flush to alternate the GPA permission between
116+
/// READ and READ|WRITE.
117+
toggle: bool,
118+
}
119+
120+
impl NptFlushPage {
121+
const GPA_FLAGS_READ: WHV_MAP_GPA_RANGE_FLAGS = WHvMapGpaRangeFlagRead;
122+
const GPA_FLAGS_READWRITE: WHV_MAP_GPA_RANGE_FLAGS =
123+
WHV_MAP_GPA_RANGE_FLAGS(WHvMapGpaRangeFlagRead.0 | WHvMapGpaRangeFlagWrite.0);
124+
125+
/// Allocate a dummy page, map it into the surrogate process, and
126+
/// create the initial GPA mapping at [`NPT_FLUSH_GPA`].
127+
fn new(
128+
partition: WHV_PARTITION_HANDLE,
129+
surrogate_process: &mut SurrogateProcess,
130+
) -> Result<Self, CreateVmError> {
131+
let handle = unsafe {
132+
CreateFileMappingA(
133+
INVALID_HANDLE_VALUE,
134+
None,
135+
PAGE_READWRITE,
136+
0,
137+
PAGE_SIZE_USIZE as u32,
138+
PCSTR::null(),
139+
)
140+
.map_err(|e| CreateVmError::InitializeVm(e.into()))?
141+
};
142+
143+
let surrogate_addr = surrogate_process
144+
.map(
145+
handle.into(),
146+
0, // sentinel key; MapViewOfFile never returns 0
147+
PAGE_SIZE_USIZE,
148+
&SurrogateMapping::SandboxMemory,
149+
)
150+
.map_err(|e| CreateVmError::SurrogateProcess(e.to_string()))?;
151+
152+
let map_gpa_range2 = unsafe {
153+
try_load_whv_map_gpa_range2().map_err(|e| CreateVmError::InitializeVm(e.into()))?
154+
};
155+
156+
let res = unsafe {
157+
map_gpa_range2(
158+
partition,
159+
surrogate_process.process_handle.into(),
160+
surrogate_addr,
161+
NPT_FLUSH_GPA,
162+
PAGE_SIZE_USIZE as u64,
163+
Self::GPA_FLAGS_READ,
164+
)
165+
};
166+
if res.is_err() {
167+
return Err(CreateVmError::InitializeVm(
168+
windows_result::Error::from_hresult(res).into(),
169+
));
170+
}
171+
172+
Ok(Self {
173+
handle,
174+
surrogate_addr,
175+
map_gpa_range2,
176+
toggle: false,
177+
})
178+
}
179+
180+
/// Toggle the dummy page's GPA permission to force an NPT TLB
181+
/// flush. VID skips the hypercall when permissions are unchanged,
182+
/// so we alternate between READ and READ|WRITE.
183+
fn flush(
184+
&mut self,
185+
partition: WHV_PARTITION_HANDLE,
186+
surrogate_process: &SurrogateProcess,
187+
) -> std::result::Result<(), RegisterError> {
188+
self.toggle = !self.toggle;
189+
let flags = if self.toggle {
190+
Self::GPA_FLAGS_READWRITE
191+
} else {
192+
Self::GPA_FLAGS_READ
193+
};
194+
let res = unsafe {
195+
(self.map_gpa_range2)(
196+
partition,
197+
surrogate_process.process_handle.into(),
198+
self.surrogate_addr,
199+
NPT_FLUSH_GPA,
200+
PAGE_SIZE_USIZE as u64,
201+
flags,
202+
)
203+
};
204+
if res.is_err() {
205+
return Err(RegisterError::ResetPartition(
206+
windows_result::Error::from_hresult(res).into(),
207+
));
208+
}
209+
Ok(())
210+
}
211+
}
212+
213+
impl Drop for NptFlushPage {
214+
fn drop(&mut self) {
215+
// The surrogate mapping is freed when SurrogateProcess drops;
216+
// we only need to close the file-mapping handle.
217+
if let Err(e) = unsafe { CloseHandle(self.handle) } {
218+
tracing::error!("Failed to close NPT flush page handle: {:?}", e);
219+
}
220+
}
221+
}
222+
89223
/// A Windows Hypervisor Platform implementation of a single-vcpu VM
90224
#[derive(Debug)]
91225
pub(crate) struct WhpVm {
@@ -98,15 +232,18 @@ pub(crate) struct WhpVm {
98232
/// Handle to the background timer (if started).
99233
#[cfg(feature = "hw-interrupts")]
100234
timer: Option<TimerThread>,
235+
/// Dummy page whose GPA permissions are toggled to force NPT TLB
236+
/// flushes after `WHvResetPartition` on AMD SVM hosts.
237+
npt_flush: NptFlushPage,
101238
}
102239

103240
// Safety: `WhpVm` is !Send because it holds `SurrogateProcess` which contains a raw pointer
104241
// `allocated_address` (*mut c_void). This pointer represents a memory mapped view address
105242
// in the surrogate process. It is never dereferenced, only used for address arithmetic and
106243
// resource management (unmapping). This is a system resource that is not bound to the creating
107244
// thread and can be safely transferred between threads.
108-
// `file_mappings` contains raw pointers that are also kernel resource handles,
109-
// safe to use from any thread.
245+
// `file_mappings` and `NptFlushPage` contain raw pointers that are also kernel
246+
// resource handles, safe to use from any thread.
110247
unsafe impl Send for WhpVm {}
111248

112249
impl WhpVm {
@@ -144,16 +281,19 @@ impl WhpVm {
144281

145282
let mgr = get_surrogate_process_manager()
146283
.map_err(|e| CreateVmError::SurrogateProcess(e.to_string()))?;
147-
let surrogate_process = mgr
284+
let mut surrogate_process = mgr
148285
.get_surrogate_process()
149286
.map_err(|e| CreateVmError::SurrogateProcess(e.to_string()))?;
150287

288+
let npt_flush = NptFlushPage::new(partition, &mut surrogate_process)?;
289+
151290
Ok(WhpVm {
152291
partition,
153292
surrogate_process,
154293
file_mappings: Vec::new(),
155294
#[cfg(feature = "hw-interrupts")]
156295
timer: None,
296+
npt_flush,
157297
})
158298
}
159299

@@ -729,35 +869,25 @@ impl VirtualMachine for WhpVm {
729869
WHvResetPartition(self.partition)
730870
.map_err(|e| RegisterError::ResetPartition(e.into()))?;
731871

732-
// WHvResetPartition resets the VMCB but may not update the
733-
// hypervisor's internal register-change tracking used for
734-
// TLB flush decisions on AMD SVM. Explicitly re-announce
735-
// the post-reset CR0 (PG=0) via the WHP API so that the
736-
// subsequent restore_sregs() write of CR0 with PG=1 is
737-
// detected as a paging-mode change, triggering
738-
// SVM_TLB_FLUSH_CURRENT_ASID on the next VMRUN.
739-
let post_reset_sregs = self.sregs()?;
740-
self.set_registers(&[(
741-
WHvX64RegisterCr0,
742-
Align16(WHV_REGISTER_VALUE {
743-
Reg64: post_reset_sregs.cr0,
744-
}),
745-
)])
746-
.map_err(|e| RegisterError::ResetPartition(e.into()))?;
872+
// WHvResetPartition does not flush the NPT (GPA→HPA) TLB
873+
// on AMD SVM. Toggle the dummy page's GPA permission to
874+
// trigger a TbGeneration bump, forcing the next VMRUN to
875+
// issue SVM_TLB_FLUSH_CURRENT_ASID.
876+
self.npt_flush
877+
.flush(self.partition, &self.surrogate_process)?;
747878

748879
// WHvResetPartition resets LAPIC to power-on defaults.
749880
// Re-initialize it when LAPIC emulation is active.
750881
#[cfg(feature = "hw-interrupts")]
751882
Self::init_lapic_bulk(self.partition)
752883
.map_err(|e| RegisterError::ResetPartition(e.into()))?;
753884

754-
// With hw-interrupts, set_sregs filters out APIC_BASE to
755-
// avoid disabling the LAPIC (the snapshot sregs default it
756-
// to 0). On AMD/SVM this means SVM_CLEAN_FIELD_AVIC is
757-
// never dirtied after the VMCB rebuild, causing the CPU to
758-
// use stale cached AVIC state on VMRUN which corrupts guest
759-
// memory reads. Re-writing APIC_BASE to its post-reset
760-
// value forces the dirty bit via ValSetEmulatedApicBase.
885+
// set_sregs filters out APIC_BASE (the snapshot sregs
886+
// default it to 0, which would disable the LAPIC). This
887+
// means APIC_BASE is never written after the VMCB rebuild,
888+
// so the AVIC clean bit stays set and the CPU may use stale
889+
// cached AVIC state. Re-writing the post-reset value forces
890+
// the hypervisor to dirty SVM_CLEAN_FIELD_AVIC.
761891
#[cfg(feature = "hw-interrupts")]
762892
{
763893
let apic_base = self.sregs()?.apic_base;

src/hyperlight_host/src/mem/layout.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ impl SandboxMemoryLayout {
313313
/// Both the scratch region and the snapshot region are bounded by
314314
/// this size. The value is arbitrary but chosen to be large enough
315315
/// for most workloads while preventing accidental resource exhaustion.
316-
const MAX_MEMORY_SIZE: usize = (16 * 1024 * 1024 * 1024) - Self::BASE_ADDRESS; // 16 GiB - BASE_ADDRESS
316+
pub(crate) const MAX_MEMORY_SIZE: usize = (16 * 1024 * 1024 * 1024) - Self::BASE_ADDRESS; // 16 GiB - BASE_ADDRESS
317317

318318
/// The base address of the sandbox's memory.
319319
#[cfg(not(feature = "nanvix-unstable"))]

0 commit comments

Comments
 (0)