hyperlight-dev · simongdavies · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
@@ -123,6 +123,11 @@ jobs:
           # with hw-interrupts feature enabled (+ explicit driver on Linux)
           just test ${{ inputs.config }} ${{ runner.os == 'Linux' && (inputs.hypervisor == 'mshv3' && 'mshv3,hw-interrupts' || 'kvm,hw-interrupts') || 'hw-interrupts' }}
 
+      - name: Run Rust tests with enable_guest_clock
+        run: |
+          # with enable_guest_clock + hw-interrupts (+ explicit driver on Linux)
+          just test ${{ inputs.config }} ${{ runner.os == 'Linux' && (inputs.hypervisor == 'mshv3' && 'mshv3,hw-interrupts,enable_guest_clock' || 'kvm,hw-interrupts,enable_guest_clock') || 'hw-interrupts,enable_guest_clock' }}
+
       - name: Run Rust Gdb tests
         env:
           RUST_LOG: debug

@@ -104,6 +104,9 @@ test-like-ci config=default-target hypervisor="kvm":
     @# with hw-interrupts enabled (+ explicit driver on Linux)
     {{ if os() == "linux" { if hypervisor == "mshv3" { "just test " + config + " mshv3,hw-interrupts" } else { "just test " + config + " kvm,hw-interrupts" } } else { "just test " + config + " hw-interrupts" } }}
 
+    @# with enable_guest_clock (+ explicit driver + hw-interrupts on Linux)
+    {{ if os() == "linux" { if hypervisor == "mshv3" { "just test " + config + " mshv3,hw-interrupts,enable_guest_clock" } else { "just test " + config + " kvm,hw-interrupts,enable_guest_clock" } } else { "just test " + config + " hw-interrupts,enable_guest_clock" } }}
+
     @# make sure certain cargo features compile
     just check
 

@@ -30,6 +30,7 @@ This project is composed internally of several components, depicted in the below
 * [How to build a Hyperlight guest binary](./how-to-build-a-hyperlight-guest-binary.md)
 * [Security considerations](./security.md)
 * [Technical requirements document](./technical-requirements-document.md)
+* [Paravirtualized guest clock](./guest-time.md)
 
 ## For developers
 

@@ -0,0 +1,139 @@
+# Paravirtualized Guest Clock
+
+Hyperlight's `enable_guest_clock` Cargo feature gives guests a cheap way to ask
+"what time is it?" without taking a VM exit. When the host is built with the
+feature, every sandbox exposes a paravirtualized clock that the guest can read
+using ordinary memory loads.
+
+## What the guest gets
+
+When the feature is enabled the host populates a single 4 KiB "clock page"
+inside the sandbox's scratch region. The page carries two pieces of
+information:
+
+- **A hypervisor-specific calibration block at offset `0x00`.** Written by
+  KVM (`kvm_clock`) or Hyper-V / MSHV (Reference TSC). Contains the TSC
+  frequency, scaling constants, and a sequence lock the guest uses to read it
+  atomically. The entire clock page is hypervisor-owned; Hyperlight does not
+  write to it.
+- **Hyperlight metadata in the scratch bookkeeping page** (separate from the
+  clock page): a `u64` [`ClockType`](../src/hyperlight_common/src/time.rs) tag
+  and `boot_time_ns`, the Unix-epoch origin of the monotonic clock computed
+  by the host as `wall_now - monotonic_now` (see below). These live at fixed
+  offsets from the top of scratch (`-0x28` and `-0x30`), NOT in the clock
+  page, so a future TLFS extension cannot clobber them.
+
+With those two pieces the guest can compute:
+
+- **Monotonic nanoseconds since boot** — read the TSC, apply the scaling
+  factors from the calibration block, giving you a `CLOCK_MONOTONIC`
+  equivalent.
+- **Wall-clock nanoseconds since the Unix epoch** — add `boot_time_ns` to the
+  monotonic value above, giving you a `CLOCK_REALTIME` / `gettimeofday`. `boot_time_ns` is computed by the host as
+  `SystemTime::now() - KVM_GET_CLOCK` (on KVM) or
+  `SystemTime::now() - TIME_REF_COUNT` (on Hyper-V) after sandbox
+  initialisation. Hyper-V has no equivalent to KVM's
+  `MSR_KVM_WALL_CLOCK_NEW`, so we use this uniform host-computed approach
+  on all backends.
+
+> **Note (KVM only):** Wall-clock time returns `None` during
+> `hyperlight_main` (guest init). On KVM, `KVM_GET_CLOCK` is unreliable
+> until the "master clock" is established at first vCPU entry, so
+> `boot_time_ns` is stamped after init completes. Monotonic time works
+> fine during init. Wall-clock time becomes available on the first
+> dispatch call.
+
+Both reads are lock-free (well, seqlock-protected for the calibration block)
+and never leave the guest.
+
+## Using it in a Rust guest
+
+The guest-side API lives in `hyperlight_guest::time` for the low-level
+readers and `hyperlight_guest_bin::time` for a `std::time`-flavoured
+wrapper:
+
+```rust
+// Low-level, no_std readers.
+use hyperlight_guest::time;
+
+if time::is_available() {
+    let mono_ns: u64 = time::monotonic_time_ns().unwrap();
+    let wall_ns: u64 = time::wall_clock_time_ns().unwrap();
+}
+
+// std::time-flavoured wrapper (hyperlight_guest_bin only).
+use hyperlight_guest_bin::time::{Instant, SystemTime, UNIX_EPOCH};
+
+let t0 = Instant::now()?;
+// ... do work ...
+let elapsed = t0.elapsed()?;
+
+let now = SystemTime::now()?;
+let unix_ns = now.duration_since(UNIX_EPOCH)?.as_nanos();
+```
+
+C guests that use picolibc get paravirt time for free: `hyperlight_guest_bin`
+wires `clock_gettime(CLOCK_MONOTONIC|CLOCK_REALTIME)` and `gettimeofday` into
+the same reader, so existing C code continues to work unchanged.
+
+## Snapshot / restore semantics
+
+Both `boot_time_ns` and the hypervisor calibration block live inside scratch
+memory, which is not included in snapshots. On every
+`MultiUseSandbox::restore`, the host re-arms the clock page: it re-installs
+the pvclock MSR / Hyper-V register against the fresh vCPU state and stamps a
+new `boot_time_ns` captured at the moment of restore. As a result a restored
+guest observes wall-clock time reflecting the restore moment, not the
+original boot — which is what wall clocks are supposed to do.
+
+## Enabling the feature
+
+Turn it on in the host's `Cargo.toml`:
+
+```toml
+[dependencies]
+hyperlight-host = { version = "...", features = ["enable_guest_clock"] }
+```
+
+The feature is x86_64 only; on aarch64 it has no effect. It is off by default
+so existing sandboxes don't pay for a facility they don't use. When off, the
+clock page is still reserved in the layout (so memory maps are stable) but
+left un-mapped against any hypervisor clock source; `hyperlight_guest::time`
+readers then report "unavailable" and fall back to whatever the guest wants
+to do about it (the picolibc wiring returns a synthetic 1-second-per-call
+counter).
+
+It is also a good stopgap for many other things that expect `gettimeofday` /
+`clock_gettime` to work (like StarlingMonkey and QuickJS).
+
+## Layout details
+
+The clock page is the second page from the very top of the scratch region.
+The top of scratch holds a fixed four-page reserved region:
+
+| Offset from top | Size  | Contents                                       |
+|-----------------|-------|------------------------------------------------|
+| `-0x1000`       | 4 KiB | Metadata / bookkeeping (size, allocator, ...)  |
+| `-0x2000`       | 4 KiB | Paravirtualized clock page                     |
+| `-0x4000`       | 8 KiB | Exception (IST1) stack (2 pages)               |
+
+The guest's IST1 (exception) stack starts at the clock-page base
+(`MAX_GVA + 1 - SCRATCH_TOP_EXN_STACK_OFFSET`) and grows downward through its
+two dedicated pages, so stack writes — including page-fault handlers running
+on IST1 — cannot clobber the clock page or the metadata page above. The
+allocator reserves the whole four-page region unconditionally so the memory
+map stays identical whether or not the feature is enabled.
+
+## Non-goals
+
+- **Sub-microsecond accuracy.** `boot_time_ns` is computed from two
+  back-to-back host reads (`SystemTime::now()` and `KVM_GET_CLOCK` /
+  `TIME_REF_COUNT`). On KVM, residual disagreement between `KVM_GET_CLOCK`
+  and the pvclock page can add up to ~13ms of constant offset (observed on
+  WSL2; root cause uncertain). On Hyper-V the offset should be negligible.
+- **`CLOCK_PROCESS_CPUTIME_ID` and friends.** The clock page exposes only
+  monotonic and wall-clock time; per-thread / per-process CPU time is out of
+  scope.
+- **Timers or sleeps.** The guest can read the clock but has no way to ask
+  the hypervisor to wake it up later — that is still done through the
+  existing guest-function call model.
@@ -109,8 +109,10 @@ The guest advances it atomically.
 ## The guest exception stack
 
 Similarly, the guest needs a stack that is always writable, in order
-to be able to take exceptions to it. The exception stack begins below
-the metadata at the top of the scratch region and grows downward.
+to be able to take exceptions to it. The exception stack occupies two
+dedicated pages within the reserved region at the top of the scratch
+region — directly below the metadata page and the paravirtualized clock
+page — and grows downward from there.
 
 ## Taking a snapshot
 

@@ -36,9 +36,20 @@ pub const MAX_GPA: usize = 0x0000_000f_ffff_ffff;
 /// - (up to) 4 pages for the PTEs for mapping that (including CoW'ing the root PT)
 /// - A page for the smallest possible non-exception stack
 /// - (up to) 3 pages for mapping that
-/// - Two pages for the exception stack and metadata
 /// - A page-aligned amount of memory for I/O buffers (for now)
+/// - The reserved region at the very top of scratch
+///   ([`super::SCRATCH_TOP_RESERVED_SIZE`], 4 pages):
+///     - One page for the metadata / bookkeeping page (size, allocator,
+///       `clock_type`, `boot_time_ns`, …; only partially populated)
+///     - One page for the paravirtualized clock page
+///     - Two pages for the exception (IST1) stack
+///
+/// The reserved region is included here so `min_scratch_size` is the single,
+/// complete minimum — every caller uses it directly, without adding anything
+/// back. The guest allocator skips the same top pages unconditionally, so the
+/// memory map is stable regardless of feature flags.
 pub fn min_scratch_size(input_data_size: usize, output_data_size: usize) -> usize {
     (input_data_size + output_data_size).next_multiple_of(crate::vmem::PAGE_SIZE)
-        + 12 * crate::vmem::PAGE_SIZE
+        + 10 * crate::vmem::PAGE_SIZE
+        + super::SCRATCH_TOP_RESERVED_SIZE as usize
 }
@@ -20,12 +20,93 @@ mod arch;
 
 pub use arch::{MAX_GPA, MAX_GVA, SNAPSHOT_PT_GVA_MAX, SNAPSHOT_PT_GVA_MIN};
 
-// offsets down from the top of scratch memory for various things
+// The topmost page of scratch serves as a host→guest bookkeeping /
+// configuration page. The host writes these fields before the first vCPU
+// run and on snapshot restore; the guest reads them at startup and on
+// each clock query. All fields are u64, little-endian, naturally aligned.
 pub const SCRATCH_TOP_SIZE_OFFSET: u64 = 0x08;
 pub const SCRATCH_TOP_ALLOCATOR_OFFSET: u64 = 0x10;
 pub const SCRATCH_TOP_SNAPSHOT_PT_GPA_BASE_OFFSET: u64 = 0x18;
 pub const SCRATCH_TOP_SNAPSHOT_GENERATION_OFFSET: u64 = 0x20;
-pub const SCRATCH_TOP_EXN_STACK_OFFSET: u64 = 0x30;
+
+/// Offset from the top of scratch for the `clock_type` field (u64).
+///
+/// Identifies which paravirtualized clock the host configured
+/// ([`crate::time::ClockType`]). Lives in the bookkeeping page at the
+/// top of scratch — NOT in the clock page itself — so the hypervisor
+/// cannot clobber it if it extends the TLFS-reserved region.
+pub const SCRATCH_TOP_CLOCK_TYPE_OFFSET: u64 = 0x28;
+
+/// Offset from the top of scratch for the `boot_time_ns` field (u64).
+///
+/// The Unix-epoch origin of the monotonic clock, computed by the host
+/// as `SystemTime::now() - current_monotonic_ns()` and written in
+/// `arm_clock`. The guest recovers wall time as
+/// `boot_time_ns + monotonic_time_ns()`.
+///
+/// Hyper-V has no equivalent to KVM's `MSR_KVM_WALL_CLOCK_NEW`, so
+/// we use this uniform host-computed approach on all backends.
+pub const SCRATCH_TOP_BOOT_TIME_NS_OFFSET: u64 = 0x30;
+
+// ---- Next free offset in the bookkeeping page: 0x38 ----
+// When adding new host→guest shared fields, use the next multiple of
+// 8 after the last offset above. All fields in this page are u64,
+// little-endian, host-written and guest-read, and are excluded from
+// snapshots because they live in scratch memory.
+
+/// Offset from the top of scratch memory to the clock page's **high edge**
+/// (its top, exclusive).
+///
+/// The reserved region at the very top of scratch is, from the top down:
+///
+/// ```text
+///   [MAX_GPA + 1 - 0x1000, MAX_GPA + 1)            metadata / bookkeeping page
+///   [MAX_GPA + 1 - 0x2000, MAX_GPA + 1 - 0x1000)   clock page
+///   [MAX_GPA + 1 - 0x4000, MAX_GPA + 1 - 0x2000)   exception (IST1) stack (2 pages)
+/// ```
+///
+/// The clock page is therefore the **second page from the top**, one 4 KiB
+/// page below the metadata page, so this offset to its high edge is exactly
+/// one page. The clock page *base* is one page lower again — see
+/// [`SCRATCH_TOP_EXN_STACK_OFFSET`] and [`clock_page_gpa`].
+///
+/// Keeping the clock page on its own page — separate from the bookkeeping
+/// fields above it — guarantees the hypervisor, which owns the whole page
+/// (KVM pvclock or Hyper-V Reference TSC), cannot clobber Hyperlight's
+/// `clock_type` / `boot_time_ns` metadata even if a future TLFS extension
+/// grows the reserved region.
+///
+/// The page is always reserved regardless of the `enable_guest_clock`
+/// feature so that the memory layout (and therefore stack positions)
+/// is stable across feature-flag builds. The host only populates it
+/// when the feature is enabled; otherwise it stays zero-filled and
+/// the guest sees `ClockType::None`.
+pub const SCRATCH_TOP_CLOCK_PAGE_OFFSET: u64 = crate::mem::PAGE_SIZE;
+
+/// Offset from the top of scratch to the top of the exception (IST1) stack,
+/// which is also the **base** of the clock page (the boundary between the
+/// clock page and the exception stack below it).
+///
+/// Derived as one page below [`SCRATCH_TOP_CLOCK_PAGE_OFFSET`] so it can
+/// never drift from the clock page above it. The exception stack grows
+/// *downward* from here for `EXN_STACK_PAGES` pages; placing its top here
+/// means neither it nor any page-fault / COW handler running on it can
+/// clobber the clock page or the metadata page above.
+pub const SCRATCH_TOP_EXN_STACK_OFFSET: u64 = SCRATCH_TOP_CLOCK_PAGE_OFFSET + crate::mem::PAGE_SIZE;
+
+/// Number of 4 KiB pages reserved for the IST1 exception stack at the top
+/// of scratch.
+const EXN_STACK_PAGES: u64 = 2;
+
+/// Total size of the reserved region at the very top of scratch: the
+/// metadata page, the clock page, and the `EXN_STACK_PAGES`-page exception
+/// stack. Everything below this is general scratch (heap, I/O buffers, …).
+///
+/// Both the guest physical allocator and the host minimum-size check use
+/// this single value, so the reservation and the size requirement can never
+/// disagree.
+pub const SCRATCH_TOP_RESERVED_SIZE: u64 =
+    SCRATCH_TOP_EXN_STACK_OFFSET + EXN_STACK_PAGES * crate::mem::PAGE_SIZE;
 
 pub fn scratch_base_gpa(size: usize) -> u64 {
     (MAX_GPA - size + 1) as u64
@@ -34,5 +115,28 @@ pub fn scratch_base_gva(size: usize) -> u64 {
     (MAX_GVA - size + 1) as u64
 }
 
+/// Guest physical address of the base of the paravirtualized clock page.
+///
+/// The clock page sits at a fixed offset from the top of the guest physical
+/// address space, independent of `scratch_size`: its base is always
+/// `MAX_GPA + 1 - SCRATCH_TOP_EXN_STACK_OFFSET` (the clock page is the second
+/// page from the top, and its base is the boundary with the exception stack
+/// below it).
+///
+/// Only meaningful when the host is built with the `enable_guest_clock`
+/// feature; otherwise the page is not populated.
+pub const fn clock_page_gpa() -> u64 {
+    (MAX_GPA as u64) + 1 - SCRATCH_TOP_EXN_STACK_OFFSET
+}
+
+/// Guest virtual address of the base of the paravirtualized clock page.
+///
+/// See [`clock_page_gpa`]. Scratch is mapped identity-style from
+/// `scratch_base_gva` to `scratch_base_gpa`, so the clock page sits at the
+/// equivalent offset in the guest virtual address space.
+pub const fn clock_page_gva() -> u64 {
+    (MAX_GVA as u64) + 1 - SCRATCH_TOP_EXN_STACK_OFFSET
+}
+
 /// Compute the minimum scratch region size needed for a sandbox.
 pub use arch::min_scratch_size;
@@ -48,6 +48,10 @@ pub mod func;
 // cbindgen:ignore
 pub mod vmem;
 
+/// Paravirtualized clock structures shared between host and guest.
+/// cbindgen:ignore
+pub mod time;
+
 /// ELF note types for embedding hyperlight version metadata in guest binaries.
 pub mod version_note;