diff --git a/.github/workflows/dep_build_test.yml b/.github/workflows/dep_build_test.yml index 32ac2a306..8b5c4651e 100644 --- a/.github/workflows/dep_build_test.yml +++ b/.github/workflows/dep_build_test.yml @@ -114,6 +114,11 @@ jobs: # with hw-interrupts feature enabled (+ explicit driver on Linux) just test ${{ inputs.config }} ${{ runner.os == 'Linux' && (inputs.hypervisor == 'mshv3' && 'mshv3,hw-interrupts' || 'kvm,hw-interrupts') || 'hw-interrupts' }} + - name: Run Rust tests with enable_guest_clock + run: | + # with enable_guest_clock + hw-interrupts (+ explicit driver on Linux) + just test ${{ inputs.config }} ${{ runner.os == 'Linux' && (inputs.hypervisor == 'mshv3' && 'mshv3,hw-interrupts,enable_guest_clock' || 'kvm,hw-interrupts,enable_guest_clock') || 'hw-interrupts,enable_guest_clock' }} + - name: Run Rust Gdb tests env: RUST_LOG: debug diff --git a/Justfile b/Justfile index 90b9ba61e..173e18f55 100644 --- a/Justfile +++ b/Justfile @@ -91,6 +91,9 @@ test-like-ci config=default-target hypervisor="kvm": @# with hw-interrupts enabled (+ explicit driver on Linux) {{ if os() == "linux" { if hypervisor == "mshv3" { "just test " + config + " mshv3,hw-interrupts" } else { "just test " + config + " kvm,hw-interrupts" } } else { "just test " + config + " hw-interrupts" } }} + @# with enable_guest_clock (+ explicit driver + hw-interrupts on Linux) + {{ if os() == "linux" { if hypervisor == "mshv3" { "just test " + config + " mshv3,hw-interrupts,enable_guest_clock" } else { "just test " + config + " kvm,hw-interrupts,enable_guest_clock" } } else { "just test " + config + " hw-interrupts,enable_guest_clock" } }} + @# make sure certain cargo features compile just check diff --git a/docs/README.md b/docs/README.md index 3b36c5db1..a4f86d836 100644 --- a/docs/README.md +++ b/docs/README.md @@ -29,6 +29,7 @@ This project is composed internally of several components, depicted in the below * [How to build a Hyperlight guest binary](./how-to-build-a-hyperlight-guest-binary.md) * [Security considerations](./security.md) * [Technical requirements document](./technical-requirements-document.md) +* [Paravirtualized guest clock](./guest-time.md) ## For developers diff --git a/docs/guest-time.md b/docs/guest-time.md new file mode 100644 index 000000000..3828da584 --- /dev/null +++ b/docs/guest-time.md @@ -0,0 +1,135 @@ +# Paravirtualized Guest Clock + +Hyperlight's `enable_guest_clock` Cargo feature gives guests a cheap way to ask +"what time is it?" without taking a VM exit. When the host is built with the +feature, every sandbox exposes a paravirtualized clock that the guest can read +using ordinary memory loads. + +## What the guest gets + +When the feature is enabled the host populates a single 4 KiB "clock page" +inside the sandbox's scratch region. The page carries two pieces of +information: + +- **A hypervisor-specific calibration block at offset `0x00`.** Written by + KVM (`kvm_clock`) or Hyper-V / MSHV (Reference TSC). Contains the TSC + frequency, scaling constants, and a sequence lock the guest uses to read it + atomically. The entire clock page is hypervisor-owned; Hyperlight does not + write to it. +- **Hyperlight metadata in the scratch bookkeeping page** (separate from the + clock page): a `u64` [`ClockType`](../src/hyperlight_common/src/time.rs) tag + and `boot_time_ns`, the Unix-epoch origin of the monotonic clock computed + by the host as `wall_now - monotonic_now` (see below). These live at fixed + offsets from the top of scratch (`-0x28` and `-0x30`), NOT in the clock + page, so a future TLFS extension cannot clobber them. + +With those two pieces the guest can compute: + +- **Monotonic nanoseconds since boot** — read the TSC, apply the scaling + factors from the calibration block, giving you a `CLOCK_MONOTONIC` + equivalent. +- **Wall-clock nanoseconds since the Unix epoch** — add `boot_time_ns` to the + monotonic value above, giving you a `CLOCK_REALTIME` / `gettimeofday`. `boot_time_ns` is computed by the host as + `SystemTime::now() - KVM_GET_CLOCK` (on KVM) or + `SystemTime::now() - TIME_REF_COUNT` (on Hyper-V) after sandbox + initialisation. Hyper-V has no equivalent to KVM's + `MSR_KVM_WALL_CLOCK_NEW`, so we use this uniform host-computed approach + on all backends. + +> **Note (KVM only):** Wall-clock time returns `None` during +> `hyperlight_main` (guest init). On KVM, `KVM_GET_CLOCK` is unreliable +> until the "master clock" is established at first vCPU entry, so +> `boot_time_ns` is stamped after init completes. Monotonic time works +> fine during init. Wall-clock time becomes available on the first +> dispatch call. + +Both reads are lock-free (well, seqlock-protected for the calibration block) +and never leave the guest. + +## Using it in a Rust guest + +The guest-side API lives in `hyperlight_guest::time` for the low-level +readers and `hyperlight_guest_bin::time` for a `std::time`-flavoured +wrapper: + +```rust +// Low-level, no_std readers. +use hyperlight_guest::time; + +if time::is_available() { + let mono_ns: u64 = time::monotonic_time_ns().unwrap(); + let wall_ns: u64 = time::wall_clock_time_ns().unwrap(); +} + +// std::time-flavoured wrapper (hyperlight_guest_bin only). +use hyperlight_guest_bin::time::{Instant, SystemTime, UNIX_EPOCH}; + +let t0 = Instant::now()?; +// ... do work ... +let elapsed = t0.elapsed()?; + +let now = SystemTime::now()?; +let unix_ns = now.duration_since(UNIX_EPOCH)?.as_nanos(); +``` + +C guests that use picolibc get paravirt time for free: `hyperlight_guest_bin` +wires `clock_gettime(CLOCK_MONOTONIC|CLOCK_REALTIME)` and `gettimeofday` into +the same reader, so existing C code continues to work unchanged. + +## Snapshot / restore semantics + +Both `boot_time_ns` and the hypervisor calibration block live inside scratch +memory, which is not included in snapshots. On every +`MultiUseSandbox::restore`, the host re-arms the clock page: it re-installs +the pvclock MSR / Hyper-V register against the fresh vCPU state and stamps a +new `boot_time_ns` captured at the moment of restore. As a result a restored +guest observes wall-clock time reflecting the restore moment, not the +original boot — which is what wall clocks are supposed to do. + +## Enabling the feature + +Turn it on in the host's `Cargo.toml`: + +```toml +[dependencies] +hyperlight-host = { version = "...", features = ["enable_guest_clock"] } +``` + +The feature is x86_64 only; on aarch64 it has no effect. It is off by default +so existing sandboxes don't pay for a facility they don't use. When off, the +clock page is still reserved in the layout (so memory maps are stable) but +left un-mapped against any hypervisor clock source; `hyperlight_guest::time` +readers then report "unavailable" and fall back to whatever the guest wants +to do about it (the picolibc wiring returns a synthetic 1-second-per-call +counter, which is enough to stop `strftime` crashing and not much else). + +## Layout details + +The clock page sits 3 pages below the very top of the scratch region: + +| Offset from top | Size | Contents | +|-----------------|-------|------------------------------------------------| +| `-0x1000` | 4 KiB | Bookkeeping (size, allocator counter, ...) | +| `-0x2000` | 4 KiB | Reserved for shared-state counter | +| `-0x3000` | 4 KiB | Paravirtualized clock page | + +Because the clock page is at the top of scratch, both the guest's main stack +and its IST1 (exception) stack are configured to start one page below the +clock page (at `MAX_GVA + 1 - SCRATCH_TOP_CLOCK_PAGE_OFFSET`) so stack writes +— including page-fault handlers running on IST1 — cannot clobber the trailer. +The allocator reserves the top three pages unconditionally so the memory map +stays identical whether or not the feature is enabled. + +## Non-goals + +- **Sub-microsecond accuracy.** `boot_time_ns` is computed from two + back-to-back host reads (`SystemTime::now()` and `KVM_GET_CLOCK` / + `TIME_REF_COUNT`). On KVM, residual disagreement between `KVM_GET_CLOCK` + and the pvclock page can add up to ~13ms of constant offset (observed on + WSL2; root cause uncertain). On Hyper-V the offset should be negligible. +- **`CLOCK_PROCESS_CPUTIME_ID` and friends.** The clock page exposes only + monotonic and wall-clock time; per-thread / per-process CPU time is out of + scope. +- **Timers or sleeps.** The guest can read the clock but has no way to ask + the hypervisor to wake it up later — that is still done through the + existing guest-function call model. diff --git a/src/hyperlight_common/src/layout.rs b/src/hyperlight_common/src/layout.rs index 1a7ca0880..bb7b57648 100644 --- a/src/hyperlight_common/src/layout.rs +++ b/src/hyperlight_common/src/layout.rs @@ -38,7 +38,31 @@ pub const SCRATCH_TOP_SIZE_OFFSET: u64 = 0x08; pub const SCRATCH_TOP_ALLOCATOR_OFFSET: u64 = 0x10; pub const SCRATCH_TOP_SNAPSHOT_PT_GPA_BASE_OFFSET: u64 = 0x18; pub const SCRATCH_TOP_SNAPSHOT_GENERATION_OFFSET: u64 = 0x20; -pub const SCRATCH_TOP_EXN_STACK_OFFSET: u64 = 0x30; + +/// Offset from the top of scratch for the `clock_type` field (u64). +/// +/// Identifies which paravirtualized clock the host configured +/// ([`crate::time::ClockType`]). Lives in the bookkeeping page at the +/// top of scratch — NOT in the clock page itself — so the hypervisor +/// cannot clobber it if it extends the TLFS-reserved region. +pub const SCRATCH_TOP_CLOCK_TYPE_OFFSET: u64 = 0x28; + +/// Offset from the top of scratch for the `boot_time_ns` field (u64). +/// +/// The Unix-epoch origin of the monotonic clock, computed by the host +/// as `SystemTime::now() - current_monotonic_ns()` and written in +/// `arm_clock`. The guest recovers wall time as +/// `boot_time_ns + monotonic_time_ns()`. +/// +/// Hyper-V has no equivalent to KVM's `MSR_KVM_WALL_CLOCK_NEW`, so +/// we use this uniform host-computed approach on all backends. +pub const SCRATCH_TOP_BOOT_TIME_NS_OFFSET: u64 = 0x30; + +// ---- Next free offset in the bookkeeping page: 0x38 ---- +// When adding new host→guest shared fields, use the next multiple of +// 8 after the last offset above. All fields in this page are u64, +// little-endian, host-written and guest-read, and are excluded from +// snapshots because they live in scratch memory. /// Offset from the top of scratch memory for a shared host-guest u64 counter. /// @@ -49,6 +73,33 @@ pub const SCRATCH_TOP_EXN_STACK_OFFSET: u64 = 0x30; #[cfg(feature = "guest-counter")] pub const SCRATCH_TOP_GUEST_COUNTER_OFFSET: u64 = 0x1008; +/// Offset from the top of scratch memory for the start of the paravirtualized +/// clock page. +/// +/// The clock page is a single 4 KiB page occupying the scratch offsets +/// `[0x3000, 0x2000)` from the top — i.e. one page lower than the +/// guest-counter page, to avoid the i686 frame-number issue that forces the +/// counter off the very last page (see [`SCRATCH_TOP_GUEST_COUNTER_OFFSET`]). +/// +/// The constant is the *high* (exclusive) offset; the page base is one page +/// below, at `top - SCRATCH_TOP_CLOCK_PAGE_OFFSET` + 1 byte — in other words, +/// subtract this value from `MAX_GPA`/`MAX_GVA` + 1 to get the page base. +/// +/// The page is always reserved regardless of the `enable_guest_clock` +/// feature so that the memory layout (and therefore stack positions) +/// is stable across feature-flag builds. The host only populates it +/// when the feature is enabled; otherwise it stays zero-filled and +/// the guest sees `ClockType::None`. +pub const SCRATCH_TOP_CLOCK_PAGE_OFFSET: u64 = 0x3000; + +/// Size of the paravirtualized clock page in bytes (one 4 KiB page). +/// The entire page is owned by the hypervisor (KVM pvclock or Hyper-V +/// Reference TSC). Hyperlight's own metadata (`clock_type`, +/// `boot_time_ns`) lives in the bookkeeping page at offsets +/// `SCRATCH_TOP_CLOCK_TYPE_OFFSET` / `SCRATCH_TOP_BOOT_TIME_NS_OFFSET`, +/// NOT in the clock page, so a future TLFS extension cannot clobber it. +pub const CLOCK_PAGE_SIZE: u64 = 0x1000; + pub fn scratch_base_gpa(size: usize) -> u64 { (MAX_GPA - size + 1) as u64 } @@ -56,5 +107,26 @@ pub fn scratch_base_gva(size: usize) -> u64 { (MAX_GVA - size + 1) as u64 } +/// Guest physical address of the base of the paravirtualized clock page. +/// +/// The clock page sits at a fixed offset from the top of the guest physical +/// address space, independent of `scratch_size`: it is always +/// `MAX_GPA + 1 - SCRATCH_TOP_CLOCK_PAGE_OFFSET`. +/// +/// Only meaningful when the host is built with the `enable_guest_clock` +/// feature; otherwise the page is not populated. +pub const fn clock_page_gpa() -> u64 { + (MAX_GPA as u64) + 1 - SCRATCH_TOP_CLOCK_PAGE_OFFSET +} + +/// Guest virtual address of the base of the paravirtualized clock page. +/// +/// See [`clock_page_gpa`]. Scratch is mapped identity-style from +/// `scratch_base_gva` to `scratch_base_gpa`, so the clock page sits at the +/// equivalent offset in the guest virtual address space. +pub const fn clock_page_gva() -> u64 { + (MAX_GVA as u64) + 1 - SCRATCH_TOP_CLOCK_PAGE_OFFSET +} + /// Compute the minimum scratch region size needed for a sandbox. pub use arch::min_scratch_size; diff --git a/src/hyperlight_common/src/lib.rs b/src/hyperlight_common/src/lib.rs index eb4be220c..0bbb70dba 100644 --- a/src/hyperlight_common/src/lib.rs +++ b/src/hyperlight_common/src/lib.rs @@ -48,5 +48,9 @@ pub mod func; // cbindgen:ignore pub mod vmem; +/// Paravirtualized clock structures shared between host and guest. +/// cbindgen:ignore +pub mod time; + /// ELF note types for embedding hyperlight version metadata in guest binaries. pub mod version_note; diff --git a/src/hyperlight_common/src/time.rs b/src/hyperlight_common/src/time.rs new file mode 100644 index 000000000..48cd0e660 --- /dev/null +++ b/src/hyperlight_common/src/time.rs @@ -0,0 +1,165 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Paravirtualized clock structures shared between host and guest. +//! +//! Guests can read time without a VM exit by consulting a shared memory page +//! that the hypervisor updates. The page is placed in the sandbox's scratch +//! region (see [`crate::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET`]), so it is +//! not included in sandbox snapshots. +//! +//! # Layout +//! +//! The clock page (4 KiB) is 100% hypervisor-owned: +//! +//! ```text +//! clock page (offset -0x3000 from scratch top): +//! 0x0000 .. : hypervisor calibration data +//! - KVM: KvmPvclockVcpuTimeInfo (32 bytes) +//! - Hyper-V: HvReferenceTscPage (4096 bytes) +//! ``` +//! +//! Hyperlight's own metadata lives in the bookkeeping page at the top +//! of scratch (separate from the clock page), so a future TLFS +//! extension of the reserved region cannot clobber it: +//! +//! ```text +//! bookkeeping page (top of scratch, offset -0x08..-0x30): +//! -0x28 : clock_type (u64, ClockType discriminant) +//! -0x30 : boot_time_ns (u64, Unix-epoch origin of monotonic clock) +//! ``` + +/// KVM pvclock flag: TSC is stable and synchronized across vCPUs. +/// +/// When this bit is set in [`KvmPvclockVcpuTimeInfo::flags`], the TSC is +/// guaranteed to be monotonic and synchronized across all vCPUs, even when +/// migrating between physical CPUs on the same host. +/// +/// Reference: Linux kernel `arch/x86/include/asm/pvclock-abi.h`. +pub const PVCLOCK_TSC_STABLE_BIT: u8 = 1 << 0; + +/// KVM pvclock structure (defined by KVM ABI). +/// +/// The host writes to this structure, and the guest reads it to compute the +/// current time in nanoseconds. +/// +/// Reference: Linux kernel `arch/x86/include/asm/pvclock.h`. +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub struct KvmPvclockVcpuTimeInfo { + /// Version counter — odd means update in progress. Guest must re-read + /// if this changes during read. + pub version: u32, + pub pad0: u32, + /// TSC value when `system_time` was captured. + pub tsc_timestamp: u64, + /// System time in nanoseconds at `tsc_timestamp`. + pub system_time: u64, + /// Multiplier for TSC → nanoseconds conversion. + pub tsc_to_system_mul: u32, + /// Shift for TSC → nanoseconds conversion (can be negative). + pub tsc_shift: i8, + /// Flags (e.g. [`PVCLOCK_TSC_STABLE_BIT`]). + pub flags: u8, + pub pad: [u8; 2], +} + +/// Hyper-V Reference TSC page structure (defined by Hyper-V ABI). +/// +/// Used by both MSHV (Linux) and WHP (Windows). Time is in 100-nanosecond +/// intervals. +/// +/// Reference: Hyper-V TLFS §12.7. +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub struct HvReferenceTscPage { + /// Sequence counter. A value of 0 means the host is directing the guest + /// to fall back to an MSR read; the guest must also re-read if this + /// changes during a read. + pub tsc_sequence: u32, + pub reserved1: u32, + /// Scale factor for TSC → time conversion. + /// Formula: `time = (tsc * tsc_scale) >> 64 + tsc_offset` (in 100 ns). + pub tsc_scale: u64, + /// Offset to add after scaling (in 100 ns units). + pub tsc_offset: i64, + /// Rest of the 4 KiB page is reserved by the TLFS. + pub reserved2: [u64; 509], +} + +/// Type of paravirtualized clock configured for the guest. +/// +/// This is the value written by the host at +/// [`crate::layout::SCRATCH_TOP_CLOCK_TYPE_OFFSET`] in the scratch +/// bookkeeping page. +/// The guest treats any value other than the two supported variants as +/// [`ClockType::None`] — this means a misbehaving host that writes garbage +/// to the bookkeeping page simply ends up advertising "no clock", rather than +/// causing the guest to misinterpret the calibration header. +#[repr(u64)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ClockType { + /// No clock configured — time functions return `None` / zero. + None = 0, + /// KVM pvclock (Linux KVM hypervisor). + KvmPvclock = 1, + /// Hyper-V Reference TSC (MSHV on Linux, WHP on Windows). + HyperVReferenceTsc = 2, +} + +impl From for ClockType { + fn from(value: u64) -> Self { + match value { + 1 => ClockType::KvmPvclock, + 2 => ClockType::HyperVReferenceTsc, + _ => ClockType::None, + } + } +} + +impl From for u64 { + fn from(value: ClockType) -> Self { + value as u64 + } +} + +// Compile-time size invariants. These layouts are dictated by the hypervisor +// ABI (KVM pvclock, Hyper-V TLFS §12.7) — a size mismatch is a programming +// error that must surface at build time. +const _: () = { + assert!(core::mem::size_of::() == 32); + assert!(core::mem::size_of::() == 4096); +}; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn clock_type_conversion_round_trips() { + assert_eq!(ClockType::from(0u64), ClockType::None); + assert_eq!(ClockType::from(1u64), ClockType::KvmPvclock); + assert_eq!(ClockType::from(2u64), ClockType::HyperVReferenceTsc); + } + + #[test] + fn clock_type_conversion_unknown_is_none() { + // A host that writes an unrecognised value must be treated as + // "clock unavailable", not as an opportunity to misinterpret. + assert_eq!(ClockType::from(3u64), ClockType::None); + assert_eq!(ClockType::from(u64::MAX), ClockType::None); + } +} diff --git a/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs b/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs index cfaad9a0b..9cb26293e 100644 --- a/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs +++ b/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs @@ -31,9 +31,14 @@ pub unsafe fn alloc_phys_pages(n: u64) -> u64 { x = inout(reg) x ); } - // Set aside two pages at the top of the scratch region for the - // exception stack, shared state, etc - let max_avail = hyperlight_common::layout::MAX_GPA - hyperlight_common::vmem::PAGE_SIZE * 2; + // Set aside three pages at the top of the scratch region: + // - top page: size/allocator/snapshot-PT/exn-stack bookkeeping + // - next page down: the reserved guest-counter / shared-state page + // - third page down: the paravirtualized guest clock page + // The clock page is always reserved even when the host is built + // without the `enable_guest_clock` feature, so that the physical + // allocator's footprint is independent of host-side features. + let max_avail = hyperlight_common::layout::MAX_GPA - hyperlight_common::vmem::PAGE_SIZE * 3; if x.checked_add(nbytes) .is_none_or(|xx| xx >= max_avail as u64) { diff --git a/src/hyperlight_guest/src/lib.rs b/src/hyperlight_guest/src/lib.rs index 19e5ac5f2..a3811fa05 100644 --- a/src/hyperlight_guest/src/lib.rs +++ b/src/hyperlight_guest/src/lib.rs @@ -25,6 +25,7 @@ pub mod error; pub mod exit; pub mod layout; pub mod prim_alloc; +pub mod time; pub mod types; pub mod guest_handle { diff --git a/src/hyperlight_guest/src/time.rs b/src/hyperlight_guest/src/time.rs new file mode 100644 index 000000000..6a261eb96 --- /dev/null +++ b/src/hyperlight_guest/src/time.rs @@ -0,0 +1,404 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Low-level guest time functions using the paravirtualized clock. +//! +//! This module provides low-level functions to read time without VM exits by +//! consulting the shared clock page populated by the host. The page lives at +//! a fixed, compile-time-known guest-virtual address inside the scratch +//! region (see [`hyperlight_common::layout::clock_page_gva`]), so no +//! per-sandbox discovery data — such as a PEB field — is required. +//! +//! # For most users +//! +//! Use [`hyperlight_guest_bin::time`] instead, which provides a +//! `std::time`-compatible API (`SystemTime`, `Instant`) built on top of the +//! free functions here. +//! +//! # Supported clock sources +//! +//! - **KVM pvclock** — used when running under KVM. +//! - **Hyper-V Reference TSC** — used when running under MSHV or WHP. +//! +//! Which one is active is decided by the host and advertised by the +//! `clock_type` field in the scratch bookkeeping page. When the host is built +//! without the `enable_guest_clock` feature the field reads back as +//! [`ClockType::None`] and every function in this module returns `None`. +//! +//! # Concurrency invariant (current) +//! +//! In the current Hyperlight execution model the guest vCPU runs only +//! while the host thread is blocked inside the vCPU run call: the host +//! writes the clock page **before** entering the guest and cannot mutate +//! it while the guest reads. There is therefore no concurrent writer in +//! practice and the seqlock retry, the acquire fences, and the per-field +//! `read_volatile`s will never actually fire at runtime today. +//! +//! These primitives are kept anyway because: (1) they future-proof +//! against multi-vCPU sandboxes, async host-side clock updates, or +//! live migration; and (2) by never creating a `&T` over +//! hypervisor-mutable memory we satisfy Rust's aliasing rules +//! unconditionally. + +use core::sync::atomic::{AtomicU64, Ordering, fence}; + +use hyperlight_common::layout::{ + SCRATCH_TOP_BOOT_TIME_NS_OFFSET, SCRATCH_TOP_CLOCK_TYPE_OFFSET, clock_page_gva, +}; +use hyperlight_common::time::{ + ClockType, HvReferenceTscPage, KvmPvclockVcpuTimeInfo, PVCLOCK_TSC_STABLE_BIT, +}; + +/// The guest-virtual address of the top of scratch memory. The +/// bookkeeping fields (`clock_type`, `boot_time_ns`, etc.) are stored +/// as negative offsets from this address. +const SCRATCH_TOP_GVA: u64 = hyperlight_common::layout::MAX_GVA as u64 + 1; + +/// Error type for clock validation failures. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ClockValidationError { + /// Clock is not configured. Either the host was built without the + /// `enable_guest_clock` feature, or the bookkeeping page contains an unknown + /// discriminant that we treat as "unavailable" out of caution. + NotConfigured, + /// KVM pvclock does not have `PVCLOCK_TSC_STABLE_BIT` set. This + /// indicates the TSC is not stable across vCPUs on this host. + KvmTscNotStable, + /// Hyper-V Reference TSC page has `tsc_sequence == 0`, which in the + /// TLFS is the host's "fall back to MSR" sentinel. MSR reads require a + /// VM exit which is not available from a Hyperlight guest, so this is + /// reported as an error rather than retried. + HyperVTscSequenceZero, +} + +/// Read the `clock_type` field from the scratch bookkeeping page. +#[inline] +fn read_clock_type() -> ClockType { + // SAFETY: the bookkeeping page at the top of scratch is always mapped + // RW; reads of any 8-byte aligned u64 inside it are well-defined. + // Zero-initialised memory decodes to `ClockType::None`. + let ptr = (SCRATCH_TOP_GVA - SCRATCH_TOP_CLOCK_TYPE_OFFSET) as *const u64; + let raw = unsafe { core::ptr::read_volatile(ptr) }; + ClockType::from(raw) +} + +/// Read the `boot_time_ns` field from the scratch bookkeeping page. +#[inline] +fn read_boot_time_ns() -> u64 { + // SAFETY: see `read_clock_type`. + let ptr = (SCRATCH_TOP_GVA - SCRATCH_TOP_BOOT_TIME_NS_OFFSET) as *const u64; + unsafe { core::ptr::read_volatile(ptr) } +} + +/// Returns `true` when the host has armed a paravirtualized clock for this +/// sandbox. Cheap - just a single read of the bookkeeping field. +#[inline] +pub fn is_available() -> bool { + !matches!(read_clock_type(), ClockType::None) +} + +/// Validate that the paravirtualized clock is properly configured and stable. +/// +/// This is an optional defense-in-depth check a guest can make once during +/// initialisation. The host should have already verified invariant TSC +/// support when enabling the feature; this catches accidental +/// misconfiguration. +pub fn validate_clock() -> Result<(), ClockValidationError> { + match read_clock_type() { + ClockType::KvmPvclock => { + // SAFETY: the clock page is mapped read/write into the guest's + // scratch region for the lifetime of the sandbox, and a + // `KvmPvclockVcpuTimeInfo` (32 bytes) fits at offset 0. We use + // raw-pointer `read_volatile` instead of materialising a + // `&KvmPvclockVcpuTimeInfo` so the reader stays sound under + // Rust's aliasing rules even if a future Hyperlight execution + // model lets the host mutate this page concurrently with the + // guest. See module-level "Concurrency invariant" note. + let ptr = clock_page_gva() as *const KvmPvclockVcpuTimeInfo; + let flags = unsafe { core::ptr::read_volatile(&raw const (*ptr).flags) }; + if (flags & PVCLOCK_TSC_STABLE_BIT) == 0 { + return Err(ClockValidationError::KvmTscNotStable); + } + Ok(()) + } + ClockType::HyperVReferenceTsc => { + // SAFETY: as above. `HvReferenceTscPage` fills the full 4 KiB + // page; we only read the `tsc_sequence` header field here. + let ptr = clock_page_gva() as *const HvReferenceTscPage; + let seq = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_sequence) }; + if seq == 0 { + return Err(ClockValidationError::HyperVTscSequenceZero); + } + Ok(()) + } + ClockType::None => Err(ClockValidationError::NotConfigured), + } +} + +/// Read the CPU's Time Stamp Counter. +#[inline] +fn rdtsc() -> u64 { + #[cfg(target_arch = "x86_64")] + { + // SAFETY: RDTSC is unprivileged on x86_64 and always present on + // CPUs that support the paravirtualized clock (host-verified + // invariant TSC). + unsafe { core::arch::x86_64::_rdtsc() } + } + #[cfg(not(target_arch = "x86_64"))] + { + 0 // TSC not available on non-x86_64 architectures. + } +} + +/// Maximum number of retries when the hypervisor is concurrently updating +/// the paravirtualized clock page. +/// +/// Both the KVM pvclock and Hyper-V Reference TSC protocols use a +/// seqlock-style mechanism: the hypervisor bumps a sequence/version counter +/// before and after mutating the page, and readers must retry if they +/// observe an in-progress or changed counter. Mutations are extremely +/// short, so a small retry cap is plenty; the hypervisor's design assumes +/// the client spin-retries rather than falling back to an MSR (which would +/// force a VM exit and defeat the whole point of the paravirtualized +/// clock). +const CLOCK_SEQLOCK_MAX_RETRIES: u32 = 100; + +/// Read time from the KVM pvclock structure. +/// +/// Uses the seqlock-style protocol described in +/// : the host sets +/// `version` to an odd value before mutating and to a new even value +/// afterwards; readers retry while `version` is odd or changes across the +/// read. We cap retries with [`CLOCK_SEQLOCK_MAX_RETRIES`] so that a +/// pathologically churning host can't make us spin forever. +fn read_kvm_pvclock() -> Option { + // SAFETY: see `validate_clock` for the mapping invariant. Today the + // host cannot mutate this page while the guest is running (single + // vCPU, host-then-guest scheduling), so the seqlock loop and the + // volatile loads are not strictly required for correctness right now. + // We keep the upstream pvclock contract verbatim so that: + // (a) the reader is sound under Rust's aliasing rules regardless of + // what the host is doing — no `&T` is ever taken over this + // memory; and + // (b) no behavioural change is needed when Hyperlight gains + // multi-vCPU sandboxes or async host-side clock updates. + let ptr = clock_page_gva() as *const KvmPvclockVcpuTimeInfo; + + for _ in 0..CLOCK_SEQLOCK_MAX_RETRIES { + let version1 = unsafe { core::ptr::read_volatile(&raw const (*ptr).version) }; + if version1 & 1 != 0 { + core::hint::spin_loop(); + continue; // Update in progress. + } + + // Pair with the hypervisor's write barrier between the version bump + // and the payload write. On x86_64 an Acquire fence is free (no + // instruction emitted), but we keep it for correctness under the + // memory model. + fence(Ordering::Acquire); + + let tsc_timestamp = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_timestamp) }; + let system_time = unsafe { core::ptr::read_volatile(&raw const (*ptr).system_time) }; + let tsc_to_system_mul = + unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_to_system_mul) }; + let tsc_shift = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_shift) }; + + fence(Ordering::Acquire); + + let version2 = unsafe { core::ptr::read_volatile(&raw const (*ptr).version) }; + if version1 != version2 { + core::hint::spin_loop(); + continue; // Data changed mid-read. + } + + let tsc_now = rdtsc(); + let tsc_delta = tsc_now.wrapping_sub(tsc_timestamp); + + // KVM pvclock scaler, per + // : + // `ns = (tsc_delta * tsc_to_system_mul) >> (32 - tsc_shift)`. + // We clamp the right-shift count to `[0, 63]` so + // buggy host cannot induce UB / panic via an out-of-range shift; + // values outside the documented `tsc_shift ∈ [-31, 31]` band + // produce non-meaningful timings, but the reader stays sound. + let raw_shift = 32i32 - tsc_shift as i32; + let shift = raw_shift.clamp(0, 63) as u32; + let ns_delta = ((tsc_delta as u128 * tsc_to_system_mul as u128) >> shift) as u64; + + return Some(system_time.wrapping_add(ns_delta)); + } + + None +} + +/// Read time from the Hyper-V Reference TSC page. +/// +/// Uses the seqlock-style protocol described in TLFS §12.7. A sequence of +/// 0 is a persistent "fall back to MSR" signal from the host; we return +/// `None` without retrying because MSR reads require a VM exit that is +/// unavailable inside a Hyperlight guest. +fn read_hv_reference_tsc() -> Option { + // SAFETY: see `read_kvm_pvclock` for the aliasing / volatile rationale. + let ptr = clock_page_gva() as *const HvReferenceTscPage; + + for _ in 0..CLOCK_SEQLOCK_MAX_RETRIES { + let seq1 = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_sequence) }; + if seq1 == 0 { + return None; // Persistent MSR-fallback sentinel. + } + + fence(Ordering::Acquire); + + let tsc_scale = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_scale) }; + let tsc_offset = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_offset) }; + + fence(Ordering::Acquire); + + let seq2 = unsafe { core::ptr::read_volatile(&raw const (*ptr).tsc_sequence) }; + if seq1 != seq2 { + core::hint::spin_loop(); + continue; // Host updated the page mid-read. + } + + let tsc_now = rdtsc(); + + // Hyper-V Reference TSC formula (TLFS §12.7): + // `time_100ns = ((tsc * scale) >> 64) + offset` + // The high 64 bits of a 128-bit multiply give the scaled value. + // We use `checked_add_signed` on the offset addition: an overflow + // here would mean the host's `tsc_offset` is so far out of band + // that `time_100ns` cannot be represented, which we treat as + // "clock unavailable" rather than retrying — the offset is + // host-written and stable, so retrying cannot rescue it. + let scaled = ((tsc_now as u128 * tsc_scale as u128) >> 64) as u64; + let time_100ns = scaled.checked_add_signed(tsc_offset)?; + + return time_100ns.checked_mul(100); + } + + None +} + +/// Highest raw pvclock value ever returned. Lives in BSS so it +/// survives snapshot/restore. Used to detect backward jumps when a +/// snapshot is restored into a new partition whose monotonic clock +/// starts from a lower value. +static RAW_HIGH_WATER: AtomicU64 = AtomicU64::new(0); + +/// Cumulative offset added to raw pvclock reads to maintain the +/// monotonic guarantee across cross-partition restores. On each +/// backward jump, the previous high-water mark is added so that all +/// future returns are >= any previously returned value. +static MONO_OFFSET: AtomicU64 = AtomicU64::new(0); + +/// Read the raw monotonic value from the hypervisor without any +/// offset adjustment. +fn raw_monotonic_ns() -> Option { + match read_clock_type() { + ClockType::KvmPvclock => read_kvm_pvclock(), + ClockType::HyperVReferenceTsc => read_hv_reference_tsc(), + ClockType::None => None, + } +} + +/// Monotonic time in nanoseconds. +/// +/// The value is an absolute counter derived from the hypervisor's time +/// base (kvmclock on KVM, partition reference time on Hyper-V). It is +/// monotonically increasing and suitable for measuring elapsed time +/// between two reads. +/// +/// If a snapshot is restored into a **new** partition whose raw clock +/// starts from a lower value, an offset is applied so the returned +/// value never goes backward. Within a single partition epoch, diffs +/// between consecutive reads reflect real elapsed time. Across a +/// cross-partition restore the diff includes a synthetic gap (the +/// high-water mark from the old partition) — safe for timeouts and +/// deadlines, but not an accurate measure of freeze duration (use +/// wall-clock time for that). +/// +/// Returns `None` if the clock is not configured, or if the retry cap +/// was exhausted (the caller may retry). +pub fn monotonic_time_ns() -> Option { + let raw = raw_monotonic_ns()?; + + let high = RAW_HIGH_WATER.load(Ordering::Relaxed); + if raw < high { + // Raw clock went backward — snapshot was restored into a new + // partition. Bump the offset by the old high-water mark so all + // future reads are >= any previously returned value. + MONO_OFFSET.fetch_add(high, Ordering::Relaxed); + RAW_HIGH_WATER.store(raw, Ordering::Relaxed); + } else { + RAW_HIGH_WATER.store(raw, Ordering::Relaxed); + } + + Some(raw.wrapping_add(MONO_OFFSET.load(Ordering::Relaxed))) +} + +/// Wall-clock time in nanoseconds since the Unix epoch. +/// +/// Returns `None` if: +/// - The clock is not configured (`clock_type == None`). +/// - `boot_time_ns` has not been stamped yet (it is zero before +/// `arm_clock` runs). On some backends the host's monotonic clock +/// source is unreliable until after the first vCPU run, so +/// wall clock is unavailable during `hyperlight_main` (init). +/// Monotonic time works fine during init. Wall clock becomes +/// available on the first dispatch call. +/// - The underlying monotonic read fails. +/// +/// The host computes `boot_time_ns` as the Unix-epoch origin of the +/// monotonic clock (`wall_now - monotonic_now`, sampled back-to-back +/// in `arm_clock`) and stamps it into the scratch bookkeeping page. The +/// guest simply adds its live monotonic reading to recover wall time. +/// +/// This host-side computation is necessary because Hyper-V has no +/// guest-accessible wall-clock register (unlike KVM's +/// `MSR_KVM_WALL_CLOCK_NEW`). We use the same host-computed approach +/// on all backends for uniformity. +pub fn wall_clock_time_ns() -> Option { + // Use the raw monotonic value (no cross-partition offset) because + // boot_time_ns is calibrated by the host against the raw clock. + // Applying the monotonic offset here would shift wall time into + // the future after a cross-partition restore. + let monotonic = raw_monotonic_ns()?; + let boot_time = read_boot_time_ns(); + // boot_time_ns == 0 means the host hasn't stamped it yet + // (scratch memory is zero-initialised). Return None rather + // than returning a nonsense value. + if boot_time == 0 { + return None; + } + Some(boot_time.wrapping_add(monotonic)) +} + +/// Monotonic time in microseconds. +/// +/// See [`monotonic_time_ns`] for details on the time base. +pub fn monotonic_time_us() -> Option { + monotonic_time_ns().map(|ns| ns / 1_000) +} + +/// Wall-clock time as `(seconds, sub-second nanoseconds)` since the Unix +/// epoch. Shape matches a POSIX `timespec`. +pub fn wall_clock_time() -> Option<(u64, u32)> { + let ns = wall_clock_time_ns()?; + let secs = ns / 1_000_000_000; + let nsecs = (ns % 1_000_000_000) as u32; + Some((secs, nsecs)) +} diff --git a/src/hyperlight_guest_bin/src/arch/amd64/init.rs b/src/hyperlight_guest_bin/src/arch/amd64/init.rs index 073bd3a2f..8fc5d9911 100644 --- a/src/hyperlight_guest_bin/src/arch/amd64/init.rs +++ b/src/hyperlight_guest_bin/src/arch/amd64/init.rs @@ -92,8 +92,12 @@ unsafe fn init_tss(pc: *mut ProcCtrl) { let tss_ptr = &raw mut (*pc).tss; // copy byte by byte to avoid alignment issues let ist1_ptr = &raw mut (*tss_ptr).ist1 as *mut [u8; 8]; + // The exception stack (IST1) grows downward. Place it below + // the reserved clock page so page-fault / COW handlers never + // clobber the hypervisor-owned clock page or the bookkeeping + // data at the top of scratch. let exn_stack = hyperlight_common::layout::MAX_GVA as u64 - - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + - hyperlight_common::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1; ist1_ptr.write_volatile(exn_stack.to_ne_bytes()); asm!( diff --git a/src/hyperlight_guest_bin/src/lib.rs b/src/hyperlight_guest_bin/src/lib.rs index 84e328892..f5fd827c2 100644 --- a/src/hyperlight_guest_bin/src/lib.rs +++ b/src/hyperlight_guest_bin/src/lib.rs @@ -53,6 +53,7 @@ pub mod host_comm; pub mod memory; #[cfg(target_arch = "x86_64")] pub mod paging; +pub mod time; /// Bridge between picolibc's POSIX expectations and the Hyperlight host. #[cfg(feature = "libc")] diff --git a/src/hyperlight_guest_bin/src/libc.rs b/src/hyperlight_guest_bin/src/libc.rs index dbb0d4cab..06089d8ae 100644 --- a/src/hyperlight_guest_bin/src/libc.rs +++ b/src/hyperlight_guest_bin/src/libc.rs @@ -20,6 +20,7 @@ use core::ffi::*; use core::sync::atomic::{AtomicU64, Ordering}; use hyperlight_common::flatbuffer_wrappers::function_types::{ParameterValue, ReturnType}; +use hyperlight_guest::time; use crate::host_comm::call_host_function; @@ -58,13 +59,32 @@ pub(crate) struct Timeval { tv_usec: c_long, } -/// Returns a synthetic monotonically-increasing time starting at Unix epoch -/// increasing 1s each call. -fn current_time() -> (u64, u64) { +/// Fallback clock used when the host has not armed a paravirtualized +/// clock. Returns a synthetic `(secs, nsecs)` pair that advances by one +/// second per call, preserving long-standing guest behaviour for hosts +/// built without the `enable_guest_clock` feature. +fn fallback_time() -> (u64, u64) { let call_count = CURRENT_TIME.fetch_add(1, Ordering::Relaxed) + 1; (call_count, 0) } +/// Returns `(secs, nsecs)` for `CLOCK_REALTIME` (wall-clock). +fn realtime() -> (u64, u64) { + match time::wall_clock_time() { + Some((secs, nsecs)) => (secs, nsecs as u64), + None => fallback_time(), + } +} + +/// Returns `(secs, nsecs)` for `CLOCK_MONOTONIC` (time since sandbox +/// creation). +fn monotonic() -> (u64, u64) { + match time::monotonic_time_ns() { + Some(ns) => (ns / 1_000_000_000, ns % 1_000_000_000), + None => fallback_time(), + } +} + #[unsafe(no_mangle)] pub extern "C" fn read(fd: c_int, buf: *mut c_void, count: usize) -> isize { if buf.is_null() && count > 0 { @@ -115,8 +135,16 @@ pub extern "C" fn clock_gettime(clk_id: c_ulong, tp: *mut Timespec) -> c_int { } match clk_id { - CLOCK_REALTIME | CLOCK_MONOTONIC => { - let (secs, nanos) = current_time(); + CLOCK_REALTIME => { + let (secs, nanos) = realtime(); + unsafe { + (*tp).tv_sec = secs as c_long; + (*tp).tv_nsec = nanos as c_long; + } + 0 + } + CLOCK_MONOTONIC => { + let (secs, nanos) = monotonic(); unsafe { (*tp).tv_sec = secs as c_long; (*tp).tv_nsec = nanos as c_long; @@ -137,7 +165,7 @@ pub extern "C" fn gettimeofday(tv: *mut Timeval, _tz: *mut c_void) -> c_int { return -1; } - let (secs, nanos) = current_time(); + let (secs, nanos) = realtime(); unsafe { (*tv).tv_sec = secs as c_long; (*tv).tv_usec = (nanos / 1000) as c_long; diff --git a/src/hyperlight_guest_bin/src/time.rs b/src/hyperlight_guest_bin/src/time.rs new file mode 100644 index 000000000..ed9f5031d --- /dev/null +++ b/src/hyperlight_guest_bin/src/time.rs @@ -0,0 +1,222 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! A `std::time`-compatible API built on the paravirtualized guest clock. +//! +//! This module provides [`Instant`] and [`SystemTime`] types that mirror the +//! shape of `std::time::Instant` and `std::time::SystemTime`, so guest code +//! that ordinarily uses the standard library's time APIs can be ported with +//! minimal changes. +//! +//! # Clock source +//! +//! Both types read from the shared paravirtualized clock page armed by the +//! host. See [`hyperlight_guest::time`] for the low-level details. +//! +//! # Availability +//! +//! If the host was built without the `enable_guest_clock` feature, every +//! constructor in this module returns [`TimeError::Unavailable`]. A guest +//! that wants to gracefully degrade should probe [`is_available`] once at +//! start-up rather than relying on `Instant::now()` to fail later. +//! +//! # Example +//! +//! ```no_run +//! use hyperlight_guest_bin::time::{Instant, SystemTime, UNIX_EPOCH}; +//! +//! if let Ok(start) = Instant::now() { +//! do_some_work(); +//! if let Ok(elapsed) = start.elapsed() { +//! log::info!("work took {} us", elapsed.as_micros()); +//! } +//! } +//! +//! if let Ok(now) = SystemTime::now() +//! && let Ok(since_epoch) = now.duration_since(UNIX_EPOCH) +//! { +//! log::info!("wall-clock seconds since epoch: {}", since_epoch.as_secs()); +//! } +//! # fn do_some_work() {} +//! ``` + +use core::time::Duration; + +use hyperlight_guest::time; + +/// Errors returned by the time API. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TimeError { + /// The host did not arm a paravirtualized clock for this sandbox (the + /// host was built without the `enable_guest_clock` feature, or clock + /// setup failed). + Unavailable, + /// The seqlock retry cap was exhausted. The caller may simply retry. + Retry, + /// `SystemTime::duration_since` was called with an argument that lies + /// in the future relative to `self`. + NegativeDuration(Duration), +} + +impl core::fmt::Display for TimeError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + Self::Unavailable => f.write_str("guest clock is not available"), + Self::Retry => f.write_str("guest clock read retry cap exhausted"), + Self::NegativeDuration(_) => f.write_str("second time is later than self"), + } + } +} + +/// Returns `true` if the host has armed a paravirtualized clock. +#[inline] +pub fn is_available() -> bool { + time::is_available() +} + +/// Read raw monotonic nanoseconds, or convert a [`time`] read failure into a +/// [`TimeError`]. Factored out so `Instant::now` and `SystemTime::now` share +/// the same failure classification. +#[inline] +fn read_monotonic_ns() -> Result { + if !time::is_available() { + return Err(TimeError::Unavailable); + } + time::monotonic_time_ns().ok_or(TimeError::Retry) +} + +/// A measurement of a monotonically non-decreasing clock, analogous to +/// [`std::time::Instant`]. +/// +/// Unlike `std::time::Instant`, construction is fallible: it returns +/// `TimeError::Unavailable` when the host has no guest-clock feature +/// enabled, and `TimeError::Retry` on a (vanishingly rare) seqlock retry +/// storm. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Instant { + /// Nanoseconds since sandbox creation, as reported by the paravirt + /// clock. + ns: u64, +} + +impl Instant { + /// Returns an instant corresponding to "now". + pub fn now() -> Result { + Ok(Self { + ns: read_monotonic_ns()?, + }) + } + + /// Returns the amount of time elapsed from another instant to this one, + /// or `None` if that instant is later than this one. + pub fn checked_duration_since(&self, earlier: Instant) -> Option { + self.ns.checked_sub(earlier.ns).map(Duration::from_nanos) + } + + /// Returns the amount of time elapsed from another instant to this one, + /// saturating at zero when the other instant is later. + pub fn saturating_duration_since(&self, earlier: Instant) -> Duration { + self.checked_duration_since(earlier) + .unwrap_or(Duration::ZERO) + } + + /// Returns the amount of time elapsed since this instant. + pub fn elapsed(&self) -> Result { + let now = Self::now()?; + Ok(now.saturating_duration_since(*self)) + } +} + +impl core::ops::Sub for Instant { + type Output = Duration; + + /// Panics if `rhs` is later than `self`. Mirrors the behaviour of + /// `std::time::Instant::sub`. + fn sub(self, rhs: Instant) -> Duration { + self.checked_duration_since(rhs) + .expect("supplied instant is later than self") + } +} + +/// A measurement of the system clock, analogous to +/// [`std::time::SystemTime`]. +/// +/// Represents wall-clock time, using the host's boot-time stamp combined +/// with the paravirtualized monotonic clock. Snapshot-restore preserves +/// the freshly re-stamped boot time, so `SystemTime::now()` will jump +/// forward by real elapsed wall-clock time across a restore — exactly the +/// behaviour a guest using `std::time::SystemTime` would expect. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct SystemTime { + /// Nanoseconds since the Unix epoch. + ns: u64, +} + +/// An anchor point corresponding to 1970-01-01 00:00:00 UTC. Subtract from +/// a `SystemTime` to get the wall-clock duration since the epoch. +pub const UNIX_EPOCH: SystemTime = SystemTime { ns: 0 }; + +impl SystemTime { + /// Returns the current wall-clock time. + pub fn now() -> Result { + if !time::is_available() { + return Err(TimeError::Unavailable); + } + let ns = time::wall_clock_time_ns().ok_or(TimeError::Retry)?; + Ok(Self { ns }) + } + + /// Returns the duration from `earlier` to `self`, or + /// `TimeError::NegativeDuration(d)` — where `d` is the magnitude of the + /// difference — if `earlier` is later than `self`. Mirrors + /// `std::time::SystemTime::duration_since`. + pub fn duration_since(&self, earlier: SystemTime) -> Result { + if self.ns >= earlier.ns { + Ok(Duration::from_nanos(self.ns - earlier.ns)) + } else { + Err(TimeError::NegativeDuration(Duration::from_nanos( + earlier.ns - self.ns, + ))) + } + } + + /// Returns the amount of time elapsed since `self`. + pub fn elapsed(&self) -> Result { + let now = Self::now()?; + now.duration_since(*self) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // The error type is both `Debug` and `Display` so it plays nicely with + // `?` in guest code and with `log::error!("{err}")` style logging. + #[test] + fn time_error_display() { + extern crate std; + use std::format; + assert_eq!( + format!("{}", TimeError::Unavailable), + "guest clock is not available" + ); + assert_eq!( + format!("{}", TimeError::NegativeDuration(Duration::from_secs(1))), + "second time is later than self" + ); + } +} diff --git a/src/hyperlight_host/Cargo.toml b/src/hyperlight_host/Cargo.toml index abc24fab8..9c8dd41a1 100644 --- a/src/hyperlight_host/Cargo.toml +++ b/src/hyperlight_host/Cargo.toml @@ -140,6 +140,8 @@ build-metadata = ["dep:built"] i686-guest = ["hyperlight-common/i686-guest"] nanvix-unstable = ["i686-guest", "hyperlight-common/nanvix-unstable"] guest-counter = ["hyperlight-common/guest-counter"] +# Populate the paravirtualized clock page so guests can read time without VM exits. x86_64 only. +enable_guest_clock = [] [[bench]] name = "benchmarks" diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs index 830b856c0..eb0f9384f 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs @@ -183,6 +183,9 @@ pub enum InitializeError { SetupRegs(#[from] RegisterError), #[error("Guest initialised stack pointer to architecturally invalid value: {0}")] InvalidStackPointer(u64), + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + #[error("Failed to arm paravirtualized guest clock: {0}")] + ArmClock(#[source] Box), } /// Errors that can occur during VM execution in the run loop @@ -492,6 +495,124 @@ impl HyperlightVm { Ok(()) } + /// Set up the pvclock / Reference TSC MSR and stamp `clock_type` + /// into the scratch bookkeeping page so the guest can read monotonic + /// time during `hyperlight_main` (init). + /// + /// Does NOT stamp `boot_time_ns` — on some backends (KVM) the + /// monotonic clock source is unreliable until after the first + /// vCPU run (see [`arm_clock`]). Wall-clock time returns `None` + /// until `arm_clock` is called. + /// + /// Must be called before the first vCPU run. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + pub(crate) fn setup_clock( + &mut self, + scratch: &crate::mem::shared_mem::HostSharedMemory, + ) -> crate::Result<()> { + use hyperlight_common::layout::{SCRATCH_TOP_CLOCK_TYPE_OFFSET, clock_page_gpa}; + + use crate::mem::shared_mem::SharedMemory; + + let gpa = clock_page_gpa(); + let clock_type = self.vm.setup_pvclock(gpa)?; + + // Write clock_type to the bookkeeping page (top of scratch), + // NOT into the clock page itself — the clock page is 100% + // hypervisor-owned. + let scratch_size = scratch.mem_size(); + let clock_type_offset = scratch_size + .checked_sub(SCRATCH_TOP_CLOCK_TYPE_OFFSET as usize) + .ok_or_else(|| crate::new_error!("scratch region too small for clock metadata"))?; + + scratch.write::(clock_type_offset, u64::from(clock_type))?; + + tracing::debug!( + target: "hyperlight::pvclock", + ?clock_type, + "clock MSR configured, boot_time_ns deferred until after first vCPU run" + ); + Ok(()) + } + + /// Arm the paravirtualized clock: set up the MSR and stamp + /// `clock_type` + `boot_time_ns` into the scratch bookkeeping page. + /// + /// Computes `boot_time_ns = wall_now - monotonic_now` where + /// `monotonic_now` comes from `VirtualMachine::current_monotonic_ns()`. + /// The guest recovers wall time as + /// `boot_time_ns + monotonic_time_ns()`. + /// + /// # Call sites + /// + /// - **Initial sandbox creation**: called after `initialise()` + /// returns (i.e. after the first vCPU run). On some backends + /// (KVM) the monotonic clock source is unreliable until the + /// first vCPU entry. Monotonic time is available during + /// `hyperlight_main` via the pvclock page (set up by + /// [`setup_clock`] before the first vCPU run), but wall-clock + /// time returns `None` until this method stamps `boot_time_ns`. + /// + /// - **Snapshot restore**: called directly by the restore path. + /// Re-stamps fresh `boot_time_ns` so the restored guest sees + /// wall time reflecting the restore moment. + /// + /// Must be called while `scratch_memory` is `Some`. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + pub(crate) fn arm_clock( + &mut self, + scratch: &crate::mem::shared_mem::HostSharedMemory, + ) -> crate::Result<()> { + use std::time::{SystemTime, UNIX_EPOCH}; + + use hyperlight_common::layout::{ + SCRATCH_TOP_BOOT_TIME_NS_OFFSET, SCRATCH_TOP_CLOCK_TYPE_OFFSET, clock_page_gpa, + }; + + use crate::mem::shared_mem::SharedMemory; + + let gpa = clock_page_gpa(); + let clock_type = self.vm.setup_pvclock(gpa)?; + + let scratch_size = scratch.mem_size(); + + // Sample monotonic first, then wall clock. If preempted between + // the two reads, boot_time_ns shifts forward (guest wall clock + // runs slightly ahead of host) rather than backward — "slightly + // in the future" is more benign than "slightly in the past" for + // most use cases. The gap is bounded by the 20ms test tolerance. + let mono_ns = self.vm.current_monotonic_ns()?; + let wall_ns = u64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|e| crate::new_error!("system time before Unix epoch: {}", e))? + .as_nanos(), + ) + .map_err(|_| crate::new_error!("wall_ns overflowed u64"))?; + let boot_time_ns = wall_ns.wrapping_sub(mono_ns); + + // Write metadata to the bookkeeping page (top of scratch), + // NOT into the clock page — the clock page is 100% + // hypervisor-owned. + let clock_type_offset = scratch_size + .checked_sub(SCRATCH_TOP_CLOCK_TYPE_OFFSET as usize) + .ok_or_else(|| crate::new_error!("scratch region too small for clock metadata"))?; + let boot_time_offset = scratch_size + .checked_sub(SCRATCH_TOP_BOOT_TIME_NS_OFFSET as usize) + .ok_or_else(|| crate::new_error!("scratch region too small for clock metadata"))?; + + scratch.write::(clock_type_offset, u64::from(clock_type))?; + scratch.write::(boot_time_offset, boot_time_ns)?; + + tracing::debug!( + target: "hyperlight::pvclock", + ?clock_type, + boot_time_ns, + "guest clock armed" + ); + Ok(()) + } + /// Get the current stack top virtual address pub(crate) fn get_stack_top(&mut self) -> u64 { self.rsp_gva diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs index f06c94964..27f23c721 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs @@ -212,6 +212,13 @@ impl HyperlightVm { return Ok(()); }; + // Set up the pvclock MSR so monotonic time works during init. + // boot_time_ns (wall clock) is deferred until after the first + // vCPU run — see arm_clock below. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + self.setup_clock(&mem_mgr.scratch_mem) + .map_err(|e| InitializeError::ArmClock(Box::new(e)))?; + let regs = CommonRegisters { rip: initialise, // We usually keep the top of the stack 16-byte @@ -241,6 +248,18 @@ impl HyperlightVm { ) .map_err(InitializeError::Run)?; + // Arm the paravirtualized clock after the first vCPU run. + // On some backends the monotonic clock source is unreliable + // until after the first vCPU entry, so wall-clock calibration + // is deferred to here. Wall clock is not available to the + // guest during hyperlight_main (init), but monotonic time + // works fine since the pvclock page is populated before the + // first vCPU entry. Wall clock becomes available on + // subsequent dispatch calls. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + self.arm_clock(&mem_mgr.scratch_mem) + .map_err(|e| InitializeError::ArmClock(Box::new(e)))?; + let regs = self.vm.regs()?; // todo(portability): this is architecture-specific if !regs.rsp.is_multiple_of(16) { @@ -1491,7 +1510,7 @@ mod tests { let peb_address = gshm.layout.peb_address; let stack_top_gva = hyperlight_common::layout::MAX_GVA as u64 - - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + - hyperlight_common::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1; let mut vm = set_up_hypervisor_partition( gshm, @@ -2104,7 +2123,7 @@ mod tests { /// Get the stack top GVA, same as the regular codepath. fn stack_top_gva(&self) -> u64 { hyperlight_common::layout::MAX_GVA as u64 - - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + - hyperlight_common::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1 } } diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index be1a15c22..d87752e1b 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -488,7 +488,7 @@ pub(crate) mod tests { UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?; let (mut mem_mgr, gshm) = sandbox.mgr.build().unwrap(); let exn_stack_top_gva = hyperlight_common::layout::MAX_GVA as u64 - - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + - hyperlight_common::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1; let mut vm = set_up_hypervisor_partition( gshm, diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs index db68dfdd0..7e78291ea 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs @@ -468,6 +468,55 @@ impl VirtualMachine for KvmVm { Ok(()) } + + #[cfg(feature = "enable_guest_clock")] + fn setup_pvclock( + &mut self, + clock_page_gpa: u64, + ) -> crate::Result { + // KVM pvclock: write `MSR_KVM_SYSTEM_TIME_NEW` with `gpa | 1`. + // Bit 0 is the "enable" flag; clearing it disables pvclock for this + // vCPU. + // + // Reference: https://docs.kernel.org/virt/kvm/x86/msr.html#pvclock + use kvm_bindings::{Msrs, kvm_msr_entry}; + + const MSR_KVM_SYSTEM_TIME_NEW: u32 = 0x4b564d01; + const PVCLOCK_ENABLE_BIT: u64 = 1; + + let mut msrs = Msrs::new(1) + .map_err(|e| crate::new_error!("Failed to allocate MSR list for pvclock: {}", e))?; + msrs.as_mut_slice()[0] = kvm_msr_entry { + index: MSR_KVM_SYSTEM_TIME_NEW, + data: clock_page_gpa | PVCLOCK_ENABLE_BIT, + ..Default::default() + }; + + self.vcpu_fd + .set_msrs(&msrs) + .map_err(|e| crate::new_error!("Failed to set pvclock MSR: {}", e))?; + + tracing::debug!( + target: "hyperlight::pvclock", + clock_page_gpa, + "KVM pvclock armed" + ); + Ok(hyperlight_common::time::ClockType::KvmPvclock) + } + + #[cfg(feature = "enable_guest_clock")] + fn current_monotonic_ns(&self) -> crate::Result { + // KVM_GET_CLOCK returns kvmclock nanoseconds — the same time base + // the guest reads through the pvclock page. We cannot use + // clock_gettime(CLOCK_MONOTONIC) here because kvmclock has its + // own epoch (which can be shifted via KVM_SET_CLOCK) and does not + // necessarily match host CLOCK_MONOTONIC. + let clock = self + .vm_fd + .get_clock() + .map_err(|e| crate::new_error!("KVM_GET_CLOCK failed: {}", e))?; + Ok(clock.clock) + } } #[cfg(gdb)] diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs index ecb19a09f..ce2518d07 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs @@ -353,6 +353,32 @@ pub(crate) trait VirtualMachine: Debug + Send { #[cfg(not(feature = "i686-guest"))] fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError>; + /// Arm the hypervisor's paravirtualized clock for this vCPU, pointing it + /// at the guest physical address of the sandbox's clock page. + /// + /// Must be called before the first `run_vcpu`, and again on snapshot + /// restore since the register lives in vCPU state. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + fn setup_pvclock( + &mut self, + clock_page_gpa: u64, + ) -> std::result::Result; + + /// Read the host's monotonic clock for the time base that backs the + /// guest's paravirtualized clock page, in nanoseconds. + /// + /// Each hypervisor has its own monotonic epoch that may differ from + /// `CLOCK_MONOTONIC`, so we cannot use a single host clock. This + /// value is used to derive `boot_time_ns = wall_now - monotonic_now`, + /// giving guests a uniform wall-clock origin across all backends. + /// + /// KVM does offer `MSR_KVM_WALL_CLOCK_NEW` for this, but Hyper-V's + /// TLFS explicitly states its reference time is "not intended to be + /// used as a source of wall clock time". Rather than diverge per + /// backend, we use the same host-computed approach everywhere. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + fn current_monotonic_ns(&self) -> std::result::Result; + /// Get partition handle #[cfg(target_os = "windows")] fn partition_handle(&self) -> windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE; diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs index 27f024ca6..9fb2b3166 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs @@ -469,6 +469,57 @@ impl VirtualMachine for MshvVm { .map_err(|e| RegisterError::SetXsave(e.into()))?; Ok(()) } + + #[cfg(feature = "enable_guest_clock")] + fn setup_pvclock( + &mut self, + clock_page_gpa: u64, + ) -> crate::Result { + // Hyper-V Reference TSC page: write `HV_REGISTER_REFERENCE_TSC` with + // `gpa | 1`. Bit 0 is the "enable" flag. + // + // Reference: Hyper-V TLFS section 12.7 (Reference TSC Page). + use mshv_bindings::hv_register_name_HV_REGISTER_REFERENCE_TSC; + + const REFERENCE_TSC_ENABLE_BIT: u64 = 1; + + self.vcpu_fd + .set_reg(&[hv_register_assoc { + name: hv_register_name_HV_REGISTER_REFERENCE_TSC, + value: hv_register_value { + reg64: clock_page_gpa | REFERENCE_TSC_ENABLE_BIT, + }, + ..Default::default() + }]) + .map_err(|e| crate::new_error!("Failed to set HV_REGISTER_REFERENCE_TSC: {}", e))?; + + tracing::debug!( + target: "hyperlight::pvclock", + clock_page_gpa, + "MSHV Reference TSC armed" + ); + Ok(hyperlight_common::time::ClockType::HyperVReferenceTsc) + } + + #[cfg(feature = "enable_guest_clock")] + fn current_monotonic_ns(&self) -> crate::Result { + // HV Reference TSC is partition reference time in 100 ns units; + // the host reads the same time base via HV_REGISTER_TIME_REF_COUNT. + use mshv_bindings::hv_register_name_HV_REGISTER_TIME_REF_COUNT; + let mut reg = [hv_register_assoc { + name: hv_register_name_HV_REGISTER_TIME_REF_COUNT, + value: hv_register_value { reg64: 0 }, + ..Default::default() + }]; + self.vcpu_fd + .get_reg(&mut reg) + .map_err(|e| crate::new_error!("Failed to read HV_REGISTER_TIME_REF_COUNT: {}", e))?; + // SAFETY: the union holds reg64 because we set the register name to + // a 64-bit register, and `get_reg` writes through the same union + // shape we provided. + let ticks_100ns = unsafe { reg[0].value.reg64 }; + Ok(ticks_100ns.wrapping_mul(100)) + } } #[cfg(gdb)] diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs index 18e366835..d2bbe73d0 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs @@ -803,6 +803,56 @@ impl VirtualMachine for WhpVm { fn partition_handle(&self) -> WHV_PARTITION_HANDLE { self.partition } + + #[cfg(feature = "enable_guest_clock")] + fn setup_pvclock( + &mut self, + clock_page_gpa: u64, + ) -> crate::Result { + // Hyper-V Reference TSC page via WHP: write `WHvRegisterReferenceTsc` + // with `gpa | 1`. Bit 0 is the "enable" flag. + // + // Reference: Hyper-V TLFS section 12.7 (Reference TSC Page). + const REFERENCE_TSC_ENABLE_BIT: u64 = 1; + + let reg_value = WHV_REGISTER_VALUE { + Reg64: clock_page_gpa | REFERENCE_TSC_ENABLE_BIT, + }; + self.set_registers(&[(WHvRegisterReferenceTsc, Align16(reg_value))]) + .map_err(|e| crate::new_error!("Failed to set WHvRegisterReferenceTsc: {}", e))?; + + tracing::debug!( + target: "hyperlight::pvclock", + clock_page_gpa, + "WHP Reference TSC armed" + ); + Ok(hyperlight_common::time::ClockType::HyperVReferenceTsc) + } + + #[cfg(feature = "enable_guest_clock")] + fn current_monotonic_ns(&self) -> crate::Result { + // WHP exposes the partition reference time (same time base as the + // Reference TSC page) via a partition property, NOT a vCPU register. + // The value is in 100 ns units. + // + // Note: WHP does not expose `WHvRegisterTimeRefCount` — that is a + // Hyper-V/MSHV register name. The WHP equivalent is + // `WHvPartitionPropertyCodeReferenceTime`. + let mut property: WHV_PARTITION_PROPERTY = unsafe { std::mem::zeroed() }; + let mut written_size = 0u32; + unsafe { + WHvGetPartitionProperty( + self.partition, + WHvPartitionPropertyCodeReferenceTime, + &mut property as *mut WHV_PARTITION_PROPERTY as *mut c_void, + std::mem::size_of::() as u32, + Some(&mut written_size), + ) + .map_err(|e| crate::new_error!("Failed to read WHP ReferenceTime: {}", e))?; + } + let ticks_100ns = unsafe { property.ReferenceTime }; + Ok(ticks_100ns.wrapping_mul(100)) + } } #[cfg(gdb)] diff --git a/src/hyperlight_host/src/mem/layout.rs b/src/hyperlight_host/src/mem/layout.rs index 26615d579..7395f3908 100644 --- a/src/hyperlight_host/src/mem/layout.rs +++ b/src/hyperlight_host/src/mem/layout.rs @@ -341,6 +341,11 @@ impl SandboxMemoryLayout { cfg.get_input_data_size(), cfg.get_output_data_size(), ); + // The guest allocator unconditionally reserves the clock page at + // the top of scratch (so its footprint is feature-independent), + // so the host minimum must always account for it. + let min_scratch_size = + min_scratch_size + hyperlight_common::layout::CLOCK_PAGE_SIZE as usize; if scratch_size < min_scratch_size { return Err(MemoryRequestTooSmall(scratch_size, min_scratch_size)); } @@ -595,6 +600,9 @@ impl SandboxMemoryLayout { self.sandbox_memory_config.get_input_data_size(), self.sandbox_memory_config.get_output_data_size(), ); + // Must match the unconditional clock page reservation in the guest allocator. + let min_fixed_scratch = + min_fixed_scratch + hyperlight_common::layout::CLOCK_PAGE_SIZE as usize; let min_scratch = min_fixed_scratch + size; if self.scratch_size < min_scratch { return Err(MemoryRequestTooSmall(self.scratch_size, min_scratch)); diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index 241622cab..51f8dd8ec 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -348,6 +348,14 @@ impl MultiUseSandbox { HyperlightVmError::Restore(e) })?; + // Re-arm the paravirtualized clock for the freshly-reset vCPU. The + // MSR / HV register that pvclock rides on lives in vCPU state and + // has been clobbered by reset_vcpu above, and `boot_time_ns` must + // be re-stamped so the restored guest sees wall-clock reflecting + // the restore moment, not the original boot. + #[cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + self.vm.arm_clock(&self.mem_mgr.scratch_mem)?; + self.vm.set_stack_top(snapshot.stack_top_gva()); self.vm.set_entrypoint(snapshot.entrypoint()); diff --git a/src/hyperlight_host/src/sandbox/snapshot.rs b/src/hyperlight_host/src/sandbox/snapshot.rs index e4c7b1133..3cdf65b4f 100644 --- a/src/hyperlight_host/src/sandbox/snapshot.rs +++ b/src/hyperlight_host/src/sandbox/snapshot.rs @@ -405,8 +405,13 @@ impl Snapshot { layout.set_pt_size(pt_bytes.len())?; memory.extend(&pt_bytes); + // The main/init stack top must live below the reserved clock page + // at the top of scratch; otherwise the guest's first stack writes + // clobber the paravirtualized clock page. The clock page is + // always reserved independent of the host's `enable_guest_clock` + // feature. let exn_stack_top_gva = hyperlight_common::layout::MAX_GVA as u64 - - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + - hyperlight_common::layout::SCRATCH_TOP_CLOCK_PAGE_OFFSET + 1; let extra_regions = Vec::new(); diff --git a/src/hyperlight_host/tests/guest_clock_test.rs b/src/hyperlight_host/tests/guest_clock_test.rs new file mode 100644 index 000000000..24d466e39 --- /dev/null +++ b/src/hyperlight_host/tests/guest_clock_test.rs @@ -0,0 +1,186 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//! Integration tests for the paravirtualized guest clock, only compiled +//! when the `enable_guest_clock` feature is enabled on `hyperlight-host`. +#![cfg(all(feature = "enable_guest_clock", target_arch = "x86_64"))] + +use std::thread; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +pub mod common; +use crate::common::with_rust_sandbox; +/// Minimum real wait used by the monotonic advance test. Chosen large +/// enough to dwarf any plausible CI scheduling jitter but small enough +/// not to slow the test suite down noticeably. +const MONOTONIC_ADVANCE_SLEEP: Duration = Duration::from_millis(50); + +/// Minimum real wait used by the restore-doesn't-freeze-wall-clock test. +/// Must be comfortably larger than `WALL_CLOCK_ADVANCE_TOLERANCE` below. +const RESTORE_SLEEP: Duration = Duration::from_millis(200); + +/// Allowed "play" when comparing post-restore wall-clock time against the +/// host's notion of now. Accounts for the guest call round-trip plus any +/// CI jitter. Kept generous because the test only needs to prove that +/// the clock was re-stamped, not that it is sub-millisecond accurate. +const WALL_CLOCK_ADVANCE_TOLERANCE: Duration = Duration::from_millis(500); + +/// How long to sit idle after sandbox creation in the no-drift test below. +/// Long enough that any constant offset between guest and host wall +/// clocks (e.g. from a stale `boot_time_ns` calibration) dominates over +/// scheduling jitter. +const IDLE_BEFORE_FIRST_CALL: Duration = Duration::from_secs(2); + +/// Tight tolerance used by the no-drift test. +/// +/// The host computes `boot_time_ns = wall_now - monotonic_now` +/// back-to-back in `arm_clock` (where `monotonic_now` comes from +/// `KVM_GET_CLOCK` on KVM, or `HV_REGISTER_TIME_REF_COUNT` on +/// Hyper-V). On KVM, `KVM_GET_CLOCK` can disagree with the live +/// pvclock page by up to ~13ms (observed on WSL2; root cause +/// uncertain — may be smaller on bare metal). The 20ms tolerance +/// accommodates this while still catching formula bugs (e.g. +/// omitting the monotonic subtraction produces ~100ms+ drift). +const WALL_CLOCK_TIGHT_TOLERANCE: Duration = Duration::from_millis(20); + +#[test] +fn clock_is_available_under_enable_guest_clock() { + with_rust_sandbox(|mut sbox| { + let available: i32 = sbox.call("ClockIsAvailable", ()).unwrap(); + assert_eq!(available, 1, "guest clock should be armed by the host"); + }); +} + +#[test] +fn monotonic_time_advances_across_calls() { + with_rust_sandbox(|mut sbox| { + let first: i64 = sbox.call("GetMonotonicTimeNs", ()).unwrap(); + assert!(first >= 0, "guest reported clock unavailable: {first}"); + + thread::sleep(MONOTONIC_ADVANCE_SLEEP); + + let second: i64 = sbox.call("GetMonotonicTimeNs", ()).unwrap(); + assert!(second >= 0, "guest reported clock unavailable: {second}"); + + let delta_ns = second - first; + assert!( + delta_ns >= MONOTONIC_ADVANCE_SLEEP.as_nanos() as i64 / 2, + "monotonic clock did not advance enough: first={first} second={second} \ + delta_ns={delta_ns}" + ); + }); +} + +#[test] +fn wall_clock_tracks_host_wall_clock() { + with_rust_sandbox(|mut sbox| { + let guest_ns: i64 = sbox.call("GetWallClockTimeNs", ()).unwrap(); + assert!( + guest_ns >= 0, + "guest reported wall-clock unavailable: {guest_ns}" + ); + + let host_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() as i64; + + let skew_ns = (host_ns - guest_ns).abs(); + assert!( + skew_ns < WALL_CLOCK_ADVANCE_TOLERANCE.as_nanos() as i64, + "guest wall-clock differs from host by {skew_ns} ns \ + (guest={guest_ns}, host={host_ns})" + ); + }); +} + +/// Snapshot / restore must re-stamp the host's `boot_time_ns` so the guest +/// sees real elapsed wall-clock time across the restore rather than a +/// frozen instant from when the snapshot was taken. +#[test] +fn wall_clock_advances_across_snapshot_restore() { + with_rust_sandbox(|mut sbox| { + let snapshot = sbox.snapshot().unwrap(); + + let before: i64 = sbox.call("GetWallClockTimeNs", ()).unwrap(); + assert!( + before >= 0, + "guest reported wall-clock unavailable: {before}" + ); + + thread::sleep(RESTORE_SLEEP); + sbox.restore(snapshot).unwrap(); + + let after: i64 = sbox.call("GetWallClockTimeNs", ()).unwrap(); + assert!(after >= 0, "guest reported wall-clock unavailable: {after}"); + + let advance_ns = after - before; + // Allow half the sleep to cover scheduling jitter on the low end; + // on the high end, real elapsed time plus the guest-call overhead + // is fine. + assert!( + advance_ns >= RESTORE_SLEEP.as_nanos() as i64 / 2, + "wall-clock did not advance across snapshot/restore: \ + before={before} after={after} advance_ns={advance_ns}" + ); + }); +} + +/// Diagnostic for the `boot_time_ns` calibration formula. +/// +/// `arm_clock` stamps `boot_time_ns` and the guest computes +/// `wall = boot_time_ns + monotonic_time_ns()`. For that to match the +/// host's wall clock, `boot_time_ns` must be `wall_at_arm - monotonic_at_arm` +/// — i.e. the Unix-epoch origin of the monotonic clock — not just +/// `wall_at_arm`. If the host stamps the latter, the guest's wall clock +/// is offset ahead of the host by exactly the value of the underlying +/// paravirt counter at arm time, which on a host with non-trivial +/// uptime (or any KVM partition where `system_time` is host-wide) can +/// be arbitrarily large. +/// +/// This test waits for a real interval after sandbox creation before +/// the first guest call, then requires the guest's reported wall clock +/// to match the host's within a tight tolerance. The existing +/// [`wall_clock_tracks_host_wall_clock`] test uses a 500 ms tolerance +/// and reads immediately, both of which can mask a small constant +/// offset. This one will not. +#[test] +fn wall_clock_does_not_drift_after_idle() { + with_rust_sandbox(|mut sbox| { + thread::sleep(IDLE_BEFORE_FIRST_CALL); + + let guest_ns: i64 = sbox.call("GetWallClockTimeNs", ()).unwrap(); + assert!( + guest_ns >= 0, + "guest reported wall-clock unavailable: {guest_ns}" + ); + + let host_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() as i64; + + let skew_ns = (host_ns - guest_ns).abs(); + assert!( + skew_ns < WALL_CLOCK_TIGHT_TOLERANCE.as_nanos() as i64, + "guest wall-clock skew of {skew_ns} ns exceeds tolerance of {tol} ns \ + after {idle:?} idle — likely a `boot_time_ns` calibration bug \ + (guest={guest_ns}, host={host_ns})", + tol = WALL_CLOCK_TIGHT_TOLERANCE.as_nanos(), + idle = IDLE_BEFORE_FIRST_CALL, + ); + }); +} diff --git a/src/tests/rust_guests/simpleguest/src/main.rs b/src/tests/rust_guests/simpleguest/src/main.rs index b6844a716..fb08b80fc 100644 --- a/src/tests/rust_guests/simpleguest/src/main.rs +++ b/src/tests/rust_guests/simpleguest/src/main.rs @@ -725,6 +725,30 @@ fn add(a: i32, b: i32) -> Result { host_add(a, b) } +// ===== Paravirtualized guest clock test surface ===== + +#[guest_function("ClockIsAvailable")] +fn clock_is_available() -> i32 { + hyperlight_guest::time::is_available() as i32 +} + +/// Returns monotonic nanoseconds, or `-1` if the clock is unavailable. +#[guest_function("GetMonotonicTimeNs")] +fn get_monotonic_time_ns() -> i64 { + hyperlight_guest::time::monotonic_time_ns() + .and_then(|ns| i64::try_from(ns).ok()) + .unwrap_or(-1) +} + +/// Returns wall-clock nanoseconds since the Unix epoch, or `-1` if the +/// clock is unavailable. +#[guest_function("GetWallClockTimeNs")] +fn get_wall_clock_time_ns() -> i64 { + hyperlight_guest::time::wall_clock_time_ns() + .and_then(|ns| i64::try_from(ns).ok()) + .unwrap_or(-1) +} + // Does nothing, but used for testing large parameters #[guest_function("LargeParameters")] fn large_parameters(v: Vec, s: String) {