From 27c0ea06ccb26a793047b970cd20f895ec38df27 Mon Sep 17 00:00:00 2001 From: Hans Christian Schmitz Date: Thu, 16 Oct 2025 15:24:26 +0200 Subject: [PATCH 1/2] tmp: trace log messages for SplitVq --- src/drivers/virtio/virtqueue/split.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/drivers/virtio/virtqueue/split.rs b/src/drivers/virtio/virtqueue/split.rs index cc584053bb..c08f3d8d74 100644 --- a/src/drivers/virtio/virtqueue/split.rs +++ b/src/drivers/virtio/virtqueue/split.rs @@ -57,19 +57,30 @@ impl DescrRing { fn push(&mut self, tkn: TransferToken) -> Result { let mut index; if let Some(ctrl_desc) = tkn.ctrl_desc.as_ref() { + trace!(" Creating indirect descriptor"); let descriptor = SplitVq::indirect_desc(ctrl_desc.as_ref()); + trace!(" Attempting to assign descriptor to free slot in table"); index = self.indexes.allocate().ok_or(VirtqError::NoDescrAvail)?; + trace!(" Assigned one descriptor (indirect)"); self.descr_table_mut()[index] = MaybeUninit::new(descriptor); } else { + trace!(" Creating direct descriptor iterator"); let mut rev_all_desc_iter = SplitVq::descriptor_iter(&tkn.buff_tkn)?.rev(); + trace!( + " Attempting to assign descriptors to free slots in table in reverse order" + ); + + let mut num_descriptors_assigned = 0; + // We need to handle the last descriptor (the first for the reversed iterator) specially to not set the next flag. { // If the [AvailBufferToken] is empty, we panic let descriptor = rev_all_desc_iter.next().unwrap(); index = self.indexes.allocate().ok_or(VirtqError::NoDescrAvail)?; + num_descriptors_assigned += 1; self.descr_table_mut()[index] = MaybeUninit::new(descriptor); } for mut descriptor in rev_all_desc_iter { @@ -77,14 +88,20 @@ impl DescrRing { descriptor.next = le16::from_ne(index.try_into().unwrap()); index = self.indexes.allocate().ok_or(VirtqError::NoDescrAvail)?; + num_descriptors_assigned += 1; self.descr_table_mut()[index] = MaybeUninit::new(descriptor); } // At this point, `index` is the index of the last element of the reversed iterator, // thus the head of the descriptor chain. + trace!(" Assigned {num_descriptors_assigned} descriptors (direct)"); } + trace!(" Inserting transfer token into token ring at index {index}"); + self.token_ring[index] = Some(tkn); + trace!(" Updating available ring"); + let len = self.token_ring.len(); let idx = self.avail_ring_mut().idx.to_ne(); self.avail_ring_mut().ring_mut(true)[idx as usize % len] = @@ -111,12 +128,16 @@ impl DescrRing { "The buff_id is incorrect or the reference to the TransferToken was misplaced.", ); + let mut num_descriptors_freed = 0; // We return the indices of the now freed ring slots back to `mem_pool.` let mut id_ret_idx = u16::try_from(used_elem.id.to_ne()).unwrap(); loop { unsafe { self.indexes.deallocate(id_ret_idx.into()); } + + num_descriptors_freed += 1; + let cur_chain_elem = unsafe { self.descr_table_mut()[usize::from(id_ret_idx)].assume_init() }; if cur_chain_elem.flags.contains(virtq::DescF::NEXT) { @@ -125,6 +146,7 @@ impl DescrRing { break; } } + trace!(" freed {num_descriptors_freed} descriptors"); self.read_idx = self.read_idx.wrapping_add(1); Ok(UsedBufferToken::from_avail_buffer_token( @@ -194,7 +216,9 @@ impl Virtq for SplitVq { notif: bool, buffer_type: BufferType, ) -> Result<(), VirtqError> { + trace!(" Creating transfer token"); let transfer_tkn = Self::transfer_token_from_buffer_token(buffer_tkn, buffer_type); + trace!(" Pushing to descriptor ring transfer token"); let next_idx = self.ring.push(transfer_tkn)?; if notif { From e0b56899647d99f07c54a46ee9523ef559b1a7f8 Mon Sep 17 00:00:00 2001 From: Hans Christian Schmitz Date: Mon, 20 Oct 2025 12:12:34 +0200 Subject: [PATCH 2/2] draft: feat(balloon): add balloon device driver TODO: explanatory commit message --- Cargo.toml | 3 + src/drivers/balloon/mod.rs | 1245 ++++++++++++++++++++++++ src/drivers/balloon/oom.rs | 71 ++ src/drivers/balloon/pci.rs | 120 +++ src/drivers/mod.rs | 2 + src/drivers/pci.rs | 40 +- src/drivers/virtio/mod.rs | 6 + src/drivers/virtio/transport/pci.rs | 24 + src/drivers/virtio/virtqueue/mod.rs | 20 +- src/drivers/virtio/virtqueue/packed.rs | 4 + src/drivers/virtio/virtqueue/split.rs | 8 + src/executor/balloon.rs | 41 + src/executor/mod.rs | 4 + src/lib.rs | 4 + src/mm/mod.rs | 14 +- 15 files changed, 1602 insertions(+), 4 deletions(-) create mode 100644 src/drivers/balloon/mod.rs create mode 100644 src/drivers/balloon/oom.rs create mode 100644 src/drivers/balloon/pci.rs create mode 100644 src/executor/balloon.rs diff --git a/Cargo.toml b/Cargo.toml index 82f10deb87..08a65a7f13 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -191,6 +191,9 @@ virtio-fs = ["virtio", "dep:fuse-abi", "fuse-abi/num_enum"] ## Enables the virtio-vsock driver. virtio-vsock = ["virtio"] +## Enables the virtio-balloon driver. +virtio-balloon = ["virtio", "pci"] + #! ### Other Drivers ## Enables the _Video Graphics Array_ (VGA) driver. diff --git a/src/drivers/balloon/mod.rs b/src/drivers/balloon/mod.rs new file mode 100644 index 0000000000..4b86888241 --- /dev/null +++ b/src/drivers/balloon/mod.rs @@ -0,0 +1,1245 @@ +#[warn(missing_docs, rustdoc::broken_intra_doc_links)] +use alloc::vec::Vec; +use core::alloc::Layout; +use core::fmt::Debug; +use core::marker::PhantomData; +use core::num::{NonZeroU32, NonZeroUsize}; +use core::ptr::NonNull; +use core::time::Duration; + +use memory_addresses::VirtAddr; +use pci_types::InterruptLine; +use smallvec::{SmallVec, smallvec}; +use talc::base::Talc; +use talc::base::binning::Binning; +use thiserror::Error; +use virtio::FeatureBits; +use virtio::balloon::{ConfigVolatileFieldAccess as _, F}; +use volatile::VolatileRef; + +use super::Driver; +use super::virtio::virtqueue::error::VirtqError; +use super::virtio::virtqueue::split::SplitVq; +use super::virtio::virtqueue::{AvailBufferToken, BufferElem, BufferType, VirtQueue, Virtq as _}; +use crate::VIRTIO_MAX_QUEUE_SIZE; +use crate::drivers::balloon::oom::DeflateBalloonOnOom; +use crate::drivers::virtio::ControlRegisters; +#[cfg(not(feature = "pci"))] +use crate::drivers::virtio::transport::mmio::{ComCfg, IsrStatus, NotifCfg}; +#[cfg(feature = "pci")] +use crate::drivers::virtio::transport::pci::{ComCfg, IsrStatus, NotifCfg}; +use crate::executor::WakerRegistration; +use crate::mm::device_alloc::DeviceAlloc; +use crate::mm::{ALLOCATOR, virtual_to_physical}; + +#[cfg(feature = "pci")] +pub mod oom; +#[cfg(feature = "pci")] +mod pci; + +const KIBI: u32 = 1024; +const MEBI: u32 = 1024 * KIBI; +const GIBI: u32 = 1024 * MEBI; + +/// Fixed size of pages as handled by the basic balloon device interface. +/// The basic interface only deals with 4 KiB pages. Optional features can support +/// larger page sizes, e.g. [`F::PAGE_REPORTING`]. +const BALLOON_PAGE_SIZE: usize = 4 * KIBI as usize; + +/// Minimum interval between voluntary inflation attempts in microseconds. +/// Actual interval may be longer as inflation is only attempted in +/// [`VirtioBalloonDriver::poll_events`]. This is called by the balloon executor +/// task which is cooperatively scheduled, so it may miss the exact interval while +/// other tasks are executing. +const VOLUNTARY_INFLATE_INTERVAL_MICROS: u64 = 1_000_000; + +/// Maximum number of 4 KiB pages voluntarily inflated per voluntary inflation +/// attempt, i.e. per call of [`VirtioBalloonDriver::poll_events`]. +const VOLUNTARY_INFLATE_MAX_NUM_PAGES: u32 = 2 * GIBI / BALLOON_PAGE_SIZE as u32; + +// TODO: prevent possible deflate of not yet acknowledged inflated pages. See VIRTIO v1.2 5.5.6.1 + +/// A wrapper struct for the raw configuration structure. +/// Handling the right access to fields, as some are read-only +/// for the driver. +#[derive(Debug)] +struct BalloonDevCfg { + pub raw: VolatileRef<'static, virtio::balloon::Config>, + pub dev_id: u16, + pub features: virtio::balloon::F, +} + +impl BalloonDevCfg { + fn num_pages(&self) -> u32 { + self.raw.as_ptr().num_pages().read().into() + } + + fn actual(&mut self) -> u32 { + self.raw.as_ptr().actual().read().into() + } + + fn set_actual(&mut self, num_pages: u32) { + self.raw.as_mut_ptr().actual().write(num_pages.into()); + } +} + +/// Virtio traditional memory balloon driver. +/// +/// Supports host requested inflation and voluntary inflation (above what the +/// host has requested). When the host decreases the requested balloon size again +/// (i.e. increasing permissible guest size again), the driver does not deflate +/// the balloon proactively. +/// +/// Voluntary inflation occurs when [`VirtioBalloonDriver::poll_events`] is called, +/// but at most every [`VOLUNTARY_INFLATE_INTERVAL_MICROS`] microseconds. +/// +/// The balloon is deflated again (making memory available to other Hermit tasks) +/// when an out of memory event occurs and the allocator's out of memory handler +/// calls [`VirtioBalloonDriver::deflate_for_oom`]. This way memory previously +/// returned to the host can be reused to ensure system stability. See also +/// [`oom::DeflateBalloonOnOom`]. +pub(crate) struct VirtioBalloonDriver { + dev_cfg: BalloonDevCfg, + com_cfg: ComCfg, + isr_stat: IsrStatus, + notif_cfg: NotifCfg, + irq: InterruptLine, + + inflateq: BalloonVq, + deflateq: BalloonVq, + + num_in_balloon: u32, + num_pending_inflation: u32, + num_pending_deflation: u32, + num_targeted: u32, + + balloon_storage: BalloonStorage, + last_voluntary_inflate: u64, + + waker: WakerRegistration, +} + +impl VirtioBalloonDriver { + /// Negotiates a subset of features, understood and wanted by both the OS + /// and the device. + fn negotiate_features( + &mut self, + driver_features: virtio::balloon::F, + ) -> Result<(), VirtioBalloonError> { + if driver_features.requirements_satisfied() { + debug!( + " Feature set requested by device driver are in conformance with specification." + ); + } else { + return Err(VirtioBalloonError::FeatureRequirementsNotMet { driver_features }); + } + + let common_features = self + .com_cfg + .control_registers() + .negotiate_features(driver_features); + + if common_features.contains(driver_features) { + // The device supports all our required features + Ok(()) + } else { + Err(VirtioBalloonError::IncompatibleFeatureSets { driver_features }) + } + } + + /// Initializes the device in adherence to specification. + /// + /// See Virtio specification v1.2. - 3.1.1 + /// and v1.2. - 5.5.5 + pub fn init_dev(&mut self) -> Result<(), VirtioBalloonError> { + // Reset + self.com_cfg.reset_dev(); + + // Indicate device, that OS noticed it + self.com_cfg.ack_dev(); + + // Indicate device, that driver is able to handle it + self.com_cfg.set_drv(); + + // TODO: add support for free page hinting and reporting + + let features = F::VERSION_1; + self.negotiate_features(features)?; + + // Indicates the device, that the current feature set is final for the driver + // and will not be changed. + self.com_cfg.features_ok(); + + // Checks if the device has accepted final set. This finishes feature negotiation. + if self.com_cfg.check_features() { + info!( + " Features have been negotiated between device {:x} and driver: {features:?}", + self.dev_cfg.dev_id + ); + // Set feature set in device config fur future use. + self.dev_cfg.features = features; + } else { + return Err(VirtioBalloonError::FeatureNegotiationFailed { + device_id: self.dev_cfg.dev_id, + }); + } + + self.inflateq.init(VirtQueue::Split( + SplitVq::new( + &mut self.com_cfg, + &self.notif_cfg, + VIRTIO_MAX_QUEUE_SIZE, + 0u16, + self.dev_cfg.features.into(), + ) + .expect("Failed to create SplitVq for inflateq due to invalid parameters (bug)"), + )); + + self.deflateq.init(VirtQueue::Split( + SplitVq::new( + &mut self.com_cfg, + &self.notif_cfg, + VIRTIO_MAX_QUEUE_SIZE, + 1u16, + self.dev_cfg.features.into(), + ) + .expect("Failed to create SplitVq for deflateq due to invalid parameters (bug)"), + )); + + // At this point the device is "live" + self.com_cfg.drv_ok(); + + info!(" Finished initialization"); + + self.adjust_balloon_size(); + + Ok(()) + } + + fn num_pages_changed(&mut self) -> Option { + let new_num_pages = self.dev_cfg.num_pages(); + + if new_num_pages == self.num_targeted { + None + } else { + self.num_targeted = new_num_pages; + Some(new_num_pages) + } + } + + pub(crate) fn poll_events(&mut self, cx: &mut core::task::Context<'_>) { + trace!(" Driver is being polled..."); + + self.waker.register(cx.waker()); + + trace!(" Processing acknowledgements for inflation/deflation"); + + let mut changed = false; + + { + let num_new_acknowledged_deflated = self.deflateq.discard_new_used(); + + if num_new_acknowledged_deflated > 0 { + debug!( + " Deflation acknowledged for {num_new_acknowledged_deflated} pages" + ); + + self.num_pending_deflation -= num_new_acknowledged_deflated as u32; + self.num_in_balloon -= num_new_acknowledged_deflated as u32; + changed = true; + } + } + + { + let num_new_acknowledged_inflated = self.inflateq.discard_new_used(); + + if num_new_acknowledged_inflated > 0 { + debug!( + " Inflation acknowledged for {num_new_acknowledged_inflated} pages" + ); + + self.num_pending_inflation -= num_new_acknowledged_inflated as u32; + self.num_in_balloon += num_new_acknowledged_inflated as u32; + changed = true; + } + } + + if changed { + debug!( + " Setting new actual balloon size of {} pages", + self.num_in_balloon + ); + self.dev_cfg.set_actual(self.num_in_balloon); + } + + self.adjust_balloon_size(); + } + + fn wake(&mut self) { + self.waker.wake(); + } + + /// Deflate the balloon by the given number of pages. + /// + /// # Panics + /// When `num_pages_to_deflate` is larger than the number of pages currently + /// deflatable in the balloon. That is all pages currently in the balloon, + /// minus the number of pages already queued for deflation. + /// + /// # Safety + /// Must be called with the same instance of [`Talc`] that was provided to + /// [`Self::inflate`] to inflate the balloon. + unsafe fn deflate( + &mut self, + talc: &mut Talc, + num_pages_to_deflate: u32, + ) { + assert!( + num_pages_to_deflate <= self.num_in_balloon - self.num_pending_deflation, + "Can't deflate more pages than there are in the balloon" + ); + + trace!(" Attempting to deflate by {num_pages_to_deflate} pages"); + + let page_indices = self + .balloon_storage + .mark_pages_for_deflation(num_pages_to_deflate); + + trace!( + " Marked {} pages for deflation, sending them into the deflateq: {page_indices:?}", + page_indices.len() + ); + + for chunk_page_indices in &page_indices { + // SAFETY: We ensure with our balloon storage that we only deflate pages + // that we have previously inflated into the balloon. + // Deflating also does not give the host ownership over + // additional memory of ours. Merely sending the indices into + // the queue does not yet deallocate the pages on our side. + unsafe { + self.deflateq + .send_pages(chunk_page_indices.iter().copied(), false) + .expect("Failed to send pages into the deflateq"); + } + } + + // SAFETY: For now we don't have [`F::MUST_TELL_HOST`] support, so + // we can deallocate all pages immediately once we have sent + // them into the deflateq. See VIRTIO v1.2 5.5.6 3. + // We pass on the upholding of the requirements on the `Talc` + // instance used to our caller. + unsafe { + self.balloon_storage.shrink_chunks(talc, page_indices); + } + + self.num_pending_deflation += num_pages_to_deflate; + } + + fn inflate( + &mut self, + talc: &mut Talc, + num_pages_to_inflate: u32, + voluntary: bool, + ) -> usize { + trace!(" Attempting to inflate as much as possible"); + + let page_indices = + self.balloon_storage + .allocate_chunks(talc, num_pages_to_inflate, voluntary); + let num_pages_inflated = page_indices.len(); + + trace!(" Sending page indices into inflateq: {page_indices:?}"); + + // SAFETY: We ensure with our balloon storage that we only inflate pages + // that we have allocated via the global allocator. Inflating + // a page hands ownership over to the host, but we ensure that + // the contents of the page are not used until the page has + // been deflated again by keeping our allocation in the balloon storage. + unsafe { + self.inflateq + .send_pages(page_indices, false) + .expect("Failed to send pages into the inflateq"); + } + + self.num_pending_inflation += num_pages_inflated as u32; + + num_pages_inflated + } + + fn adjust_balloon_size(&mut self) { + trace!(" Adjusting balloon size"); + + if let Some(new_target_num_pages) = self.num_pages_changed() { + if new_target_num_pages < self.num_in_balloon - self.num_pending_deflation { + let num_to_deflate = + (self.num_in_balloon - self.num_pending_deflation) - new_target_num_pages; + + debug!( + " Size change requested: deflate of {num_to_deflate}, from {} (with pending: inflation={} deflation={}) to {new_target_num_pages}", + self.num_in_balloon, self.num_pending_inflation, self.num_pending_deflation + ); + + trace!(" Ignoring, we only deflate on OOM"); + } else if new_target_num_pages > self.num_in_balloon + self.num_pending_inflation { + let num_to_inflate = + new_target_num_pages - (self.num_in_balloon + self.num_pending_inflation); + + debug!( + " Size change requested: inflate of {num_to_inflate}, from {} (with pending: inflation={} deflation={}) to {new_target_num_pages}", + self.num_in_balloon, self.num_pending_inflation, self.num_pending_deflation + ); + + self.inflate(&mut *ALLOCATOR.lock(), num_to_inflate, false); + trace!(" Done inflating"); + } + }; + + let now = crate::arch::processor::get_timestamp(); + + if now + >= self.last_voluntary_inflate + + u64::from(crate::arch::processor::get_frequency()) + * VOLUNTARY_INFLATE_INTERVAL_MICROS + { + debug!(" Voluntarily inflating balloon as much as we can"); + let num_inflated = self.inflate( + &mut *ALLOCATOR.lock(), + VOLUNTARY_INFLATE_MAX_NUM_PAGES, + true, + ); + debug!( + " Voluntarily inflated {num_inflated} pages. Next voluntary inflate in {:?}", + Duration::from_micros(VOLUNTARY_INFLATE_INTERVAL_MICROS) + ); + self.last_voluntary_inflate = now; + } + } + + pub fn disable_interrupts(&mut self) { + self.inflateq.disable_notifs(); + self.deflateq.disable_notifs(); + } + + pub fn enable_interrupts(&mut self) { + self.inflateq.enable_notifs(); + self.deflateq.enable_notifs(); + } + + pub fn num_deflatable_for_oom(&self) -> u32 { + self.num_in_balloon + .saturating_sub(self.dev_cfg.num_pages()) + .saturating_sub(self.num_pending_deflation) + } + + /// Deflate the balloon in case of an out-of-memory (OOM) event. + /// This is meant to be called from a [`talc::OomHandler`] registered to Hermit's + /// global instance of [`Talc`]. + /// + /// # Safety + /// May only be called with the one [`Talc`] instance registered as the global + /// allocator for Hermit. + pub unsafe fn deflate_for_oom( + &mut self, + talc: &mut Talc, + failed_alloc_num_pages: u32, + ) -> Result<(), ()> { + // We don't really know how much space Talc has left. + // The allocation might have failed only by a short margin, or by a lot. + + let num_deflatable = self.num_deflatable_for_oom(); + + if num_deflatable > 0 { + // Deflate as many pages as we can up to the amount needed for the allocation. + // We don't have to wait for host acknowledgement, because for now + // we don't support [`F::MUST_TELL_HOST`]. + + let num_to_deflate = num_deflatable.min(failed_alloc_num_pages); + + info!( + " Deflating {num_to_deflate} pages in an attempt to recover from an OOM condition" + ); + + // SAFETY: We pass on the requirement of using the correct `Talc` + // instance to our caller. + unsafe { + self.deflate(talc, num_to_deflate); + } + Ok(()) + } else { + error!(" Unable to deflate balloon further"); + // Nothing more we can do + Err(()) + } + } +} + +impl Driver for VirtioBalloonDriver { + fn get_interrupt_number(&self) -> InterruptLine { + self.irq + } + + fn get_name(&self) -> &'static str { + "virtio-balloon" + } +} + +struct BalloonVq { + vq: Option, +} + +impl BalloonVq { + pub fn new() -> Self { + Self { vq: None } + } + + fn init(&mut self, vq: VirtQueue) { + self.vq = Some(vq); + } + + pub fn enable_notifs(&mut self) { + let Some(vq) = &mut self.vq else { + debug!(" BalloonVq::enable_notifs called on uninitialized vq"); + return; + }; + + vq.enable_notifs(); + } + + pub fn disable_notifs(&mut self) { + let Some(vq) = &mut self.vq else { + debug!(" BalloonVq::disable_notifs called on uninitialized vq"); + return; + }; + + vq.disable_notifs(); + } + + fn is_empty(&self) -> bool { + let Some(vq) = &self.vq else { + debug!(" BalloonVq::disable_notifs called on uninitialized vq"); + return true; + }; + + vq.is_empty() + } + + fn used_send_buff_to_page_indices( + used_send_buff: SmallVec<[BufferElem; 2]>, + ) -> impl Iterator { + used_send_buff.into_iter().flat_map(|buffer_elem| { + match buffer_elem { + BufferElem::Sized(_any) => + panic!("Unexpected used `BufferElem::Sized` encountered, BalloonVq should only have sent `BufferElem::Vector`s"), + BufferElem::Vector(items) => { + assert!(items.len() % 4 == 0, "Unexpected size of used `BufferElem::Vector`, BalloonVq should only have sent lengths that are multiples of 4"); + + items + .into_iter() + .array_chunks() + .map(|bytes: [u8; 4]| u32::from_le_bytes(bytes)) + }, + } + }) + } + + /// Receive all new page indices marked used by the host. + /// These are the page indices we have previously sent into the queue in available buffers. + pub fn recv_new_used(&mut self) -> impl Iterator { + let Some(vq) = &mut self.vq else { + debug!(" BalloonVq::try_recv_new_used called on uninitialized vq"); + panic!("BalloonVq must be initialized before calling try_recv_new_used"); + }; + + let mut current_used_page_indices_iter = None; + + core::iter::from_fn(move || { + match current_used_page_indices_iter.as_mut() { + // Must appear in the code before `current_used_page_indices_iter.next()` for an existing iterator (see below). + // Otherwise Rust is unable to infer the contents of the `Option` (and the type can't be named explicitly). + // If this inference failure gets fixed, this match can be converted to an `if let Some(iter) = ...` + None => match vq.try_recv() { + Ok(new_used) => { + let mut new_used_page_indices_iter = + Self::used_send_buff_to_page_indices(new_used.send_buff); + + let used = new_used_page_indices_iter.next()?; + + current_used_page_indices_iter = Some(new_used_page_indices_iter); + + Some(used) + } + + Err(VirtqError::NoNewUsed) => None, + + Err(error) => { + panic!( + "Failed to receive new used virtqueue descriptors with unexpected error: {error:?}" + ) + } + }, + + Some(current_used_page_indices_iter) => current_used_page_indices_iter.next(), + } + }) + } + + /// Discard all new page indices marked used by the host. + /// These are the page indices we have previously sent into the queue in available buffers. + pub fn discard_new_used(&mut self) -> usize { + let Some(vq) = &mut self.vq else { + debug!(" BalloonVq::discard_new_used called on uninitialized vq"); + panic!("BalloonVq must be initialized before calling discard_new_used"); + }; + + let mut num_discarded = 0; + + loop { + match vq.try_recv() { + Ok(new_used) => { + let num_page_indices = + Self::used_send_buff_to_page_indices(new_used.send_buff).count(); + trace!( + " Discarded used buffer received from host with {num_page_indices} page indices" + ); + num_discarded += num_page_indices; + } + + Err(VirtqError::NoNewUsed) => break, + + Err(error) => { + panic!( + "Failed to receive new used virtqueue descriptors with unexpected error: {error:?}" + ) + } + } + } + + num_discarded + } + + pub fn discard_blocking_until_empty(&mut self) -> usize { + self.disable_notifs(); + + trace!( + " trying to empty the virtqueue, blocking until all elements have been discarded" + ); + + let mut num_discarded = 0; + while !self.is_empty() { + num_discarded += self.discard_new_used(); + } + + trace!(" done emptying the virtqueue"); + + self.enable_notifs(); + + num_discarded + } + + /// Send specified pages into the balloon virtqueue. + /// + /// To ensure that there is enough space in the queue, call [`Self::recv_new_used`] + /// or [`Self::discard_new_used`] before sending. + /// + /// The page indices are of 4096B (4K) pages and are submitted as `u32`s, + /// i.e. only pages up to (2³² - 1) * 4096 B = 16 TiB in our physical memory + /// can be submitted here. + /// + /// # Safety + /// The caller must ensure that the pages of which the indices are sent into + /// the inflate queue are not used by the kernel or the application until they + /// have been deflated again via the deflate queue + /// (with or without acknowledgement by the host depending on [`F::MUST_TELL_HOST`]). + pub unsafe fn send_pages>( + &mut self, + page_indices: I, + notif: bool, + ) -> Result<(), VirtqError> { + trace!(" Sending page indices into queue"); + + let Some(vq) = &mut self.vq else { + error!(" BalloonVq::send_pages called on uninitialized vq"); + panic!("BalloonVq must be initialized before calling send_pages"); + }; + + trace!(" Allocating new Vec (DeviceAlloc) for page indices"); + + let mut page_indices_bytes = Vec::new_in(DeviceAlloc); + page_indices + .into_iter() + // Not specified as little-endian by the spec? Linux does it little-endian for VIRTIO 1.0 + .flat_map(|index| index.to_le_bytes()) + .collect_into(&mut page_indices_bytes); + + if page_indices_bytes.is_empty() { + debug!(" Vec of page indices is empty, doing nothing"); + return Ok(()); + } + + let buff_tkn = AvailBufferToken::new( + smallvec![BufferElem::Vector(page_indices_bytes)], + smallvec![], + ) + .expect("We have specified a send_buff so AvailBufferToken::new should succeed"); + + trace!(" Dispatching buffer to the queue"); + + vq.dispatch(buff_tkn, notif, BufferType::Direct)?; + + Ok(()) + } +} + +/// Errors that can occur during the lifetime and initialization of the [`VirtioBalloonDriver`] +#[derive(Debug, Copy, Clone, Error)] +pub enum VirtioBalloonError { + #[cfg(feature = "pci")] + #[error( + "Virtio traditional memory balloon device driver failed, for device {device_id:x}, due to a missing or malformed device config!" + )] + NoDevCfg { device_id: u16 }, + /// The device did not accept the negotiated features at the last step of negotiation. + #[error( + "Virtio traditional memory balloon device driver failed, for device {device_id:x}, device did not acknowledge negotiated feature set!" + )] + FeatureNegotiationFailed { device_id: u16 }, + /// Set of features requested by driver does not adhere to the requirements of features + /// indicated by the specification + #[error( + "Virtio traditional memory balloon device driver tried to set feature bit without setting dependency feature. Feat set: {driver_features:?}" + )] + FeatureRequirementsNotMet { driver_features: virtio::balloon::F }, + /// The first u64 contains the feature bits wanted by the driver. + /// but which are incompatible with the device feature set, second u64. + #[error("Feature set: {driver_features:?}, is incompatible with the device's features")] + IncompatibleFeatureSets { driver_features: virtio::balloon::F }, +} + +#[derive(Debug)] +struct BalloonStorage { + /// A stack of chunks of pages allocated for the balloon. + chunks: Vec, +} + +impl BalloonStorage { + pub fn new() -> Self { + Self { + chunks: Vec::new_in(DeviceAlloc), + } + } + + fn allocate_chunk( + &mut self, + talc: &mut Talc, + num_pages: NonZeroU32, + ) -> Option> { + let page = BalloonAllocation::try_allocate(talc, num_pages)?; + + self.chunks.push(page); + + // Only now get the iterator over physical indices, so it lives as long + // as chunks, instead of referencing the now moved page variable. + let mut page_indices = self + .chunks + .last() + .expect("We just pushed one chunk") + .phys_page_indices() + .peekable(); + let first_page_index = *page_indices + .peek() + .expect("If the allocation didn't fail, we should have at least one page index"); + + trace!( + " Allocated ballon page chunk starting at page index {first_page_index} with {num_pages} pages" + ); + + Some(page_indices) + } + + pub fn allocate_chunks( + &mut self, + talc: &mut Talc, + target_num_pages: u32, + voluntary: bool, + ) -> Vec { + let mut page_indices = Vec::new_in(DeviceAlloc); + let mut current_exponent = target_num_pages.ilog2(); + let mut num_remaining = target_num_pages; + + trace!(" Attempting to allocate {target_num_pages} pages"); + + while num_remaining > 0 { + trace!( + " Attempting to allocate chunk of {} pages (pages remaining: {num_remaining})", + 1u32 << current_exponent + ); + match self.allocate_chunk( + talc, + NonZeroU32::new(1 << current_exponent) + .expect("One shifted left by any number is always at least one"), + ) { + Some(chunk_page_indices) => { + num_remaining -= 1 << current_exponent; + page_indices.extend(chunk_page_indices); + } + None => { + if current_exponent == 0 { + log!( + if voluntary { + log::Level::Debug + } else { + log::Level::Warn + }, + " Failed to allocate as many pages as requested to fill the balloon with, continuing with as many as possible ({})", + target_num_pages - num_remaining + ); + break; + } + + let old_exponent = current_exponent; + current_exponent -= 1; + trace!( + " Failed to allocate new chunk of 2^{old_exponent} ({}) pages to fill the balloon with, reducing chunk size to 2^{current_exponent} ({})", + 1u32 << old_exponent, + 1u32 << current_exponent, + ); + + continue; + } + } + } + + trace!(" Done allocating {} chunks", page_indices.len()); + + page_indices + } + + pub fn mark_pages_for_deflation( + &mut self, + target_num_pages: u32, + ) -> Vec, DeviceAlloc> { + trace!(" Attempting to mark {target_num_pages} pages as queued for deflation"); + + let mut num_remaining = target_num_pages; + let mut per_chunk_page_indices = Vec::new_in(DeviceAlloc); + + // Go through chunks from small/recent to large/old, mark as much as requested if possible. + // Collect the page indices of marked pages for submission to the deflate queue. + + for chunk in self.chunks.iter_mut().rev() { + let num_to_mark = chunk.num_available_for_deflation().min(num_remaining); + + let mut page_indices = Vec::new_in(DeviceAlloc); + chunk + .mark_queued_for_deflation(num_to_mark) + .collect_into(&mut page_indices); + + per_chunk_page_indices.push(page_indices); + + num_remaining -= num_to_mark; + + if num_remaining == 0 { + break; + } + } + + if num_remaining > 0 { + warn!( + " Attempted to deflate more pages than were in the balloon: no more allocation chunks left to deflate" + ); + } + + per_chunk_page_indices + } + + /// Shrink chunks previously marked partially or fully as queued for deflation previously. + /// The chunks will be shrunk only by the pages the indices of which are provided + /// in `acknowledged_deflated_pages`. The indices should be provided in the + /// groups and order they were returned by [`Self::allocate_chunks`]. + /// + /// # Safety + /// Must be called with the same instance of [`Talc`] that was provided to + /// [`Self::allocate_chunks`] to allocate the chunks. This should be the same + /// [`Talc`] instance for all chunks. + /// + /// Must not be called with page indices that the host still has ownership of. + /// That is, only page indices to pages that are already deflated may be passed + /// to this function. Otherwise pages still owned by the host may be freed, + /// leading to unsound future allocations. + pub unsafe fn shrink_chunks( + &mut self, + talc: &mut Talc, + acknowledged_deflated_pages: Vec, DeviceAlloc>, + ) { + let mut next_chunk_index = self.chunks.len().checked_sub(1); + + for chunk_deflated_pages in acknowledged_deflated_pages.into_iter() { + let Some(mut current_chunk_index) = next_chunk_index else { + error!( + " Was unable to use all page indices acknowledged for deflation to shrink allocation chunks" + ); + return; + }; + + loop { + if self.chunks[current_chunk_index].can_shrink_by_pages(&chunk_deflated_pages) { + break; + } + + trace!( + " Skipped one chunk, because it cannot be shrunk by the current block of deflated pages" + ); + + let Some(new_chunk_index) = current_chunk_index.checked_sub(1) else { + error!( + " Was unable to use all page indices acknowledged for deflation to shrink allocation chunks" + ); + return; + }; + + current_chunk_index = new_chunk_index; + } + + // SAFETY: We pass on the upholding of the requirements on the `Talc` + // instance passed and the page indices provided to our caller. + let shrink_res = + unsafe { self.chunks[current_chunk_index].shrink(talc, chunk_deflated_pages) }; + + match shrink_res { + ShrinkResult::PagesRemain => (), + ShrinkResult::Deallocated => { + self.chunks.remove(current_chunk_index); + } + } + + next_chunk_index = current_chunk_index.checked_sub(1); + } + } +} + +/// Represents a chunk of consecutive 4K pages allocated for the balloon. +/// +/// This ensures via encapsulation, that inflated pages, pages released to the host, +/// are not read from / written to while they are in the balloon. +/// +/// The allocation represented by this type must be manually deallocated via [`Self:deallocate`]. +/// If the type is dropped, the allocation is leaked. +/// This is not unsafe, but undesirable. +#[derive(Debug)] +struct BalloonAllocation { + /// Pointer to the allocation or `None` if fully deallocated. + allocation_ptr: Option>, + /// Indices of the pages currently allocated and owned by this struct. + page_indices: Vec, + /// Index of the first index that is queued for deflation, with all following + /// also being queued for deflation. + /// This is an index into [`Self::page_indices`]. + /// When there are no pages queued for deflation, this index is the one after + /// the last element of [`Self::page_indices`], i.e. the length of [`Self::page_indices`]. + queued_for_deflation_start: usize, +} + +// SAFETY: `BalloonAllocation` does not implement `Clone` (or any other cloning mechanism) +// and implies exclusive ownership of an allocation, with the exception of host ineractions. +// Sending it across threads cannot create a situation where we can access +// mutable state across two threads. The host interactions are guarded by +// unsafe functions and in general we don't dereference pointers into our allocation. +unsafe impl Send for BalloonAllocation {} + +// SAFETY: We don't allow for any interior mutability as `allocation_ptr` is never +// dereferenced by us and is not exposed outside of our type. Other than that we +// only have plain integer types that are `Sync` themselves. +unsafe impl Sync for BalloonAllocation {} + +impl BalloonAllocation { + /// Get the memory layout for an allocation of `num_pages` 4K pages + fn layout(num_pages: NonZeroUsize) -> Layout { + Layout::from_size_align(num_pages.get() * BALLOON_PAGE_SIZE, BALLOON_PAGE_SIZE).expect( + "Layout of a non-zero amount of 4K pages aligned to 4K page boundaries should be valid", + ) + } + + /// The current layout of our allocation if we have any pages allocated, + /// `None` otherwise. + fn current_layout(&self) -> Option { + self.num_pages_allocated().map(Self::layout) + } + + /// The total number of pages allocated for this chunk. + /// This also includes pages marked for deflation that haven't been shrunk away yet. + fn num_pages_allocated(&self) -> Option { + NonZeroUsize::new(self.page_indices.len()) + } + + /// The number of pages of this chunk that can be queued for deflation. + fn num_available_for_deflation(&self) -> u32 { + (0..self.queued_for_deflation_start) + .len() + .try_into() + .expect( + "We only deal with 32-bit indexed pages, so our number of pages has to fit in a u32", + ) + } + + pub fn is_empty(&self) -> bool { + self.allocation_ptr.is_none() + } + + pub fn phys_page_indices(&self) -> impl Iterator { + self.page_indices.iter().copied() + } + + #[must_use = "this returns an object representing the allocation, unless stored, it is leaked"] + pub fn try_allocate( + talc: &mut Talc, + num_pages: NonZeroU32, + ) -> Option { + // SAFETY: We require a non-zero number of pages, from which we construct + // a non-zero-sized layout of this many 4K pages. + let allocation_ptr = unsafe { + talc.try_allocate(Self::layout(num_pages.try_into().expect( + "We don't support 16-bit or narrower platforms so a u32 should fit into a usize", + ))) + }?; + + let num_pages = num_pages.get() as usize; + + let mut page_indices = Vec::with_capacity_in(num_pages, DeviceAlloc); + (0..num_pages) + .map(|offset| VirtAddr::from_ptr(allocation_ptr.as_ptr()) + offset * BALLOON_PAGE_SIZE) + .map(|virt_addr| { + virtual_to_physical(virt_addr) + .expect("We only deal with virtual addresses that are mapped") + }) + .map(|phys_addr| { + u32::try_from(phys_addr.as_u64() / BALLOON_PAGE_SIZE as u64) + .expect("Balloon cannot handle physical pages above 16TiB") + }) + .collect_into(&mut page_indices); + + Some(Self { + allocation_ptr: Some(allocation_ptr), + page_indices, + queued_for_deflation_start: num_pages, + }) + } + + fn pages_queued_for_deflation(&self) -> &[u32] { + &self.page_indices[self.queued_for_deflation_start..] + } + + pub fn mark_queued_for_deflation( + &mut self, + num_pages_to_mark: u32, + ) -> impl Iterator { + let num_previously_marked = self.pages_queued_for_deflation().len(); + + assert!( + num_pages_to_mark as usize <= self.page_indices.len() - num_previously_marked, + "Cannot mark mark more pages for deflation than are still contained and unmarked in the chunk" + ); + + let num_allocated = self.page_indices.len(); + + trace!( + " Marking {num_pages_to_mark} pages for chunk: {num_allocated} (of that {num_previously_marked} marked for deflation) -> {num_allocated} (of that {} marked for deflation)", + num_previously_marked + num_pages_to_mark as usize + ); + + self.queued_for_deflation_start -= num_pages_to_mark as usize; + + self.pages_queued_for_deflation()[..num_pages_to_mark as usize] + .iter() + .copied() + } + + pub fn can_shrink_by_pages(&self, page_indices: &[u32]) -> bool { + self.pages_queued_for_deflation() + .iter() + .rev() + .zip(page_indices.iter().rev()) + .all(|(marked, deflated)| *marked == *deflated) + } + + /// Shrinks the allocated chunk by `pages_to_shrink`. + /// Takes `self` by value and if there are remaining pages in the chunks after + /// shrinking, returns it via [`ShrinkResult::PagesRemain`]. Otherwise `self` + /// is consumed with the chunk having been emptied. + /// + /// `pages_to_shrink` should be a list of page indices previously returned by + /// [`Self::mark_queued_for_deflation`]. They should be submitted in the order + /// they were returned by [`Self::mark_queued_for_deflation`] both within such + /// a list and across multiple calls of this function with different lists. + /// This ensure we can actually shrink our allocation. + /// + /// # Safety + /// Must be called with the same instance of [`Talc`] that was provided to + /// [`Self::try_allocate`] to create this instance of [`BalloonAllocation`]. + /// + /// Must not be called while the host still has ownership of any of the pages + /// that are a part of the allocation represented by this struct. + /// I.e. deallocation may only take place once the host has returned ownership + /// back to us for all pages of this allocation. + /// + /// # Panics + /// If `pages_to_shrink` contains page indices of pages not marked queued for deflation + #[must_use = "If pages remain after shrinking, remaining BalloonAllocation is returned. Dropping it would leak the allocation"] + pub unsafe fn shrink( + &mut self, + talc: &mut Talc, + pages_to_shrink: Vec, + ) -> ShrinkResult { + let num_previously_marked = self.pages_queued_for_deflation().len(); + assert!( + pages_to_shrink.len() <= num_previously_marked, + "Must mark the amount of the allocation chunk to be shrunk for deflation before shrinking" + ); + + if self.is_empty() { + warn!(" Attempted to shrink already empty balloon allocation chunk"); + return ShrinkResult::Deallocated; + } + + if pages_to_shrink.is_empty() { + return ShrinkResult::PagesRemain; + } + + trace!( + " Shrinking chunk by {} pages: {} (of that {} marked for deflation) -> {} (of that {} marked for deflation)", + pages_to_shrink.len(), + self.page_indices.len(), + num_previously_marked, + self.page_indices.len() - pages_to_shrink.len(), + num_previously_marked - pages_to_shrink.len(), + ); + + let old_layout = self + .current_layout() + .expect("We checked above that we have at least one page still allocated"); + + // Find the position in `self.page_indices` from which we want to start shrinking. + // Only look through the sub-slice of it that is actually marked queued for deflation + // to find the index. + let Some(first_to_shrink) = self + .pages_queued_for_deflation() + .iter() + .position(|page_index| *page_index == pages_to_shrink[0]) + .map(|index| self.queued_for_deflation_start + index) + else { + error!( + " First page to shrink ({}) was not found inside balloon allocation chunk, can't shrink", + pages_to_shrink[0] + ); + panic!("Attempted to shrink balloon allocation chunk by page not inside the chunk") + }; + + if !self + .pages_queued_for_deflation() + .iter() + .last() + .is_some_and(|page_index| { + page_index + == pages_to_shrink + .last() + .expect("We checked for non-emptiness above") + }) { + error!( + " Last page to shrink {} was not found inside balloon allocation chunk, can't shrink", + pages_to_shrink + .last() + .expect("We checked for non-emptiness above") + ); + panic!( + "Attempted to shrink balloon allocation chunk by pages not consecutively at the end of the chunk" + ) + } + + for (page_index_to_shrink, page_index_marked) in pages_to_shrink + .into_iter() + .zip(self.page_indices.drain(first_to_shrink..)) + { + assert!( + page_index_to_shrink == page_index_marked, + "Attempted to shrink balloon allocation chunk by page not inside the chunk" + ); + } + + let new_num_pages = self.page_indices.len(); + + let res = if new_num_pages == 0 { + trace!( + " Deallocating balloon chunk as all its pages were shrunk away after acknowledged deflation" + ); + + trace!( + " Freeing ptr={:x?}, layout={old_layout:?}", + self.allocation_ptr + ); + // SAFETY: We require that our caller ensures that the same `Talc` + // instance is passed here as the one passed to allocate our + // `BalloonAllocation`. As we don't expose our pointer, or + // allow other modification from outside, it must have been + // allocated with the given `Talc` instance. + // We track the size of our allocation beginning with the intial + // allocation and also during shrinking operations. Our alignment + // is always to 4K page boundaries. We thus ensure the correct + // layout is passed here. + unsafe { + talc.deallocate( + self.allocation_ptr + .take() + .expect("We checked above that we still have at least one page allocated") + .as_ptr(), + old_layout, + ); + } + + ShrinkResult::Deallocated + } else { + trace!( + " Shrinking chunk with {} pages still remaining of which {} pages marked queued for deflation", + self.page_indices.len(), + self.pages_queued_for_deflation().len() + ); + + trace!( + " shrinking ptr={:x?}, old_layout={old_layout:?}, len={new_num_pages}", + self.allocation_ptr + ); + // SAFETY: We require that our caller ensures that the same `Talc` + // instance is passed here as the one passed to allocate our + // `BalloonAllocation`. As we don't expose our pointer, or + // allow other modification from outside, it must have been + // allocated with the given `Talc` instance. + // We track the size of our allocation beginning with the intial + // allocation and also during shrinking operations. Our alignment + // is always to 4K page boundaries. We thus ensure the correct + // old layout is passed here. + // This branch cannout be reached if the new size is zero. + // The size can also not be larger than the old size, as we + // take a non-negative amount to shrink by as our parameter, + // not a new size. + unsafe { + talc.shrink( + self.allocation_ptr + .expect("We checked above that we still have at least one page allocated") + .as_ptr(), + old_layout, + new_num_pages * BALLOON_PAGE_SIZE, + ); + } + + ShrinkResult::PagesRemain + }; + + trace!(" Done shrinking"); + + res + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ShrinkResult { + PagesRemain, + Deallocated, +} diff --git a/src/drivers/balloon/oom.rs b/src/drivers/balloon/oom.rs new file mode 100644 index 0000000000..d151ee3ec2 --- /dev/null +++ b/src/drivers/balloon/oom.rs @@ -0,0 +1,71 @@ +use core::alloc::Layout; + +use talc::base::Talc; +use talc::base::binning::Binning; +use talc::source::Source; + +use crate::drivers::pci::get_balloon_driver; + +/// [`Talc`] out of memory handler that attempts to recover memory previously +/// returned to the host via the VIRTIO Traditional Memory Balloon device. +/// +/// It attempts to deflate the balloon (re-acquiring memory from the host, and +/// freeing the allocations made by the balloon driver in the host's stead) by +/// the amount required for the allocation that would have failed. If the balloon +/// is filled with fewer pages than would be required to cover the allocation's +/// size, this handler attempts to recover as many as possible still. +/// +/// Memory freed across chunks of pages allocated for the balloon may not be +/// contiguous. This means that even if we free as many bytes as required for the +/// allocation, we may not have freed enough _contiguous_ memory for it. This is +/// ok however and [`Talc`] will simply call our handler again until we've either +/// exhausted the memory available for recovery from the host, or the allocation +/// succeeds. +#[derive(Debug)] +pub struct DeflateBalloonOnOom { + /// Dummy field to prevent construction of the struct except through [`Self::new`] + /// which is marked `unsafe`` and documents our requirements for safety. + #[doc(hidden)] + _private: (), +} + +impl DeflateBalloonOnOom { + /// Construct a new instance of the balloon deflating [`Source`] for [`Talc`]. + /// + /// # Safety + /// May only be used with the one instance of [`Talc`] registered as Hermit's + /// global allocator. + pub const unsafe fn new() -> Self { + Self { _private: () } + } +} + +// SAFETY: We only use the Talc reference passed in via the trait functions and +// thus avoid deadlocks or panics. The balloon driver operations used in +// this implementation are passed this reference to Talc and also don't +// access the global Talc themselves. Hermit's logging implementation +// (via the log::Log implementation on crate::logging::KernelLogger, +// via hermit_macros::println!, via crate::console::_print) +// does not allocate to while printing, so using such logging here is safe. +unsafe impl Source for DeflateBalloonOnOom { + fn acquire(talc: &mut Talc, layout: Layout) -> Result<(), ()> { + warn!(" Encountered OOM, attempting to deflate balloon to recover..."); + + let Some(balloon_driver) = get_balloon_driver() else { + return Err(()); + }; + + let Some(mut ballon_driver_guard) = balloon_driver.try_lock() else { + error!( + " Driver was locked while attempting to allocate more than available. Unable to deflate balloon" + ); + return Err(()); + }; + + // For Talc's tag adjacent to the allocation, just always free one page more. + // Divide rounding up so the allocation always fits even if it's not a multiple of 4K pages large. + unsafe { + ballon_driver_guard.deflate_for_oom(talc, (layout.size().div_ceil(4096)) as u32 + 1) + } + } +} diff --git a/src/drivers/balloon/pci.rs b/src/drivers/balloon/pci.rs new file mode 100644 index 0000000000..9cae7dc4fe --- /dev/null +++ b/src/drivers/balloon/pci.rs @@ -0,0 +1,120 @@ +use core::marker::PhantomData; + +use virtio::pci::IsrStatus; +use volatile::VolatileRef; + +use super::{BalloonDevCfg, BalloonStorage, BalloonVq, VirtioBalloonDriver, VirtioBalloonError}; +use crate::drivers::pci::PciDevice; +use crate::drivers::virtio::error::VirtioError; +use crate::drivers::virtio::transport::pci::{self as virtio_pci, PciCap, UniCapsColl}; +use crate::executor::WakerRegistration; +use crate::pci::PciConfigRegion; + +impl VirtioBalloonDriver { + pub fn get_dev_id(&self) -> u16 { + self.dev_cfg.dev_id + } + + pub fn set_failed(&mut self) { + self.com_cfg.set_failed(); + } + + fn map_cfg(cap: &PciCap) -> Option { + let dev_cfg = virtio_pci::map_dev_cfg::(cap)?; + + let dev_cfg = VolatileRef::from_mut_ref(dev_cfg); + + Some(BalloonDevCfg { + raw: dev_cfg, + dev_id: cap.dev_id(), + features: virtio::balloon::F::empty(), + }) + } + + /// Instantiates a new [`VirtioBalloonDriver`] struct, by checking the available + /// configuration structures and moving them into the struct. + fn new( + caps_coll: UniCapsColl, + device: &PciDevice, + ) -> Result { + let device_id = device.device_id(); + + let UniCapsColl { + com_cfg, + notif_cfg, + isr_cfg, + dev_cfg_list, + .. + } = caps_coll; + + let Some(dev_cfg) = dev_cfg_list.iter().find_map(VirtioBalloonDriver::map_cfg) else { + error!(" No dev config. Aborting!"); + return Err(VirtioBalloonError::NoDevCfg { device_id }); + }; + + Ok(VirtioBalloonDriver { + dev_cfg, + com_cfg, + isr_stat: isr_cfg, + notif_cfg, + irq: device.get_irq().unwrap(), + + inflateq: BalloonVq::new(), + deflateq: BalloonVq::new(), + + num_in_balloon: 0, + num_pending_inflation: 0, + num_pending_deflation: 0, + num_targeted: 0, + + balloon_storage: BalloonStorage::new(), + last_voluntary_inflate: 0, + + waker: WakerRegistration::new(), + }) + } + + /// Initialize a new VIRTIO Traditional Memory Balloon device based on the given PCI device + pub fn from_pci_device( + device: &PciDevice, + ) -> Result { + let caps = virtio_pci::map_caps(device).inspect_err(|_| { + error!(" Mapping capabilities failed. Aborting!"); + })?; + + let mut driver = VirtioBalloonDriver::new(caps, device) + .inspect_err(|_| { + error!(" Initializing new driver failed. Aborting!"); + }) + .map_err(VirtioError::BalloonDriver)?; + + driver + .init_dev() + .inspect_err(|_| driver.set_failed()) + .map_err(VirtioError::BalloonDriver)?; + + info!( + " device with id {:x}, has been initialized by driver!", + driver.get_dev_id() + ); + + Ok(driver) + } + + pub fn handle_interrupt(&mut self) { + let status = self.isr_stat.acknowledge(); + + if status.contains(IsrStatus::DEVICE_CONFIGURATION_INTERRUPT) { + debug!( + " Received config interrupt, new config: {:?}", + self.dev_cfg + ); + } + + if status.contains(IsrStatus::QUEUE_INTERRUPT) { + debug!(" Received queue interrupt"); + } + + self.wake(); + } +} diff --git a/src/drivers/mod.rs b/src/drivers/mod.rs index 66cee6aef5..e6c5729b73 100644 --- a/src/drivers/mod.rs +++ b/src/drivers/mod.rs @@ -1,5 +1,7 @@ //! A module containing hermit-rs driver, hermit-rs driver trait and driver specific errors. +#[cfg(feature = "virtio-balloon")] +pub mod balloon; #[cfg(feature = "virtio-console")] pub mod console; #[cfg(feature = "virtio-fs")] diff --git a/src/drivers/pci.rs b/src/drivers/pci.rs index 935ed380cf..6b89f990e1 100644 --- a/src/drivers/pci.rs +++ b/src/drivers/pci.rs @@ -8,7 +8,8 @@ use hashbrown::HashMap; #[cfg(any( feature = "virtio-fs", feature = "virtio-vsock", - feature = "virtio-console" + feature = "virtio-console", + feature = "virtio-balloon", ))] use hermit_sync::InterruptTicketMutex; use hermit_sync::without_interrupts; @@ -22,6 +23,8 @@ use pci_types::{ use crate::arch::pci::PciConfigRegion; #[cfg(feature = "virtio-console")] use crate::console::IoDevice; +#[cfg(feature = "virtio-balloon")] +use crate::drivers::balloon::VirtioBalloonDriver; #[cfg(feature = "virtio-console")] use crate::drivers::console::{VirtioConsoleDriver, VirtioUART}; #[cfg(feature = "virtio-fs")] @@ -336,6 +339,8 @@ pub(crate) enum PciDriver { VirtioConsole(InterruptTicketMutex), #[cfg(feature = "virtio-vsock")] VirtioVsock(InterruptTicketMutex), + #[cfg(feature = "virtio-balloon")] + VirtioBalloon(InterruptTicketMutex), } impl PciDriver { @@ -366,6 +371,15 @@ impl PciDriver { } } + #[cfg(feature = "virtio-balloon")] + fn get_balloon_driver(&self) -> Option<&InterruptTicketMutex> { + #[allow(unreachable_patterns)] + match self { + Self::VirtioBalloon(drv) => Some(drv), + _ => None, + } + } + fn get_interrupt_handler(&self) -> (InterruptLine, fn()) { #[allow(unreachable_patterns)] match self { @@ -410,6 +424,18 @@ impl PciDriver { let irq_number = drv.lock().get_interrupt_number(); (irq_number, console_handler) } + #[cfg(feature = "virtio-balloon")] + Self::VirtioBalloon(drv) => { + fn balloon_handler() { + if let Some(driver) = get_balloon_driver() { + driver.lock().handle_interrupt(); + } + } + + let irq_number = drv.lock().get_interrupt_number(); + + (irq_number, balloon_handler) + } _ => todo!(), } } @@ -479,6 +505,14 @@ pub(crate) fn get_filesystem_driver() -> Option<&'static InterruptTicketMutex Option<&'static InterruptTicketMutex> { + PCI_DRIVERS + .get()? + .iter() + .find_map(|drv| drv.get_balloon_driver()) +} + pub(crate) fn init() { // virtio: 4.1.2 PCI Device Discovery without_interrupts(|| { @@ -514,6 +548,10 @@ pub(crate) fn init() { Ok(VirtioDriver::Vsock(drv)) => { register_driver(PciDriver::VirtioVsock(InterruptTicketMutex::new(*drv))); } + #[cfg(feature = "virtio-balloon")] + Ok(VirtioDriver::Balloon(drv)) => { + register_driver(PciDriver::VirtioBalloon(InterruptTicketMutex::new(*drv))); + } Err(err) => error!("Could not initialize virtio-pci device: {err}"), } } diff --git a/src/drivers/virtio/mod.rs b/src/drivers/virtio/mod.rs index caaae147f7..c28115458f 100644 --- a/src/drivers/virtio/mod.rs +++ b/src/drivers/virtio/mod.rs @@ -165,6 +165,8 @@ where pub mod error { use thiserror::Error; + #[cfg(feature = "virtio-balloon")] + pub use crate::drivers::balloon::VirtioBalloonError; #[cfg(feature = "virtio-console")] pub use crate::drivers::console::error::VirtioConsoleError; #[cfg(feature = "virtio-fs")] @@ -226,5 +228,9 @@ pub mod error { #[cfg(feature = "virtio-console")] #[error(transparent)] ConsoleDriver(VirtioConsoleError), + + #[cfg(feature = "virtio-balloon")] + #[error(transparent)] + BalloonDriver(VirtioBalloonError), } } diff --git a/src/drivers/virtio/transport/pci.rs b/src/drivers/virtio/transport/pci.rs index a16aba68a6..4036e021f3 100644 --- a/src/drivers/virtio/transport/pci.rs +++ b/src/drivers/virtio/transport/pci.rs @@ -20,6 +20,8 @@ use volatile::access::ReadOnly; use volatile::{VolatilePtr, VolatileRef}; use crate::arch::pci::PciConfigRegion; +#[cfg(feature = "virtio-balloon")] +use crate::drivers::balloon::VirtioBalloonDriver; #[cfg(feature = "virtio-console")] use crate::drivers::console::VirtioConsoleDriver; use crate::drivers::error::DriverError; @@ -717,6 +719,26 @@ pub(crate) fn init_device( Err(DriverError::InitVirtioDevFail(virtio_error)) } }, + #[cfg(feature = "virtio-balloon")] + virtio::Id::Balloon => match VirtioBalloonDriver::from_pci_device(device) { + Ok(virtio_balloon_driver) => { + info!("Virtio traditional memory balloon driver initialized."); + + let irq = device.get_irq().unwrap(); + crate::arch::interrupts::add_irq_name(irq, "virtio-balloon"); + info!("Virtio balloon interrupt handler at line {irq}"); + + Ok(VirtioDriver::Balloon(alloc::boxed::Box::new( + virtio_balloon_driver, + ))) + } + Err(virtio_error) => { + error!( + "Virtio traditional memory balloon driver could not be initialized with device id {device_id:x}: {virtio_error}" + ); + Err(DriverError::InitVirtioDevFail(virtio_error)) + } + }, id => { if let Some(feature) = id.as_feature() { error!("Virtio driver {id:?} is currently not active."); @@ -746,4 +768,6 @@ pub(crate) enum VirtioDriver { Net(alloc::boxed::Box), #[cfg(feature = "virtio-vsock")] Vsock(alloc::boxed::Box), + #[cfg(feature = "virtio-balloon")] + Balloon(alloc::boxed::Box), } diff --git a/src/drivers/virtio/virtqueue/mod.rs b/src/drivers/virtio/virtqueue/mod.rs index b02c35f7e3..9daf34abef 100644 --- a/src/drivers/virtio/virtqueue/mod.rs +++ b/src/drivers/virtio/virtqueue/mod.rs @@ -101,6 +101,9 @@ pub trait Virtq: Send { /// Disables interrupts for this virtqueue upon receiving a transfer fn disable_notifs(&mut self); + /// Check if there are no more descriptors left in the queue. + fn is_empty(&self) -> bool; + /// Checks if new used descriptors have been written by the device. /// This activates the queue and polls the descriptor ring of the queue. fn try_recv(&mut self) -> Result; @@ -539,6 +542,7 @@ mod index_alloc { pub struct IndexAlloc { /// Zero bits are available. bits: Box<[usize]>, + extra_bits: usize, } const USIZE_BITS: usize = usize::BITS as usize; @@ -555,7 +559,7 @@ mod index_alloc { *bits.last_mut().unwrap() = usize::MAX >> extra_bits; } - Self { bits } + Self { bits, extra_bits } } #[inline] @@ -587,6 +591,20 @@ mod index_alloc { *self.bits.get_unchecked_mut(word_index) &= !mask; } } + + pub fn all_used(&self) -> bool { + // all words all ones + self.bits.iter().all(|word| !*word == 0) + } + + pub fn all_available(&self) -> bool { + // all words all zeros, except for the extra bits in the last one + self.bits.iter().rev().skip(1).all(|word| *word == 0) + && self + .bits + .last() + .map_or(false, |word| *word & !(usize::MAX >> self.extra_bits) == 0) + } } #[cfg(all(test, not(target_os = "none")))] diff --git a/src/drivers/virtio/virtqueue/packed.rs b/src/drivers/virtio/virtqueue/packed.rs index d9d11de87d..2ef12158e5 100644 --- a/src/drivers/virtio/virtqueue/packed.rs +++ b/src/drivers/virtio/virtqueue/packed.rs @@ -514,6 +514,10 @@ impl Virtq for PackedVq { self.drv_event.disable_notif(); } + fn is_empty(&self) -> bool { + todo!() + } + fn try_recv(&mut self) -> Result { self.descr_ring.try_recv() } diff --git a/src/drivers/virtio/virtqueue/split.rs b/src/drivers/virtio/virtqueue/split.rs index c08f3d8d74..e99e879c0d 100644 --- a/src/drivers/virtio/virtqueue/split.rs +++ b/src/drivers/virtio/virtqueue/split.rs @@ -54,6 +54,10 @@ impl DescrRing { unsafe { &*self.used_ring_cell.get() } } + fn is_empty(&self) -> bool { + self.indexes.all_available() + } + fn push(&mut self, tkn: TransferToken) -> Result { let mut index; if let Some(ctrl_desc) = tkn.ctrl_desc.as_ref() { @@ -194,6 +198,10 @@ impl Virtq for SplitVq { self.ring.try_recv() } + fn is_empty(&self) -> bool { + self.ring.is_empty() + } + fn dispatch_batch( &mut self, _tkns: Vec<(AvailBufferToken, BufferType)>, diff --git a/src/executor/balloon.rs b/src/executor/balloon.rs new file mode 100644 index 0000000000..f23853135d --- /dev/null +++ b/src/executor/balloon.rs @@ -0,0 +1,41 @@ +use core::future; +use core::task::Poll; + +use crate::drivers::pci; +use crate::executor::spawn; + +async fn balloon_run() { + future::poll_fn(|cx| { + if let Some(driver) = pci::get_balloon_driver() { + let Some(mut driver_guard) = driver.try_lock() else { + debug!( + "Balloon driver was polled while the driver was locked elsewhere, doing nothing" + ); + // This should only happen when polling while another core is deflating due to an OOM event, + // or an interrupt is being handled, otherwise we only lock the driver here. + + // Interrupt handling should wake the registered waker and deflation as a result of OOM + // handling should cause items to be submitted to the deflateq which should lead to a + // future interrupt. + return Poll::Pending; + }; + + driver_guard.poll_events(cx); + + Poll::Pending + } else { + Poll::Ready(()) + } + }) + .await; +} + +pub(crate) fn init() { + info!("Try to initialize balloon interface!"); + + if let Some(driver) = pci::get_balloon_driver() { + driver.lock().enable_interrupts(); + } + + spawn(balloon_run()); +} diff --git a/src/executor/mod.rs b/src/executor/mod.rs index 3a7ff58ce8..e606fc8df0 100644 --- a/src/executor/mod.rs +++ b/src/executor/mod.rs @@ -1,5 +1,7 @@ #[cfg(feature = "alloc-stats")] mod alloc_stats; +#[cfg(feature = "virtio-balloon")] +mod balloon; #[cfg(feature = "net")] pub(crate) mod device; #[cfg(feature = "net")] @@ -130,6 +132,8 @@ pub fn init() { crate::executor::vsock::init(); #[cfg(feature = "alloc-stats")] crate::executor::alloc_stats::init(); + #[cfg(feature = "virtio-balloon")] + crate::executor::balloon::init(); } /// Blocks the current thread on `f`, running the executor when idling. diff --git a/src/lib.rs b/src/lib.rs index a8edaa6250..87efc84f8b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -74,6 +74,10 @@ reexport_test_harness_main = "test_main" )] #![cfg_attr(all(target_os = "none", test), no_main)] +#![cfg_attr( + feature = "virtio-balloon", + feature(iter_collect_into, iter_array_chunks) +)] // FIXME: move this to `Cargo.toml` once stable #![feature(strict_provenance_lints)] #![warn(fuzzy_provenance_casts)] diff --git a/src/mm/mod.rs b/src/mm/mod.rs index 43541c58db..587c58d61e 100644 --- a/src/mm/mod.rs +++ b/src/mm/mod.rs @@ -54,7 +54,7 @@ use hermit_sync::{Lazy, RawInterruptTicketMutex}; pub use memory_addresses::{PhysAddr, VirtAddr}; #[cfg(target_os = "none")] use talc::TalcLock; -#[cfg(target_os = "none")] +#[cfg(all(target_os = "none", not(feature = "virtio-balloon")))] use talc::source::Manual; pub use self::page_range_alloc::{PageRangeAllocator, PageRangeBox}; @@ -64,12 +64,22 @@ pub use self::virtualmem::{PageAlloc, PageBox}; use crate::arch::mm::paging::HugePageSize; pub use crate::arch::mm::paging::virtual_to_physical; use crate::arch::mm::paging::{BasePageSize, LargePageSize, PageSize}; +#[cfg(all(target_os = "none", feature = "virtio-balloon"))] +use crate::drivers::balloon::oom::DeflateBalloonOnOom; use crate::{arch, env}; -#[cfg(target_os = "none")] +#[cfg(all(target_os = "none", not(feature = "virtio-balloon")))] #[global_allocator] pub(crate) static ALLOCATOR: TalcLock = TalcLock::new(Manual); +#[cfg(all(target_os = "none", feature = "virtio-balloon"))] +#[global_allocator] +pub(crate) static ALLOCATOR: TalcLock = TalcLock::new( + // SAFETY: We are using this Talc Source with the one and only global instance + // of Talc that Hermit uses as its global allocator. + unsafe { DeflateBalloonOnOom::new() }, +); + /// Physical and virtual address range of the 2 MiB pages that map the kernel. static KERNEL_ADDR_RANGE: Lazy> = Lazy::new(|| { if cfg!(target_os = "none") {