diff --git a/Cargo.toml b/Cargo.toml index 16e2bc5..1ae791e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,25 @@ version = "0.1.0" authors = ["Samuel Ortiz "] repository = "https://github.com/rust-vmm/vm-device" license = "Apache-2.0" +edition = "2018" [dependencies] -vm-memory = { git = "https://github.com/rust-vmm/vm-memory" } +libc = ">=0.2.39" +kvm-bindings = { version = "~0", optional = true } +kvm-ioctls = { version = "~0", optional = true } +vfio-ioctls = { git = "https://github.com/cloud-hypervisor/vfio-ioctls.git", branch = "dragonball", optional = true } +vmm-sys-util = "~0" + +[dev-dependencies] +byteorder = ">=1.2.1" + +[features] +legacy-irq = [] +msi-irq = [] +vfio-msi-irq = [] + +kvm-irq = ["kvm-ioctls", "kvm-bindings"] +kvm-msi-generic = ["msi-irq", "kvm-irq"] +kvm-legacy-irq = ["legacy-irq", "kvm-irq"] +kvm-msi-irq = ["kvm-msi-generic"] +kvm-vfio-msi-irq = ["kvm-msi-generic", "vfio-ioctls", "vfio-msi-irq"] diff --git a/coverage_config.json b/coverage_config.json index 03bbeba..2e04917 100644 --- a/coverage_config.json +++ b/coverage_config.json @@ -1,5 +1,5 @@ { - "coverage_score": 75.8, + "coverage_score": 86.1, "exclude_path": "", "crate_features": "" } diff --git a/rust-vmm-ci b/rust-vmm-ci index bb1cd14..c309d06 160000 --- a/rust-vmm-ci +++ b/rust-vmm-ci @@ -1 +1 @@ -Subproject commit bb1cd14d2c164b4f699b08c885c06a02fbe3f7b0 +Subproject commit c309d0627bde6b07db91201dd8b47007841c100a diff --git a/src/device_manager.rs b/src/device_manager.rs new file mode 100644 index 0000000..f368036 --- /dev/null +++ b/src/device_manager.rs @@ -0,0 +1,511 @@ +// Copyright © 2019 Intel Corporation. All Rights Reserved. +// SPDX-License-Identifier: (Apache-2.0 OR BSD-3-Clause) + +//! System level device management. +//! +//! [IoManager](struct.IoManager.html) is respondsible for managing +//! all devices of virtual machine, registering IO resources callback, +//! unregistering devices and helping VM IO exit handling. +//! +//!VMM would be responsible for getting device resource request, ask +//! vm_allocator to allocate the resources, ask vm_device to register the +//! devices IO ranges, and finally set resources to virtual device. + +use crate::resources::Resource; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +use crate::PioAddress; +use crate::{DeviceIo, IoAddress, IoSize}; + +use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; +use std::collections::btree_map::BTreeMap; +use std::ops::Deref; +use std::result; +use std::sync::Arc; + +/// Error type for `IoManager` usage. +#[derive(Debug)] +pub enum Error { + /// The inserting device overlaps with a current device. + DeviceOverlap, + /// The device doesn't exist. + NoDevice, +} + +/// Simplify the `Result` type. +pub type Result = result::Result; + +/// Structure describing an IO range. +#[derive(Debug, Copy, Clone, Eq)] +pub struct IoRange { + base: IoAddress, + size: IoSize, +} + +impl IoRange { + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + fn new_pio_range(base: u16, size: u16) -> Self { + IoRange { + base: IoAddress(base as u64), + size: IoSize(size as u64), + } + } + + fn new_mmio_range(base: u64, size: u64) -> Self { + IoRange { + base: IoAddress(base), + size: IoSize(size), + } + } +} + +impl PartialEq for IoRange { + fn eq(&self, other: &IoRange) -> bool { + self.base == other.base + } +} + +impl Ord for IoRange { + fn cmp(&self, other: &IoRange) -> Ordering { + self.base.cmp(&other.base) + } +} + +impl PartialOrd for IoRange { + fn partial_cmp(&self, other: &IoRange) -> Option { + self.base.partial_cmp(&other.base) + } +} + +/// System IO manager serving for all devices management and VM exit handling. +#[derive(Clone, Default)] +pub struct IoManager { + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + /// Range mapping for VM exit pio operations. + pio_bus: BTreeMap>, + /// Range mapping for VM exit mmio operations. + mmio_bus: BTreeMap>, +} + +impl IoManager { + /// Create an default IoManager with empty IO member. + pub fn new() -> Self { + IoManager::default() + } + + /// Register a new device IO with its allocated resources. + /// VMM is responsible for providing the allocated resources to virtual device. + /// + /// # Arguments + /// + /// * `device`: device instance object to be registered + /// * `resources`: resources that this device owns, might include + /// port I/O and memory-mapped I/O ranges, irq number, etc. + pub fn register_device_io( + &mut self, + device: Arc, + resources: &[Resource], + ) -> Result<()> { + // Register and mark device resources + // The resources addresses being registered are sucessfully allocated before. + for (idx, res) in resources.iter().enumerate() { + match *res { + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + Resource::PioAddressRange { base, size } => { + if self + .pio_bus + .insert(IoRange::new_pio_range(base, size), device.clone()) + .is_some() + { + // Unregister registered resources. + self.unregister_device_io(&resources[0..idx]) + .expect("failed to unregister devices"); + + return Err(Error::DeviceOverlap); + } + } + Resource::MmioAddressRange { base, size } => { + if self + .mmio_bus + .insert(IoRange::new_mmio_range(base, size), device.clone()) + .is_some() + { + // Unregister registered resources. + self.unregister_device_io(&resources[0..idx]) + .expect("failed to unregister devices"); + + return Err(Error::DeviceOverlap); + } + } + _ => continue, + } + } + Ok(()) + } + + /// Unregister a device from `IoManager`, e.g. users specified removing. + /// VMM pre-fetches the resources e.g. dev.get_assigned_resources() + /// VMM is responsible for freeing the resources. + /// + /// # Arguments + /// + /// * `resources`: resources that this device owns, might include + /// port I/O and memory-mapped I/O ranges, irq number, etc. + pub fn unregister_device_io(&mut self, resources: &[Resource]) -> Result<()> { + for res in resources.iter() { + match *res { + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + Resource::PioAddressRange { base, size } => { + self.pio_bus.remove(&IoRange::new_pio_range(base, size)); + } + Resource::MmioAddressRange { base, size } => { + self.mmio_bus.remove(&IoRange::new_mmio_range(base, size)); + } + _ => continue, + } + } + Ok(()) + } + + /// A helper function handling MMIO read command during VM exit. + /// The virtual device itself provides mutable ability and thead-safe protection. + /// + /// Return error if failed to get the device. + pub fn mmio_read(&self, addr: u64, data: &mut [u8]) -> Result<()> { + self.get_device(IoAddress(addr)) + .map(|(device, base)| device.read(base, IoAddress(addr - base.raw_value()), data)) + .ok_or(Error::NoDevice) + } + + /// A helper function handling MMIO write command during VM exit. + /// The virtual device itself provides mutable ability and thead-safe protection. + /// + /// Return error if failed to get the device. + pub fn mmio_write(&self, addr: u64, data: &[u8]) -> Result<()> { + self.get_device(IoAddress(addr)) + .map(|(device, base)| device.write(base, IoAddress(addr - base.raw_value()), data)) + .ok_or(Error::NoDevice) + } + + // Return the Device mapped `addr` and the base address. + fn get_device(&self, addr: IoAddress) -> Option<(&Arc, IoAddress)> { + let range = IoRange::new_mmio_range(addr.raw_value(), 0); + if let Some((range, dev)) = self.mmio_bus.range(..=&range).nth_back(0) { + if (addr.raw_value() - range.base.raw_value()) < range.size.raw_value() { + return Some((dev, range.base)); + } + } + None + } +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +impl IoManager { + /// A helper function handling PIO read command during VM exit. + /// The virtual device itself provides mutable ability and thead-safe protection. + /// + /// Return error if failed to get the device. + pub fn pio_read(&self, addr: u16, data: &mut [u8]) -> Result<()> { + self.get_pio_device(PioAddress(addr)) + .map(|(device, base)| device.pio_read(base, PioAddress(addr - base.raw_value()), data)) + .ok_or(Error::NoDevice) + } + + /// A helper function handling PIO write command during VM exit. + /// The virtual device itself provides mutable ability and thead-safe protection. + /// + /// Return error if failed to get the device. + pub fn pio_write(&self, addr: u16, data: &[u8]) -> Result<()> { + self.get_pio_device(PioAddress(addr)) + .map(|(device, base)| device.pio_write(base, PioAddress(addr - base.raw_value()), data)) + .ok_or(Error::NoDevice) + } + + // Return the Device mapped `addr` and the base address. + fn get_pio_device(&self, addr: PioAddress) -> Option<(&Arc, PioAddress)> { + let range = IoRange::new_pio_range(addr.raw_value(), 0); + if let Some((range, dev)) = self.pio_bus.range(..=&range).nth_back(0) { + if (addr.raw_value() as u64 - range.base.raw_value()) < range.size.raw_value() { + return Some((dev, PioAddress(range.base.0 as u16))); + } + } + None + } +} + +/// Io manager transaction context to register/unregister devices. +pub trait IoManagerContext { + /// Type of context object. + type Context; + + /// Begin a transaction and return a context object. + /// + /// The returned context object must be passed to commit_tx() or cancel_tx() later. + fn begin_tx(&self) -> Self::Context; + + /// Commit the transaction. + fn commit_tx(&self, ctx: Self::Context); + + /// Cancel the transaction. + fn cancel_tx(&self, ctx: Self::Context); + + /// Register a new device IO with its allocated resources. + /// + /// # Arguments + /// + /// * `ctx`: context object returned by begin_tx(). + /// * `device`: device instance object to be registered + /// * `resources`: resources that this device owns, might include + /// port I/O and memory-mapped I/O ranges, irq number, etc. + fn register_device_io( + &self, + ctx: &mut Self::Context, + device: Arc, + resources: &[Resource], + ) -> Result<()>; + + /// Unregister a device from `IoManager`, e.g. users specified removing. + /// VMM pre-fetches the resources e.g. dev.get_assigned_resources() + /// VMM is responsible for freeing the resources. + /// + /// # Arguments + /// + /// * `ctx`: context object returned by begin_tx(). + /// * `resources`: resources that this device owns, might include + /// port I/O and memory-mapped I/O ranges, irq number, etc. + fn unregister_device_io(&self, ctx: &mut Self::Context, resources: &[Resource]) -> Result<()>; +} + +impl IoManagerContext for Arc { + type Context = T::Context; + + fn begin_tx(&self) -> Self::Context { + self.deref().begin_tx() + } + + fn commit_tx(&self, ctx: Self::Context) { + self.deref().commit_tx(ctx) + } + + fn cancel_tx(&self, ctx: Self::Context) { + self.deref().cancel_tx(ctx) + } + + fn register_device_io( + &self, + ctx: &mut Self::Context, + device: Arc, + resources: &[Resource], + ) -> std::result::Result<(), Error> { + self.deref().register_device_io(ctx, device, resources) + } + + fn unregister_device_io( + &self, + ctx: &mut Self::Context, + resources: &[Resource], + ) -> std::result::Result<(), Error> { + self.deref().unregister_device_io(ctx, resources) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Mutex; + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + const PIO_ADDRESS_SIZE: u16 = 4; + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + const PIO_ADDRESS_BASE: u16 = 0x40; + const MMIO_ADDRESS_SIZE: u64 = 0x8765_4321; + const MMIO_ADDRESS_BASE: u64 = 0x1234_5678; + const LEGACY_IRQ: u32 = 4; + const CONFIG_DATA: u32 = 0x1234; + + struct DummyDevice { + config: Mutex, + } + + impl DummyDevice { + fn new(config: u32) -> Self { + DummyDevice { + config: Mutex::new(config), + } + } + } + + impl DeviceIo for DummyDevice { + fn read(&self, _base: IoAddress, _offset: IoAddress, data: &mut [u8]) { + if data.len() > 4 { + return; + } + for (idx, iter) in data.iter_mut().enumerate() { + let config = self.config.lock().expect("failed to acquire lock"); + *iter = (*config >> (idx * 8) & 0xff) as u8; + } + } + + fn write(&self, _base: IoAddress, _offset: IoAddress, data: &[u8]) { + let mut config = self.config.lock().expect("failed to acquire lock"); + *config = u32::from(data[0]) & 0xff; + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + fn pio_read(&self, _base: PioAddress, _offset: PioAddress, data: &mut [u8]) { + if data.len() > 4 { + return; + } + for (idx, iter) in data.iter_mut().enumerate() { + let config = self.config.lock().expect("failed to acquire lock"); + *iter = (*config >> (idx * 8) & 0xff) as u8; + } + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + fn pio_write(&self, _base: PioAddress, _offset: PioAddress, data: &[u8]) { + let mut config = self.config.lock().expect("failed to acquire lock"); + *config = u32::from(data[0]) & 0xff; + } + } + + #[test] + fn test_clone_io_manager() { + let mut io_mgr = IoManager::new(); + let dummy = DummyDevice::new(0); + let dum = Arc::new(dummy); + + let mut resource: Vec = Vec::new(); + let mmio = Resource::MmioAddressRange { + base: MMIO_ADDRESS_BASE, + size: MMIO_ADDRESS_SIZE, + }; + let irq = Resource::LegacyIrq(LEGACY_IRQ); + + resource.push(mmio); + resource.push(irq); + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + let pio = Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + }; + resource.push(pio); + } + + assert!(io_mgr.register_device_io(dum.clone(), &resource).is_ok()); + + let io_mgr2 = io_mgr.clone(); + assert_eq!(io_mgr2.mmio_bus.len(), 1); + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + assert_eq!(io_mgr2.pio_bus.len(), 1); + + let (dev, addr) = io_mgr2 + .get_device(IoAddress(MMIO_ADDRESS_BASE + 1)) + .unwrap(); + assert_eq!(Arc::strong_count(dev), 5); + + assert_eq!(addr, IoAddress(MMIO_ADDRESS_BASE)); + + drop(io_mgr); + assert_eq!(Arc::strong_count(dev), 3); + + drop(io_mgr2); + assert_eq!(Arc::strong_count(&dum), 1); + } + } + + #[test] + fn test_register_unregister_device_io() { + let mut io_mgr = IoManager::new(); + let dummy = DummyDevice::new(0); + let dum = Arc::new(dummy); + + let mut resource: Vec = Vec::new(); + let mmio = Resource::MmioAddressRange { + base: MMIO_ADDRESS_BASE, + size: MMIO_ADDRESS_SIZE, + }; + let irq = Resource::LegacyIrq(LEGACY_IRQ); + + resource.push(mmio); + resource.push(irq); + + assert!(io_mgr.register_device_io(dum.clone(), &resource).is_ok()); + assert!(io_mgr.unregister_device_io(&resource).is_ok()) + } + + #[test] + fn test_mmio_read_write() { + let mut io_mgr: IoManager = Default::default(); + let dum = Arc::new(DummyDevice::new(CONFIG_DATA)); + let mut resource: Vec = Vec::new(); + + let mmio = Resource::MmioAddressRange { + base: MMIO_ADDRESS_BASE, + size: MMIO_ADDRESS_SIZE, + }; + resource.push(mmio); + assert!(io_mgr.register_device_io(dum.clone(), &resource).is_ok()); + + let mut data = [0; 4]; + assert!(io_mgr.mmio_read(MMIO_ADDRESS_BASE, &mut data).is_ok()); + assert_eq!(data, [0x34, 0x12, 0, 0]); + + assert!(io_mgr + .mmio_read(MMIO_ADDRESS_BASE + MMIO_ADDRESS_SIZE, &mut data) + .is_err()); + + data = [0; 4]; + assert!(io_mgr.mmio_write(MMIO_ADDRESS_BASE, &data).is_ok()); + assert_eq!(*dum.config.lock().unwrap(), 0); + + assert!(io_mgr + .mmio_write(MMIO_ADDRESS_BASE + MMIO_ADDRESS_SIZE, &data) + .is_err()); + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + #[test] + fn test_pio_read_write() { + let mut io_mgr: IoManager = Default::default(); + let dum = Arc::new(DummyDevice::new(CONFIG_DATA)); + let mut resource: Vec = Vec::new(); + + let pio = Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + }; + resource.push(pio); + assert!(io_mgr.register_device_io(dum.clone(), &resource).is_ok()); + + let mut data = [0; 4]; + assert!(io_mgr.pio_read(PIO_ADDRESS_BASE, &mut data).is_ok()); + assert_eq!(data, [0x34, 0x12, 0, 0]); + + assert!(io_mgr + .pio_read(PIO_ADDRESS_BASE + PIO_ADDRESS_SIZE, &mut data) + .is_err()); + + data = [0; 4]; + assert!(io_mgr.pio_write(PIO_ADDRESS_BASE, &data).is_ok()); + assert_eq!(*dum.config.lock().unwrap(), 0); + + assert!(io_mgr + .pio_write(PIO_ADDRESS_BASE + PIO_ADDRESS_SIZE, &data) + .is_err()); + } + + #[test] + fn test_device_manager_data_structs() { + let range1 = IoRange::new_mmio_range(0x1000, 0x1000); + let range2 = IoRange::new_mmio_range(0x1000, 0x2000); + let range3 = IoRange::new_mmio_range(0x2000, 0x1000); + + assert_eq!(range1, range1.clone()); + assert_eq!(range1, range2); + assert!(range1 < range3); + } +} diff --git a/src/interrupt/kvm/legacy_irq.rs b/src/interrupt/kvm/legacy_irq.rs new file mode 100644 index 0000000..b7290cd --- /dev/null +++ b/src/interrupt/kvm/legacy_irq.rs @@ -0,0 +1,256 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Manage virtual device's legacy interrupts based on Linux KVM framework. +//! +//! On x86 platforms, legacy interrupts are those managed by the Master PIC, the slave PIC and +//! IOAPICs. + +use super::*; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use kvm_bindings::{ + KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_IRQ_ROUTING_IRQCHIP, +}; +use vmm_sys_util::eventfd::EFD_NONBLOCK; + +/// Maximum number of legacy interrupts supported. +pub const MAX_LEGACY_IRQS: u32 = 24; + +pub(super) struct LegacyIrq { + base: u32, + vmfd: Arc, + irqfd: EventFd, +} + +impl LegacyIrq { + #[allow(clippy::new_ret_no_self)] + pub(super) fn new( + base: InterruptIndex, + count: InterruptIndex, + vmfd: Arc, + _routes: Arc, + ) -> Result { + if count != 1 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + if base >= MAX_LEGACY_IRQS { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + Ok(LegacyIrq { + base, + vmfd, + irqfd: EventFd::new(EFD_NONBLOCK)?, + }) + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn add_legacy_entry( + gsi: u32, + chip: u32, + pin: u32, + routes: &mut HashMap, + ) -> Result<()> { + let mut entry = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + // Safe because we are initializing all fields of the `irqchip` struct. + entry.u.irqchip.irqchip = chip; + entry.u.irqchip.pin = pin; + routes.insert(hash_key(&entry), entry); + + Ok(()) + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + /// Build routings for IRQs connected to the master PIC, the slave PIC or the first IOAPIC. + pub(super) fn initialize_legacy( + routes: &mut HashMap, + ) -> Result<()> { + // Build routings for the master PIC + for i in 0..8 { + if i != 2 { + Self::add_legacy_entry(i, KVM_IRQCHIP_PIC_MASTER, i, routes)?; + } + } + + // Build routings for the slave PIC + for i in 8..16 { + Self::add_legacy_entry(i, KVM_IRQCHIP_PIC_SLAVE, i - 8, routes)?; + } + + // Build routings for the first IOAPIC + for i in 0..MAX_LEGACY_IRQS { + if i == 0 { + Self::add_legacy_entry(i, KVM_IRQCHIP_IOAPIC, 2, routes)?; + } else if i != 2 { + Self::add_legacy_entry(i, KVM_IRQCHIP_IOAPIC, i, routes)?; + }; + } + + Ok(()) + } + + #[cfg(any(target_arch = "aarch", target_arch = "aarch64"))] + pub(super) fn initialize_legacy( + _routes: &mut HashMap, + ) -> Result<()> { + //TODO + Ok(()) + } +} + +impl InterruptSourceGroup for LegacyIrq { + fn interrupt_type(&self) -> InterruptSourceType { + InterruptSourceType::LegacyIrq + } + + fn len(&self) -> u32 { + 1 + } + + fn base(&self) -> u32 { + self.base + } + + fn enable(&self, configs: &[InterruptSourceConfig]) -> Result<()> { + if configs.len() != 1 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + // The IRQ routings for legacy IRQs have been configured during + // KvmIrqManager::initialize(), so only need to register irqfd to the KVM driver. + self.vmfd + .register_irqfd(&self.irqfd, self.base) + .map_err(from_sys_util_errno) + } + + fn disable(&self) -> Result<()> { + self.vmfd + .unregister_irqfd(&self.irqfd, self.base) + .map_err(from_sys_util_errno) + } + + fn update(&self, index: InterruptIndex, _config: &InterruptSourceConfig) -> Result<()> { + // For legacy interrupts, the routing configuration is managed by the PIC/IOAPIC interrupt + // controller drivers, so nothing to do here. + if index != 0 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + Ok(()) + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + if index != 0 { + None + } else { + Some(&self.irqfd) + } + } + + fn trigger(&self, index: InterruptIndex) -> Result<()> { + if index != 0 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + self.irqfd.write(1) + } + + fn mask(&self, index: InterruptIndex) -> Result<()> { + if index > 1 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + let irqfd = &self.irqfd; + self.vmfd + .unregister_irqfd(irqfd, self.base + index) + .map_err(from_sys_util_errno)?; + + Ok(()) + } + + fn unmask(&self, index: InterruptIndex) -> Result<()> { + if index > 1 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + let irqfd = &self.irqfd; + self.vmfd + .register_irqfd(irqfd, self.base + index) + .map_err(from_sys_util_errno)?; + + Ok(()) + } + + fn get_pending_state(&self, index: InterruptIndex) -> bool { + if index > 1 { + return false; + } + + // Peak the EventFd.count by reading and writing back. + // The irqfd must be in NON-BLOCKING mode. + let irqfd = &self.irqfd; + match irqfd.read() { + Err(_) => false, + Ok(count) => { + if count != 0 { + if let Err(_) = irqfd.write(count) { + // Hope the caller will handle the pending state corrrectly, + // then no interrupt will be lost. + //panic!("really no way to recover here!!!!"); + } + } + count != 0 + } + } + } +} + +#[cfg(test)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +mod test { + use super::*; + use kvm_ioctls::{Kvm, VmFd}; + + fn create_vm_fd() -> VmFd { + let kvm = Kvm::new().unwrap(); + kvm.create_vm().unwrap() + } + + #[test] + #[allow(unreachable_patterns)] + fn test_legacy_interrupt_group() { + let vmfd = Arc::new(create_vm_fd()); + let rounting = Arc::new(KvmIrqRouting::new(vmfd.clone())); + let base = 0; + let count = 1; + let group = LegacyIrq::new(base, count, vmfd.clone(), rounting.clone()).unwrap(); + + let mut legacy_fds = Vec::with_capacity(1); + legacy_fds.push(InterruptSourceConfig::LegacyIrq(LegacyIrqSourceConfig {})); + + match group.interrupt_type() { + InterruptSourceType::LegacyIrq => {} + _ => { + panic!(); + } + } + assert_eq!(group.len(), 1); + assert_eq!(group.base(), base); + assert!(group.enable(&legacy_fds).is_ok()); + assert!(group.notifier(0).unwrap().write(1).is_ok()); + assert!(group.trigger(0).is_ok()); + assert!(group.trigger(1).is_err()); + assert!(group + .update( + 0, + &InterruptSourceConfig::LegacyIrq(LegacyIrqSourceConfig {}) + ) + .is_ok()); + assert!(group.disable().is_ok()); + + assert!(LegacyIrq::new(base, 2, vmfd.clone(), rounting.clone()).is_err()); + assert!(LegacyIrq::new(110, 1, vmfd.clone(), rounting.clone()).is_err()); + } +} diff --git a/src/interrupt/kvm/mod.rs b/src/interrupt/kvm/mod.rs new file mode 100644 index 0000000..c02c93d --- /dev/null +++ b/src/interrupt/kvm/mod.rs @@ -0,0 +1,478 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Manage virtual device's interrupts based on the Linux KVM framework. +//! +//! When updaing KVM IRQ routing by ioctl(KVM_SET_GSI_ROUTING), all interrupts of the virtual +//! machine must be updated all together. The [KvmIrqRouting](struct.KvmIrqRouting.html) +//! structure is to maintain the global interrupt routing table. +//! +//! It deserves a good documentation about the way that KVM based vmms manages interrupts. +//! From the KVM hypervisor side, it provides three mechanism to support injecting interrupts into +//! guests: +//! 1) Irqfd. When data is written to an irqfd, it triggers KVM to inject an interrupt into guest. +//! 2) Irq routing. Irq routing determines the way to inject an irq into guest. +//! 3) Signal MSI. Vmm can inject an MSI interrupt into guest by issuing KVM_SIGNAL_MSI ioctl. +//! +//! Most VMMs use irqfd + irq routing to support interrupt injecting, so we will focus on this mode. +//! The flow to enable interrupt injecting is: +//! 1) VMM creates an irqfd +//! 2) VMM invokes KVM_IRQFD to bind the irqfd to an interrupt source +//! 3) VMM invokes KVM_SET_GSI_ROUTING to configure the way to inject the interrupt into guest +//! 4) device backend driver writes to the irqfd +//! 5) an interurpt is injected into the guest +//! +//! So far so good, right? Let's move on to mask/unmask/get_pending_state. That's the real tough +//! part. To support mask/unmask/get_peding_state, we must have a way to break the interrupt +//! delivery chain and maintain the pending state. Let's see how it's implemented by each VMM. +//! - Firecracker. It's very simple, it doesn't support mask/unmask/get_pending_state at all. +//! - Cloud Hypervisor. It builds the interrupt delivery path as: +//! vhost-backend-driver -> EeventFd -> CLH -> Irqfd -> Irqrouting -> Guest OS +//! It also maintains a masked/pending pair for each interrupt. When masking an interrupt, it +//! sets the masked flag and remove IrqRouting for the interrupt. +//! The CLH design has two shortcomings: +//! - it's inefficient for the hot interrupt delivery path. +//! - it may lose in-flight interrupts after removing IRQ routing entry for an interrupt due irqfd +//! implementation details. Buy me a cup of coffee if you wants to knwo the detail. +//! - Qemu. Qemu has a smart design, which supports: +//! - A fast path: driver -> irqfd -> Irqrouting -> Guest OS +//! - A slow path: driver -> eventfd -> qemu -> irqfd -> Irqrouting -> Guest OS +//! When masking an interrupt, it switches from fast path to slow path and vice versa when +//! unmasking an interrupt. +//! - Dragonball V1. We doesn't support mask/unmask/get_pending_state at all, we have also enhanced +//! the Virtio MMIO spec, we could use the fast path: driver -> irqfd -> Irqrouting -> Guest OS. +//! - Dragonball V2. When enabling PCI device passthrough, mask/unmask/get_pending_state is a must +//! to support PCI MSI/MSIx. Unlike Qemu fast path/slow path design, Dragonball V2 implements +//! mask/unmask/get_pending_state with fast path only. It works as follow: +//! 1) When masking an interrupt, unbind the irqfd from the interrupt by KVM_IRQFD. After that, +//! all writes to the irqfd won't trigger injecting anymore, and irqfd maintains count for +//! following write operations. +//! 2) When unmasking an interrupt, bind the irqfd to the interrupt again by KVM_IRQFD. When +//! rebinding, an interrupt will be injected into guest if the irqfd has a non-zero count. +//! 3) When getting pending state, peek the count of the irqfd. But the irqfd doesn't support +//! peek, so simulate peek by reading and writing back the count read. +//! By this design, we use the irqfd count to maintain interrupt pending state, and auto-inject +//! pending interrupts when rebinding. So we don't need to maintain the pending status bit. +//! +//! Why Qemu needs a slow path but Dragonball V2 doesn't need slow path? +//! Qemu needs to support a broad ranges of guest OSes and all kinds of device drivers. And some +//! legacy device drivers mask/unmask interrupt when handling each interrupt. +//! For Dragonball, we don't expect guest device driver exhibits such behaviors, and treat +//! mask/unmask/get_pending_state as cold path. We optimize for the hot interrupt delivery path +//! and avoid the complexity to introduce a slow path. The penalty is that get_pending_state() +//! will be much more expensive. + +use std::collections::HashMap; +use std::io::{Error, ErrorKind}; +use std::sync::{Arc, Mutex}; + +use kvm_bindings::{kvm_irq_routing, kvm_irq_routing_entry}; +use kvm_ioctls::VmFd; + +use super::*; + +#[cfg(feature = "kvm-legacy-irq")] +mod legacy_irq; +#[cfg(feature = "kvm-legacy-irq")] +use self::legacy_irq::LegacyIrq; +#[cfg(feature = "kvm-msi-generic")] +mod msi_generic; +#[cfg(feature = "kvm-msi-irq")] +mod msi_irq; +#[cfg(feature = "kvm-msi-irq")] +use self::msi_irq::MsiIrq; + +#[cfg(feature = "kvm-vfio-msi-irq")] +mod vfio_msi_irq; +#[cfg(feature = "kvm-vfio-msi-irq")] +use self::vfio_msi_irq::VfioMsiIrq; + +/// Maximum number of global interrupt sources. +pub const MAX_IRQS: InterruptIndex = 1024; + +/// Default maximum number of Message Signaled Interrupts per device. +pub const DEFAULT_MAX_MSI_IRQS_PER_DEVICE: InterruptIndex = 128; + +/// Structure to manage interrupt sources for a virtual machine based on the Linux KVM framework. +/// +/// The KVM framework provides methods to inject interrupts into the target virtual machines, +/// which uses irqfd to notity the KVM kernel module for injecting interrupts. When the interrupt +/// source, usually a virtual device backend in userspace, writes to the irqfd file descriptor, +/// the KVM kernel module will inject a corresponding interrupt into the target VM according to +/// the IRQ routing configuration. +pub struct KvmIrqManager { + mgr: Mutex, +} + +impl KvmIrqManager { + /// Create a new interrupt manager based on the Linux KVM framework. + /// + /// # Arguments + /// * `vmfd`: The KVM VM file descriptor, which will be used to access the KVM subsystem. + pub fn new(vmfd: Arc) -> Self { + let vmfd2 = vmfd.clone(); + KvmIrqManager { + mgr: Mutex::new(KvmIrqManagerObj { + vmfd, + groups: HashMap::new(), + routes: Arc::new(KvmIrqRouting::new(vmfd2)), + max_msi_irqs: DEFAULT_MAX_MSI_IRQS_PER_DEVICE, + }), + } + } + + /// Prepare the interrupt manager for generating interrupts into the target VM. + pub fn initialize(&self) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + let mgr = self.mgr.lock().unwrap(); + mgr.initialize() + } + + /// Set maximum supported MSI interrupts per device. + pub fn set_max_msi_irqs(&self, max_msi_irqs: InterruptIndex) { + let mut mgr = self.mgr.lock().unwrap(); + mgr.max_msi_irqs = max_msi_irqs; + } +} + +impl InterruptManager for KvmIrqManager { + fn create_group( + &self, + ty: InterruptSourceType, + base: InterruptIndex, + count: u32, + ) -> Result>> { + // Safe to unwrap because there's no legal way to break the mutex. + let mut mgr = self.mgr.lock().unwrap(); + mgr.create_group(ty, base, count) + } + + fn destroy_group(&self, group: Arc>) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + let mut mgr = self.mgr.lock().unwrap(); + mgr.destroy_group(group) + } +} + +struct KvmIrqManagerObj { + vmfd: Arc, + routes: Arc, + groups: HashMap>>, + max_msi_irqs: InterruptIndex, +} + +impl KvmIrqManagerObj { + fn initialize(&self) -> Result<()> { + self.routes.initialize()?; + Ok(()) + } + + fn create_group( + &mut self, + ty: InterruptSourceType, + base: InterruptIndex, + count: u32, + ) -> Result>> { + #[allow(unreachable_patterns)] + let group: Arc> = match ty { + #[cfg(feature = "kvm-legacy-irq")] + InterruptSourceType::LegacyIrq => Arc::new(Box::new(LegacyIrq::new( + base, + count, + self.vmfd.clone(), + self.routes.clone(), + )?)), + #[cfg(feature = "kvm-msi-irq")] + InterruptSourceType::MsiIrq => Arc::new(Box::new(MsiIrq::new( + base, + count, + self.max_msi_irqs, + self.vmfd.clone(), + self.routes.clone(), + )?)), + #[cfg(feature = "kvm-vfio-msi-irq")] + InterruptSourceType::VfioMsiIrq(vfio_device, vfio_index) => { + Arc::new(Box::new(VfioMsiIrq::new( + base, + count, + self.max_msi_irqs, + self.vmfd.clone(), + self.routes.clone(), + vfio_device, + vfio_index, + )?)) + } + _ => return Err(Error::from(ErrorKind::InvalidInput)), + }; + + self.groups.insert(base, group.clone()); + + Ok(group) + } + + fn destroy_group(&mut self, group: Arc>) -> Result<()> { + self.groups.remove(&group.base()); + Ok(()) + } +} + +// Use (entry.type, entry.gsi) as the hash key because entry.gsi can't uniquely identify an +// interrupt source on x86 platforms. The PIC and IOAPIC may share the same GSI on x86 platforms. +fn hash_key(entry: &kvm_irq_routing_entry) -> u64 { + let type1 = match entry.type_ { + #[cfg(feature = "kvm-legacy-irq")] + kvm_bindings::KVM_IRQ_ROUTING_IRQCHIP => unsafe { entry.u.irqchip.irqchip }, + _ => 0u32, + }; + (u64::from(type1) << 48 | u64::from(entry.type_) << 32) | u64::from(entry.gsi) +} + +pub(super) struct KvmIrqRouting { + vm_fd: Arc, + routes: Mutex>, +} + +impl KvmIrqRouting { + pub(super) fn new(vm_fd: Arc) -> Self { + KvmIrqRouting { + vm_fd, + routes: Mutex::new(HashMap::new()), + } + } + + pub(super) fn initialize(&self) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + #[allow(unused_mut)] + let mut routes = self.routes.lock().unwrap(); + + #[cfg(feature = "kvm-legacy-irq")] + LegacyIrq::initialize_legacy(&mut *routes)?; + + self.set_routing(&*routes) + } + + fn set_routing(&self, routes: &HashMap) -> Result<()> { + // Allocate enough buffer memory. + let elem_sz = std::mem::size_of::(); + let total_sz = std::mem::size_of::() * routes.len() + elem_sz; + let elem_cnt = (total_sz + elem_sz - 1) / elem_sz; + let mut irq_routings = Vec::::with_capacity(elem_cnt); + irq_routings.resize_with(elem_cnt, Default::default); + + // Prepare the irq_routing header. + let mut irq_routing = &mut irq_routings[0]; + irq_routing.nr = routes.len() as u32; + irq_routing.flags = 0; + + // Safe because we have just allocated enough memory above. + let irq_routing_entries = unsafe { irq_routing.entries.as_mut_slice(routes.len()) }; + for (idx, entry) in routes.values().enumerate() { + irq_routing_entries[idx] = *entry; + } + + self.vm_fd + .set_gsi_routing(irq_routing) + .map_err(from_sys_util_errno)?; + + Ok(()) + } +} + +#[cfg(feature = "kvm-msi-generic")] +impl KvmIrqRouting { + pub(super) fn add(&self, entries: &[kvm_irq_routing_entry]) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + let mut routes = self.routes.lock().unwrap(); + for entry in entries { + if entry.gsi >= MAX_IRQS { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } else if routes.contains_key(&hash_key(entry)) { + return Err(std::io::Error::from_raw_os_error(libc::EEXIST)); + } + } + + for entry in entries { + let _ = routes.insert(hash_key(entry), *entry); + } + self.set_routing(&routes) + } + + pub(super) fn remove(&self, entries: &[kvm_irq_routing_entry]) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + let mut routes = self.routes.lock().unwrap(); + for entry in entries { + let _ = routes.remove(&hash_key(entry)); + } + self.set_routing(&routes) + } + + pub(super) fn modify(&self, entry: &kvm_irq_routing_entry) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + let mut routes = self.routes.lock().unwrap(); + if !routes.contains_key(&hash_key(entry)) { + return Err(std::io::Error::from_raw_os_error(libc::ENOENT)); + } + + let _ = routes.insert(hash_key(entry), *entry); + self.set_routing(&routes) + } +} + +/// Helper function convert from vmm_sys_util::errno::Error to std::io::Error. +pub fn from_sys_util_errno(e: vmm_sys_util::errno::Error) -> std::io::Error { + std::io::Error::from_raw_os_error(e.errno()) +} + +#[cfg(any(target = "x86", target = "x86_64"))] +#[cfg(test)] +mod test { + use super::*; + use kvm_ioctls::{Kvm, VmFd}; + + //const VFIO_PCI_MSI_IRQ_INDEX: u32 = 1; + + fn create_vm_fd() -> VmFd { + let kvm = Kvm::new().unwrap(); + kvm.create_vm().unwrap() + } + + fn create_irq_group( + manager: Arc, + _vmfd: Arc, + ) -> Arc> { + let base = 0; + let count = 1; + + manager + .create_group(InterruptSourceType::LegacyIrq, base, count) + .unwrap() + } + + fn create_msi_group( + manager: Arc, + _vmfd: Arc, + ) -> Arc> { + let base = 168; + let count = 32; + + manager + .create_group(InterruptSourceType::MsiIrq, base, count) + .unwrap() + } + + const MASTER_PIC: usize = 7; + const SLAVE_PIC: usize = 8; + const IOAPIC: usize = 23; + + #[test] + fn test_create_kvmirqmanager() { + let vmfd = Arc::new(create_vm_fd()); + let manager = KvmIrqManager::new(vmfd.clone()); + assert!(vmfd.create_irq_chip().is_ok()); + assert!(manager.initialize().is_ok()); + } + + #[test] + fn test_kvmirqmanager_opt() { + let vmfd = Arc::new(create_vm_fd()); + assert!(vmfd.create_irq_chip().is_ok()); + let manager = Arc::new(KvmIrqManager::new(vmfd.clone())); + assert!(manager.initialize().is_ok()); + //irq + let group = create_irq_group(manager.clone(), vmfd.clone()); + let _ = group.clone(); + assert!(manager.destroy_group(group).is_ok()); + //msi + let group = create_msi_group(manager.clone(), vmfd.clone()); + let _ = group.clone(); + assert!(manager.destroy_group(group).is_ok()); + } + + #[test] + fn test_irqrouting_initialize_legacy() { + let vmfd = Arc::new(create_vm_fd()); + let routing = KvmIrqRouting::new(vmfd.clone()); + assert!(routing.initialize().is_err()); + assert!(vmfd.create_irq_chip().is_ok()); + assert!(routing.initialize().is_ok()); + let routes = &routing.routes.lock().unwrap(); + assert_eq!(routes.len(), MASTER_PIC + SLAVE_PIC + IOAPIC); + } + + #[test] + fn test_routing_opt() { + // pub(super) fn modify(&self, entry: &kvm_irq_routing_entry) -> Result<()> { + let vmfd = Arc::new(create_vm_fd()); + let routing = KvmIrqRouting::new(vmfd.clone()); + assert!(routing.initialize().is_err()); + assert!(vmfd.create_irq_chip().is_ok()); + assert!(routing.initialize().is_ok()); + + let mut entry = kvm_irq_routing_entry { + gsi: 8, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + + // Safe because we are initializing all fields of the `irqchip` struct. + unsafe { + entry.u.irqchip.irqchip = 0; + entry.u.irqchip.pin = 3; + } + + let entrys = vec![entry.clone()]; + + assert!(routing.modify(&entry).is_err()); + assert!(routing.add(&entrys).is_ok()); + unsafe { + entry.u.irqchip.pin = 4; + } + assert!(routing.modify(&entry).is_ok()); + assert!(routing.remove(&entrys).is_ok()); + assert!(routing.modify(&entry).is_err()); + } + + #[test] + fn test_routing_commit() { + let vmfd = Arc::new(create_vm_fd()); + let routing = KvmIrqRouting::new(vmfd.clone()); + + assert!(routing.initialize().is_err()); + assert!(vmfd.create_irq_chip().is_ok()); + assert!(routing.initialize().is_ok()); + + let mut entry = kvm_irq_routing_entry { + gsi: 8, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + unsafe { + entry.u.irqchip.irqchip = 0; + entry.u.irqchip.pin = 3; + } + + routing + .routes + .lock() + .unwrap() + .insert(hash_key(&entry), entry); + let routes = routing.routes.lock().unwrap(); + assert!(routing.commit(&routes).is_ok()); + } + + #[test] + fn test_has_key() { + let gsi = 4; + let mut entry = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + // Safe because we are initializing all fields of the `irqchip` struct. + unsafe { + entry.u.irqchip.irqchip = KVM_IRQCHIP_PIC_MASTER; + entry.u.irqchip.pin = gsi; + } + assert_eq!(hash_key(&entry), 0x0001_0000_0004); + } +} diff --git a/src/interrupt/kvm/msi_generic.rs b/src/interrupt/kvm/msi_generic.rs new file mode 100644 index 0000000..116bd6e --- /dev/null +++ b/src/interrupt/kvm/msi_generic.rs @@ -0,0 +1,126 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Helper utilities for handling MSI interrupts. + +use kvm_bindings::{kvm_irq_routing_entry, KVM_IRQ_ROUTING_MSI}; +use vmm_sys_util::eventfd::EFD_NONBLOCK; + +use super::*; + +pub(super) struct MsiConfig { + pub(super) irqfd: EventFd, + pub(super) config: Mutex, +} + +impl MsiConfig { + pub(super) fn new() -> Self { + MsiConfig { + irqfd: EventFd::new(EFD_NONBLOCK).unwrap(), + config: Mutex::new(Default::default()), + } + } +} + +pub(super) fn new_msi_routing_entry( + gsi: InterruptIndex, + msicfg: &MsiIrqSourceConfig, +) -> kvm_irq_routing_entry { + let mut entry = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_MSI, + flags: 0, + ..Default::default() + }; + entry.u.msi.address_hi = msicfg.high_addr; + entry.u.msi.address_lo = msicfg.low_addr; + entry.u.msi.data = msicfg.data; + entry +} + +#[allow(irrefutable_let_patterns)] +pub(super) fn create_msi_routing_entries( + base: InterruptIndex, + configs: &[InterruptSourceConfig], +) -> Result> { + let _ = base + .checked_add(configs.len() as u32) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::EINVAL))?; + let mut entries = Vec::with_capacity(configs.len()); + for (i, ref val) in configs.iter().enumerate() { + if let InterruptSourceConfig::MsiIrq(msicfg) = val { + let entry = new_msi_routing_entry(base + i as u32, msicfg); + entries.push(entry); + } else { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + } + Ok(entries) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_create_msiconfig() { + let config = MsiConfig::new(); + config.irqfd.write(1).unwrap(); + } + + #[test] + fn test_new_msi_routing_single() { + let test_gsi = 4; + let msi_source_config = MsiIrqSourceConfig { + high_addr: 0x1234, + low_addr: 0x5678, + data: 0x9876, + }; + let entry = new_msi_routing_entry(test_gsi, &msi_source_config); + assert_eq!(entry.gsi, test_gsi); + assert_eq!(entry.type_, KVM_IRQ_ROUTING_MSI); + unsafe { + assert_eq!(entry.u.msi.address_hi, msi_source_config.high_addr); + assert_eq!(entry.u.msi.address_lo, msi_source_config.low_addr); + assert_eq!(entry.u.msi.data, msi_source_config.data); + } + } + + #[cfg(all( + feature = "legacy_irq", + any(target_arch = "x86", target_arch = "x86_64") + ))] + #[test] + fn test_new_msi_routing_multi() { + let mut msi_fds = Vec::with_capacity(16); + for _ in 0..16 { + msi_fds.push(InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig { + high_addr: 0x1234, + low_addr: 0x5678, + data: 0x9876, + })); + } + let mut legacy_fds = Vec::with_capacity(16); + for _ in 0..16 { + legacy_fds.push(InterruptSourceConfig::LegacyIrq(LegacyIrqSourceConfig {})); + } + + let base = 0; + let entrys = create_msi_routing_entries(0, &msi_fds).unwrap(); + + for (i, entry) in entrys.iter().enumerate() { + assert_eq!(entry.gsi, (base + i) as u32); + assert_eq!(entry.type_, KVM_IRQ_ROUTING_MSI); + if let InterruptSourceConfig::MsiIrq(config) = &msi_fds[i] { + unsafe { + assert_eq!(entry.u.msi.address_hi, config.high_addr); + assert_eq!(entry.u.msi.address_lo, config.low_addr); + assert_eq!(entry.u.msi.data, config.data); + } + } + } + + assert!(create_msi_routing_entries(0, &legacy_fds).is_err()); + assert!(create_msi_routing_entries(!0, &msi_fds).is_err()); + } +} diff --git a/src/interrupt/kvm/msi_irq.rs b/src/interrupt/kvm/msi_irq.rs new file mode 100644 index 0000000..6283232 --- /dev/null +++ b/src/interrupt/kvm/msi_irq.rs @@ -0,0 +1,279 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Manage virtual device's PCI MSI/PCI MSIx interrupts based on Linux KVM framework. +//! +//! To optimize for performance by avoiding unnecessary locking and state checking, we assume that +//! the caller will take the responsibility to maintain the interrupt states and only issue valid +//! requests to this driver. If the caller doesn't obey the contract, only the current virtual +//! machine will be affected, it shouldn't break the host or other virtual machines. + +use super::msi_generic::{create_msi_routing_entries, new_msi_routing_entry, MsiConfig}; +use super::*; + +pub(super) struct MsiIrq { + base: InterruptIndex, + count: InterruptIndex, + vmfd: Arc, + irq_routing: Arc, + msi_configs: Vec, +} + +impl MsiIrq { + #[allow(clippy::new_ret_no_self)] + pub(super) fn new( + base: InterruptIndex, + count: InterruptIndex, + max_msi_irqs: InterruptIndex, + vmfd: Arc, + irq_routing: Arc, + ) -> Result { + if count > max_msi_irqs || base >= MAX_IRQS || base + count > MAX_IRQS { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + let mut msi_configs = Vec::with_capacity(count as usize); + for _ in 0..count { + msi_configs.push(MsiConfig::new()); + } + + Ok(MsiIrq { + base, + count, + vmfd, + irq_routing, + msi_configs, + }) + } +} + +impl InterruptSourceGroup for MsiIrq { + fn interrupt_type(&self) -> InterruptSourceType { + InterruptSourceType::MsiIrq + } + + fn len(&self) -> u32 { + self.count + } + + fn base(&self) -> u32 { + self.base + } + + fn enable(&self, configs: &[InterruptSourceConfig]) -> Result<()> { + if configs.len() != self.count as usize { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + // First add IRQ routings for all the MSI interrupts. + let entries = create_msi_routing_entries(self.base, configs)?; + self.irq_routing.add(&entries)?; + + // Then register irqfds to the KVM module. + for i in 0..self.count { + let irqfd = &self.msi_configs[i as usize].irqfd; + self.vmfd + .register_irqfd(irqfd, self.base + i) + .map_err(from_sys_util_errno)?; + } + + Ok(()) + } + + fn disable(&self) -> Result<()> { + // First unregister all irqfds, so it won't trigger anymore. + for i in 0..self.count { + let irqfd = &self.msi_configs[i as usize].irqfd; + self.vmfd + .unregister_irqfd(irqfd, self.base + i) + .map_err(from_sys_util_errno)?; + } + + // Then tear down the IRQ routings for all the MSI interrupts. + let mut entries = Vec::with_capacity(self.count as usize); + for i in 0..self.count { + // Safe to unwrap because there's no legal way to break the mutex. + let msicfg = self.msi_configs[i as usize].config.lock().unwrap(); + let entry = new_msi_routing_entry(self.base + i, &*msicfg); + entries.push(entry); + } + self.irq_routing.remove(&entries)?; + + Ok(()) + } + + #[allow(irrefutable_let_patterns)] + fn update(&self, index: InterruptIndex, config: &InterruptSourceConfig) -> Result<()> { + if index >= self.count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + if let InterruptSourceConfig::MsiIrq(ref cfg) = config { + // Safe to unwrap because there's no legal way to break the mutex. + let entry = { + let mut msicfg = self.msi_configs[index as usize].config.lock().unwrap(); + msicfg.high_addr = cfg.high_addr; + msicfg.low_addr = cfg.low_addr; + msicfg.data = cfg.data; + new_msi_routing_entry(self.base + index, &*msicfg) + }; + self.irq_routing.modify(&entry) + } else { + Err(std::io::Error::from_raw_os_error(libc::EINVAL)) + } + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + if index >= self.count { + None + } else { + let msi_config = &self.msi_configs[index as usize]; + Some(&msi_config.irqfd) + } + } + + fn trigger(&self, index: InterruptIndex) -> Result<()> { + // Assume that the caller will maintain the interrupt states and only call this function + // when suitable. + if index >= self.count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + let msi_config = &self.msi_configs[index as usize]; + msi_config.irqfd.write(1) + } + + fn mask(&self, index: InterruptIndex) -> Result<()> { + if index >= self.count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + let irqfd = &self.msi_configs[index as usize].irqfd; + self.vmfd + .unregister_irqfd(irqfd, self.base + index) + .map_err(from_sys_util_errno)?; + + Ok(()) + } + + fn unmask(&self, index: InterruptIndex) -> Result<()> { + if index >= self.count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + let irqfd = &self.msi_configs[index as usize].irqfd; + self.vmfd + .register_irqfd(irqfd, self.base + index) + .map_err(from_sys_util_errno)?; + + Ok(()) + } + + fn get_pending_state(&self, index: InterruptIndex) -> bool { + if index >= self.count { + return false; + } + + // Peak the EventFd.count by reading and writing back. + // The irqfd must be in NON-BLOCKING mode. + let irqfd = &self.msi_configs[index as usize].irqfd; + match irqfd.read() { + Err(_) => false, + Ok(count) => { + if count != 0 { + if let Err(_) = irqfd.write(count) { + // Hope the caller will handle the pending state corrrectly, + // then no interrupt will be lost. + //panic!("really no way to recover here!!!!"); + } + } + count != 0 + } + } + } +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[cfg(test)] +mod test { + use super::*; + use kvm_ioctls::{Kvm, VmFd}; + + fn create_vm_fd() -> VmFd { + let kvm = Kvm::new().unwrap(); + kvm.create_vm().unwrap() + } + + #[test] + #[allow(unreachable_patterns)] + fn test_msi_interrupt_group() { + let vmfd = Arc::new(create_vm_fd()); + assert!(vmfd.create_irq_chip().is_ok()); + + let rounting = Arc::new(KvmIrqRouting::new(vmfd.clone())); + assert!(rounting.initialize().is_ok()); + + let base = 168; + let count = 32; + let group = MsiIrq::new( + base, + count, + DEFAULT_MAX_MSI_IRQS_PER_DEVICE, + vmfd.clone(), + rounting.clone(), + ) + .unwrap(); + let mut msi_fds = Vec::with_capacity(count as usize); + + match group.interrupt_type() { + InterruptSourceType::MsiIrq => {} + _ => { + panic!(); + } + } + + for _ in 0..count { + let msi_source_config = MsiIrqSourceConfig { + high_addr: 0x1234, + low_addr: 0x5678, + data: 0x9876, + }; + msi_fds.push(InterruptSourceConfig::MsiIrq(msi_source_config)); + } + + assert!(group.enable(&msi_fds).is_ok()); + assert_eq!(group.len(), count); + assert_eq!(group.base(), base); + + for i in 0..count { + let msi_source_config = MsiIrqSourceConfig { + high_addr: i + 0x1234, + low_addr: i + 0x5678, + data: i + 0x9876, + }; + assert!(group.notifier(i).unwrap().write(1).is_ok()); + assert!(group.trigger(i).is_ok()); + assert!(group + .update(0, &InterruptSourceConfig::MsiIrq(msi_source_config)) + .is_ok()); + } + assert!(group.trigger(33).is_err()); + assert!(group.disable().is_ok()); + + assert!(MsiIrq::new( + base, + DEFAULT_MAX_MSI_IRQS_PER_DEVICE + 1, + DEFAULT_MAX_MSI_IRQS_PER_DEVICE, + vmfd.clone(), + rounting.clone() + ) + .is_err()); + assert!(MsiIrq::new( + 1100, + 1, + DEFAULT_MAX_MSI_IRQS_PER_DEVICE, + vmfd.clone(), + rounting.clone() + ) + .is_err()); + } +} diff --git a/src/interrupt/kvm/vfio_msi_irq.rs b/src/interrupt/kvm/vfio_msi_irq.rs new file mode 100644 index 0000000..c2cf87c --- /dev/null +++ b/src/interrupt/kvm/vfio_msi_irq.rs @@ -0,0 +1,394 @@ +// Copyright (C) 2019-2020 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 or BSD-3-Clause + +//! Manage virtual device's PCI MSIx/Generic MSI interrupts based on Linux KVM and VFIO framework. +//! +//! The InterruptSourceGroup trait provides methods to inject virtual device interrupts into the +//! target virtual machine, so it's a type of interrupt event sink and doesn't handle the way to +//! generate interrupt events. On the other hand, a VFIO device may generate interrupt events, so +//! it's a type interrupt event source. +//! There are special optimizations to deliver an interrupt from a VFIO device to a virutal machine. +//! - Basic Mode. The virtual device driver register and eventfd to the VFIO driver, register +//! another irqfd to the KVM driver, and relays events from the eventfd to the irqfd. This is +//! not optimal for performance because every interrupt will cause a round-trip into the +//! userspace. +//! - Better Mode. The virtual device driver creates an irqfd, and register the irqfd to both the +//! VFIO driver and KVM driver. So an interrupt event will be relayed but the host kernel, but +//! it still causes VMExit for each interrupt. +//! - Best Mode. On x86 platforms with Posted Interrupt capability, the hardware could help to +//! deliver an hardware interrupt to a specific virtual machine, bypass the host kernel. + +use vfio_ioctls::VfioError; + +use super::msi_generic::{create_msi_routing_entries, new_msi_routing_entry, MsiConfig}; +use super::*; + +pub(super) struct VfioMsiIrq { + base: InterruptIndex, + count: InterruptIndex, + vmfd: Arc, + irq_routing: Arc, + vfio_device: Arc, + vfio_index: u32, + msi_configs: Vec, +} + +impl VfioMsiIrq { + #[allow(clippy::new_ret_no_self)] + pub(super) fn new( + base: InterruptIndex, + count: InterruptIndex, + max_msi_irqs: InterruptIndex, + vmfd: Arc, + irq_routing: Arc, + vfio_device: Arc, + vfio_index: u32, + ) -> Result { + if count > max_msi_irqs || base >= MAX_IRQS || base + count > MAX_IRQS { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + match vfio_device.get_irq_info(vfio_index) { + Some(ref info) => { + if info.count < count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + } + None => return Err(std::io::Error::from_raw_os_error(libc::EINVAL)), + } + + let mut msi_configs = Vec::with_capacity(count as usize); + for _ in 0..count { + msi_configs.push(MsiConfig::new()); + } + + Ok(VfioMsiIrq { + base, + count, + vmfd, + irq_routing, + vfio_device, + vfio_index, + msi_configs, + }) + } +} + +impl InterruptSourceGroup for VfioMsiIrq { + fn interrupt_type(&self) -> InterruptSourceType { + InterruptSourceType::VfioMsiIrq(self.vfio_device.clone(), self.vfio_index) + } + + fn len(&self) -> u32 { + self.count + } + + fn base(&self) -> u32 { + self.base + } + + fn enable(&self, configs: &[InterruptSourceConfig]) -> Result<()> { + if configs.len() != self.count as usize { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + // First add IRQ routings for all the MSI interrupts. + let entries = create_msi_routing_entries(self.base, configs)?; + self.irq_routing.add(&entries)?; + + // Then register irqfds to the KVM module. + for i in 0..self.count { + let irqfd = &self.msi_configs[i as usize].irqfd; + self.vmfd + .register_irqfd(irqfd, self.base + i) + .map_err(from_sys_util_errno)?; + } + + // At last configure the VFIO hardware device. + let mut fds = Vec::with_capacity(self.count as usize); + for i in 0..self.count { + fds.push(&self.msi_configs[i as usize].irqfd); + } + self.vfio_device + .enable_irq(self.vfio_index, fds) + .map_err(map_vfio_error)?; + + Ok(()) + } + + fn disable(&self) -> Result<()> { + // First disable interrupts from the VFIO hardware device + self.vfio_device + .disable_irq(self.vfio_index) + .map_err(map_vfio_error)?; + + // Then unregister all irqfds, so it won't trigger anymore. + for i in 0..self.count { + let irqfd = &self.msi_configs[i as usize].irqfd; + self.vmfd + .unregister_irqfd(irqfd, self.base + i) + .map_err(from_sys_util_errno)?; + } + + // At last tear down the IRQ routings for all the MSI interrupts. + let mut entries = Vec::with_capacity(self.count as usize); + for i in 0..self.count { + // Safe to unwrap because there's no legal way to break the mutex. + let msicfg = self.msi_configs[i as usize].config.lock().unwrap(); + let entry = new_msi_routing_entry(self.base + i, &*msicfg); + entries.push(entry); + } + self.irq_routing.remove(&entries)?; + + Ok(()) + } + + fn update(&self, index: InterruptIndex, config: &InterruptSourceConfig) -> Result<()> { + if index >= self.count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + if let InterruptSourceConfig::MsiIrq(ref cfg) = config { + // Safe to unwrap because there's no legal way to break the mutex. + let entry = { + let mut msicfg = self.msi_configs[index as usize].config.lock().unwrap(); + msicfg.high_addr = cfg.high_addr; + msicfg.low_addr = cfg.low_addr; + msicfg.data = cfg.data; + + // Only need to update the KVM IRQ routings, no need to touch the VFIO device. + new_msi_routing_entry(self.base + index, &*msicfg) + }; + self.irq_routing.modify(&entry) + } else { + Err(std::io::Error::from_raw_os_error(libc::EINVAL)) + } + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + if index >= self.count { + None + } else { + let msi_config = &self.msi_configs[index as usize]; + Some(&msi_config.irqfd) + } + } + + fn trigger(&self, index: InterruptIndex) -> Result<()> { + // Assume that the caller will maintain the interrupt states and only call this function + // when suitable. + if index >= self.count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + self.vfio_device + .trigger_irq(self.vfio_index, index) + .map_err(map_vfio_error) + } +} + +impl std::fmt::Debug for VfioMsiIrq { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "VFIO MSI Irq, base {}, vfio_index {} ", + self.base, self.vfio_index + ) + } +} + +fn map_vfio_error(err: VfioError) -> std::io::Error { + match err { + VfioError::OpenContainer(e) => e, + VfioError::OpenGroup(e, _f) => e, + VfioError::KvmSetDeviceAttr(e) => from_sys_util_errno(e), + _ => std::io::Error::from_raw_os_error(libc::EIO), + } +} + +// Following unit test cases depend on hardware configuration, disabled by default. +#[cfg(test_disabled)] +mod test { + use super::*; + use kvm_ioctls::{DeviceFd, Kvm, VmFd}; + use std::path::Path; + use vfio_ioctls::{VfioContainer, VfioDevice}; + + const VFIO_PCI_INTX_IRQ_INDEX: u32 = 0; + const VFIO_PCI_MSI_IRQ_INDEX: u32 = 1; + const VFIO_PCI_MSIX_IRQ_INDEX: u32 = 2; + + const BASE: u32 = 0; + + fn create_vm_fd() -> VmFd { + let kvm = Kvm::new().unwrap(); + kvm.create_vm().unwrap() + } + + fn create_kvm_device(vm: Arc) -> DeviceFd { + let mut vfio_dev = kvm_bindings::kvm_create_device { + type_: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_VFIO, + fd: 0, + flags: 0, + }; + + vm.create_device(&mut vfio_dev).unwrap() + } + + fn vfio_msi_group_prepare( + vfio_index: u32, + pic_sys_path: &str, + ) -> (Arc, u32) { + let vmfd = Arc::new(create_vm_fd()); + assert!(vmfd.create_irq_chip().is_ok()); + let kvm_device = Arc::new(create_kvm_device(vmfd.clone())); + let sysfspath_eth1: &Path = Path::new(pic_sys_path); + let container = Arc::new(VfioContainer::new(kvm_device).unwrap()); + let vfio_device = Arc::new( + VfioDevice::new(sysfspath_eth1, container) + .map_err(|err| println!("{}", err)) + .unwrap(), + ); + + let count = match vfio_device.get_irq_info(vfio_index) { + Some(ref info) => info.count, + None => 0, + }; + + let rounting = Arc::new(KvmIrqRouting::new(vmfd.clone())); + + assert!(VfioMsiIrq::new( + BASE, + 33, + 32, + vmfd.clone(), + rounting.clone(), + vfio_device.clone(), + vfio_index + ) + .is_err()); + assert!(VfioMsiIrq::new( + 1100, + 1, + 32, + vmfd.clone(), + rounting.clone(), + vfio_device.clone(), + vfio_index + ) + .is_err()); + ( + Arc::new( + VfioMsiIrq::new( + BASE, + count, + 32, + vmfd.clone(), + rounting.clone(), + vfio_device.clone(), + vfio_index, + ) + .unwrap(), + ), + count, + ) + } + + fn vfio_msi_interrupt_group_opt(group: Arc, count: u32, index: u32) { + let mmio_base: u32 = 0xd000_0000; + let mut msi_fds: Vec = Vec::with_capacity(count as usize); + if index == VFIO_PCI_INTX_IRQ_INDEX { + msi_fds.push(InterruptSourceConfig::LegacyIrq(LegacyIrqSourceConfig {})); + } else { + for i in 0..count { + let msi_source_config = MsiIrqSourceConfig { + high_addr: 0, + low_addr: mmio_base + i * 0x1000, + data: 0x1000, + }; + msi_fds.push(InterruptSourceConfig::MsiIrq(msi_source_config)); + } + } + assert!(group.enable(&msi_fds).is_ok()); + assert_eq!(group.len(), count); + assert_eq!(group.base(), BASE); + + for i in 0..count { + assert!(group.irqfd(i).unwrap().write(1).is_ok()); + assert!(group.trigger(i, 0x168).is_err()); + assert!(group.trigger(i, 0).is_ok()); + assert!(group.ack(i, 0x168).is_err()); + assert!(group.ack(i, 0).is_ok()); + + if index == VFIO_PCI_INTX_IRQ_INDEX { + assert!(group + .update( + 0, + &InterruptSourceConfig::LegacyIrq(LegacyIrqSourceConfig {}) + ) + .is_ok()); + } else { + let msi_source_config = MsiIrqSourceConfig { + high_addr: 0, + low_addr: mmio_base + i * 0x1000, + data: i + 0x1000, + }; + assert!(group + .update(i, &InterruptSourceConfig::MsiIrq(msi_source_config)) + .is_ok()); + } + } + assert!(group.trigger(33, 0x168).is_err()); + assert!(group.ack(33, 0x168).is_err()); + assert!(group.disable().is_ok()); + } + + #[test] + fn test_vfio_msi_interrupt_group_intx() { + let (group0, count) = vfio_msi_group_prepare( + VFIO_PCI_INTX_IRQ_INDEX, + "/sys/bus/pci/devices/0000:5c:00.0/", + ); + if count != 0 { + vfio_msi_interrupt_group_opt(group0, count, VFIO_PCI_INTX_IRQ_INDEX); + } + let (group1, count) = vfio_msi_group_prepare( + VFIO_PCI_INTX_IRQ_INDEX, + "/sys/bus/pci/devices/0000:5d:00.0/", + ); + if count != 0 { + vfio_msi_interrupt_group_opt(group1, count, VFIO_PCI_INTX_IRQ_INDEX); + } + } + + #[test] + fn test_vfio_msi_interrupt_group_msi() { + let (group0, count) = + vfio_msi_group_prepare(VFIO_PCI_MSI_IRQ_INDEX, "/sys/bus/pci/devices/0000:5c:00.0/"); + if count != 0 { + vfio_msi_interrupt_group_opt(group0, count, VFIO_PCI_MSI_IRQ_INDEX); + } + let (group1, count) = + vfio_msi_group_prepare(VFIO_PCI_MSI_IRQ_INDEX, "/sys/bus/pci/devices/0000:5d:00.0/"); + if count != 0 { + vfio_msi_interrupt_group_opt(group1, count, VFIO_PCI_MSI_IRQ_INDEX); + } + } + + #[test] + #[ignore] + fn test_vfio_msi_interrupt_group_msix() { + let (group0, count) = vfio_msi_group_prepare( + VFIO_PCI_MSIX_IRQ_INDEX, + "/sys/bus/pci/devices/0000:5c:00.0/", + ); + if count != 0 { + vfio_msi_interrupt_group_opt(group0, count, VFIO_PCI_MSIX_IRQ_INDEX); + } + let (group1, count) = vfio_msi_group_prepare( + VFIO_PCI_MSIX_IRQ_INDEX, + "/sys/bus/pci/devices/0000:5d:00.0/", + ); + if count != 0 { + vfio_msi_interrupt_group_opt(group1, count, VFIO_PCI_MSIX_IRQ_INDEX); + } + } +} diff --git a/src/interrupt/manager.rs b/src/interrupt/manager.rs new file mode 100644 index 0000000..73d42ea --- /dev/null +++ b/src/interrupt/manager.rs @@ -0,0 +1,644 @@ +// Copyright (C) 2019-2020 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +/// Interrupt manager to manage and switch device interrupt modes. +/// +/// A device may support multiple interrupt modes. For example, a PCI device may support legacy, +/// PCI MSI and PCI MSIx interrupts. This interrupt manager helps a device backend driver to manage +/// its interrupts and provides interfaces to switch interrupt working modes. +use std::io::{Error, Result}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; +use std::usize; + +#[cfg(feature = "legacy-irq")] +use super::LegacyIrqSourceConfig; +#[cfg(feature = "msi-irq")] +use super::MsiIrqSourceConfig; +use super::{InterruptManager, InterruptSourceConfig, InterruptSourceGroup, InterruptSourceType}; +use crate::resources::DeviceResources; + +#[cfg(feature = "legacy-irq")] +const LEGACY_CONFIGS: [InterruptSourceConfig; 1] = + [InterruptSourceConfig::LegacyIrq(LegacyIrqSourceConfig {})]; + +/// Device interrupt working modes. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum DeviceInterruptMode { + /// The device interrupt manager has been disabled. + Disabled = 0, + /// The device interrupt manager works in legacy irq mode. + LegacyIrq = 1, + /// The device interrupt manager works in generic MSI mode. + GenericMsiIrq = 2, + /// The device interrupt manager works in PCI MSI mode. + PciMsiIrq = 3, + /// The device interrupt manager works in PCI MSI-x mode. + PciMsixIrq = 4, +} + +/// A struct to manage interrupts and interrupt modes for a device. +/// +/// The interrupt manager may support multiple working mode. For example, an interrupt manager +/// for a PCI device may work in legacy mode, PCI MSI mode or PCI MSIx mode. Under certain +/// conditions, the interrupt manager may switch between interrupt working modes. To simplify +/// implementation, switching working mode is only supported at configuration stage and will be +/// disabled at runtime stage. The DeviceInterruptManager::enable() switches the interrupt manager +/// from configuration stage into runtime stage. And DeviceInterruptManager::reset() switches +/// from runtime stage back to initial configuration stage. +pub struct DeviceInterruptManager { + mode: DeviceInterruptMode, + activated: bool, + current_idx: usize, + mode2idx: [usize; 5], + intr_mgr: T, + intr_groups: Vec>>, + #[cfg(feature = "msi-irq")] + msi_config: Vec, +} + +impl DeviceInterruptManager { + /// Create an interrupt manager for a device. + /// + /// # Arguments + /// * `intr_mgr`: underline interrupt manager to allocate/free interrupt groups. + /// * `resources`: resources assigned to the device, including assigned interrupt resources. + pub fn new(intr_mgr: T, resources: &DeviceResources) -> Result { + let mut mgr = DeviceInterruptManager { + mode: DeviceInterruptMode::Disabled, + activated: false, + current_idx: usize::MAX, + mode2idx: [usize::MAX; 5], + intr_mgr, + intr_groups: Vec::new(), + #[cfg(feature = "msi-irq")] + msi_config: Vec::new(), + }; + + #[cfg(feature = "legacy-irq")] + { + if let Some(irq) = resources.get_legacy_irq() { + let group = mgr + .intr_mgr + .create_group(InterruptSourceType::LegacyIrq, irq, 1)?; + mgr.mode2idx[DeviceInterruptMode::LegacyIrq as usize] = mgr.intr_groups.len(); + mgr.intr_groups.push(group); + } + } + + #[cfg(feature = "msi-irq")] + { + if let Some(msi) = resources.get_generic_msi_irqs() { + let group = mgr + .intr_mgr + .create_group(InterruptSourceType::MsiIrq, msi.0, msi.1)?; + mgr.resize_msi_config_space(group.len()); + mgr.mode2idx[DeviceInterruptMode::GenericMsiIrq as usize] = mgr.intr_groups.len(); + mgr.intr_groups.push(group); + } + + if let Some(msi) = resources.get_pci_msi_irqs() { + let group = mgr + .intr_mgr + .create_group(InterruptSourceType::MsiIrq, msi.0, msi.1)?; + mgr.resize_msi_config_space(group.len()); + mgr.mode2idx[DeviceInterruptMode::PciMsiIrq as usize] = mgr.intr_groups.len(); + mgr.intr_groups.push(group); + } + + if let Some(msi) = resources.get_pci_msix_irqs() { + let group = mgr + .intr_mgr + .create_group(InterruptSourceType::MsiIrq, msi.0, msi.1)?; + mgr.resize_msi_config_space(group.len()); + mgr.mode2idx[DeviceInterruptMode::PciMsixIrq as usize] = mgr.intr_groups.len(); + mgr.intr_groups.push(group); + } + } + + Ok(mgr) + } + + /// Check whether the interrupt manager has been activated. + pub fn is_enabled(&self) -> bool { + self.activated + } + + /// Switch the interrupt manager from configuration stage into runtime stage. + /// + /// The working mode could only be changed at configuration stage, and all requests to change + /// working mode at runtime stage will be rejected. + /// If the interrupt manager is still in DISABLED mode when DeviceInterruptManager::enable() + /// is called, it will be put into LEGACY mode if LEGACY mode is supported. + pub fn enable(&mut self) -> Result<()> { + if self.activated { + return Ok(()); + } + + // Enter Legacy mode by default if Legacy mode is supported. + if self.mode == DeviceInterruptMode::Disabled + && self.mode2idx[DeviceInterruptMode::LegacyIrq as usize] != usize::MAX + { + self.set_working_mode(DeviceInterruptMode::LegacyIrq)?; + } + if self.mode == DeviceInterruptMode::Disabled { + return Err(Error::from_raw_os_error(libc::EINVAL)); + } + + self.intr_groups[self.current_idx].enable(self.get_configs(self.mode))?; + self.activated = true; + + Ok(()) + } + + /// Switch the interrupt manager from runtime stage back into initial configuration stage. + /// + /// Currently we doesn't track the usage of interrupt group object given out by `get_group()`, + /// so the the caller needs to take the responsibility to release all interrupt group object + /// reference before calling DeviceInterruptManager::reset(). + pub fn reset(&mut self) -> Result<()> { + if self.activated { + self.activated = false; + self.intr_groups[self.current_idx].disable()?; + } + self.set_working_mode(DeviceInterruptMode::Disabled)?; + + Ok(()) + } + + /// Get the current interrupt working mode. + pub fn get_working_mode(&mut self) -> DeviceInterruptMode { + self.mode + } + + /// Switch interrupt working mode. + /// + /// Currently switching working mode is only supported during device configuration stage and + /// will always return failure if called during device runtime stage. The device switches + /// from configuration stage to runtime stage by invoking `DeviceInterruptManager::enable()`. + /// With this constraint, the device drivers may call `DeviceInterruptManager::get_group()` to + /// get the underline active interrupt group object, and directly calls the interrupt group + /// object's methods to trigger/acknowledge interrupts. + /// + /// This is a key design decision for optimizing performance. Though the DeviceInterruptManager + /// object itself is not multi-thread safe and must be protected from concurrent access by the + /// caller, the interrupt source group object is multi-thread safe and could be called + /// concurrently to trigger/acknowledge interrupts. This design may help to improve performance + /// for MSI interrupts. + /// + /// # Arguments + /// * `mode`: target working mode. + pub fn set_working_mode(&mut self, mode: DeviceInterruptMode) -> Result<()> { + // Can't switch mode agian once enabled. + if self.activated { + return Err(Error::from_raw_os_error(libc::EINVAL)); + } + + if mode != self.mode { + // Supported state transitions: + // other state -> DISABLED + // - DISABLED -> other + // - non-legacy -> legacy + // - legacy -> non-legacy + if self.mode != DeviceInterruptMode::Disabled + && self.mode != DeviceInterruptMode::LegacyIrq + && mode != DeviceInterruptMode::LegacyIrq + && mode != DeviceInterruptMode::Disabled + { + return Err(Error::from_raw_os_error(libc::EINVAL)); + } + + // Then enter new state + if mode != DeviceInterruptMode::Disabled { + self.reset_configs(mode); + self.current_idx = self.mode2idx[mode as usize]; + } + self.mode = mode; + } + + Ok(()) + } + + /// Get the underline interrupt source group object, so the device driver could concurrently + /// trigger/acknowledge interrupts by using the returned group object. + pub fn get_group(&self) -> Option>> { + if !self.activated || self.mode == DeviceInterruptMode::Disabled { + None + } else { + Some(self.intr_groups[self.current_idx].clone()) + } + } + + /// Reconfigure a specific interrupt in current working mode at configuration or runtime stage. + /// + /// It's mainly used to reconfigure Generic MSI/PCI MSI/PCI MSIx interrupts. Actually legacy + /// interrupts don't support reconfiguration yet. + #[allow(unused_variables)] + pub fn update(&mut self, index: u32) -> Result<()> { + if !self.activated { + return Err(Error::from_raw_os_error(libc::EINVAL)); + } + + match self.mode { + #[cfg(feature = "msi-irq")] + DeviceInterruptMode::GenericMsiIrq + | DeviceInterruptMode::PciMsiIrq + | DeviceInterruptMode::PciMsixIrq => { + let group = &self.intr_groups[self.current_idx as usize]; + if index >= group.len() || index >= self.msi_config.len() as u32 { + return Err(Error::from_raw_os_error(libc::EINVAL)); + } + group.update(index, &self.msi_config[index as usize])?; + Ok(()) + } + _ => Err(Error::from_raw_os_error(libc::EINVAL)), + } + } + + fn get_configs(&self, mode: DeviceInterruptMode) -> &[InterruptSourceConfig] { + match mode { + #[cfg(feature = "legacy-irq")] + DeviceInterruptMode::LegacyIrq => &LEGACY_CONFIGS[..], + #[cfg(feature = "msi-irq")] + DeviceInterruptMode::GenericMsiIrq + | DeviceInterruptMode::PciMsiIrq + | DeviceInterruptMode::PciMsixIrq => { + let idx = self.mode2idx[mode as usize]; + let group_len = self.intr_groups[idx].len() as usize; + &self.msi_config[0..group_len] + } + _ => panic!("unhandled interrupt type in get_configs()"), + } + } + + fn reset_configs(&mut self, mode: DeviceInterruptMode) { + match mode { + #[cfg(feature = "msi-irq")] + DeviceInterruptMode::GenericMsiIrq + | DeviceInterruptMode::PciMsiIrq + | DeviceInterruptMode::PciMsixIrq => { + self.msi_config = vec![ + InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig::default()); + self.msi_config.len() + ]; + } + _ => {} + } + } +} + +#[cfg(feature = "msi-irq")] +impl DeviceInterruptManager { + /// Set the high address for a MSI message. + #[allow(irrefutable_let_patterns)] + pub fn set_msi_high_address(&mut self, index: u32, data: u32) -> Result<()> { + if (index as usize) < self.msi_config.len() { + if let InterruptSourceConfig::MsiIrq(ref mut msi) = self.msi_config[index as usize] { + msi.high_addr = data; + return Ok(()); + } + } + Err(Error::from_raw_os_error(libc::EINVAL)) + } + + /// Set the low address for a MSI message. + #[allow(irrefutable_let_patterns)] + pub fn set_msi_low_address(&mut self, index: u32, data: u32) -> Result<()> { + if (index as usize) < self.msi_config.len() { + if let InterruptSourceConfig::MsiIrq(ref mut msi) = self.msi_config[index as usize] { + msi.low_addr = data; + return Ok(()); + } + } + Err(Error::from_raw_os_error(libc::EINVAL)) + } + + /// Set the data for a MSI message. + #[allow(irrefutable_let_patterns)] + pub fn set_msi_data(&mut self, index: u32, data: u32) -> Result<()> { + if (index as usize) < self.msi_config.len() { + if let InterruptSourceConfig::MsiIrq(ref mut msi) = self.msi_config[index as usize] { + msi.data = data; + return Ok(()); + } + } + Err(Error::from_raw_os_error(libc::EINVAL)) + } + + fn resize_msi_config_space(&mut self, size: u32) { + if self.msi_config.len() < size as usize { + self.msi_config = + vec![InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig::default()); size as usize]; + } + } +} + +/// Struct to implement a 32-bit interrupt status register. +pub struct InterruptStatusRegister32 { + status: AtomicU32, +} + +impl InterruptStatusRegister32 { + /// Create a status register instance. + pub fn new() -> Self { + InterruptStatusRegister32 { + status: AtomicU32::new(0), + } + } + + /// Read current value of the status register. + pub fn read(&self) -> u32 { + self.status.load(Ordering::SeqCst) + } + + /// Write value to the status register. + pub fn write(&self, value: u32) { + self.status.store(value, Ordering::SeqCst); + } + + /// Read current value and reset the status register to 0. + pub fn read_and_clear(&self) -> u32 { + self.status.swap(0, Ordering::SeqCst) + } + + /// Set bits into `value`. + pub fn set_bits(&self, value: u32) { + self.status.fetch_or(value, Ordering::SeqCst); + } + + /// Clear bits present in `value`. + pub fn clear_bits(&self, value: u32) { + self.status.fetch_and(!value, Ordering::SeqCst); + } +} + +#[cfg(all(test, feature = "kvm-legacy-irq", feature = "kvm-msi-irq"))] +mod tests { + use super::*; + use crate::interrupt::KvmIrqManager; + use crate::resources::{DeviceResources, MsiIrqType, Resource}; + use kvm_ioctls::{Kvm, VmFd}; + use std::sync::Arc; + + fn create_vm_fd() -> VmFd { + let kvm = Kvm::new().unwrap(); + kvm.create_vm().unwrap() + } + + fn create_init_resources() -> DeviceResources { + let mut resources = DeviceResources::new(); + + resources.append(Resource::MmioAddressRange { + base: 0xd000_0000, + size: 0x10_0000, + }); + resources.append(Resource::LegacyIrq(0)); + resources.append(Resource::MsiIrq { + ty: MsiIrqType::GenericMsi, + base: 0x200, + size: 0x10, + }); + resources.append(Resource::MsiIrq { + ty: MsiIrqType::PciMsi, + base: 0x100, + size: 0x20, + }); + resources.append(Resource::MsiIrq { + ty: MsiIrqType::PciMsix, + base: 0x300, + size: 0x20, + }); + + resources + } + + fn create_interrupt_manager() -> DeviceInterruptManager> { + let vmfd = Arc::new(create_vm_fd()); + assert!(vmfd.create_irq_chip().is_ok()); + let intr_mgr = Arc::new(KvmIrqManager::new(vmfd.clone())); + + let resource = create_init_resources(); + assert!(intr_mgr.initialize().is_ok()); + DeviceInterruptManager::new(intr_mgr.clone(), &resource).unwrap() + } + + #[test] + fn test_create_device_interrupt_manager() { + let mut mgr = create_interrupt_manager(); + + assert_eq!(mgr.mode, DeviceInterruptMode::Disabled); + assert_eq!(mgr.activated, false); + assert_eq!(mgr.current_idx, usize::MAX); + assert_eq!(mgr.intr_groups.len(), 4); + assert_eq!(mgr.is_enabled(), false); + assert!(mgr.get_group().is_none()); + + // Enter legacy mode by default + mgr.enable().unwrap(); + assert_eq!(mgr.is_enabled(), true); + assert_eq!( + mgr.mode2idx[DeviceInterruptMode::LegacyIrq as usize], + mgr.current_idx + ); + assert!(mgr.get_group().is_some()); + + // Disable interrupt manager + mgr.reset().unwrap(); + assert_eq!(mgr.is_enabled(), false); + assert_eq!( + mgr.mode2idx[DeviceInterruptMode::LegacyIrq as usize], + mgr.current_idx + ); + assert_eq!(mgr.get_working_mode(), DeviceInterruptMode::Disabled); + assert!(mgr.get_group().is_none()); + } + + #[test] + fn test_device_interrupt_manager_switch_mode() { + let mut mgr = create_interrupt_manager(); + + // Can't switch working mode in enabled state. + mgr.enable().unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap_err(); + mgr.reset().unwrap(); + + // Switch from LEGACY to PciMsi mode + mgr.set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap_err(); + + // Switch from LEGACY to PciMsix mode + mgr.set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap_err(); + + // Switch from LEGACY to GenericMsi mode + mgr.set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap_err(); + + // Switch from DISABLED to PciMsi mode + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap_err(); + + // Switch from DISABLED to PciMsix mode + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap_err(); + + // Switch from DISABLED to GenericMsi mode + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap_err(); + + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + } + + #[test] + fn test_error() { + let mut interrupt_manager = create_interrupt_manager(); + + assert!(interrupt_manager.set_msi_data(512, 0).is_err()); + assert!(interrupt_manager.set_msi_data(0, 0).is_ok()); + assert!(interrupt_manager.set_msi_high_address(512, 0).is_err()); + assert!(interrupt_manager.set_msi_high_address(0, 0).is_ok()); + assert!(interrupt_manager.set_msi_low_address(512, 0).is_err()); + assert!(interrupt_manager.set_msi_low_address(0, 0).is_ok()); + + interrupt_manager.activated = true; + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::Disabled) + .is_err()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .is_err()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::LegacyIrq) + .is_err()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::PciMsiIrq) + .is_err()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::PciMsixIrq) + .is_err()); + } + + #[test] + fn test_disable2legacy() { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager.activated = false; + interrupt_manager.mode = DeviceInterruptMode::Disabled; + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::LegacyIrq) + .is_ok()); + } + + #[test] + fn test_disable2nonlegacy() { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager.activated = false; + interrupt_manager.mode = DeviceInterruptMode::Disabled; + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .is_ok()); + } + + #[test] + fn test_legacy2nonlegacy() { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager.activated = false; + interrupt_manager.mode = DeviceInterruptMode::Disabled; + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::LegacyIrq) + .is_ok()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .is_ok()); + } + + #[test] + fn test_nonlegacy2legacy() { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager.activated = false; + interrupt_manager.mode = DeviceInterruptMode::Disabled; + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .is_ok()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::LegacyIrq) + .is_ok()); + } + + #[test] + fn test_update() { + let mut interrupt_manager = create_interrupt_manager(); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .is_ok()); + assert!(interrupt_manager.enable().is_ok()); + assert!(interrupt_manager.update(0x10).is_err()); + assert!(interrupt_manager.update(0x01).is_ok()); + assert!(interrupt_manager.reset().is_ok()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::LegacyIrq) + .is_ok()); + assert!(interrupt_manager.update(0x10).is_err()); + } + + #[test] + fn test_interrupt_status_register() { + let status = InterruptStatusRegister32::new(); + + assert_eq!(status.read(), 0); + status.write(0x13); + assert_eq!(status.read(), 0x13); + status.clear_bits(0x11); + assert_eq!(status.read(), 0x2); + status.set_bits(0x100); + assert_eq!(status.read_and_clear(), 0x102); + assert_eq!(status.read(), 0); + } +} diff --git a/src/interrupt/mod.rs b/src/interrupt/mod.rs new file mode 100644 index 0000000..28f9921 --- /dev/null +++ b/src/interrupt/mod.rs @@ -0,0 +1,244 @@ +// Copyright (C) 2019-2020 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Traits and Structs to manage interrupt sources for devices. +//! +//! In system programming, an interrupt is a signal to the processor emitted by hardware or +//! software indicating an event that needs immediate attention. An interrupt alerts the processor +//! to a high-priority condition requiring the interruption of the current code the processor is +//! executing. The processor responds by suspending its current activities, saving its state, and +//! executing a function called an interrupt handler (or an interrupt service routine, ISR) to deal +//! with the event. This interruption is temporary, and, after the interrupt handler finishes, +//! unless handling the interrupt has emitted a fatal error, the processor resumes normal +//! activities. +//! +//! Hardware interrupts are used by devices to communicate that they require attention from the +//! operating system, or a bare-metal program running on the CPU if there are no OSes. The act of +//! initiating a hardware interrupt is referred to as an interrupt request (IRQ). Different devices +//! are usually associated with different interrupts using a unique value associated with each +//! interrupt. This makes it possible to know which hardware device caused which interrupts. +//! These interrupt values are often called IRQ lines, or just interrupt lines. +//! +//! Nowadays, IRQ lines is not the only mechanism to deliver device interrupts to processors. +//! MSI [(Message Signaled Interrupt)](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) +//! is another commonly used alternative in-band method of signaling an interrupt, using special +//! in-band messages to replace traditional out-of-band assertion of dedicated interrupt lines. +//! While more complex to implement in a device, message signaled interrupts have some significant +//! advantages over pin-based out-of-band interrupt signaling. Message signaled interrupts are +//! supported in PCI bus since its version 2.2, and in later available PCI Express bus. Some non-PCI +//! architectures also use message signaled interrupts. +//! +//! While IRQ is a term commonly used by Operating Systems when dealing with hardware +//! interrupts, the IRQ numbers managed by OSes are independent of the ones managed by VMM. +//! For simplicity sake, the term `Interrupt Source` is used instead of IRQ to represent both pin-based +//! interrupts and MSI interrupts. +//! +//! A device may support multiple types of interrupts, and each type of interrupt may support one +//! or multiple interrupt sources. For example, a PCI device may support: +//! * Legacy Irq: exactly one interrupt source. +//! * PCI MSI Irq: 1,2,4,8,16,32 interrupt sources. +//! * PCI MSIx Irq: 2^n(n=0-11) interrupt sources. +//! +//! A distinct Interrupt Source Identifier (ISID) will be assigned to each interrupt source. +//! An ID allocator will be used to allocate and free Interrupt Source Identifiers for devices. +//! To decouple the vm-device crate from the ID allocator, the vm-device crate doesn't take the +//! responsibility to allocate/free Interrupt Source IDs but only makes use of assigned IDs. +//! +//! The overall flow to deal with interrupts is: +//! * the VMM creates an interrupt manager +//! * the VMM creates a device manager, passing on an reference to the interrupt manager +//! * the device manager passes on an reference to the interrupt manager to all registered devices +//! * guest kernel loads drivers for virtual devices +//! * guest device driver determines the type and number of interrupts needed, and update the +//! device configuration +//! * the virtual device backend requests the interrupt manager to create an interrupt group +//! according to guest configuration information + +use std::io::Error; +use std::ops::Deref; +use std::sync::Arc; + +#[cfg(feature = "vfio-msi-irq")] +use vfio_ioctls::VfioDevice; +use vmm_sys_util::eventfd::EventFd; + +mod manager; +pub use manager::{DeviceInterruptManager, DeviceInterruptMode, InterruptStatusRegister32}; + +/// Reuse std::io::Result to simplify interoperability among crates. +pub type Result = std::io::Result; + +/// Data type to store an interrupt source identifier. +pub type InterruptIndex = u32; + +/// Type of interrupt source. +#[derive(Clone)] +pub enum InterruptSourceType { + #[cfg(feature = "legacy-irq")] + /// Legacy Pin-based Interrupt. + /// On x86 platforms, legacy interrupts are routed through 8259 PICs and/or IOAPICs. + LegacyIrq, + #[cfg(feature = "msi-irq")] + /// Message Signaled Interrupt (PCI MSI/PCI MSIx etc). + /// Some non-PCI devices (like HPET on x86) make use of generic MSI in platform specific ways. + MsiIrq, + #[cfg(feature = "vfio-msi-irq")] + /// Message Signalled Interrupt for PCI MSI/PCI MSIx based VFIO devices. + VfioMsiIrq(Arc, u32), +} + +/// Configuration data for an interrupt source. +#[derive(Clone, Debug)] +pub enum InterruptSourceConfig { + #[cfg(feature = "legacy-irq")] + /// Configuration data for Legacy interrupts. + LegacyIrq(LegacyIrqSourceConfig), + #[cfg(feature = "msi-irq")] + /// Configuration data for PciMsi, PciMsix and generic MSI interrupts. + MsiIrq(MsiIrqSourceConfig), +} + +/// Configuration data for legacy interrupts. +/// +/// On x86 platforms, legacy interrupts means those interrupts routed through PICs or IOAPICs. +#[cfg(feature = "legacy-irq")] +#[derive(Clone, Debug)] +pub struct LegacyIrqSourceConfig {} + +/// Configuration data for GenericMsi, PciMsi, PciMsix interrupts. +#[cfg(feature = "msi-irq")] +#[derive(Copy, Clone, Debug, Default)] +pub struct MsiIrqSourceConfig { + /// High address to deliver message signaled interrupt. + pub high_addr: u32, + /// Low address to deliver message signaled interrupt. + pub low_addr: u32, + /// Data to write to deliver message signaled interrupt. + pub data: u32, +} + +/// Trait to manage interrupt sources for virtual device backends. +/// +/// The InterruptManager implementations should protect itself from concurrent accesses internally, +/// so it could be invoked from multi-threaded context. +pub trait InterruptManager { + /// Create an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object to manage + /// interrupt sources for a virtual device + /// + /// An [InterruptSourceGroup](trait.InterruptSourceGroup.html) object manages all interrupt + /// sources of the same type for a virtual device. + /// + /// # Arguments + /// * type_: type of interrupt source. + /// * base: base Interrupt Source ID to be managed by the group object. + /// * count: number of Interrupt Sources to be managed by the group object. + fn create_group( + &self, + type_: InterruptSourceType, + base: InterruptIndex, + count: InterruptIndex, + ) -> Result>>; + + /// Destroy an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object created by + /// [create_group()](trait.InterruptManager.html#tymethod.create_group). + /// + /// Assume the caller takes the responsibility to disable all interrupt sources of the group + /// before calling destroy_group(). This assumption helps to simplify InterruptSourceGroup + /// implementations. + fn destroy_group(&self, group: Arc>) -> Result<()>; +} + +impl InterruptManager for Arc { + fn create_group( + &self, + type_: InterruptSourceType, + base: u32, + count: u32, + ) -> std::result::Result>, Error> { + self.deref().create_group(type_, base, count) + } + + fn destroy_group( + &self, + group: Arc>, + ) -> std::result::Result<(), Error> { + self.deref().destroy_group(group) + } +} + +/// Trait to manage a group of interrupt sources for a device. +/// +/// A device may support several types of interrupts, and each type of interrupt may contain one or +/// multiple continuous interrupt sources. For example, a PCI device may concurrently support: +/// * Legacy Irq: exactly one interrupt source. +/// * PCI MSI Irq: 1,2,4,8,16,32 interrupt sources. +/// * PCI MSIx Irq: 2^n(n=0-11) interrupt sources. +/// +/// PCI MSI interrupts of a device may not be configured individually, and must configured as a +/// whole block. So all interrupts of the same type of a device are abstracted as an +/// [InterruptSourceGroup](trait.InterruptSourceGroup.html) object, instead of abstracting each +/// interrupt source as a distinct InterruptSource. +#[allow(clippy::len_without_is_empty)] +#[allow(clippy::trivially_copy_pass_by_ref)] +pub trait InterruptSourceGroup: Send + Sync { + /// Get type of interrupt sources managed by the group. + fn interrupt_type(&self) -> InterruptSourceType; + + /// Get number of interrupt sources managed by the group. + fn len(&self) -> InterruptIndex; + + /// Get base of the assigned Interrupt Source Identifiers. + fn base(&self) -> InterruptIndex; + + /// Enable the interrupt sources in the group to generate interrupts. + fn enable(&self, configs: &[InterruptSourceConfig]) -> Result<()>; + + /// Disable the interrupt sources in the group to generate interrupts. + fn disable(&self) -> Result<()>; + + /// Update the interrupt source group configuration. + /// + /// # Arguments + /// * index: sub-index into the group. + /// * config: configuration data for the interrupt source. + fn update(&self, index: InterruptIndex, config: &InterruptSourceConfig) -> Result<()>; + + /// Returns an interrupt notifier from this interrupt. + /// + /// An interrupt notifier allows for external components and processes + /// to inject interrupts into a guest, by writing to the file returned + /// by this method. + fn notifier(&self, _index: InterruptIndex) -> Option<&EventFd> { + None + } + + /// Inject an interrupt from this interrupt source into the guest. + /// + /// If the interrupt has an associated `interrupt_status` register, all bits set in `flag` + /// will be atomically ORed into the `interrupt_status` register. + fn trigger(&self, index: InterruptIndex) -> Result<()>; + + /// Mask an interrupt from this interrupt source. + fn mask(&self, _index: InterruptIndex) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Unmask an interrupt from this interrupt source. + fn unmask(&self, _index: InterruptIndex) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Check whether there's pending interrupt. + fn get_pending_state(&self, _index: InterruptIndex) -> bool { + false + } +} + +#[cfg(feature = "kvm-irq")] +mod kvm; +#[cfg(feature = "kvm-irq")] +pub use self::kvm::KvmIrqManager; diff --git a/src/lib.rs b/src/lib.rs index 9ef255d..ce2b872 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,33 +1,377 @@ // Copyright © 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +#![deny(missing_docs)] + //! rust-vmm device model. -extern crate vm_memory; +#[cfg(feature = "kvm-vfio-msi-irq")] +extern crate vfio_ioctls; -use vm_memory::GuestAddress; +use std::cmp::{Ord, PartialOrd}; +use std::sync::Mutex; +pub mod device_manager; +pub mod interrupt; pub mod resources; +use self::resources::DeviceResources; + +/// IO Size. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub struct IoSize(pub u64); + +impl IoSize { + /// Get the raw value as u64 to make operation simple. + #[inline] + pub fn raw_value(self) -> u64 { + self.0 + } +} + +impl From for IoSize { + #[inline] + fn from(size: u64) -> Self { + IoSize(size) + } +} + +impl From for u64 { + #[inline] + fn from(size: IoSize) -> Self { + size.0 + } +} + /// IO Addresses. -#[derive(Debug, Copy, Clone)] -pub enum IoAddress { +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub struct IoAddress(pub u64); + +impl IoAddress { + /// Get the raw value of IO Address to make operation simple. + #[inline] + pub fn raw_value(self) -> u64 { + self.0 + } +} + +impl From for IoAddress { + #[inline] + fn from(addr: u64) -> Self { + IoAddress(addr) + } +} + +impl From for u64 { + #[inline] + fn from(addr: IoAddress) -> Self { + addr.0 + } +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +mod x86 { + use super::{IoAddress, IoSize}; + use std::convert::TryFrom; + + type PioAddressType = u16; + + #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] + /// Port I/O size. + pub struct PioSize(pub PioAddressType); + + impl PioSize { + /// Get the raw value as u64 to make operation simple. + #[inline] + pub fn raw_value(self) -> PioAddressType { + self.0 + } + } + + impl From for PioSize { + #[inline] + fn from(size: PioAddressType) -> Self { + PioSize(size) + } + } + + impl From for PioAddressType { + #[inline] + fn from(size: PioSize) -> Self { + size.0 + } + } + + impl TryFrom for PioSize { + type Error = IoSize; + + #[inline] + fn try_from(size: IoSize) -> Result { + if size.raw_value() <= std::u16::MAX as u64 { + Ok(PioSize(size.raw_value() as PioAddressType)) + } else { + Err(size) + } + } + } + + impl From for IoSize { + #[inline] + fn from(size: PioSize) -> Self { + IoSize(size.raw_value() as u64) + } + } + /// Port I/O address. - Pio(u16), + #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] + pub struct PioAddress(pub PioAddressType); - /// Memory mapped I/O address. - Mmio(GuestAddress), + impl PioAddress { + /// Get the raw value of IO Address to make operation simple. + #[inline] + pub fn raw_value(self) -> PioAddressType { + self.0 + } + } + + impl From for PioAddress { + #[inline] + fn from(addr: PioAddressType) -> Self { + PioAddress(addr) + } + } + + impl From for PioAddressType { + #[inline] + fn from(addr: PioAddress) -> Self { + addr.0 + } + } + + impl TryFrom for PioAddress { + type Error = IoAddress; + + #[inline] + fn try_from(addr: IoAddress) -> Result { + if addr.0 <= std::u16::MAX as u64 { + Ok(PioAddress(addr.raw_value() as PioAddressType)) + } else { + Err(addr) + } + } + } + + impl From for IoAddress { + #[inline] + fn from(addr: PioAddress) -> Self { + IoAddress(addr.raw_value() as u64) + } + } } -/// Device IO trait. +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub use self::x86::{PioAddress, PioSize}; + +/// IO Addresses. +/// Device IO trait adopting interior mutability pattern. +/// /// A device supporting memory based I/O should implement this trait, then /// register itself against the different IO type ranges it handles. /// The VMM will then dispatch IO (PIO or MMIO) VM exits by calling into the /// registered devices read or write method from this trait. -pub trait DeviceIo: Send { - /// Read from the guest physical address `addr` to `data`. - fn read(&mut self, addr: IoAddress, data: &mut [u8]); +/// +/// The DeviceIo trait adopts the interior mutability pattern so we can get a +/// real concurrent multiple threads handling. For device backend drivers not +/// focusing on high performance, they may use the Mutex +/// adapter to simplify implementation. +#[allow(unused_variables)] +pub trait DeviceIo: Send + Sync { + /// Read from the guest physical address `base`, starting at `offset`. + /// Result is placed in `data`. + fn read(&self, base: IoAddress, offset: IoAddress, data: &mut [u8]) {} + + /// Write `data` to the guest physical address `base`, starting from `offset`. + fn write(&self, base: IoAddress, offset: IoAddress, data: &[u8]) {} + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + /// Read from the guest physical address `base`, starting at `offset`. + /// Result is placed in `data`. + fn pio_read(&self, base: PioAddress, offset: PioAddress, data: &mut [u8]) {} + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + /// Write `data` to the guest physical address `base`, starting from `offset`. + fn pio_write(&self, base: PioAddress, offset: PioAddress, data: &[u8]) {} + + /// Get resources assigned to the device. + fn get_assigned_resources(&self) -> DeviceResources { + DeviceResources::new() + } + + /// Get the IO resources which will be trapped by the DeviceManager. + /// + /// All none Mmio/Pio resources in the returned resource list will be ignored. + fn get_trapped_io_resources(&self) -> DeviceResources { + self.get_assigned_resources() + } +} + +/// Device IO trait without interior mutability. +/// +/// Many device backend drivers will mutate itself when handling IO requests. +/// The DeviceIo trait assumes interior mutability, but it's a little complex +/// to support interior mutability. So the Mutex adapter may be +/// used to ease device backend driver implementations. +/// +/// The Mutex adapter is an zero overhead abstraction without +/// performance penalty. +#[allow(unused_variables)] +pub trait DeviceIoMut: Send { + /// Read from the guest physical address `base`, starting at `offset`. + /// Result is placed in `data`. + fn read(&mut self, base: IoAddress, offset: IoAddress, data: &mut [u8]) {} + + /// Write `data` to the guest physical address `base`, starting from `offset`. + fn write(&mut self, base: IoAddress, offset: IoAddress, data: &[u8]) {} + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + /// Read from the guest physical address `base`, starting at `offset`. + /// Result is placed in `data`. + fn pio_read(&mut self, base: PioAddress, offset: PioAddress, data: &mut [u8]) {} + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + /// Write `data` to the guest physical address `base`, starting from `offset`. + fn pio_write(&mut self, base: PioAddress, offset: PioAddress, data: &[u8]) {} + + /// Get resources assigned to the device. + fn get_assigned_resources(&self) -> DeviceResources { + DeviceResources::new() + } + + /// Get the IO resources which will be trapped by the DeviceManager. + /// + /// All none Mmio/Pio resources in the returned resource list will be ignored. + fn get_trapped_io_resources(&self) -> DeviceResources { + self.get_assigned_resources() + } +} + +impl DeviceIo for Mutex { + fn read(&self, base: IoAddress, offset: IoAddress, data: &mut [u8]) { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().read(base, offset, data) + } + + fn write(&self, base: IoAddress, offset: IoAddress, data: &[u8]) { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().write(base, offset, data) + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + fn pio_read(&self, base: PioAddress, offset: PioAddress, data: &mut [u8]) { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().pio_read(base, offset, data) + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + fn pio_write(&self, base: PioAddress, offset: PioAddress, data: &[u8]) { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().pio_write(base, offset, data) + } + + fn get_assigned_resources(&self) -> DeviceResources { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().get_assigned_resources() + } + + fn get_trapped_io_resources(&self) -> DeviceResources { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().get_trapped_io_resources() + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + use std::convert::TryFrom; + use std::sync::Arc; + + #[derive(Default)] + struct MockDevice { + data: u8, + } + + impl DeviceIoMut for MockDevice { + fn read(&mut self, _base: IoAddress, _offset: IoAddress, data: &mut [u8]) { + data[0] = self.data; + } + + fn write(&mut self, _base: IoAddress, _offset: IoAddress, data: &[u8]) { + self.data = data[0]; + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + fn pio_read(&mut self, _base: PioAddress, _offset: PioAddress, data: &mut [u8]) { + data[0] = self.data; + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + fn pio_write(&mut self, _base: PioAddress, _offset: PioAddress, data: &[u8]) { + self.data = data[0]; + } + } + + fn register_device(device: Arc) { + device.write(IoAddress(0), IoAddress(0), &[0x10u8]); + let mut buf = [0x0u8]; + device.read(IoAddress(0), IoAddress(0), &mut buf); + assert_eq!(buf[0], 0x10); + } + + #[test] + fn test_device_io_mut_adapter() { + let device_mut = Arc::new(Mutex::new(MockDevice::default())); + + register_device(device_mut.clone()); + assert_eq!(device_mut.lock().unwrap().data, 0x010); + } + + #[test] + fn test_io_data_struct() { + let io_size = IoSize::from(0x1111u64); + assert_eq!(io_size.raw_value(), 0x1111u64); + assert_eq!(u64::from(io_size), 0x1111u64); + assert_eq!(io_size, io_size.clone()); + let io_size1 = IoSize::from(0x1112u64); + assert!(io_size < io_size1); + + let io_addr = IoAddress::from(0x1234u64); + assert_eq!(io_addr.raw_value(), 0x1234u64); + assert_eq!(u64::from(io_addr), 0x1234u64); + assert_eq!(io_addr, io_addr.clone()); + let io_addr1 = IoAddress::from(0x1235u64); + assert!(io_addr < io_addr1); + } + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + #[test] + fn test_pio_data_struct() { + let pio_size = PioSize::from(0x1111u16); + assert_eq!(pio_size.raw_value(), 0x1111u16); + assert_eq!(u16::from(pio_size), 0x1111u16); + assert_eq!(pio_size, pio_size.clone()); + let pio_size1 = PioSize::from(0x1112u16); + assert!(pio_size < pio_size1); + + let pio_addr = PioAddress::from(0x1234u16); + assert_eq!(pio_addr.raw_value(), 0x1234u16); + assert_eq!(u16::from(pio_addr), 0x1234u16); + assert_eq!(pio_addr, pio_addr.clone()); + let pio_addr1 = PioAddress::from(0x1235u16); + assert!(pio_addr < pio_addr1); - /// Write `data` to the guest physical address `addr`. - fn write(&mut self, addr: IoAddress, data: &[u8]); + assert!(PioAddress::try_from(IoAddress::from(0x123456u64)).is_err()); + assert!(PioAddress::try_from(IoAddress::from(0x1234u64)).is_ok()); + assert_eq!(IoAddress::from(pio_addr).raw_value(), 0x1234u64); + } } diff --git a/src/resources.rs b/src/resources.rs index 5ae37dd..89f383d 100644 --- a/src/resources.rs +++ b/src/resources.rs @@ -12,9 +12,11 @@ //! 5) the VMM registers the new device onto corresponding device managers according the allocated //! resources. +use std::ops::Deref; use std::{u16, u32, u64}; /// Enumeration describing a device's resource constraints. +#[derive(Copy, Clone, Debug, PartialEq)] pub enum ResourceConstraint { /// Constraint for an IO Port address range. PioAddress { @@ -108,7 +110,7 @@ impl ResourceConstraint { } /// Type of Message Singaled Interrupt -#[derive(Copy, Clone, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq)] pub enum MsiIrqType { /// PCI MSI IRQ numbers. PciMsi, @@ -120,7 +122,7 @@ pub enum MsiIrqType { /// Enumeration for device resources. #[allow(missing_docs)] -#[derive(Clone)] +#[derive(Clone, Debug, PartialEq)] pub enum Resource { /// IO Port address range. PioAddressRange { base: u16, size: u16 }, @@ -141,7 +143,7 @@ pub enum Resource { } /// Newtype to store a set of device resources. -#[derive(Default, Clone)] +#[derive(Clone, Debug, Default)] pub struct DeviceResources(Vec); impl DeviceResources { @@ -245,6 +247,14 @@ impl DeviceResources { } } +impl Deref for DeviceResources { + type Target = [Resource]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + #[cfg(test)] mod tests { use super::*; @@ -269,12 +279,16 @@ mod tests { size: PIO_ADDRESS_SIZE, }; let mut resource = DeviceResources::new(); - resource.append(entry); + resource.append(entry.clone()); + assert_eq!(entry, resource[0]); + let entry = Resource::MmioAddressRange { base: MMIO_ADDRESS_BASE, size: MMIO_ADDRESS_SIZE, }; - resource.append(entry); + resource.append(entry.clone()); + assert_eq!(entry, resource[1]); + let entry = Resource::LegacyIrq(LEGACY_IRQ); resource.append(entry); let entry = Resource::MsiIrq { @@ -310,6 +324,25 @@ mod tests { resources.get_pio_address_ranges()[0].0 == PIO_ADDRESS_BASE && resources.get_pio_address_ranges()[0].1 == PIO_ADDRESS_SIZE ); + assert_eq!( + resources[0], + Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + } + ); + assert_ne!(resources[0], resources[1]); + + let resources2 = resources.clone(); + assert_eq!(resources.len(), resources2.len()); + drop(resources); + assert_eq!( + resources2[0], + Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + } + ); } #[test] @@ -374,6 +407,13 @@ mod tests { #[test] fn test_resource_constraint() { + let pio = ResourceConstraint::new_pio(2); + let pio2 = pio.clone(); + let mmio = ResourceConstraint::new_mmio(0x1000); + assert_eq!(pio, pio2); + drop(pio2); + assert_ne!(pio, mmio); + if let ResourceConstraint::PioAddress { range, align, size } = ResourceConstraint::new_pio(2) { @@ -401,7 +441,7 @@ mod tests { assert_eq!(align, 0x1000); assert_eq!(size, 0x2000); } else { - panic!("Pio resource constraint is invalid."); + panic!("Mmio resource constraint is invalid."); } if let ResourceConstraint::MmioAddress { range, align, size } = @@ -411,7 +451,34 @@ mod tests { assert_eq!(align, 0x2000); assert_eq!(size, 0x2000); } else { - panic!("Pio resource constraint is invalid."); + panic!("Mmio resource constraint is invalid."); + } + + if let ResourceConstraint::LegacyIrq { irq } = + ResourceConstraint::new_legacy_irq(Some(0x123)) + { + assert_eq!(irq, Some(0x123)); + } else { + panic!("IRQ resource constraint is invalid."); + } + + if let ResourceConstraint::KvmMemSlot { slot, size } = + ResourceConstraint::new_kvm_mem_slot(0x1000, Some(0x2000)) + { + assert_eq!(slot, Some(0x2000)); + assert_eq!(size, 0x1000); + } else { + panic!("KVM slot resource constraint is invalid."); + } + } + + #[test] + fn test_resources_deref() { + let resources = get_device_resource(); + let mut count = 0; + for _res in resources.iter() { + count += 1; } + assert_eq!(count, resources.0.len()); } }