xref: /aosp_15_r20/external/crosvm/devices/src/vfio.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::collections::HashMap;
6 use std::ffi::CString;
7 use std::fs::File;
8 use std::fs::OpenOptions;
9 use std::io;
10 use std::mem;
11 use std::os::raw::c_ulong;
12 use std::os::unix::prelude::FileExt;
13 use std::path::Path;
14 use std::path::PathBuf;
15 #[cfg(all(target_os = "android", target_arch = "aarch64"))]
16 use std::ptr::addr_of_mut;
17 use std::slice;
18 use std::sync::Arc;
19 
20 use base::error;
21 use base::ioctl;
22 use base::ioctl_with_mut_ptr;
23 use base::ioctl_with_mut_ref;
24 use base::ioctl_with_ptr;
25 use base::ioctl_with_ref;
26 use base::ioctl_with_val;
27 use base::warn;
28 use base::AsRawDescriptor;
29 use base::Error;
30 use base::Event;
31 use base::FromRawDescriptor;
32 use base::RawDescriptor;
33 use base::SafeDescriptor;
34 use cfg_if::cfg_if;
35 use data_model::vec_with_array_field;
36 use hypervisor::DeviceKind;
37 use hypervisor::Vm;
38 use once_cell::sync::OnceCell;
39 use rand::seq::index::sample;
40 use rand::thread_rng;
41 use remain::sorted;
42 use resources::address_allocator::AddressAllocator;
43 use resources::AddressRange;
44 use resources::Alloc;
45 use resources::Error as ResourcesError;
46 use sync::Mutex;
47 use thiserror::Error;
48 use vfio_sys::vfio::vfio_acpi_dsm;
49 use vfio_sys::vfio::VFIO_IRQ_SET_DATA_BOOL;
50 use vfio_sys::*;
51 use zerocopy::AsBytes;
52 use zerocopy::FromBytes;
53 
54 use crate::IommuDevType;
55 
56 #[sorted]
57 #[derive(Error, Debug)]
58 pub enum VfioError {
59     #[error("failed to duplicate VfioContainer")]
60     ContainerDupError,
61     #[error("failed to set container's IOMMU driver type as {0:?}: {1}")]
62     ContainerSetIOMMU(IommuType, Error),
63     #[error("failed to create KVM vfio device: {0}")]
64     CreateVfioKvmDevice(Error),
65     #[error("failed to get Group Status: {0}")]
66     GetGroupStatus(Error),
67     #[error("failed to get vfio device fd: {0}")]
68     GroupGetDeviceFD(Error),
69     #[error("failed to add vfio group into vfio container: {0}")]
70     GroupSetContainer(Error),
71     #[error("group is inviable")]
72     GroupViable,
73     #[error("invalid region index: {0}")]
74     InvalidIndex(usize),
75     #[error("invalid operation")]
76     InvalidOperation,
77     #[error("invalid file path")]
78     InvalidPath,
79     #[error("failed to add guest memory map into iommu table: {0}")]
80     IommuDmaMap(Error),
81     #[error("failed to remove guest memory map from iommu table: {0}")]
82     IommuDmaUnmap(Error),
83     #[error("failed to get IOMMU cap info from host")]
84     IommuGetCapInfo,
85     #[error("failed to get IOMMU info from host: {0}")]
86     IommuGetInfo(Error),
87     #[error("failed to attach device to pKVM pvIOMMU: {0}")]
88     KvmPviommuSetConfig(Error),
89     #[error("failed to set KVM vfio device's attribute: {0}")]
90     KvmSetDeviceAttr(Error),
91     #[error("AddressAllocator is unavailable")]
92     NoRescAlloc,
93     #[error("failed to open /dev/vfio/vfio container: {0}")]
94     OpenContainer(io::Error),
95     #[error("failed to open {1} group: {0}")]
96     OpenGroup(io::Error, String),
97     #[error("failed to read {1} link: {0}")]
98     ReadLink(io::Error, PathBuf),
99     #[error("resources error: {0}")]
100     Resources(ResourcesError),
101     #[error("unknown vfio device type (flags: {0:#x})")]
102     UnknownDeviceType(u32),
103     #[error("failed to call vfio device's ACPI _DSM: {0}")]
104     VfioAcpiDsm(Error),
105     #[error("failed to disable vfio deviece's acpi notification: {0}")]
106     VfioAcpiNotificationDisable(Error),
107     #[error("failed to enable vfio deviece's acpi notification: {0}")]
108     VfioAcpiNotificationEnable(Error),
109     #[error("failed to test vfio deviece's acpi notification: {0}")]
110     VfioAcpiNotificationTest(Error),
111     #[error(
112         "vfio API version doesn't match with VFIO_API_VERSION defined in vfio_sys/src/vfio.rs"
113     )]
114     VfioApiVersion,
115     #[error("failed to get vfio device's info or info doesn't match: {0}")]
116     VfioDeviceGetInfo(Error),
117     #[error("failed to get vfio device's region info: {0}")]
118     VfioDeviceGetRegionInfo(Error),
119     #[error("container doesn't support IOMMU driver type {0:?}")]
120     VfioIommuSupport(IommuType),
121     #[error("failed to disable vfio deviece's irq: {0}")]
122     VfioIrqDisable(Error),
123     #[error("failed to enable vfio deviece's irq: {0}")]
124     VfioIrqEnable(Error),
125     #[error("failed to mask vfio deviece's irq: {0}")]
126     VfioIrqMask(Error),
127     #[error("failed to unmask vfio deviece's irq: {0}")]
128     VfioIrqUnmask(Error),
129     #[error("failed to enter vfio deviece's low power state: {0}")]
130     VfioPmLowPowerEnter(Error),
131     #[error("failed to exit vfio deviece's low power state: {0}")]
132     VfioPmLowPowerExit(Error),
133 }
134 
135 type Result<T> = std::result::Result<T, VfioError>;
136 
get_error() -> Error137 fn get_error() -> Error {
138     Error::last()
139 }
140 
141 static KVM_VFIO_FILE: OnceCell<SafeDescriptor> = OnceCell::new();
142 
143 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
144 pub enum VfioDeviceType {
145     Pci,
146     Platform,
147 }
148 
149 enum KvmVfioGroupOps {
150     Add,
151     Delete,
152 }
153 
154 #[derive(Debug)]
155 pub struct KvmVfioPviommu {
156     file: File,
157 }
158 
159 impl KvmVfioPviommu {
new(vm: &impl Vm) -> Result<Self>160     pub fn new(vm: &impl Vm) -> Result<Self> {
161         cfg_if! {
162             if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
163                 let file = Self::ioctl_kvm_dev_vfio_pviommu_attach(vm)?;
164 
165                 Ok(Self { file })
166             } else {
167                 let _ = vm;
168                 unimplemented!()
169             }
170         }
171     }
172 
attach<T: AsRawDescriptor>(&self, device: &T, sid_idx: u32, vsid: u32) -> Result<()>173     pub fn attach<T: AsRawDescriptor>(&self, device: &T, sid_idx: u32, vsid: u32) -> Result<()> {
174         cfg_if! {
175             if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
176                 self.ioctl_kvm_pviommu_set_config(device, sid_idx, vsid)
177             } else {
178                 let _ = device;
179                 let _ = sid_idx;
180                 let _ = vsid;
181                 unimplemented!()
182             }
183         }
184     }
185 
id(&self) -> u32186     pub fn id(&self) -> u32 {
187         let fd = self.as_raw_descriptor();
188         // Guests identify pvIOMMUs to the hypervisor using the corresponding VMM FDs.
189         fd.try_into().unwrap()
190     }
191 
get_sid_count<T: AsRawDescriptor>(vm: &impl Vm, device: &T) -> Result<u32>192     pub fn get_sid_count<T: AsRawDescriptor>(vm: &impl Vm, device: &T) -> Result<u32> {
193         cfg_if! {
194             if #[cfg(all(target_os = "android", target_arch = "aarch64"))] {
195                 let info = Self::ioctl_kvm_dev_vfio_pviommu_get_info(vm, device)?;
196 
197                 Ok(info.nr_sids)
198             } else {
199                 let _ = vm;
200                 let _ = device;
201                 unimplemented!()
202             }
203         }
204     }
205 
206     #[cfg(all(target_os = "android", target_arch = "aarch64"))]
ioctl_kvm_dev_vfio_pviommu_attach(vm: &impl Vm) -> Result<File>207     fn ioctl_kvm_dev_vfio_pviommu_attach(vm: &impl Vm) -> Result<File> {
208         let kvm_vfio_file = KVM_VFIO_FILE
209             .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
210             .map_err(VfioError::CreateVfioKvmDevice)?;
211 
212         let vfio_dev_attr = kvm_sys::kvm_device_attr {
213             flags: 0,
214             group: kvm_sys::KVM_DEV_VFIO_PVIOMMU,
215             attr: kvm_sys::KVM_DEV_VFIO_PVIOMMU_ATTACH as u64,
216             addr: 0,
217         };
218 
219         // SAFETY:
220         // Safe as we are the owner of vfio_dev_attr, which is valid.
221         let ret =
222             unsafe { ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr) };
223 
224         if ret < 0 {
225             Err(VfioError::KvmSetDeviceAttr(get_error()))
226         } else {
227             // SAFETY: Safe as we verify the return value.
228             Ok(unsafe { File::from_raw_descriptor(ret) })
229         }
230     }
231 
232     #[cfg(all(target_os = "android", target_arch = "aarch64"))]
ioctl_kvm_pviommu_set_config<T: AsRawDescriptor>( &self, device: &T, sid_idx: u32, vsid: u32, ) -> Result<()>233     fn ioctl_kvm_pviommu_set_config<T: AsRawDescriptor>(
234         &self,
235         device: &T,
236         sid_idx: u32,
237         vsid: u32,
238     ) -> Result<()> {
239         let config = kvm_sys::kvm_vfio_iommu_config {
240             device_fd: device.as_raw_descriptor(),
241             sid_idx,
242             vsid,
243         };
244 
245         // SAFETY:
246         // Safe as we are the owner of device and config which are valid, and we verify the return
247         // value.
248         let ret = unsafe { ioctl_with_ref(self, kvm_sys::KVM_PVIOMMU_SET_CONFIG, &config) };
249 
250         if ret < 0 {
251             Err(VfioError::KvmPviommuSetConfig(get_error()))
252         } else {
253             Ok(())
254         }
255     }
256 
257     #[cfg(all(target_os = "android", target_arch = "aarch64"))]
ioctl_kvm_dev_vfio_pviommu_get_info<T: AsRawDescriptor>( vm: &impl Vm, device: &T, ) -> Result<kvm_sys::kvm_vfio_iommu_info>258     fn ioctl_kvm_dev_vfio_pviommu_get_info<T: AsRawDescriptor>(
259         vm: &impl Vm,
260         device: &T,
261     ) -> Result<kvm_sys::kvm_vfio_iommu_info> {
262         let kvm_vfio_file = KVM_VFIO_FILE
263             .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
264             .map_err(VfioError::CreateVfioKvmDevice)?;
265 
266         let mut info = kvm_sys::kvm_vfio_iommu_info {
267             device_fd: device.as_raw_descriptor(),
268             nr_sids: 0,
269         };
270 
271         let vfio_dev_attr = kvm_sys::kvm_device_attr {
272             flags: 0,
273             group: kvm_sys::KVM_DEV_VFIO_PVIOMMU,
274             attr: kvm_sys::KVM_DEV_VFIO_PVIOMMU_GET_INFO as u64,
275             addr: addr_of_mut!(info) as usize as u64,
276         };
277 
278         // SAFETY:
279         // Safe as we are the owner of vfio_dev_attr, which is valid.
280         let ret =
281             unsafe { ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr) };
282 
283         if ret < 0 {
284             Err(VfioError::KvmSetDeviceAttr(get_error()))
285         } else {
286             Ok(info)
287         }
288     }
289 }
290 
291 impl AsRawDescriptor for KvmVfioPviommu {
as_raw_descriptor(&self) -> RawDescriptor292     fn as_raw_descriptor(&self) -> RawDescriptor {
293         self.file.as_raw_descriptor()
294     }
295 }
296 
297 #[repr(u32)]
298 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
299 pub enum IommuType {
300     Type1V2 = VFIO_TYPE1v2_IOMMU,
301     PkvmPviommu = VFIO_PKVM_PVIOMMU,
302     // ChromeOS specific vfio_iommu_type1 implementation that is optimized for
303     // small, dynamic mappings. For clients which create large, relatively
304     // static mappings, Type1V2 is still preferred.
305     //
306     // See crrev.com/c/3593528 for the implementation.
307     Type1ChromeOS = 100001,
308 }
309 
310 /// VfioContainer contain multi VfioGroup, and delegate an IOMMU domain table
311 pub struct VfioContainer {
312     container: File,
313     groups: HashMap<u32, Arc<Mutex<VfioGroup>>>,
314     iommu_type: Option<IommuType>,
315 }
316 
extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> Option<T> where T: FromBytes,317 fn extract_vfio_struct<T>(bytes: &[u8], offset: usize) -> Option<T>
318 where
319     T: FromBytes,
320 {
321     bytes.get(offset..).and_then(T::read_from_prefix)
322 }
323 
324 const VFIO_API_VERSION: u8 = 0;
325 impl VfioContainer {
new() -> Result<Self>326     pub fn new() -> Result<Self> {
327         let container = OpenOptions::new()
328             .read(true)
329             .write(true)
330             .open("/dev/vfio/vfio")
331             .map_err(VfioError::OpenContainer)?;
332 
333         Self::new_from_container(container)
334     }
335 
336     // Construct a VfioContainer from an exist container file.
new_from_container(container: File) -> Result<Self>337     pub fn new_from_container(container: File) -> Result<Self> {
338         // SAFETY:
339         // Safe as file is vfio container descriptor and ioctl is defined by kernel.
340         let version = unsafe { ioctl(&container, VFIO_GET_API_VERSION) };
341         if version as u8 != VFIO_API_VERSION {
342             return Err(VfioError::VfioApiVersion);
343         }
344 
345         Ok(VfioContainer {
346             container,
347             groups: HashMap::new(),
348             iommu_type: None,
349         })
350     }
351 
is_group_set(&self, group_id: u32) -> bool352     fn is_group_set(&self, group_id: u32) -> bool {
353         self.groups.contains_key(&group_id)
354     }
355 
check_extension(&self, val: IommuType) -> bool356     fn check_extension(&self, val: IommuType) -> bool {
357         // SAFETY:
358         // Safe as file is vfio container and make sure val is valid.
359         let ret = unsafe { ioctl_with_val(self, VFIO_CHECK_EXTENSION, val as c_ulong) };
360         ret != 0
361     }
362 
set_iommu(&mut self, val: IommuType) -> i32363     fn set_iommu(&mut self, val: IommuType) -> i32 {
364         // SAFETY:
365         // Safe as file is vfio container and make sure val is valid.
366         unsafe { ioctl_with_val(self, VFIO_SET_IOMMU, val as c_ulong) }
367     }
368 
set_iommu_checked(&mut self, val: IommuType) -> Result<()>369     fn set_iommu_checked(&mut self, val: IommuType) -> Result<()> {
370         if !self.check_extension(val) {
371             Err(VfioError::VfioIommuSupport(val))
372         } else if self.set_iommu(val) != 0 {
373             Err(VfioError::ContainerSetIOMMU(val, get_error()))
374         } else {
375             self.iommu_type = Some(val);
376             Ok(())
377         }
378     }
379 
380     /// # Safety
381     ///
382     /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
vfio_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>383     pub unsafe fn vfio_dma_map(
384         &self,
385         iova: u64,
386         size: u64,
387         user_addr: u64,
388         write_en: bool,
389     ) -> Result<()> {
390         match self
391             .iommu_type
392             .expect("vfio_dma_map called before configuring IOMMU")
393         {
394             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
395                 self.vfio_iommu_type1_dma_map(iova, size, user_addr, write_en)
396             }
397             IommuType::PkvmPviommu => Err(VfioError::InvalidOperation),
398         }
399     }
400 
401     /// # Safety
402     ///
403     /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
vfio_iommu_type1_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>404     unsafe fn vfio_iommu_type1_dma_map(
405         &self,
406         iova: u64,
407         size: u64,
408         user_addr: u64,
409         write_en: bool,
410     ) -> Result<()> {
411         let mut dma_map = vfio_iommu_type1_dma_map {
412             argsz: mem::size_of::<vfio_iommu_type1_dma_map>() as u32,
413             flags: VFIO_DMA_MAP_FLAG_READ,
414             vaddr: user_addr,
415             iova,
416             size,
417         };
418 
419         if write_en {
420             dma_map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
421         }
422 
423         let ret = ioctl_with_ref(self, VFIO_IOMMU_MAP_DMA, &dma_map);
424         if ret != 0 {
425             return Err(VfioError::IommuDmaMap(get_error()));
426         }
427 
428         Ok(())
429     }
430 
vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()>431     pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
432         match self
433             .iommu_type
434             .expect("vfio_dma_unmap called before configuring IOMMU")
435         {
436             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
437                 self.vfio_iommu_type1_dma_unmap(iova, size)
438             }
439             IommuType::PkvmPviommu => Err(VfioError::InvalidOperation),
440         }
441     }
442 
vfio_iommu_type1_dma_unmap(&self, iova: u64, size: u64) -> Result<()>443     fn vfio_iommu_type1_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
444         let mut dma_unmap = vfio_iommu_type1_dma_unmap {
445             argsz: mem::size_of::<vfio_iommu_type1_dma_unmap>() as u32,
446             flags: 0,
447             iova,
448             size,
449             ..Default::default()
450         };
451 
452         // SAFETY:
453         // Safe as file is vfio container, dma_unmap is constructed by us, and
454         // we check the return value
455         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_UNMAP_DMA, &mut dma_unmap) };
456         if ret != 0 || dma_unmap.size != size {
457             return Err(VfioError::IommuDmaUnmap(get_error()));
458         }
459 
460         Ok(())
461     }
462 
vfio_get_iommu_page_size_mask(&self) -> Result<u64>463     pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
464         match self
465             .iommu_type
466             .expect("vfio_get_iommu_page_size_mask called before configuring IOMMU")
467         {
468             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
469                 self.vfio_iommu_type1_get_iommu_page_size_mask()
470             }
471             IommuType::PkvmPviommu => Ok(0),
472         }
473     }
474 
vfio_iommu_type1_get_iommu_page_size_mask(&self) -> Result<u64>475     fn vfio_iommu_type1_get_iommu_page_size_mask(&self) -> Result<u64> {
476         let mut iommu_info = vfio_iommu_type1_info {
477             argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
478             flags: 0,
479             iova_pgsizes: 0,
480             ..Default::default()
481         };
482 
483         // SAFETY:
484         // Safe as file is vfio container, iommu_info has valid values,
485         // and we check the return value
486         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO, &mut iommu_info) };
487         if ret != 0 || (iommu_info.flags & VFIO_IOMMU_INFO_PGSIZES) == 0 {
488             return Err(VfioError::IommuGetInfo(get_error()));
489         }
490 
491         Ok(iommu_info.iova_pgsizes)
492     }
493 
vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>>494     pub fn vfio_iommu_iova_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
495         match self
496             .iommu_type
497             .expect("vfio_iommu_iova_get_iova_ranges called before configuring IOMMU")
498         {
499             IommuType::Type1V2 | IommuType::Type1ChromeOS => {
500                 self.vfio_iommu_type1_get_iova_ranges()
501             }
502             IommuType::PkvmPviommu => Ok(Vec::new()),
503         }
504     }
505 
vfio_iommu_type1_get_iova_ranges(&self) -> Result<Vec<AddressRange>>506     fn vfio_iommu_type1_get_iova_ranges(&self) -> Result<Vec<AddressRange>> {
507         // Query the buffer size needed fetch the capabilities.
508         let mut iommu_info_argsz = vfio_iommu_type1_info {
509             argsz: mem::size_of::<vfio_iommu_type1_info>() as u32,
510             flags: 0,
511             iova_pgsizes: 0,
512             ..Default::default()
513         };
514 
515         // SAFETY:
516         // Safe as file is vfio container, iommu_info_argsz has valid values,
517         // and we check the return value
518         let ret = unsafe { ioctl_with_mut_ref(self, VFIO_IOMMU_GET_INFO, &mut iommu_info_argsz) };
519         if ret != 0 {
520             return Err(VfioError::IommuGetInfo(get_error()));
521         }
522 
523         if (iommu_info_argsz.flags & VFIO_IOMMU_INFO_CAPS) == 0 {
524             return Err(VfioError::IommuGetCapInfo);
525         }
526 
527         let mut iommu_info = vec_with_array_field::<vfio_iommu_type1_info, u8>(
528             iommu_info_argsz.argsz as usize - mem::size_of::<vfio_iommu_type1_info>(),
529         );
530         iommu_info[0].argsz = iommu_info_argsz.argsz;
531         let ret =
532             // SAFETY:
533             // Safe as file is vfio container, iommu_info has valid values,
534             // and we check the return value
535             unsafe { ioctl_with_mut_ptr(self, VFIO_IOMMU_GET_INFO, iommu_info.as_mut_ptr()) };
536         if ret != 0 {
537             return Err(VfioError::IommuGetInfo(get_error()));
538         }
539 
540         // SAFETY:
541         // Safe because we initialized iommu_info with enough space, u8 has less strict
542         // alignment, and since it will no longer be mutated.
543         let info_bytes = unsafe {
544             std::slice::from_raw_parts(
545                 iommu_info.as_ptr() as *const u8,
546                 iommu_info_argsz.argsz as usize,
547             )
548         };
549 
550         if (iommu_info[0].flags & VFIO_IOMMU_INFO_CAPS) == 0 {
551             return Err(VfioError::IommuGetCapInfo);
552         }
553 
554         let mut offset = iommu_info[0].cap_offset as usize;
555         while offset != 0 {
556             let header = extract_vfio_struct::<vfio_info_cap_header>(info_bytes, offset)
557                 .ok_or(VfioError::IommuGetCapInfo)?;
558 
559             if header.id == VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE as u16 && header.version == 1 {
560                 let iova_header =
561                     extract_vfio_struct::<vfio_iommu_type1_info_cap_iova_range_header>(
562                         info_bytes, offset,
563                     )
564                     .ok_or(VfioError::IommuGetCapInfo)?;
565                 let range_offset = offset + mem::size_of::<vfio_iommu_type1_info_cap_iova_range>();
566                 let mut ret = Vec::new();
567                 for i in 0..iova_header.nr_iovas {
568                     ret.push(
569                         extract_vfio_struct::<vfio_iova_range>(
570                             info_bytes,
571                             range_offset + i as usize * mem::size_of::<vfio_iova_range>(),
572                         )
573                         .ok_or(VfioError::IommuGetCapInfo)?,
574                     );
575                 }
576                 return Ok(ret
577                     .iter()
578                     .map(|range| AddressRange {
579                         start: range.start,
580                         end: range.end,
581                     })
582                     .collect());
583             }
584             offset = header.next as usize;
585         }
586 
587         Err(VfioError::IommuGetCapInfo)
588     }
589 
set_iommu_from(&mut self, iommu_dev: IommuDevType) -> Result<()>590     fn set_iommu_from(&mut self, iommu_dev: IommuDevType) -> Result<()> {
591         match iommu_dev {
592             IommuDevType::CoIommu | IommuDevType::VirtioIommu => {
593                 // If we expect granular, dynamic mappings, try the ChromeOS Type1ChromeOS first,
594                 // then fall back to upstream versions.
595                 self.set_iommu_checked(IommuType::Type1ChromeOS)
596                     .or_else(|_| self.set_iommu_checked(IommuType::Type1V2))
597             }
598             IommuDevType::NoIommu => self.set_iommu_checked(IommuType::Type1V2),
599             IommuDevType::PkvmPviommu => self.set_iommu_checked(IommuType::PkvmPviommu),
600         }
601     }
602 
get_group_with_vm( &mut self, id: u32, vm: &impl Vm, iommu_dev: IommuDevType, ) -> Result<Arc<Mutex<VfioGroup>>>603     fn get_group_with_vm(
604         &mut self,
605         id: u32,
606         vm: &impl Vm,
607         iommu_dev: IommuDevType,
608     ) -> Result<Arc<Mutex<VfioGroup>>> {
609         if let Some(group) = self.groups.get(&id) {
610             return Ok(group.clone());
611         }
612 
613         let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
614         if self.groups.is_empty() {
615             self.set_iommu_from(iommu_dev)?;
616             // Before the first group is added into container, do once per container
617             // initialization. Both coiommu and virtio-iommu rely on small, dynamic
618             // mappings. However, if an iommu is not enabled, then we map the entirety
619             // of guest memory as a small number of large, static mappings.
620             match iommu_dev {
621                 IommuDevType::CoIommu | IommuDevType::PkvmPviommu | IommuDevType::VirtioIommu => {}
622                 IommuDevType::NoIommu => {
623                     for region in vm.get_memory().regions() {
624                         // SAFETY:
625                         // Safe because the guest regions are guaranteed not to overlap
626                         unsafe {
627                             self.vfio_dma_map(
628                                 region.guest_addr.0,
629                                 region.size as u64,
630                                 region.host_addr as u64,
631                                 true,
632                             )
633                         }?;
634                     }
635                 }
636             }
637         }
638 
639         let kvm_vfio_file = KVM_VFIO_FILE
640             .get_or_try_init(|| vm.create_device(DeviceKind::Vfio))
641             .map_err(VfioError::CreateVfioKvmDevice)?;
642         group
643             .lock()
644             .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Add)?;
645 
646         self.groups.insert(id, group.clone());
647 
648         Ok(group)
649     }
650 
get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>>651     fn get_group(&mut self, id: u32) -> Result<Arc<Mutex<VfioGroup>>> {
652         if let Some(group) = self.groups.get(&id) {
653             return Ok(group.clone());
654         }
655 
656         let group = Arc::new(Mutex::new(VfioGroup::new(self, id)?));
657 
658         if self.groups.is_empty() {
659             // Before the first group is added into container, do once per
660             // container initialization.
661             self.set_iommu_checked(IommuType::Type1V2)?;
662         }
663 
664         self.groups.insert(id, group.clone());
665         Ok(group)
666     }
667 
remove_group(&mut self, id: u32, reduce: bool)668     fn remove_group(&mut self, id: u32, reduce: bool) {
669         let mut remove = false;
670 
671         if let Some(group) = self.groups.get(&id) {
672             if reduce {
673                 group.lock().reduce_device_num();
674             }
675             if group.lock().device_num() == 0 {
676                 let kvm_vfio_file = KVM_VFIO_FILE.get().expect("kvm vfio file isn't created");
677                 if group
678                     .lock()
679                     .kvm_device_set_group(kvm_vfio_file, KvmVfioGroupOps::Delete)
680                     .is_err()
681                 {
682                     warn!("failing in remove vfio group from kvm device");
683                 }
684                 remove = true;
685             }
686         }
687 
688         if remove {
689             self.groups.remove(&id);
690         }
691     }
692 
clone_as_raw_descriptor(&self) -> Result<RawDescriptor>693     pub fn clone_as_raw_descriptor(&self) -> Result<RawDescriptor> {
694         // SAFETY: this call is safe because it doesn't modify any memory and we
695         // check the return value.
696         let raw_descriptor = unsafe { libc::dup(self.container.as_raw_descriptor()) };
697         if raw_descriptor < 0 {
698             Err(VfioError::ContainerDupError)
699         } else {
700             Ok(raw_descriptor)
701         }
702     }
703 
704     // Gets group ids for all groups in the container.
group_ids(&self) -> Vec<&u32>705     pub fn group_ids(&self) -> Vec<&u32> {
706         self.groups.keys().collect()
707     }
708 }
709 
710 impl AsRawDescriptor for VfioContainer {
as_raw_descriptor(&self) -> RawDescriptor711     fn as_raw_descriptor(&self) -> RawDescriptor {
712         self.container.as_raw_descriptor()
713     }
714 }
715 
716 struct VfioGroup {
717     group: File,
718     device_num: u32,
719 }
720 
721 impl VfioGroup {
new(container: &VfioContainer, id: u32) -> Result<Self>722     fn new(container: &VfioContainer, id: u32) -> Result<Self> {
723         let group_path = format!("/dev/vfio/{}", id);
724         let group_file = OpenOptions::new()
725             .read(true)
726             .write(true)
727             .open(Path::new(&group_path))
728             .map_err(|e| VfioError::OpenGroup(e, group_path))?;
729 
730         let mut group_status = vfio_group_status {
731             argsz: mem::size_of::<vfio_group_status>() as u32,
732             flags: 0,
733         };
734         let mut ret =
735             // SAFETY:
736             // Safe as we are the owner of group_file and group_status which are valid value.
737             unsafe { ioctl_with_mut_ref(&group_file, VFIO_GROUP_GET_STATUS, &mut group_status) };
738         if ret < 0 {
739             return Err(VfioError::GetGroupStatus(get_error()));
740         }
741 
742         if group_status.flags != VFIO_GROUP_FLAGS_VIABLE {
743             return Err(VfioError::GroupViable);
744         }
745 
746         let container_raw_descriptor = container.as_raw_descriptor();
747         // SAFETY:
748         // Safe as we are the owner of group_file and container_raw_descriptor which are valid
749         // value, and we verify the ret value
750         ret = unsafe {
751             ioctl_with_ref(
752                 &group_file,
753                 VFIO_GROUP_SET_CONTAINER,
754                 &container_raw_descriptor,
755             )
756         };
757         if ret < 0 {
758             return Err(VfioError::GroupSetContainer(get_error()));
759         }
760 
761         Ok(VfioGroup {
762             group: group_file,
763             device_num: 0,
764         })
765     }
766 
get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32>767     fn get_group_id<P: AsRef<Path>>(sysfspath: P) -> Result<u32> {
768         let mut uuid_path = PathBuf::new();
769         uuid_path.push(sysfspath);
770         uuid_path.push("iommu_group");
771         let group_path = uuid_path
772             .read_link()
773             .map_err(|e| VfioError::ReadLink(e, uuid_path))?;
774         let group_osstr = group_path.file_name().ok_or(VfioError::InvalidPath)?;
775         let group_str = group_osstr.to_str().ok_or(VfioError::InvalidPath)?;
776         let group_id = group_str
777             .parse::<u32>()
778             .map_err(|_| VfioError::InvalidPath)?;
779 
780         Ok(group_id)
781     }
782 
kvm_device_set_group( &self, kvm_vfio_file: &SafeDescriptor, ops: KvmVfioGroupOps, ) -> Result<()>783     fn kvm_device_set_group(
784         &self,
785         kvm_vfio_file: &SafeDescriptor,
786         ops: KvmVfioGroupOps,
787     ) -> Result<()> {
788         let group_descriptor = self.as_raw_descriptor();
789         let group_descriptor_ptr = &group_descriptor as *const i32;
790         let vfio_dev_attr = match ops {
791             KvmVfioGroupOps::Add => kvm_sys::kvm_device_attr {
792                 flags: 0,
793                 group: kvm_sys::KVM_DEV_VFIO_GROUP,
794                 attr: kvm_sys::KVM_DEV_VFIO_GROUP_ADD as u64,
795                 addr: group_descriptor_ptr as u64,
796             },
797             KvmVfioGroupOps::Delete => kvm_sys::kvm_device_attr {
798                 flags: 0,
799                 group: kvm_sys::KVM_DEV_VFIO_GROUP,
800                 attr: kvm_sys::KVM_DEV_VFIO_GROUP_DEL as u64,
801                 addr: group_descriptor_ptr as u64,
802             },
803         };
804 
805         // SAFETY:
806         // Safe as we are the owner of vfio_dev_descriptor and vfio_dev_attr which are valid value,
807         // and we verify the return value.
808         if 0 != unsafe {
809             ioctl_with_ref(kvm_vfio_file, kvm_sys::KVM_SET_DEVICE_ATTR, &vfio_dev_attr)
810         } {
811             return Err(VfioError::KvmSetDeviceAttr(get_error()));
812         }
813 
814         Ok(())
815     }
816 
get_device(&self, name: &str) -> Result<File>817     fn get_device(&self, name: &str) -> Result<File> {
818         let path: CString = CString::new(name.as_bytes()).expect("CString::new() failed");
819         let path_ptr = path.as_ptr();
820 
821         // SAFETY:
822         // Safe as we are the owner of self and path_ptr which are valid value.
823         let ret = unsafe { ioctl_with_ptr(self, VFIO_GROUP_GET_DEVICE_FD, path_ptr) };
824         if ret < 0 {
825             return Err(VfioError::GroupGetDeviceFD(get_error()));
826         }
827 
828         // SAFETY:
829         // Safe as ret is valid descriptor
830         Ok(unsafe { File::from_raw_descriptor(ret) })
831     }
832 
add_device_num(&mut self)833     fn add_device_num(&mut self) {
834         self.device_num += 1;
835     }
836 
reduce_device_num(&mut self)837     fn reduce_device_num(&mut self) {
838         self.device_num -= 1;
839     }
840 
device_num(&self) -> u32841     fn device_num(&self) -> u32 {
842         self.device_num
843     }
844 }
845 
846 impl AsRawDescriptor for VfioGroup {
as_raw_descriptor(&self) -> RawDescriptor847     fn as_raw_descriptor(&self) -> RawDescriptor {
848         self.group.as_raw_descriptor()
849     }
850 }
851 
852 /// A helper struct for managing VFIO containers
853 #[derive(Default)]
854 pub struct VfioContainerManager {
855     /// One VFIO container shared by all VFIO devices that don't attach to any IOMMU device.
856     no_iommu_container: Option<Arc<Mutex<VfioContainer>>>,
857 
858     /// For IOMMU enabled devices, all VFIO groups that share the same IOVA space are managed by
859     /// one VFIO container.
860     iommu_containers: Vec<Arc<Mutex<VfioContainer>>>,
861 
862     /// One VFIO container shared by all VFIO devices that attach to the CoIOMMU device.
863     coiommu_container: Option<Arc<Mutex<VfioContainer>>>,
864 
865     /// One VFIO container shared by all VFIO devices that attach to pKVM.
866     pkvm_iommu_container: Option<Arc<Mutex<VfioContainer>>>,
867 }
868 
869 impl VfioContainerManager {
new() -> Self870     pub fn new() -> Self {
871         Self::default()
872     }
873 
874     /// The single place to create a VFIO container for a PCI endpoint.
875     ///
876     /// The policy to determine whether an individual or a shared VFIO container
877     /// will be created for this device is governed by the physical PCI topology,
878     /// and the argument iommu_type.
879     ///
880     ///  # Arguments
881     ///
882     ///  * `sysfspath` - the path to the PCI device, e.g. /sys/bus/pci/devices/0000:02:00.0
883     ///  * `iommu_type` - which type of IOMMU is enabled on this device
get_container<P: AsRef<Path>>( &mut self, iommu_type: IommuDevType, sysfspath: Option<P>, ) -> Result<Arc<Mutex<VfioContainer>>>884     pub fn get_container<P: AsRef<Path>>(
885         &mut self,
886         iommu_type: IommuDevType,
887         sysfspath: Option<P>,
888     ) -> Result<Arc<Mutex<VfioContainer>>> {
889         match iommu_type {
890             IommuDevType::NoIommu => {
891                 // One VFIO container is used for all IOMMU disabled groups.
892                 if let Some(container) = &self.no_iommu_container {
893                     Ok(container.clone())
894                 } else {
895                     let container = Arc::new(Mutex::new(VfioContainer::new()?));
896                     self.no_iommu_container = Some(container.clone());
897                     Ok(container)
898                 }
899             }
900             IommuDevType::VirtioIommu => {
901                 let path = sysfspath.ok_or(VfioError::InvalidPath)?;
902                 let group_id = VfioGroup::get_group_id(path)?;
903 
904                 // One VFIO container is used for all devices that belong to one VFIO group.
905                 // NOTE: vfio_wrapper relies on each container containing exactly one group.
906                 if let Some(container) = self
907                     .iommu_containers
908                     .iter()
909                     .find(|container| container.lock().is_group_set(group_id))
910                 {
911                     Ok(container.clone())
912                 } else {
913                     let container = Arc::new(Mutex::new(VfioContainer::new()?));
914                     self.iommu_containers.push(container.clone());
915                     Ok(container)
916                 }
917             }
918             IommuDevType::CoIommu => {
919                 // One VFIO container is used for devices attached to CoIommu
920                 if let Some(container) = &self.coiommu_container {
921                     Ok(container.clone())
922                 } else {
923                     let container = Arc::new(Mutex::new(VfioContainer::new()?));
924                     self.coiommu_container = Some(container.clone());
925                     Ok(container)
926                 }
927             }
928             IommuDevType::PkvmPviommu => {
929                 // One VFIO container is used for devices attached to pKVM
930                 if let Some(container) = &self.pkvm_iommu_container {
931                     Ok(container.clone())
932                 } else {
933                     let container = Arc::new(Mutex::new(VfioContainer::new()?));
934                     self.pkvm_iommu_container = Some(container.clone());
935                     Ok(container)
936                 }
937             }
938         }
939     }
940 }
941 
942 /// Vfio Irq type used to enable/disable/mask/unmask vfio irq
943 pub enum VfioIrqType {
944     Intx,
945     Msi,
946     Msix,
947 }
948 
949 /// Vfio Irq information used to assign and enable/disable/mask/unmask vfio irq
950 pub struct VfioIrq {
951     pub flags: u32,
952     pub index: u32,
953 }
954 
955 /// Address on VFIO memory region.
956 #[derive(Debug, Default, Clone)]
957 pub struct VfioRegionAddr {
958     /// region number.
959     pub index: usize,
960     /// offset in the region.
961     pub addr: u64,
962 }
963 
964 #[derive(Debug)]
965 pub struct VfioRegion {
966     // flags for this region: read/write/mmap
967     flags: u32,
968     size: u64,
969     // region offset used to read/write with vfio device descriptor
970     offset: u64,
971     // vectors for mmap offset and size
972     mmaps: Vec<vfio_region_sparse_mmap_area>,
973     // type and subtype for cap type
974     cap_info: Option<(u32, u32)>,
975 }
976 
977 /// Vfio device for exposing regions which could be read/write to kernel vfio device.
978 pub struct VfioDevice {
979     dev: File,
980     name: String,
981     container: Arc<Mutex<VfioContainer>>,
982     dev_type: VfioDeviceType,
983     group_descriptor: RawDescriptor,
984     group_id: u32,
985     // vec for vfio device's regions
986     regions: Vec<VfioRegion>,
987     num_irqs: u32,
988 
989     iova_alloc: Arc<Mutex<AddressAllocator>>,
990     dt_symbol: Option<String>,
991     pviommu: Option<(Arc<Mutex<KvmVfioPviommu>>, Vec<u32>)>,
992 }
993 
994 impl VfioDevice {
995     /// Create a new vfio device, then guest read/write on this device could be
996     /// transfered into kernel vfio.
997     /// sysfspath specify the vfio device path in sys file system.
new_passthrough<P: AsRef<Path>>( sysfspath: &P, vm: &impl Vm, container: Arc<Mutex<VfioContainer>>, iommu_dev: IommuDevType, dt_symbol: Option<String>, ) -> Result<Self>998     pub fn new_passthrough<P: AsRef<Path>>(
999         sysfspath: &P,
1000         vm: &impl Vm,
1001         container: Arc<Mutex<VfioContainer>>,
1002         iommu_dev: IommuDevType,
1003         dt_symbol: Option<String>,
1004     ) -> Result<Self> {
1005         let group_id = VfioGroup::get_group_id(sysfspath)?;
1006 
1007         let group = container
1008             .lock()
1009             .get_group_with_vm(group_id, vm, iommu_dev)?;
1010         let name_osstr = sysfspath
1011             .as_ref()
1012             .file_name()
1013             .ok_or(VfioError::InvalidPath)?;
1014         let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
1015         let name = String::from(name_str);
1016         let dev = group.lock().get_device(&name)?;
1017         let (dev_info, dev_type) = Self::get_device_info(&dev)?;
1018         let regions = Self::get_regions(&dev, dev_info.num_regions)?;
1019         group.lock().add_device_num();
1020         let group_descriptor = group.lock().as_raw_descriptor();
1021 
1022         let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
1023         let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
1024             .map_err(VfioError::Resources)?;
1025 
1026         let pviommu = if matches!(iommu_dev, IommuDevType::PkvmPviommu) {
1027             // We currently have a 1-to-1 mapping between pvIOMMUs and VFIO devices.
1028             let pviommu = KvmVfioPviommu::new(vm)?;
1029 
1030             let vsids_len = KvmVfioPviommu::get_sid_count(vm, &dev)?.try_into().unwrap();
1031             let max_vsid = u32::MAX.try_into().unwrap();
1032             let random_vsids = sample(&mut thread_rng(), max_vsid, vsids_len).into_iter();
1033             let vsids = Vec::from_iter(random_vsids.map(|v| u32::try_from(v).unwrap()));
1034             for (i, vsid) in vsids.iter().enumerate() {
1035                 pviommu.attach(&dev, i.try_into().unwrap(), *vsid)?;
1036             }
1037 
1038             Some((Arc::new(Mutex::new(pviommu)), vsids))
1039         } else {
1040             None
1041         };
1042 
1043         Ok(VfioDevice {
1044             dev,
1045             name,
1046             container,
1047             dev_type,
1048             group_descriptor,
1049             group_id,
1050             regions,
1051             num_irqs: dev_info.num_irqs,
1052             iova_alloc: Arc::new(Mutex::new(iova_alloc)),
1053             dt_symbol,
1054             pviommu,
1055         })
1056     }
1057 
new<P: AsRef<Path>>( sysfspath: &P, container: Arc<Mutex<VfioContainer>>, ) -> Result<Self>1058     pub fn new<P: AsRef<Path>>(
1059         sysfspath: &P,
1060         container: Arc<Mutex<VfioContainer>>,
1061     ) -> Result<Self> {
1062         let group_id = VfioGroup::get_group_id(sysfspath)?;
1063         let group = container.lock().get_group(group_id)?;
1064         let name_osstr = sysfspath
1065             .as_ref()
1066             .file_name()
1067             .ok_or(VfioError::InvalidPath)?;
1068         let name_str = name_osstr.to_str().ok_or(VfioError::InvalidPath)?;
1069         let name = String::from(name_str);
1070 
1071         let dev = match group.lock().get_device(&name) {
1072             Ok(dev) => dev,
1073             Err(e) => {
1074                 container.lock().remove_group(group_id, false);
1075                 return Err(e);
1076             }
1077         };
1078         let (dev_info, dev_type) = match Self::get_device_info(&dev) {
1079             Ok(dev_info) => dev_info,
1080             Err(e) => {
1081                 container.lock().remove_group(group_id, false);
1082                 return Err(e);
1083             }
1084         };
1085         let regions = match Self::get_regions(&dev, dev_info.num_regions) {
1086             Ok(regions) => regions,
1087             Err(e) => {
1088                 container.lock().remove_group(group_id, false);
1089                 return Err(e);
1090             }
1091         };
1092         group.lock().add_device_num();
1093         let group_descriptor = group.lock().as_raw_descriptor();
1094 
1095         let iova_ranges = container.lock().vfio_iommu_iova_get_iova_ranges()?;
1096         let iova_alloc = AddressAllocator::new_from_list(iova_ranges, None, None)
1097             .map_err(VfioError::Resources)?;
1098 
1099         Ok(VfioDevice {
1100             dev,
1101             name,
1102             container,
1103             dev_type,
1104             group_descriptor,
1105             group_id,
1106             regions,
1107             num_irqs: dev_info.num_irqs,
1108             iova_alloc: Arc::new(Mutex::new(iova_alloc)),
1109             dt_symbol: None,
1110             pviommu: None,
1111         })
1112     }
1113 
1114     /// Returns the file for this device.
dev_file(&self) -> &File1115     pub fn dev_file(&self) -> &File {
1116         &self.dev
1117     }
1118 
1119     /// Returns PCI device name, formatted as BUS:DEVICE.FUNCTION string.
device_name(&self) -> &String1120     pub fn device_name(&self) -> &String {
1121         &self.name
1122     }
1123 
1124     /// Returns the type of this VFIO device.
device_type(&self) -> VfioDeviceType1125     pub fn device_type(&self) -> VfioDeviceType {
1126         self.dev_type
1127     }
1128 
1129     /// Returns the DT symbol (node label) of this VFIO device.
dt_symbol(&self) -> Option<&str>1130     pub fn dt_symbol(&self) -> Option<&str> {
1131         self.dt_symbol.as_deref()
1132     }
1133 
1134     /// Returns the type and indentifier (if applicable) of the IOMMU used by this VFIO device and
1135     /// its master IDs.
iommu(&self) -> Option<(IommuDevType, Option<u32>, &[u32])>1136     pub fn iommu(&self) -> Option<(IommuDevType, Option<u32>, &[u32])> {
1137         // We currently only report IommuDevType::PkvmPviommu.
1138         if let Some((ref pviommu, ref ids)) = self.pviommu {
1139             Some((
1140                 IommuDevType::PkvmPviommu,
1141                 Some(pviommu.lock().id()),
1142                 ids.as_ref(),
1143             ))
1144         } else {
1145             None
1146         }
1147     }
1148 
1149     /// enter the device's low power state
pm_low_power_enter(&self) -> Result<()>1150     pub fn pm_low_power_enter(&self) -> Result<()> {
1151         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
1152         device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
1153         device_feature[0].flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY;
1154         // SAFETY:
1155         // Safe as we are the owner of self and power_management which are valid value
1156         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE, &device_feature[0]) };
1157         if ret < 0 {
1158             Err(VfioError::VfioPmLowPowerEnter(get_error()))
1159         } else {
1160             Ok(())
1161         }
1162     }
1163 
1164     /// enter the device's low power state with wakeup notification
pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()>1165     pub fn pm_low_power_enter_with_wakeup(&self, wakeup_evt: Event) -> Result<()> {
1166         let payload = vfio_device_low_power_entry_with_wakeup {
1167             wakeup_eventfd: wakeup_evt.as_raw_descriptor(),
1168             reserved: 0,
1169         };
1170         let payload_size = mem::size_of::<vfio_device_low_power_entry_with_wakeup>();
1171         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(payload_size);
1172         device_feature[0].argsz = (mem::size_of::<vfio_device_feature>() + payload_size) as u32;
1173         device_feature[0].flags =
1174             VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP;
1175         // SAFETY:
1176         // Safe as we know vfio_device_low_power_entry_with_wakeup has two 32-bit int fields
1177         unsafe {
1178             device_feature[0]
1179                 .data
1180                 .as_mut_slice(payload_size)
1181                 .copy_from_slice(
1182                     mem::transmute::<vfio_device_low_power_entry_with_wakeup, [u8; 8]>(payload)
1183                         .as_slice(),
1184                 );
1185         }
1186         // SAFETY:
1187         // Safe as we are the owner of self and power_management which are valid value
1188         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE, &device_feature[0]) };
1189         if ret < 0 {
1190             Err(VfioError::VfioPmLowPowerEnter(get_error()))
1191         } else {
1192             Ok(())
1193         }
1194     }
1195 
1196     /// exit the device's low power state
pm_low_power_exit(&self) -> Result<()>1197     pub fn pm_low_power_exit(&self) -> Result<()> {
1198         let mut device_feature = vec_with_array_field::<vfio_device_feature, u8>(0);
1199         device_feature[0].argsz = mem::size_of::<vfio_device_feature>() as u32;
1200         device_feature[0].flags = VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_LOW_POWER_EXIT;
1201         // SAFETY:
1202         // Safe as we are the owner of self and power_management which are valid value
1203         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_FEATURE, &device_feature[0]) };
1204         if ret < 0 {
1205             Err(VfioError::VfioPmLowPowerExit(get_error()))
1206         } else {
1207             Ok(())
1208         }
1209     }
1210 
1211     /// call _DSM from the device's ACPI table
acpi_dsm(&self, args: &[u8]) -> Result<Vec<u8>>1212     pub fn acpi_dsm(&self, args: &[u8]) -> Result<Vec<u8>> {
1213         let count = args.len();
1214         let mut dsm = vec_with_array_field::<vfio_acpi_dsm, u8>(count);
1215         dsm[0].argsz = (mem::size_of::<vfio_acpi_dsm>() + mem::size_of_val(args)) as u32;
1216         dsm[0].padding = 0;
1217         // SAFETY:
1218         // Safe as we allocated enough space to hold args
1219         unsafe {
1220             dsm[0].args.as_mut_slice(count).clone_from_slice(args);
1221         }
1222         // SAFETY:
1223         // Safe as we are the owner of self and dsm which are valid value
1224         let ret = unsafe { ioctl_with_mut_ref(&self.dev, VFIO_DEVICE_ACPI_DSM, &mut dsm[0]) };
1225         if ret < 0 {
1226             Err(VfioError::VfioAcpiDsm(get_error()))
1227         } else {
1228             // SAFETY:
1229             // Safe as we allocated enough space to hold args
1230             let res = unsafe { dsm[0].args.as_slice(count) };
1231             Ok(res.to_vec())
1232         }
1233     }
1234 
1235     /// Enable vfio device's ACPI notifications and associate EventFD with device.
acpi_notification_evt_enable( &self, acpi_notification_eventfd: &Event, index: u32, ) -> Result<()>1236     pub fn acpi_notification_evt_enable(
1237         &self,
1238         acpi_notification_eventfd: &Event,
1239         index: u32,
1240     ) -> Result<()> {
1241         let u32_size = mem::size_of::<u32>();
1242         let count = 1;
1243 
1244         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
1245         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
1246         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
1247         irq_set[0].index = index;
1248         irq_set[0].start = 0;
1249         irq_set[0].count = count as u32;
1250 
1251         // SAFETY:
1252         // It is safe as enough space is reserved through vec_with_array_field(u32)<count>.
1253         let data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
1254         data.copy_from_slice(&acpi_notification_eventfd.as_raw_descriptor().to_ne_bytes()[..]);
1255 
1256         // SAFETY:
1257         // Safe as we are the owner of self and irq_set which are valid value
1258         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1259         if ret < 0 {
1260             Err(VfioError::VfioAcpiNotificationEnable(get_error()))
1261         } else {
1262             Ok(())
1263         }
1264     }
1265 
1266     /// Disable vfio device's ACPI notification and disconnect EventFd with device.
acpi_notification_disable(&self, index: u32) -> Result<()>1267     pub fn acpi_notification_disable(&self, index: u32) -> Result<()> {
1268         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1269         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1270         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1271         irq_set[0].index = index;
1272         irq_set[0].start = 0;
1273         irq_set[0].count = 0;
1274 
1275         // SAFETY:
1276         // Safe as we are the owner of self and irq_set which are valid value
1277         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1278         if ret < 0 {
1279             Err(VfioError::VfioAcpiNotificationDisable(get_error()))
1280         } else {
1281             Ok(())
1282         }
1283     }
1284 
1285     /// Test vfio device's ACPI notification by simulating hardware triggering.
1286     /// When the signaling mechanism is set, the VFIO_IRQ_SET_DATA_BOOL can be used with
1287     /// VFIO_IRQ_SET_ACTION_TRIGGER to perform kernel level interrupt loopback testing.
acpi_notification_test(&self, index: u32, val: u32) -> Result<()>1288     pub fn acpi_notification_test(&self, index: u32, val: u32) -> Result<()> {
1289         let u32_size = mem::size_of::<u32>();
1290         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1291         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + u32_size) as u32;
1292         irq_set[0].flags = VFIO_IRQ_SET_DATA_BOOL | VFIO_IRQ_SET_ACTION_TRIGGER;
1293         irq_set[0].index = index;
1294         irq_set[0].start = 0;
1295         irq_set[0].count = 1;
1296 
1297         // SAFETY:
1298         // It is safe as enough space is reserved through vec_with_array_field(u32)<count>.
1299         let data = unsafe { irq_set[0].data.as_mut_slice(u32_size) };
1300         data.copy_from_slice(&val.to_ne_bytes()[..]);
1301 
1302         // SAFETY:
1303         // Safe as we are the owner of self and irq_set which are valid value
1304         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1305         if ret < 0 {
1306             Err(VfioError::VfioAcpiNotificationTest(get_error()))
1307         } else {
1308             Ok(())
1309         }
1310     }
1311 
1312     /// Enable vfio device's irq and associate Irqfd Event with device.
1313     /// When MSIx is enabled, multi vectors will be supported, and vectors starting from subindex to
1314     /// subindex + descriptors length will be assigned with irqfd in the descriptors array.
1315     /// when index = VFIO_PCI_REQ_IRQ_INDEX, kernel vfio will trigger this event when physical
1316     /// device is removed.
1317     /// If descriptor is None, -1 is assigned to the irq. A value of -1 is used to either de-assign
1318     /// interrupts if already assigned or skip un-assigned interrupts.
irq_enable( &self, descriptors: &[Option<&Event>], index: u32, subindex: u32, ) -> Result<()>1319     pub fn irq_enable(
1320         &self,
1321         descriptors: &[Option<&Event>],
1322         index: u32,
1323         subindex: u32,
1324     ) -> Result<()> {
1325         let count = descriptors.len();
1326         let u32_size = mem::size_of::<u32>();
1327         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(count);
1328         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + count * u32_size) as u32;
1329         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
1330         irq_set[0].index = index;
1331         irq_set[0].start = subindex;
1332         irq_set[0].count = count as u32;
1333 
1334         // SAFETY:
1335         // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data
1336         // is u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1337         // together as u32. It is safe as enough space is reserved through
1338         // vec_with_array_field(u32)<count>.
1339         let mut data = unsafe { irq_set[0].data.as_mut_slice(count * u32_size) };
1340         for descriptor in descriptors.iter().take(count) {
1341             let (left, right) = data.split_at_mut(u32_size);
1342             match descriptor {
1343                 Some(fd) => left.copy_from_slice(&fd.as_raw_descriptor().to_ne_bytes()[..]),
1344                 None => left.copy_from_slice(&(-1i32).to_ne_bytes()[..]),
1345             }
1346             data = right;
1347         }
1348 
1349         // SAFETY:
1350         // Safe as we are the owner of self and irq_set which are valid value
1351         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1352         if ret < 0 {
1353             Err(VfioError::VfioIrqEnable(get_error()))
1354         } else {
1355             Ok(())
1356         }
1357     }
1358 
1359     /// When intx is enabled, irqfd is used to trigger a level interrupt into guest, resample irqfd
1360     /// is used to get guest EOI notification.
1361     /// When host hw generates interrupt, vfio irq handler in host kernel receive and handle it,
1362     /// this handler disable hw irq first, then trigger irqfd to inject interrupt into guest. When
1363     /// resample irqfd is triggered by guest EOI, vfio kernel could enable hw irq, so hw could
1364     /// generate another interrupts.
1365     /// This function enable resample irqfd and let vfio kernel could get EOI notification.
1366     ///
1367     /// descriptor: should be resample IrqFd.
resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()>1368     pub fn resample_virq_enable(&self, descriptor: &Event, index: u32) -> Result<()> {
1369         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(1);
1370         irq_set[0].argsz = (mem::size_of::<vfio_irq_set>() + mem::size_of::<u32>()) as u32;
1371         irq_set[0].flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
1372         irq_set[0].index = index;
1373         irq_set[0].start = 0;
1374         irq_set[0].count = 1;
1375 
1376         {
1377             // SAFETY:
1378             // irq_set.data could be none, bool or descriptor according to flags, so irq_set.data is
1379             // u8 default, here irq_set.data is descriptor as u32, so 4 default u8 are combined
1380             // together as u32. It is safe as enough space is reserved through
1381             // vec_with_array_field(u32)<1>.
1382             let descriptors = unsafe { irq_set[0].data.as_mut_slice(4) };
1383             descriptors.copy_from_slice(&descriptor.as_raw_descriptor().to_le_bytes()[..]);
1384         }
1385 
1386         // SAFETY:
1387         // Safe as we are the owner of self and irq_set which are valid value
1388         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1389         if ret < 0 {
1390             Err(VfioError::VfioIrqEnable(get_error()))
1391         } else {
1392             Ok(())
1393         }
1394     }
1395 
1396     /// disable vfio device's irq and disconnect Irqfd Event with device
irq_disable(&self, index: u32) -> Result<()>1397     pub fn irq_disable(&self, index: u32) -> Result<()> {
1398         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1399         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1400         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
1401         irq_set[0].index = index;
1402         irq_set[0].start = 0;
1403         irq_set[0].count = 0;
1404 
1405         // SAFETY:
1406         // Safe as we are the owner of self and irq_set which are valid value
1407         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1408         if ret < 0 {
1409             Err(VfioError::VfioIrqDisable(get_error()))
1410         } else {
1411             Ok(())
1412         }
1413     }
1414 
1415     /// Unmask vfio device irq
irq_unmask(&self, index: u32) -> Result<()>1416     pub fn irq_unmask(&self, index: u32) -> Result<()> {
1417         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1418         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1419         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
1420         irq_set[0].index = index;
1421         irq_set[0].start = 0;
1422         irq_set[0].count = 1;
1423 
1424         // SAFETY:
1425         // Safe as we are the owner of self and irq_set which are valid value
1426         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1427         if ret < 0 {
1428             Err(VfioError::VfioIrqUnmask(get_error()))
1429         } else {
1430             Ok(())
1431         }
1432     }
1433 
1434     /// Mask vfio device irq
irq_mask(&self, index: u32) -> Result<()>1435     pub fn irq_mask(&self, index: u32) -> Result<()> {
1436         let mut irq_set = vec_with_array_field::<vfio_irq_set, u32>(0);
1437         irq_set[0].argsz = mem::size_of::<vfio_irq_set>() as u32;
1438         irq_set[0].flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
1439         irq_set[0].index = index;
1440         irq_set[0].start = 0;
1441         irq_set[0].count = 1;
1442 
1443         // SAFETY:
1444         // Safe as we are the owner of self and irq_set which are valid value
1445         let ret = unsafe { ioctl_with_ref(&self.dev, VFIO_DEVICE_SET_IRQS, &irq_set[0]) };
1446         if ret < 0 {
1447             Err(VfioError::VfioIrqMask(get_error()))
1448         } else {
1449             Ok(())
1450         }
1451     }
1452 
1453     /// Get and validate VFIO device information.
get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)>1454     fn get_device_info(device_file: &File) -> Result<(vfio_device_info, VfioDeviceType)> {
1455         let mut dev_info = vfio_device_info {
1456             argsz: mem::size_of::<vfio_device_info>() as u32,
1457             flags: 0,
1458             num_regions: 0,
1459             num_irqs: 0,
1460             ..Default::default()
1461         };
1462 
1463         // SAFETY:
1464         // Safe as we are the owner of device_file and dev_info which are valid value,
1465         // and we verify the return value.
1466         let ret = unsafe { ioctl_with_mut_ref(device_file, VFIO_DEVICE_GET_INFO, &mut dev_info) };
1467         if ret < 0 {
1468             return Err(VfioError::VfioDeviceGetInfo(get_error()));
1469         }
1470 
1471         let dev_type = if (dev_info.flags & VFIO_DEVICE_FLAGS_PCI) != 0 {
1472             if dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1
1473                 || dev_info.num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1
1474             {
1475                 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1476             }
1477 
1478             VfioDeviceType::Pci
1479         } else if (dev_info.flags & VFIO_DEVICE_FLAGS_PLATFORM) != 0 {
1480             VfioDeviceType::Platform
1481         } else {
1482             return Err(VfioError::UnknownDeviceType(dev_info.flags));
1483         };
1484 
1485         Ok((dev_info, dev_type))
1486     }
1487 
1488     /// Query interrupt information
1489     /// return: Vector of interrupts information, each of which contains flags and index
get_irqs(&self) -> Result<Vec<VfioIrq>>1490     pub fn get_irqs(&self) -> Result<Vec<VfioIrq>> {
1491         let mut irqs: Vec<VfioIrq> = Vec::new();
1492 
1493         for i in 0..self.num_irqs {
1494             let argsz = mem::size_of::<vfio_irq_info>() as u32;
1495             let mut irq_info = vfio_irq_info {
1496                 argsz,
1497                 flags: 0,
1498                 index: i,
1499                 count: 0,
1500             };
1501             // SAFETY:
1502             // Safe as we are the owner of dev and irq_info which are valid value,
1503             // and we verify the return value.
1504             let ret = unsafe {
1505                 ioctl_with_mut_ref(self.device_file(), VFIO_DEVICE_GET_IRQ_INFO, &mut irq_info)
1506             };
1507             if ret < 0 || irq_info.count != 1 {
1508                 return Err(VfioError::VfioDeviceGetInfo(get_error()));
1509             }
1510 
1511             let irq = VfioIrq {
1512                 flags: irq_info.flags,
1513                 index: irq_info.index,
1514             };
1515             irqs.push(irq);
1516         }
1517         Ok(irqs)
1518     }
1519 
1520     #[allow(clippy::cast_ptr_alignment)]
get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>>1521     fn get_regions(dev: &File, num_regions: u32) -> Result<Vec<VfioRegion>> {
1522         let mut regions: Vec<VfioRegion> = Vec::new();
1523         for i in 0..num_regions {
1524             let argsz = mem::size_of::<vfio_region_info>() as u32;
1525             let mut reg_info = vfio_region_info {
1526                 argsz,
1527                 flags: 0,
1528                 index: i,
1529                 cap_offset: 0,
1530                 size: 0,
1531                 offset: 0,
1532             };
1533             let ret =
1534                 // SAFETY:
1535                 // Safe as we are the owner of dev and reg_info which are valid value,
1536                 // and we verify the return value.
1537                 unsafe { ioctl_with_mut_ref(dev, VFIO_DEVICE_GET_REGION_INFO, &mut reg_info) };
1538             if ret < 0 {
1539                 continue;
1540             }
1541 
1542             let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::new();
1543             let mut cap_info: Option<(u32, u32)> = None;
1544             if reg_info.argsz > argsz {
1545                 let cap_len: usize = (reg_info.argsz - argsz) as usize;
1546                 let mut region_with_cap =
1547                     vec_with_array_field::<vfio_region_info_with_cap, u8>(cap_len);
1548                 region_with_cap[0].region_info.argsz = reg_info.argsz;
1549                 region_with_cap[0].region_info.flags = 0;
1550                 region_with_cap[0].region_info.index = i;
1551                 region_with_cap[0].region_info.cap_offset = 0;
1552                 region_with_cap[0].region_info.size = 0;
1553                 region_with_cap[0].region_info.offset = 0;
1554                 // SAFETY:
1555                 // Safe as we are the owner of dev and region_info which are valid value,
1556                 // and we verify the return value.
1557                 let ret = unsafe {
1558                     ioctl_with_mut_ref(
1559                         dev,
1560                         VFIO_DEVICE_GET_REGION_INFO,
1561                         &mut (region_with_cap[0].region_info),
1562                     )
1563                 };
1564                 if ret < 0 {
1565                     return Err(VfioError::VfioDeviceGetRegionInfo(get_error()));
1566                 }
1567 
1568                 if region_with_cap[0].region_info.flags & VFIO_REGION_INFO_FLAG_CAPS == 0 {
1569                     continue;
1570                 }
1571 
1572                 let cap_header_sz = mem::size_of::<vfio_info_cap_header>() as u32;
1573                 let mmap_cap_sz = mem::size_of::<vfio_region_info_cap_sparse_mmap>() as u32;
1574                 let mmap_area_sz = mem::size_of::<vfio_region_sparse_mmap_area>() as u32;
1575                 let type_cap_sz = mem::size_of::<vfio_region_info_cap_type>() as u32;
1576                 let region_info_sz = reg_info.argsz;
1577 
1578                 // region_with_cap[0].cap_info may contain many structures, like
1579                 // vfio_region_info_cap_sparse_mmap struct or vfio_region_info_cap_type struct.
1580                 // Both of them begin with vfio_info_cap_header, so we will get individual cap from
1581                 // vfio_into_cap_header.
1582                 // Go through all the cap structs.
1583                 let info_ptr = region_with_cap.as_ptr() as *mut u8;
1584                 let mut offset = region_with_cap[0].region_info.cap_offset;
1585                 while offset != 0 {
1586                     if offset + cap_header_sz > region_info_sz {
1587                         break;
1588                     }
1589                     // SAFETY:
1590                     // Safe, as cap_header struct is in this function allocated region_with_cap
1591                     // vec.
1592                     let cap_ptr = unsafe { info_ptr.offset(offset as isize) };
1593                     // SAFETY:
1594                     // Safe, as cap_header struct is in this function allocated region_with_cap
1595                     // vec.
1596                     let cap_header = unsafe { &*(cap_ptr as *const vfio_info_cap_header) };
1597                     if cap_header.id as u32 == VFIO_REGION_INFO_CAP_SPARSE_MMAP {
1598                         if offset + mmap_cap_sz > region_info_sz {
1599                             break;
1600                         }
1601                         // cap_ptr is vfio_region_info_cap_sparse_mmap here
1602                         let sparse_mmap =
1603                             // SAFETY:
1604                             // Safe, this vfio_region_info_cap_sparse_mmap is in this function
1605                             // allocated region_with_cap vec.
1606                             unsafe { &*(cap_ptr as *const vfio_region_info_cap_sparse_mmap) };
1607 
1608                         let area_num = sparse_mmap.nr_areas;
1609                         if offset + mmap_cap_sz + area_num * mmap_area_sz > region_info_sz {
1610                             break;
1611                         }
1612                         let areas =
1613                             // SAFETY:
1614                             // Safe, these vfio_region_sparse_mmap_area are in this function allocated
1615                             // region_with_cap vec.
1616                             unsafe { sparse_mmap.areas.as_slice(sparse_mmap.nr_areas as usize) };
1617                         for area in areas.iter() {
1618                             mmaps.push(*area);
1619                         }
1620                     } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_TYPE {
1621                         if offset + type_cap_sz > region_info_sz {
1622                             break;
1623                         }
1624                         // cap_ptr is vfio_region_info_cap_type here
1625                         let cap_type_info =
1626                             // SAFETY:
1627                             // Safe, this vfio_region_info_cap_type is in this function allocated
1628                             // region_with_cap vec
1629                             unsafe { &*(cap_ptr as *const vfio_region_info_cap_type) };
1630 
1631                         cap_info = Some((cap_type_info.type_, cap_type_info.subtype));
1632                     } else if cap_header.id as u32 == VFIO_REGION_INFO_CAP_MSIX_MAPPABLE {
1633                         mmaps.push(vfio_region_sparse_mmap_area {
1634                             offset: 0,
1635                             size: region_with_cap[0].region_info.size,
1636                         });
1637                     }
1638 
1639                     offset = cap_header.next;
1640                 }
1641             } else if reg_info.flags & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1642                 mmaps.push(vfio_region_sparse_mmap_area {
1643                     offset: 0,
1644                     size: reg_info.size,
1645                 });
1646             }
1647 
1648             let region = VfioRegion {
1649                 flags: reg_info.flags,
1650                 size: reg_info.size,
1651                 offset: reg_info.offset,
1652                 mmaps,
1653                 cap_info,
1654             };
1655             regions.push(region);
1656         }
1657 
1658         Ok(regions)
1659     }
1660 
1661     /// get a region's flag
1662     /// the return's value may conatin:
1663     ///     VFIO_REGION_INFO_FLAG_READ:  region supports read
1664     ///     VFIO_REGION_INFO_FLAG_WRITE: region supports write
1665     ///     VFIO_REGION_INFO_FLAG_MMAP:  region supports mmap
1666     ///     VFIO_REGION_INFO_FLAG_CAPS:  region's info supports caps
get_region_flags(&self, index: usize) -> u321667     pub fn get_region_flags(&self, index: usize) -> u32 {
1668         match self.regions.get(index) {
1669             Some(v) => v.flags,
1670             None => {
1671                 warn!("get_region_flags() with invalid index: {}", index);
1672                 0
1673             }
1674         }
1675     }
1676 
1677     /// get a region's offset
1678     /// return: Region offset from the start of vfio device descriptor
get_region_offset(&self, index: usize) -> u641679     pub fn get_region_offset(&self, index: usize) -> u64 {
1680         match self.regions.get(index) {
1681             Some(v) => v.offset,
1682             None => {
1683                 warn!("get_region_offset with invalid index: {}", index);
1684                 0
1685             }
1686         }
1687     }
1688 
1689     /// get a region's size
1690     /// return: Region size from the start of vfio device descriptor
get_region_size(&self, index: usize) -> u641691     pub fn get_region_size(&self, index: usize) -> u64 {
1692         match self.regions.get(index) {
1693             Some(v) => v.size,
1694             None => {
1695                 warn!("get_region_size with invalid index: {}", index);
1696                 0
1697             }
1698         }
1699     }
1700 
1701     /// get a number of regions
1702     /// return: Number of regions of vfio device descriptor
get_region_count(&self) -> usize1703     pub fn get_region_count(&self) -> usize {
1704         self.regions.len()
1705     }
1706 
1707     /// get a region's mmap info vector
get_region_mmap(&self, index: usize) -> Vec<vfio_region_sparse_mmap_area>1708     pub fn get_region_mmap(&self, index: usize) -> Vec<vfio_region_sparse_mmap_area> {
1709         match self.regions.get(index) {
1710             Some(v) => v.mmaps.clone(),
1711             None => {
1712                 warn!("get_region_mmap with invalid index: {}", index);
1713                 Vec::new()
1714             }
1715         }
1716     }
1717 
1718     /// find the specified cap type in device regions
1719     /// Input:
1720     ///      type_:  cap type
1721     ///      sub_type: cap sub_type
1722     /// Output:
1723     ///     None: device doesn't have the specified cap type
1724     ///     Some((bar_index, region_size)): device has the specified cap type, return region's
1725     ///                                     index and size
get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)>1726     pub fn get_cap_type_info(&self, type_: u32, sub_type: u32) -> Option<(u32, u64)> {
1727         for (index, region) in self.regions.iter().enumerate() {
1728             if let Some(cap_info) = &region.cap_info {
1729                 if cap_info.0 == type_ && cap_info.1 == sub_type {
1730                     return Some((index as u32, region.size));
1731                 }
1732             }
1733         }
1734 
1735         None
1736     }
1737 
1738     /// Returns file offset corresponding to the given `VfioRegionAddr`.
1739     /// The offset can be used when reading/writing the VFIO device's FD directly.
get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64>1740     pub fn get_offset_for_addr(&self, addr: &VfioRegionAddr) -> Result<u64> {
1741         let region = self
1742             .regions
1743             .get(addr.index)
1744             .ok_or(VfioError::InvalidIndex(addr.index))?;
1745         Ok(region.offset + addr.addr)
1746     }
1747 
1748     /// Read region's data from VFIO device into buf
1749     /// index: region num
1750     /// buf: data destination and buf length is read size
1751     /// addr: offset in the region
region_read(&self, index: usize, buf: &mut [u8], addr: u64)1752     pub fn region_read(&self, index: usize, buf: &mut [u8], addr: u64) {
1753         let stub: &VfioRegion = self
1754             .regions
1755             .get(index)
1756             .unwrap_or_else(|| panic!("tried to read VFIO with an invalid index: {}", index));
1757 
1758         let size = buf.len() as u64;
1759         if size > stub.size || addr + size > stub.size {
1760             panic!(
1761                 "tried to read VFIO region with invalid arguments: index={}, addr=0x{:x}, size=0x{:x}",
1762                 index, addr, size
1763             );
1764         }
1765 
1766         self.dev
1767             .read_exact_at(buf, stub.offset + addr)
1768             .unwrap_or_else(|e| {
1769                 panic!(
1770                     "failed to read region: index={}, addr=0x{:x}, error={}",
1771                     index, addr, e
1772                 )
1773             });
1774     }
1775 
1776     /// Reads a value from the specified `VfioRegionAddr.addr` + `offset`.
region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T1777     pub fn region_read_from_addr<T: FromBytes>(&self, addr: &VfioRegionAddr, offset: u64) -> T {
1778         let mut val = mem::MaybeUninit::zeroed();
1779         let buf =
1780             // SAFETY:
1781             // Safe because we have zero-initialized `size_of::<T>()` bytes.
1782             unsafe { slice::from_raw_parts_mut(val.as_mut_ptr() as *mut u8, mem::size_of::<T>()) };
1783         self.region_read(addr.index, buf, addr.addr + offset);
1784         // SAFETY:
1785         // Safe because any bit pattern is valid for a type that implements FromBytes.
1786         unsafe { val.assume_init() }
1787     }
1788 
1789     /// write the data from buf into a vfio device region
1790     /// index: region num
1791     /// buf: data src and buf length is write size
1792     /// addr: offset in the region
region_write(&self, index: usize, buf: &[u8], addr: u64)1793     pub fn region_write(&self, index: usize, buf: &[u8], addr: u64) {
1794         let stub: &VfioRegion = self
1795             .regions
1796             .get(index)
1797             .unwrap_or_else(|| panic!("tried to write VFIO with an invalid index: {}", index));
1798 
1799         let size = buf.len() as u64;
1800         if size > stub.size
1801             || addr + size > stub.size
1802             || (stub.flags & VFIO_REGION_INFO_FLAG_WRITE) == 0
1803         {
1804             panic!(
1805                 "tried to write VFIO region with invalid arguments: index={}, addr=0x{:x}, size=0x{:x}",
1806                 index, addr, size
1807             );
1808         }
1809 
1810         self.dev
1811             .write_all_at(buf, stub.offset + addr)
1812             .unwrap_or_else(|e| {
1813                 panic!(
1814                     "failed to write region: index={}, addr=0x{:x}, error={}",
1815                     index, addr, e
1816                 )
1817             });
1818     }
1819 
1820     /// Writes data into the specified `VfioRegionAddr.addr` + `offset`.
region_write_to_addr<T: AsBytes>(&self, val: &T, addr: &VfioRegionAddr, offset: u64)1821     pub fn region_write_to_addr<T: AsBytes>(&self, val: &T, addr: &VfioRegionAddr, offset: u64) {
1822         self.region_write(addr.index, val.as_bytes(), addr.addr + offset);
1823     }
1824 
1825     /// get vfio device's descriptors which are passed into minijail process
keep_rds(&self) -> Vec<RawDescriptor>1826     pub fn keep_rds(&self) -> Vec<RawDescriptor> {
1827         vec![
1828             self.dev.as_raw_descriptor(),
1829             self.group_descriptor,
1830             self.container.lock().as_raw_descriptor(),
1831         ]
1832     }
1833 
1834     /// Add (iova, user_addr) map into vfio container iommu table
1835     /// # Safety
1836     ///
1837     /// The caller is responsible for determining the safety of the VFIO_IOMMU_MAP_DMA ioctl.
vfio_dma_map( &self, iova: u64, size: u64, user_addr: u64, write_en: bool, ) -> Result<()>1838     pub unsafe fn vfio_dma_map(
1839         &self,
1840         iova: u64,
1841         size: u64,
1842         user_addr: u64,
1843         write_en: bool,
1844     ) -> Result<()> {
1845         self.container
1846             .lock()
1847             .vfio_dma_map(iova, size, user_addr, write_en)
1848     }
1849 
1850     /// Remove (iova, user_addr) map from vfio container iommu table
vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()>1851     pub fn vfio_dma_unmap(&self, iova: u64, size: u64) -> Result<()> {
1852         self.container.lock().vfio_dma_unmap(iova, size)
1853     }
1854 
vfio_get_iommu_page_size_mask(&self) -> Result<u64>1855     pub fn vfio_get_iommu_page_size_mask(&self) -> Result<u64> {
1856         self.container.lock().vfio_get_iommu_page_size_mask()
1857     }
1858 
alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64>1859     pub fn alloc_iova(&self, size: u64, align_size: u64, alloc: Alloc) -> Result<u64> {
1860         self.iova_alloc
1861             .lock()
1862             .allocate_with_align(size, alloc, "alloc_iova".to_owned(), align_size)
1863             .map_err(VfioError::Resources)
1864     }
1865 
get_iova(&self, alloc: &Alloc) -> Option<AddressRange>1866     pub fn get_iova(&self, alloc: &Alloc) -> Option<AddressRange> {
1867         self.iova_alloc.lock().get(alloc).map(|res| res.0)
1868     }
1869 
release_iova(&self, alloc: Alloc) -> Result<AddressRange>1870     pub fn release_iova(&self, alloc: Alloc) -> Result<AddressRange> {
1871         self.iova_alloc
1872             .lock()
1873             .release(alloc)
1874             .map_err(VfioError::Resources)
1875     }
1876 
get_max_addr(&self) -> u641877     pub fn get_max_addr(&self) -> u64 {
1878         self.iova_alloc.lock().get_max_addr()
1879     }
1880 
1881     /// Gets the vfio device backing `File`.
device_file(&self) -> &File1882     pub fn device_file(&self) -> &File {
1883         &self.dev
1884     }
1885 
1886     /// close vfio device
close(&self)1887     pub fn close(&self) {
1888         self.container.lock().remove_group(self.group_id, true);
1889     }
1890 }
1891 
1892 pub struct VfioPciConfig {
1893     device: Arc<VfioDevice>,
1894 }
1895 
1896 impl VfioPciConfig {
new(device: Arc<VfioDevice>) -> Self1897     pub fn new(device: Arc<VfioDevice>) -> Self {
1898         VfioPciConfig { device }
1899     }
1900 
read_config<T: FromBytes>(&self, offset: u32) -> T1901     pub fn read_config<T: FromBytes>(&self, offset: u32) -> T {
1902         let mut buf = vec![0u8; std::mem::size_of::<T>()];
1903         self.device.region_read(
1904             VFIO_PCI_CONFIG_REGION_INDEX as usize,
1905             &mut buf,
1906             offset.into(),
1907         );
1908         T::read_from(&buf[..]).expect("failed to convert config data from slice")
1909     }
1910 
write_config<T: AsBytes>(&self, config: T, offset: u32)1911     pub fn write_config<T: AsBytes>(&self, config: T, offset: u32) {
1912         self.device.region_write(
1913             VFIO_PCI_CONFIG_REGION_INDEX as usize,
1914             config.as_bytes(),
1915             offset.into(),
1916         );
1917     }
1918 
1919     /// Set the VFIO device this config refers to as the bus master.
set_bus_master(&self)1920     pub fn set_bus_master(&self) {
1921         /// Constant definitions from `linux/pci_regs.h`.
1922         const PCI_COMMAND: u32 = 0x4;
1923         /// Enable bus mastering
1924         const PCI_COMMAND_MASTER: u16 = 0x4;
1925 
1926         let mut cmd: u16 = self.read_config(PCI_COMMAND);
1927 
1928         if cmd & PCI_COMMAND_MASTER != 0 {
1929             return;
1930         }
1931 
1932         cmd |= PCI_COMMAND_MASTER;
1933 
1934         self.write_config(cmd, PCI_COMMAND);
1935     }
1936 }
1937 
1938 impl AsRawDescriptor for VfioDevice {
as_raw_descriptor(&self) -> RawDescriptor1939     fn as_raw_descriptor(&self) -> RawDescriptor {
1940         self.dev.as_raw_descriptor()
1941     }
1942 }
1943