xref: /aosp_15_r20/external/crosvm/devices/src/pci/vfio_pci.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::cmp::max;
6 use std::cmp::Reverse;
7 use std::collections::BTreeMap;
8 use std::collections::BTreeSet;
9 use std::fs;
10 use std::path::Path;
11 use std::path::PathBuf;
12 use std::str::FromStr;
13 use std::sync::Arc;
14 
15 use acpi_tables::aml::Aml;
16 use base::debug;
17 use base::error;
18 use base::pagesize;
19 use base::warn;
20 use base::AsRawDescriptor;
21 use base::AsRawDescriptors;
22 use base::Event;
23 use base::EventToken;
24 use base::MemoryMapping;
25 use base::Protection;
26 use base::RawDescriptor;
27 use base::Tube;
28 use base::WaitContext;
29 use base::WorkerThread;
30 use hypervisor::MemCacheType;
31 use resources::AddressRange;
32 use resources::Alloc;
33 use resources::AllocOptions;
34 use resources::MmioType;
35 use resources::SystemAllocator;
36 use sync::Mutex;
37 use vfio_sys::vfio::VFIO_PCI_ACPI_NTFY_IRQ_INDEX;
38 use vfio_sys::*;
39 use vm_control::api::VmMemoryClient;
40 use vm_control::HotPlugDeviceInfo;
41 use vm_control::HotPlugDeviceType;
42 use vm_control::VmMemoryDestination;
43 use vm_control::VmMemoryRegionId;
44 use vm_control::VmMemorySource;
45 use vm_control::VmRequest;
46 use vm_control::VmResponse;
47 
48 use crate::pci::acpi::DeviceVcfgRegister;
49 use crate::pci::acpi::DsmMethod;
50 use crate::pci::acpi::PowerResourceMethod;
51 use crate::pci::acpi::SHM_OFFSET;
52 use crate::pci::msi::MsiConfig;
53 use crate::pci::msi::MsiStatus;
54 use crate::pci::msi::PCI_MSI_FLAGS;
55 use crate::pci::msi::PCI_MSI_FLAGS_64BIT;
56 use crate::pci::msi::PCI_MSI_FLAGS_MASKBIT;
57 use crate::pci::msi::PCI_MSI_NEXT_POINTER;
58 use crate::pci::msix::MsixConfig;
59 use crate::pci::msix::MsixStatus;
60 use crate::pci::msix::BITS_PER_PBA_ENTRY;
61 use crate::pci::msix::MSIX_PBA_ENTRIES_MODULO;
62 use crate::pci::msix::MSIX_TABLE_ENTRIES_MODULO;
63 use crate::pci::pci_device::BarRange;
64 use crate::pci::pci_device::Error as PciDeviceError;
65 use crate::pci::pci_device::PciDevice;
66 use crate::pci::pci_device::PreferredIrq;
67 use crate::pci::pm::PciPmCap;
68 use crate::pci::pm::PmConfig;
69 use crate::pci::pm::PM_CAP_LENGTH;
70 use crate::pci::PciAddress;
71 use crate::pci::PciBarConfiguration;
72 use crate::pci::PciBarIndex;
73 use crate::pci::PciBarPrefetchable;
74 use crate::pci::PciBarRegionType;
75 use crate::pci::PciCapabilityID;
76 use crate::pci::PciClassCode;
77 use crate::pci::PciId;
78 use crate::pci::PciInterruptPin;
79 use crate::pci::PCI_VCFG_DSM;
80 use crate::pci::PCI_VCFG_NOTY;
81 use crate::pci::PCI_VCFG_PM;
82 use crate::pci::PCI_VENDOR_ID_INTEL;
83 use crate::vfio::VfioDevice;
84 use crate::vfio::VfioError;
85 use crate::vfio::VfioIrqType;
86 use crate::vfio::VfioPciConfig;
87 use crate::IrqLevelEvent;
88 use crate::Suspendable;
89 
90 const PCI_VENDOR_ID: u32 = 0x0;
91 const PCI_DEVICE_ID: u32 = 0x2;
92 const PCI_COMMAND: u32 = 0x4;
93 const PCI_COMMAND_MEMORY: u8 = 0x2;
94 const PCI_BASE_CLASS_CODE: u32 = 0x0B;
95 const PCI_INTERRUPT_NUM: u32 = 0x3C;
96 const PCI_INTERRUPT_PIN: u32 = 0x3D;
97 
98 const PCI_CAPABILITY_LIST: u32 = 0x34;
99 const PCI_CAP_ID_MSI: u8 = 0x05;
100 const PCI_CAP_ID_MSIX: u8 = 0x11;
101 const PCI_CAP_ID_PM: u8 = 0x01;
102 
103 // Size of the standard PCI config space
104 const PCI_CONFIG_SPACE_SIZE: u32 = 0x100;
105 // Size of the standard PCIe config space: 4KB
106 const PCIE_CONFIG_SPACE_SIZE: u32 = 0x1000;
107 
108 // Extended Capabilities
109 const PCI_EXT_CAP_ID_CAC: u16 = 0x0C;
110 const PCI_EXT_CAP_ID_ARI: u16 = 0x0E;
111 const PCI_EXT_CAP_ID_SRIOV: u16 = 0x10;
112 const PCI_EXT_CAP_ID_REBAR: u16 = 0x15;
113 
114 struct VfioPmCap {
115     offset: u32,
116     capabilities: u32,
117     config: PmConfig,
118 }
119 
120 impl VfioPmCap {
new(config: &VfioPciConfig, cap_start: u32) -> Self121     fn new(config: &VfioPciConfig, cap_start: u32) -> Self {
122         let mut capabilities: u32 = config.read_config(cap_start);
123         capabilities |= (PciPmCap::default_cap() as u32) << 16;
124         VfioPmCap {
125             offset: cap_start,
126             capabilities,
127             config: PmConfig::new(false),
128         }
129     }
130 
should_trigger_pme(&mut self) -> bool131     pub fn should_trigger_pme(&mut self) -> bool {
132         self.config.should_trigger_pme()
133     }
134 
is_pm_reg(&self, offset: u32) -> bool135     fn is_pm_reg(&self, offset: u32) -> bool {
136         (offset >= self.offset) && (offset < self.offset + PM_CAP_LENGTH as u32)
137     }
138 
read(&self, offset: u32) -> u32139     pub fn read(&self, offset: u32) -> u32 {
140         let offset = offset - self.offset;
141         if offset == 0 {
142             self.capabilities
143         } else {
144             let mut data = 0;
145             self.config.read(&mut data);
146             data
147         }
148     }
149 
write(&mut self, offset: u64, data: &[u8])150     pub fn write(&mut self, offset: u64, data: &[u8]) {
151         let offset = offset - self.offset as u64;
152         if offset >= std::mem::size_of::<u32>() as u64 {
153             let offset = offset - std::mem::size_of::<u32>() as u64;
154             self.config.write(offset, data);
155         }
156     }
157 }
158 
159 enum VfioMsiChange {
160     Disable,
161     Enable,
162     FunctionChanged,
163 }
164 
165 struct VfioMsiCap {
166     config: MsiConfig,
167     offset: u32,
168 }
169 
170 impl VfioMsiCap {
new( config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube, device_id: u32, device_name: String, ) -> Self171     fn new(
172         config: &VfioPciConfig,
173         msi_cap_start: u32,
174         vm_socket_irq: Tube,
175         device_id: u32,
176         device_name: String,
177     ) -> Self {
178         let msi_ctl: u16 = config.read_config(msi_cap_start + PCI_MSI_FLAGS);
179         let is_64bit = (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0;
180         let mask_cap = (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0;
181 
182         VfioMsiCap {
183             config: MsiConfig::new(is_64bit, mask_cap, vm_socket_irq, device_id, device_name),
184             offset: msi_cap_start,
185         }
186     }
187 
is_msi_reg(&self, index: u64, len: usize) -> bool188     fn is_msi_reg(&self, index: u64, len: usize) -> bool {
189         self.config.is_msi_reg(self.offset, index, len)
190     }
191 
write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange>192     fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
193         let offset = index as u32 - self.offset;
194         match self.config.write_msi_capability(offset, data) {
195             MsiStatus::Enabled => Some(VfioMsiChange::Enable),
196             MsiStatus::Disabled => Some(VfioMsiChange::Disable),
197             MsiStatus::NothingToDo => None,
198         }
199     }
200 
get_msi_irqfd(&self) -> Option<&Event>201     fn get_msi_irqfd(&self) -> Option<&Event> {
202         self.config.get_irqfd()
203     }
204 
destroy(&mut self)205     fn destroy(&mut self) {
206         self.config.destroy()
207     }
208 }
209 
210 // MSI-X registers in MSI-X capability
211 const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
212 const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
213 const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
214 const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
215 const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
216 const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
217 const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
218 const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
219 
220 struct VfioMsixCap {
221     config: MsixConfig,
222     offset: u32,
223     table_size: u16,
224     table_pci_bar: PciBarIndex,
225     table_offset: u64,
226     table_size_bytes: u64,
227     pba_pci_bar: PciBarIndex,
228     pba_offset: u64,
229     pba_size_bytes: u64,
230     msix_interrupt_evt: Vec<Event>,
231 }
232 
233 impl VfioMsixCap {
new( config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube, pci_id: u32, device_name: String, ) -> Self234     fn new(
235         config: &VfioPciConfig,
236         msix_cap_start: u32,
237         vm_socket_irq: Tube,
238         pci_id: u32,
239         device_name: String,
240     ) -> Self {
241         let msix_ctl: u16 = config.read_config(msix_cap_start + PCI_MSIX_FLAGS);
242         let table: u32 = config.read_config(msix_cap_start + PCI_MSIX_TABLE);
243         let table_pci_bar = (table & PCI_MSIX_TABLE_BIR) as PciBarIndex;
244         let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
245         let pba: u32 = config.read_config(msix_cap_start + PCI_MSIX_PBA);
246         let pba_pci_bar = (pba & PCI_MSIX_PBA_BIR) as PciBarIndex;
247         let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
248 
249         let mut table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) as u64 + 1;
250         if table_pci_bar == pba_pci_bar
251             && pba_offset > table_offset
252             && (table_offset + table_size * MSIX_TABLE_ENTRIES_MODULO) > pba_offset
253         {
254             table_size = (pba_offset - table_offset) / MSIX_TABLE_ENTRIES_MODULO;
255         }
256 
257         let table_size_bytes = table_size * MSIX_TABLE_ENTRIES_MODULO;
258         let pba_size_bytes = ((table_size + BITS_PER_PBA_ENTRY as u64 - 1)
259             / BITS_PER_PBA_ENTRY as u64)
260             * MSIX_PBA_ENTRIES_MODULO;
261         let mut msix_interrupt_evt = Vec::new();
262         for _ in 0..table_size {
263             msix_interrupt_evt.push(Event::new().expect("failed to create msix interrupt"));
264         }
265         VfioMsixCap {
266             config: MsixConfig::new(table_size as u16, vm_socket_irq, pci_id, device_name),
267             offset: msix_cap_start,
268             table_size: table_size as u16,
269             table_pci_bar,
270             table_offset,
271             table_size_bytes,
272             pba_pci_bar,
273             pba_offset,
274             pba_size_bytes,
275             msix_interrupt_evt,
276         }
277     }
278 
279     // only msix control register is writable and need special handle in pci r/w
is_msix_control_reg(&self, offset: u32, size: u32) -> bool280     fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
281         let control_start = self.offset + PCI_MSIX_FLAGS;
282         let control_end = control_start + 2;
283 
284         offset < control_end && offset + size > control_start
285     }
286 
read_msix_control(&self, data: &mut u32)287     fn read_msix_control(&self, data: &mut u32) {
288         *data = self.config.read_msix_capability(*data);
289     }
290 
write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange>291     fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
292         let old_enabled = self.config.enabled();
293         let old_masked = self.config.masked();
294 
295         self.config
296             .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
297 
298         let new_enabled = self.config.enabled();
299         let new_masked = self.config.masked();
300 
301         if !old_enabled && new_enabled {
302             Some(VfioMsiChange::Enable)
303         } else if old_enabled && !new_enabled {
304             Some(VfioMsiChange::Disable)
305         } else if new_enabled && old_masked != new_masked {
306             Some(VfioMsiChange::FunctionChanged)
307         } else {
308             None
309         }
310     }
311 
is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool312     fn is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool {
313         bar_index == self.table_pci_bar
314             && offset >= self.table_offset
315             && offset < self.table_offset + self.table_size_bytes
316     }
317 
get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange>318     fn get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
319         if bar_index == self.table_pci_bar {
320             AddressRange::from_start_and_size(self.table_offset, self.table_size_bytes)
321         } else {
322             None
323         }
324     }
325 
read_table(&self, offset: u64, data: &mut [u8])326     fn read_table(&self, offset: u64, data: &mut [u8]) {
327         let offset = offset - self.table_offset;
328         self.config.read_msix_table(offset, data);
329     }
330 
write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus331     fn write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
332         let offset = offset - self.table_offset;
333         self.config.write_msix_table(offset, data)
334     }
335 
is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool336     fn is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool {
337         bar_index == self.pba_pci_bar
338             && offset >= self.pba_offset
339             && offset < self.pba_offset + self.pba_size_bytes
340     }
341 
get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange>342     fn get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
343         if bar_index == self.pba_pci_bar {
344             AddressRange::from_start_and_size(self.pba_offset, self.pba_size_bytes)
345         } else {
346             None
347         }
348     }
349 
read_pba(&self, offset: u64, data: &mut [u8])350     fn read_pba(&self, offset: u64, data: &mut [u8]) {
351         let offset = offset - self.pba_offset;
352         self.config.read_pba_entries(offset, data);
353     }
354 
write_pba(&mut self, offset: u64, data: &[u8])355     fn write_pba(&mut self, offset: u64, data: &[u8]) {
356         let offset = offset - self.pba_offset;
357         self.config.write_pba_entries(offset, data);
358     }
359 
get_msix_irqfd(&self, index: usize) -> Option<&Event>360     fn get_msix_irqfd(&self, index: usize) -> Option<&Event> {
361         let irqfd = self.config.get_irqfd(index);
362         if let Some(fd) = irqfd {
363             if self.msix_vector_masked(index) {
364                 Some(&self.msix_interrupt_evt[index])
365             } else {
366                 Some(fd)
367             }
368         } else {
369             None
370         }
371     }
372 
get_msix_irqfds(&self) -> Vec<Option<&Event>>373     fn get_msix_irqfds(&self) -> Vec<Option<&Event>> {
374         let mut irqfds = Vec::new();
375 
376         for i in 0..self.table_size {
377             irqfds.push(self.get_msix_irqfd(i as usize));
378         }
379 
380         irqfds
381     }
382 
table_size(&self) -> usize383     fn table_size(&self) -> usize {
384         self.table_size.into()
385     }
386 
clone_msix_evt(&self) -> Vec<Event>387     fn clone_msix_evt(&self) -> Vec<Event> {
388         self.msix_interrupt_evt
389             .iter()
390             .map(|irq| irq.try_clone().unwrap())
391             .collect()
392     }
393 
msix_vector_masked(&self, index: usize) -> bool394     fn msix_vector_masked(&self, index: usize) -> bool {
395         !self.config.enabled() || self.config.masked() || self.config.table_masked(index)
396     }
397 
trigger(&mut self, index: usize)398     fn trigger(&mut self, index: usize) {
399         self.config.trigger(index as u16);
400     }
401 
destroy(&mut self)402     fn destroy(&mut self) {
403         self.config.destroy()
404     }
405 }
406 
407 struct VfioResourceAllocator {
408     // The region that is not allocated yet.
409     regions: BTreeSet<AddressRange>,
410 }
411 
412 impl VfioResourceAllocator {
413     // Creates a new `VfioResourceAllocator` for managing VFIO resources.
414     // Can return `Err` if `base` + `size` overflows a u64.
415     //
416     // * `base` - The starting address of the range to manage.
417     // * `size` - The size of the address range in bytes.
new(pool: AddressRange) -> Result<Self, PciDeviceError>418     fn new(pool: AddressRange) -> Result<Self, PciDeviceError> {
419         if pool.is_empty() {
420             return Err(PciDeviceError::SizeZero);
421         }
422         let mut regions = BTreeSet::new();
423         regions.insert(pool);
424         Ok(VfioResourceAllocator { regions })
425     }
426 
internal_allocate_from_slot( &mut self, slot: AddressRange, range: AddressRange, ) -> Result<u64, PciDeviceError>427     fn internal_allocate_from_slot(
428         &mut self,
429         slot: AddressRange,
430         range: AddressRange,
431     ) -> Result<u64, PciDeviceError> {
432         let slot_was_present = self.regions.remove(&slot);
433         assert!(slot_was_present);
434 
435         let (before, after) = slot.non_overlapping_ranges(range);
436 
437         if !before.is_empty() {
438             self.regions.insert(before);
439         }
440         if !after.is_empty() {
441             self.regions.insert(after);
442         }
443 
444         Ok(range.start)
445     }
446 
447     // Allocates a range of addresses from the managed region with a minimal alignment.
448     // Overlapping with a previous allocation is _not_ allowed.
449     // Returns allocated address.
allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError>450     fn allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError> {
451         if size == 0 {
452             return Err(PciDeviceError::SizeZero);
453         }
454         if !alignment.is_power_of_two() {
455             return Err(PciDeviceError::BadAlignment);
456         }
457 
458         // finds first region matching alignment and size.
459         let region = self.regions.iter().find(|range| {
460             match range.start % alignment {
461                 0 => range.start.checked_add(size - 1),
462                 r => range.start.checked_add(size - 1 + alignment - r),
463             }
464             .map_or(false, |end| end <= range.end)
465         });
466 
467         match region {
468             Some(&slot) => {
469                 let start = match slot.start % alignment {
470                     0 => slot.start,
471                     r => slot.start + alignment - r,
472                 };
473                 let end = start + size - 1;
474                 let range = AddressRange::from_start_and_end(start, end);
475 
476                 self.internal_allocate_from_slot(slot, range)
477             }
478             None => Err(PciDeviceError::OutOfSpace),
479         }
480     }
481 
482     // Allocates a range of addresses from the managed region with a required location.
483     // Overlapping with a previous allocation is allowed.
allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError>484     fn allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError> {
485         if range.is_empty() {
486             return Err(PciDeviceError::SizeZero);
487         }
488 
489         while let Some(&slot) = self
490             .regions
491             .iter()
492             .find(|avail_range| avail_range.overlaps(range))
493         {
494             let _address = self.internal_allocate_from_slot(slot, range)?;
495         }
496         Ok(())
497     }
498 }
499 
500 struct VfioPciWorker {
501     address: PciAddress,
502     sysfs_path: PathBuf,
503     vm_socket: Tube,
504     name: String,
505     pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
506     msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
507 }
508 
509 impl VfioPciWorker {
run( &mut self, req_irq_evt: Event, wakeup_evt: Event, acpi_notify_evt: Event, kill_evt: Event, msix_evt: Vec<Event>, is_in_low_power: Arc<Mutex<bool>>, gpe: Option<u32>, notification_val: Arc<Mutex<Vec<u32>>>, )510     fn run(
511         &mut self,
512         req_irq_evt: Event,
513         wakeup_evt: Event,
514         acpi_notify_evt: Event,
515         kill_evt: Event,
516         msix_evt: Vec<Event>,
517         is_in_low_power: Arc<Mutex<bool>>,
518         gpe: Option<u32>,
519         notification_val: Arc<Mutex<Vec<u32>>>,
520     ) {
521         #[derive(EventToken, Debug)]
522         enum Token {
523             ReqIrq,
524             WakeUp,
525             AcpiNotifyEvent,
526             Kill,
527             MsixIrqi { index: usize },
528         }
529 
530         let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
531             (&req_irq_evt, Token::ReqIrq),
532             (&wakeup_evt, Token::WakeUp),
533             (&acpi_notify_evt, Token::AcpiNotifyEvent),
534             (&kill_evt, Token::Kill),
535         ]) {
536             Ok(pc) => pc,
537             Err(e) => {
538                 error!(
539                     "{} failed creating vfio WaitContext: {}",
540                     self.name.clone(),
541                     e
542                 );
543                 return;
544             }
545         };
546 
547         for (index, msix_int) in msix_evt.iter().enumerate() {
548             wait_ctx
549                 .add(msix_int, Token::MsixIrqi { index })
550                 .expect("Failed to create vfio WaitContext for msix interrupt event")
551         }
552 
553         'wait: loop {
554             let events = match wait_ctx.wait() {
555                 Ok(v) => v,
556                 Err(e) => {
557                     error!("{} failed polling vfio events: {}", self.name.clone(), e);
558                     break;
559                 }
560             };
561 
562             for event in events.iter().filter(|e| e.is_readable) {
563                 match event.token {
564                     Token::MsixIrqi { index } => {
565                         if let Some(msix_cap) = &self.msix_cap {
566                             msix_cap.lock().trigger(index);
567                         }
568                     }
569                     Token::ReqIrq => {
570                         let device = HotPlugDeviceInfo {
571                             device_type: HotPlugDeviceType::EndPoint,
572                             path: self.sysfs_path.clone(),
573                             hp_interrupt: false,
574                         };
575 
576                         let request = VmRequest::HotPlugVfioCommand { device, add: false };
577                         if self.vm_socket.send(&request).is_ok() {
578                             if let Err(e) = self.vm_socket.recv::<VmResponse>() {
579                                 error!("{} failed to remove vfio_device: {}", self.name.clone(), e);
580                             } else {
581                                 break 'wait;
582                             }
583                         }
584                     }
585                     Token::WakeUp => {
586                         let _ = wakeup_evt.wait();
587 
588                         if *is_in_low_power.lock() {
589                             if let Some(pm_cap) = &self.pm_cap {
590                                 if pm_cap.lock().should_trigger_pme() {
591                                     let request =
592                                         VmRequest::PciPme(self.address.pme_requester_id());
593                                     if self.vm_socket.send(&request).is_ok() {
594                                         if let Err(e) = self.vm_socket.recv::<VmResponse>() {
595                                             error!(
596                                                 "{} failed to send PME: {}",
597                                                 self.name.clone(),
598                                                 e
599                                             );
600                                         }
601                                     }
602                                 }
603                             }
604                         }
605                     }
606                     Token::AcpiNotifyEvent => {
607                         if let Some(gpe) = gpe {
608                             if let Ok(val) = base::EventExt::read_count(&acpi_notify_evt) {
609                                 notification_val.lock().push(val as u32);
610                                 let request = VmRequest::Gpe {
611                                     gpe,
612                                     clear_evt: None,
613                                 };
614                                 if self.vm_socket.send(&request).is_ok() {
615                                     if let Err(e) = self.vm_socket.recv::<VmResponse>() {
616                                         error!("{} failed to send GPE: {}", self.name.clone(), e);
617                                     }
618                                 }
619                             } else {
620                                 error!("{} failed to read acpi_notify_evt", self.name.clone());
621                             }
622                         }
623                     }
624                     Token::Kill => break 'wait,
625                 }
626             }
627         }
628     }
629 }
630 
get_next_from_extcap_header(cap_header: u32) -> u32631 fn get_next_from_extcap_header(cap_header: u32) -> u32 {
632     (cap_header >> 20) & 0xffc
633 }
634 
is_skipped_ext_cap(cap_id: u16) -> bool635 fn is_skipped_ext_cap(cap_id: u16) -> bool {
636     matches!(
637         cap_id,
638         // SR-IOV/ARI/Resizable_BAR capabilities are not well handled and should not be exposed
639         PCI_EXT_CAP_ID_ARI | PCI_EXT_CAP_ID_SRIOV | PCI_EXT_CAP_ID_REBAR
640     )
641 }
642 
643 enum DeviceData {
644     IntelGfxData { opregion_index: u32 },
645 }
646 
647 /// PCI Express Extended Capabilities information
648 #[derive(Copy, Clone)]
649 struct ExtCap {
650     /// cap offset in Configuration Space
651     offset: u32,
652     /// cap size
653     size: u32,
654     /// next offset, set next non-skipped offset for non-skipped ext cap
655     next: u16,
656     /// whether to be exposed to guest
657     is_skipped: bool,
658 }
659 
660 /// Implements the Vfio Pci device, then a pci device is added into vm
661 pub struct VfioPciDevice {
662     device: Arc<VfioDevice>,
663     config: VfioPciConfig,
664     hotplug: bool,
665     hotplug_bus_number: Option<u8>,
666     preferred_address: PciAddress,
667     pci_address: Option<PciAddress>,
668     interrupt_evt: Option<IrqLevelEvent>,
669     acpi_notification_evt: Option<Event>,
670     mmio_regions: Vec<PciBarConfiguration>,
671     io_regions: Vec<PciBarConfiguration>,
672     pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
673     msi_cap: Option<VfioMsiCap>,
674     msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
675     irq_type: Option<VfioIrqType>,
676     vm_memory_client: VmMemoryClient,
677     device_data: Option<DeviceData>,
678     pm_evt: Option<Event>,
679     is_in_low_power: Arc<Mutex<bool>>,
680     worker_thread: Option<WorkerThread<VfioPciWorker>>,
681     vm_socket_vm: Option<Tube>,
682     sysfs_path: PathBuf,
683     // PCI Express Extended Capabilities
684     ext_caps: Vec<ExtCap>,
685     vcfg_shm_mmap: Option<MemoryMapping>,
686     mapped_mmio_bars: BTreeMap<PciBarIndex, (u64, Vec<VmMemoryRegionId>)>,
687     activated: bool,
688     acpi_notifier_val: Arc<Mutex<Vec<u32>>>,
689     gpe: Option<u32>,
690     base_class_code: PciClassCode,
691 }
692 
693 impl VfioPciDevice {
694     /// Constructs a new Vfio Pci device for the give Vfio device
new( sysfs_path: &Path, device: VfioDevice, hotplug: bool, hotplug_bus_number: Option<u8>, guest_address: Option<PciAddress>, vfio_device_socket_msi: Tube, vfio_device_socket_msix: Tube, vm_memory_client: VmMemoryClient, vfio_device_socket_vm: Tube, ) -> Result<Self, PciDeviceError>695     pub fn new(
696         sysfs_path: &Path,
697         device: VfioDevice,
698         hotplug: bool,
699         hotplug_bus_number: Option<u8>,
700         guest_address: Option<PciAddress>,
701         vfio_device_socket_msi: Tube,
702         vfio_device_socket_msix: Tube,
703         vm_memory_client: VmMemoryClient,
704         vfio_device_socket_vm: Tube,
705     ) -> Result<Self, PciDeviceError> {
706         let preferred_address = if let Some(bus_num) = hotplug_bus_number {
707             debug!("hotplug bus {}", bus_num);
708             PciAddress {
709                 // Caller specify pcie bus number for hotplug device
710                 bus: bus_num,
711                 // devfn should be 0, otherwise pcie root port couldn't detect it
712                 dev: 0,
713                 func: 0,
714             }
715         } else if let Some(guest_address) = guest_address {
716             debug!("guest PCI address {}", guest_address);
717             guest_address
718         } else {
719             let addr = PciAddress::from_str(device.device_name()).map_err(|e| {
720                 PciDeviceError::PciAddressParseFailure(device.device_name().clone(), e)
721             })?;
722             debug!("parsed device PCI address {}", addr);
723             addr
724         };
725 
726         let dev = Arc::new(device);
727         let config = VfioPciConfig::new(Arc::clone(&dev));
728         let mut msi_socket = Some(vfio_device_socket_msi);
729         let mut msix_socket = Some(vfio_device_socket_msix);
730         let mut msi_cap: Option<VfioMsiCap> = None;
731         let mut msix_cap: Option<Arc<Mutex<VfioMsixCap>>> = None;
732         let mut pm_cap: Option<Arc<Mutex<VfioPmCap>>> = None;
733 
734         let mut is_pcie = false;
735         let mut cap_next: u32 = config.read_config::<u8>(PCI_CAPABILITY_LIST).into();
736         let vendor_id: u16 = config.read_config(PCI_VENDOR_ID);
737         let device_id: u16 = config.read_config(PCI_DEVICE_ID);
738         let base_class_code = PciClassCode::try_from(config.read_config::<u8>(PCI_BASE_CLASS_CODE))
739             .unwrap_or(PciClassCode::Other);
740 
741         let pci_id = PciId::new(vendor_id, device_id);
742 
743         while cap_next != 0 {
744             let cap_id: u8 = config.read_config(cap_next);
745             if cap_id == PCI_CAP_ID_PM {
746                 pm_cap = Some(Arc::new(Mutex::new(VfioPmCap::new(&config, cap_next))));
747             } else if cap_id == PCI_CAP_ID_MSI {
748                 if let Some(msi_socket) = msi_socket.take() {
749                     msi_cap = Some(VfioMsiCap::new(
750                         &config,
751                         cap_next,
752                         msi_socket,
753                         pci_id.into(),
754                         dev.device_name().to_string(),
755                     ));
756                 }
757             } else if cap_id == PCI_CAP_ID_MSIX {
758                 if let Some(msix_socket) = msix_socket.take() {
759                     msix_cap = Some(Arc::new(Mutex::new(VfioMsixCap::new(
760                         &config,
761                         cap_next,
762                         msix_socket,
763                         pci_id.into(),
764                         dev.device_name().to_string(),
765                     ))));
766                 }
767             } else if cap_id == PciCapabilityID::PciExpress as u8 {
768                 is_pcie = true;
769             }
770             let offset = cap_next + PCI_MSI_NEXT_POINTER;
771             cap_next = config.read_config::<u8>(offset).into();
772         }
773 
774         let mut ext_caps: Vec<ExtCap> = Vec::new();
775         if is_pcie {
776             let mut ext_cap_next: u32 = PCI_CONFIG_SPACE_SIZE;
777             while ext_cap_next != 0 {
778                 let ext_cap_config: u32 = config.read_config::<u32>(ext_cap_next);
779                 if ext_cap_config == 0 {
780                     break;
781                 }
782                 ext_caps.push(ExtCap {
783                     offset: ext_cap_next,
784                     // Calculate the size later
785                     size: 0,
786                     // init as the real value
787                     next: get_next_from_extcap_header(ext_cap_config) as u16,
788                     is_skipped: is_skipped_ext_cap((ext_cap_config & 0xffff) as u16),
789                 });
790                 ext_cap_next = get_next_from_extcap_header(ext_cap_config);
791             }
792 
793             // Manage extended caps
794             //
795             // Extended capabilities are chained with each pointing to the next, so
796             // we can drop anything other than the head of the chain simply by
797             // modifying the previous next pointer. For the head of the chain, we
798             // can modify the capability ID to something that cannot match a valid
799             // capability. ID PCI_EXT_CAP_ID_CAC is for this since it is no longer
800             // supported.
801             //
802             // reverse order by offset
803             ext_caps.sort_by(|a, b| b.offset.cmp(&a.offset));
804             let mut next_offset: u32 = PCIE_CONFIG_SPACE_SIZE;
805             let mut non_skipped_next: u16 = 0;
806             for ext_cap in ext_caps.iter_mut() {
807                 if !ext_cap.is_skipped {
808                     ext_cap.next = non_skipped_next;
809                     non_skipped_next = ext_cap.offset as u16;
810                 } else if ext_cap.offset == PCI_CONFIG_SPACE_SIZE {
811                     ext_cap.next = non_skipped_next;
812                 }
813                 ext_cap.size = next_offset - ext_cap.offset;
814                 next_offset = ext_cap.offset;
815             }
816             // order by offset
817             ext_caps.reverse();
818         }
819 
820         let is_intel_gfx =
821             base_class_code == PciClassCode::DisplayController && vendor_id == PCI_VENDOR_ID_INTEL;
822         let device_data = if is_intel_gfx {
823             Some(DeviceData::IntelGfxData {
824                 opregion_index: u32::MAX,
825             })
826         } else {
827             None
828         };
829 
830         Ok(VfioPciDevice {
831             device: dev,
832             config,
833             hotplug,
834             hotplug_bus_number,
835             preferred_address,
836             pci_address: None,
837             interrupt_evt: None,
838             acpi_notification_evt: None,
839             mmio_regions: Vec::new(),
840             io_regions: Vec::new(),
841             pm_cap,
842             msi_cap,
843             msix_cap,
844             irq_type: None,
845             vm_memory_client,
846             device_data,
847             pm_evt: None,
848             is_in_low_power: Arc::new(Mutex::new(false)),
849             worker_thread: None,
850             vm_socket_vm: Some(vfio_device_socket_vm),
851             sysfs_path: sysfs_path.to_path_buf(),
852             ext_caps,
853             vcfg_shm_mmap: None,
854             mapped_mmio_bars: BTreeMap::new(),
855             activated: false,
856             acpi_notifier_val: Arc::new(Mutex::new(Vec::new())),
857             gpe: None,
858             base_class_code,
859         })
860     }
861 
862     /// Gets the pci address of the device, if one has already been allocated.
pci_address(&self) -> Option<PciAddress>863     pub fn pci_address(&self) -> Option<PciAddress> {
864         self.pci_address
865     }
866 
is_gfx(&self) -> bool867     pub fn is_gfx(&self) -> bool {
868         self.base_class_code == PciClassCode::DisplayController
869     }
870 
is_intel_gfx(&self) -> bool871     fn is_intel_gfx(&self) -> bool {
872         matches!(self.device_data, Some(DeviceData::IntelGfxData { .. }))
873     }
874 
enable_acpi_notification(&mut self) -> Result<(), PciDeviceError>875     fn enable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
876         if let Some(ref acpi_notification_evt) = self.acpi_notification_evt {
877             return self
878                 .device
879                 .acpi_notification_evt_enable(acpi_notification_evt, VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
880                 .map_err(|_| PciDeviceError::AcpiNotifySetupFailed);
881         }
882         Err(PciDeviceError::AcpiNotifySetupFailed)
883     }
884 
885     #[allow(dead_code)]
disable_acpi_notification(&mut self) -> Result<(), PciDeviceError>886     fn disable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
887         if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
888             return self
889                 .device
890                 .acpi_notification_disable(VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
891                 .map_err(|_| PciDeviceError::AcpiNotifyDeactivationFailed);
892         }
893         Err(PciDeviceError::AcpiNotifyDeactivationFailed)
894     }
895 
896     #[allow(dead_code)]
test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError>897     fn test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError> {
898         if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
899             return self
900                 .device
901                 .acpi_notification_test(VFIO_PCI_ACPI_NTFY_IRQ_INDEX, val)
902                 .map_err(|_| PciDeviceError::AcpiNotifyTestFailed);
903         }
904         Err(PciDeviceError::AcpiNotifyTestFailed)
905     }
906 
enable_intx(&mut self)907     fn enable_intx(&mut self) {
908         if let Some(ref interrupt_evt) = self.interrupt_evt {
909             if let Err(e) = self.device.irq_enable(
910                 &[Some(interrupt_evt.get_trigger())],
911                 VFIO_PCI_INTX_IRQ_INDEX,
912                 0,
913             ) {
914                 error!("{} Intx enable failed: {}", self.debug_label(), e);
915                 return;
916             }
917             if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
918                 error!("{} Intx mask failed: {}", self.debug_label(), e);
919                 self.disable_intx();
920                 return;
921             }
922             if let Err(e) = self
923                 .device
924                 .resample_virq_enable(interrupt_evt.get_resample(), VFIO_PCI_INTX_IRQ_INDEX)
925             {
926                 error!("{} resample enable failed: {}", self.debug_label(), e);
927                 self.disable_intx();
928                 return;
929             }
930             if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
931                 error!("{} Intx unmask failed: {}", self.debug_label(), e);
932                 self.disable_intx();
933                 return;
934             }
935             self.irq_type = Some(VfioIrqType::Intx);
936         }
937     }
938 
disable_intx(&mut self)939     fn disable_intx(&mut self) {
940         if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
941             error!("{} Intx disable failed: {}", self.debug_label(), e);
942         }
943         self.irq_type = None;
944     }
945 
disable_irqs(&mut self)946     fn disable_irqs(&mut self) {
947         match self.irq_type {
948             Some(VfioIrqType::Msi) => self.disable_msi(),
949             Some(VfioIrqType::Msix) => self.disable_msix(),
950             _ => (),
951         }
952 
953         // Above disable_msi() or disable_msix() will enable intx again.
954         // so disable_intx here again.
955         if let Some(VfioIrqType::Intx) = self.irq_type {
956             self.disable_intx();
957         }
958     }
959 
enable_msi(&mut self)960     fn enable_msi(&mut self) {
961         self.disable_irqs();
962 
963         let irqfd = match &self.msi_cap {
964             Some(cap) => {
965                 if let Some(fd) = cap.get_msi_irqfd() {
966                     fd
967                 } else {
968                     self.enable_intx();
969                     return;
970                 }
971             }
972             None => {
973                 self.enable_intx();
974                 return;
975             }
976         };
977 
978         if let Err(e) = self
979             .device
980             .irq_enable(&[Some(irqfd)], VFIO_PCI_MSI_IRQ_INDEX, 0)
981         {
982             error!("{} failed to enable msi: {}", self.debug_label(), e);
983             self.enable_intx();
984             return;
985         }
986 
987         self.irq_type = Some(VfioIrqType::Msi);
988     }
989 
disable_msi(&mut self)990     fn disable_msi(&mut self) {
991         if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
992             error!("{} failed to disable msi: {}", self.debug_label(), e);
993             return;
994         }
995         self.irq_type = None;
996 
997         self.enable_intx();
998     }
999 
enable_msix(&mut self)1000     fn enable_msix(&mut self) {
1001         if self.msix_cap.is_none() {
1002             return;
1003         }
1004 
1005         self.disable_irqs();
1006         let cap = self.msix_cap.as_ref().unwrap().lock();
1007         let vector_in_use = cap.get_msix_irqfds().iter().any(|&irq| irq.is_some());
1008 
1009         let mut failed = false;
1010         if !vector_in_use {
1011             // If there are no msix vectors currently in use, we explicitly assign a new eventfd
1012             // to vector 0. Then we enable it and immediately disable it, so that vfio will
1013             // activate physical device. If there are available msix vectors, just enable them
1014             // instead.
1015             let fd = Event::new().expect("failed to create event");
1016             let table_size = cap.table_size();
1017             let mut irqfds = vec![None; table_size];
1018             irqfds[0] = Some(&fd);
1019             for fd in irqfds.iter_mut().skip(1) {
1020                 *fd = None;
1021             }
1022             if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1023                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1024                 failed = true;
1025             }
1026             irqfds[0] = None;
1027             if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1028                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1029                 failed = true;
1030             }
1031         } else {
1032             let result = self
1033                 .device
1034                 .irq_enable(&cap.get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0);
1035             if let Err(e) = result {
1036                 error!("{} failed to enable msix: {}", self.debug_label(), e);
1037                 failed = true;
1038             }
1039         }
1040 
1041         std::mem::drop(cap);
1042         if failed {
1043             self.enable_intx();
1044             return;
1045         }
1046         self.irq_type = Some(VfioIrqType::Msix);
1047     }
1048 
disable_msix(&mut self)1049     fn disable_msix(&mut self) {
1050         if self.msix_cap.is_none() {
1051             return;
1052         }
1053         if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
1054             error!("{} failed to disable msix: {}", self.debug_label(), e);
1055             return;
1056         }
1057         self.irq_type = None;
1058         self.enable_intx();
1059     }
1060 
msix_vectors_update(&self) -> Result<(), VfioError>1061     fn msix_vectors_update(&self) -> Result<(), VfioError> {
1062         if let Some(cap) = &self.msix_cap {
1063             self.device
1064                 .irq_enable(&cap.lock().get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0)?;
1065         }
1066         Ok(())
1067     }
1068 
msix_vector_update(&self, index: usize, irqfd: Option<&Event>)1069     fn msix_vector_update(&self, index: usize, irqfd: Option<&Event>) {
1070         if let Err(e) = self
1071             .device
1072             .irq_enable(&[irqfd], VFIO_PCI_MSIX_IRQ_INDEX, index as u32)
1073         {
1074             error!(
1075                 "{} failed to update msix vector {}: {}",
1076                 self.debug_label(),
1077                 index,
1078                 e
1079             );
1080         }
1081     }
1082 
adjust_bar_mmap( &self, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, remove_mmaps: &[AddressRange], ) -> Vec<vfio_region_sparse_mmap_area>1083     fn adjust_bar_mmap(
1084         &self,
1085         bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1086         remove_mmaps: &[AddressRange],
1087     ) -> Vec<vfio_region_sparse_mmap_area> {
1088         let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::with_capacity(bar_mmaps.len());
1089         let pgmask = (pagesize() as u64) - 1;
1090 
1091         for mmap in bar_mmaps.iter() {
1092             let mmap_range = if let Some(mmap_range) =
1093                 AddressRange::from_start_and_size(mmap.offset, mmap.size)
1094             {
1095                 mmap_range
1096             } else {
1097                 continue;
1098             };
1099             let mut to_mmap = match VfioResourceAllocator::new(mmap_range) {
1100                 Ok(a) => a,
1101                 Err(e) => {
1102                     error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1103                     mmaps.clear();
1104                     return mmaps;
1105                 }
1106             };
1107 
1108             for &(mut remove_range) in remove_mmaps.iter() {
1109                 remove_range = remove_range.intersect(mmap_range);
1110                 if !remove_range.is_empty() {
1111                     // align offsets to page size
1112                     let begin = remove_range.start & !pgmask;
1113                     let end = ((remove_range.end + 1 + pgmask) & !pgmask) - 1;
1114                     let remove_range = AddressRange::from_start_and_end(begin, end);
1115                     if let Err(e) = to_mmap.allocate_at_can_overlap(remove_range) {
1116                         error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1117                     }
1118                 }
1119             }
1120 
1121             for mmap in to_mmap.regions {
1122                 mmaps.push(vfio_region_sparse_mmap_area {
1123                     offset: mmap.start,
1124                     size: mmap.end - mmap.start + 1,
1125                 });
1126             }
1127         }
1128 
1129         mmaps
1130     }
1131 
remove_bar_mmap_msix( &self, bar_index: PciBarIndex, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, ) -> Vec<vfio_region_sparse_mmap_area>1132     fn remove_bar_mmap_msix(
1133         &self,
1134         bar_index: PciBarIndex,
1135         bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1136     ) -> Vec<vfio_region_sparse_mmap_area> {
1137         let msix_cap = &self.msix_cap.as_ref().unwrap().lock();
1138         let mut msix_regions = Vec::new();
1139 
1140         if let Some(t) = msix_cap.get_msix_table(bar_index) {
1141             msix_regions.push(t);
1142         }
1143         if let Some(p) = msix_cap.get_msix_pba(bar_index) {
1144             msix_regions.push(p);
1145         }
1146 
1147         if msix_regions.is_empty() {
1148             return bar_mmaps;
1149         }
1150 
1151         self.adjust_bar_mmap(bar_mmaps, &msix_regions)
1152     }
1153 
add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId>1154     fn add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId> {
1155         let mut mmaps_ids: Vec<VmMemoryRegionId> = Vec::new();
1156         if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1157             // the bar storing msix table and pba couldn't mmap.
1158             // these bars should be trapped, so that msix could be emulated.
1159             let mut mmaps = self.device.get_region_mmap(index);
1160 
1161             if self.msix_cap.is_some() {
1162                 mmaps = self.remove_bar_mmap_msix(index, mmaps);
1163             }
1164             if mmaps.is_empty() {
1165                 return mmaps_ids;
1166             }
1167 
1168             for mmap in mmaps.iter() {
1169                 let mmap_offset = mmap.offset;
1170                 let mmap_size = mmap.size;
1171                 let guest_map_start = bar_addr + mmap_offset;
1172                 let region_offset = self.device.get_region_offset(index);
1173                 let offset = region_offset + mmap_offset;
1174                 let descriptor = match self.device.device_file().try_clone() {
1175                     Ok(device_file) => device_file.into(),
1176                     Err(_) => break,
1177                 };
1178                 match self.vm_memory_client.register_memory(
1179                     VmMemorySource::Descriptor {
1180                         descriptor,
1181                         offset,
1182                         size: mmap_size,
1183                     },
1184                     VmMemoryDestination::GuestPhysicalAddress(guest_map_start),
1185                     Protection::read_write(),
1186                     MemCacheType::CacheCoherent,
1187                 ) {
1188                     Ok(id) => {
1189                         mmaps_ids.push(id);
1190                     }
1191                     Err(e) => {
1192                         error!("register_memory failed: {}", e);
1193                         break;
1194                     }
1195                 }
1196             }
1197         }
1198 
1199         mmaps_ids
1200     }
1201 
remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId])1202     fn remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId]) {
1203         for mmap_id in mmap_ids {
1204             if let Err(e) = self.vm_memory_client.unregister_memory(*mmap_id) {
1205                 error!("unregister_memory failed: {}", e);
1206             }
1207         }
1208     }
1209 
disable_bars_mmap(&mut self)1210     fn disable_bars_mmap(&mut self) {
1211         for (_, (_, mmap_ids)) in self.mapped_mmio_bars.iter() {
1212             self.remove_bar_mmap(mmap_ids);
1213         }
1214         self.mapped_mmio_bars.clear();
1215     }
1216 
commit_bars_mmap(&mut self)1217     fn commit_bars_mmap(&mut self) {
1218         // Unmap all bars before remapping bars, to prevent issues with overlap
1219         let mut needs_map = Vec::new();
1220         for mmio_info in self.mmio_regions.iter() {
1221             let bar_idx = mmio_info.bar_index();
1222             let addr = mmio_info.address();
1223 
1224             if let Some((cur_addr, ids)) = self.mapped_mmio_bars.remove(&bar_idx) {
1225                 if cur_addr == addr {
1226                     self.mapped_mmio_bars.insert(bar_idx, (cur_addr, ids));
1227                     continue;
1228                 } else {
1229                     self.remove_bar_mmap(&ids);
1230                 }
1231             }
1232 
1233             if addr != 0 {
1234                 needs_map.push((bar_idx, addr));
1235             }
1236         }
1237 
1238         for (bar_idx, addr) in needs_map.iter() {
1239             let ids = self.add_bar_mmap(*bar_idx, *addr);
1240             self.mapped_mmio_bars.insert(*bar_idx, (*addr, ids));
1241         }
1242     }
1243 
close(&mut self)1244     fn close(&mut self) {
1245         if let Some(msi) = self.msi_cap.as_mut() {
1246             msi.destroy();
1247         }
1248         if let Some(msix) = &self.msix_cap {
1249             msix.lock().destroy();
1250         }
1251         self.disable_bars_mmap();
1252         self.device.close();
1253     }
1254 
start_work_thread(&mut self)1255     fn start_work_thread(&mut self) {
1256         let vm_socket = match self.vm_socket_vm.take() {
1257             Some(socket) => socket,
1258             None => return,
1259         };
1260 
1261         let req_evt = match Event::new() {
1262             Ok(evt) => {
1263                 if let Err(e) = self
1264                     .device
1265                     .irq_enable(&[Some(&evt)], VFIO_PCI_REQ_IRQ_INDEX, 0)
1266                 {
1267                     error!("{} enable req_irq failed: {}", self.debug_label(), e);
1268                     return;
1269                 }
1270                 evt
1271             }
1272             Err(_) => return,
1273         };
1274 
1275         let (self_pm_evt, pm_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1276             Ok(v) => v,
1277             Err(e) => {
1278                 error!(
1279                     "{} failed creating PM Event pair: {}",
1280                     self.debug_label(),
1281                     e
1282                 );
1283                 return;
1284             }
1285         };
1286         self.pm_evt = Some(self_pm_evt);
1287 
1288         let (self_acpi_notify_evt, acpi_notify_evt) =
1289             match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1290                 Ok(v) => v,
1291                 Err(e) => {
1292                     error!(
1293                         "{} failed creating ACPI Event pair: {}",
1294                         self.debug_label(),
1295                         e
1296                     );
1297                     return;
1298                 }
1299             };
1300         self.acpi_notification_evt = Some(self_acpi_notify_evt);
1301 
1302         if let Err(e) = self.enable_acpi_notification() {
1303             error!("{}: {}", self.debug_label(), e);
1304         }
1305 
1306         let mut msix_evt = Vec::new();
1307         if let Some(msix_cap) = &self.msix_cap {
1308             msix_evt = msix_cap.lock().clone_msix_evt();
1309         }
1310 
1311         let name = self.device.device_name().to_string();
1312         let address = self.pci_address.expect("Unassigned PCI Address.");
1313         let sysfs_path = self.sysfs_path.clone();
1314         let pm_cap = self.pm_cap.clone();
1315         let msix_cap = self.msix_cap.clone();
1316         let is_in_low_power = self.is_in_low_power.clone();
1317         let gpe_nr = self.gpe;
1318         let notification_val = self.acpi_notifier_val.clone();
1319         self.worker_thread = Some(WorkerThread::start("vfio_pci", move |kill_evt| {
1320             let mut worker = VfioPciWorker {
1321                 address,
1322                 sysfs_path,
1323                 vm_socket,
1324                 name,
1325                 pm_cap,
1326                 msix_cap,
1327             };
1328             worker.run(
1329                 req_evt,
1330                 pm_evt,
1331                 acpi_notify_evt,
1332                 kill_evt,
1333                 msix_evt,
1334                 is_in_low_power,
1335                 gpe_nr,
1336                 notification_val,
1337             );
1338             worker
1339         }));
1340         self.activated = true;
1341     }
1342 
collect_bars(&mut self) -> Vec<PciBarConfiguration>1343     fn collect_bars(&mut self) -> Vec<PciBarConfiguration> {
1344         let mut i = VFIO_PCI_BAR0_REGION_INDEX;
1345         let mut mem_bars: Vec<PciBarConfiguration> = Vec::new();
1346 
1347         while i <= VFIO_PCI_ROM_REGION_INDEX {
1348             let mut low: u32 = 0xffffffff;
1349             let offset: u32 = if i == VFIO_PCI_ROM_REGION_INDEX {
1350                 0x30
1351             } else {
1352                 0x10 + i * 4
1353             };
1354             self.config.write_config(low, offset);
1355             low = self.config.read_config(offset);
1356 
1357             let low_flag = low & 0xf;
1358             let is_64bit = low_flag & 0x4 == 0x4;
1359             if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
1360                 let mut upper: u32 = 0xffffffff;
1361                 if is_64bit {
1362                     self.config.write_config(upper, offset + 4);
1363                     upper = self.config.read_config(offset + 4);
1364                 }
1365 
1366                 low &= 0xffff_fff0;
1367                 let mut size: u64 = u64::from(upper);
1368                 size <<= 32;
1369                 size |= u64::from(low);
1370                 size = !size + 1;
1371                 let region_type = if is_64bit {
1372                     PciBarRegionType::Memory64BitRegion
1373                 } else {
1374                     PciBarRegionType::Memory32BitRegion
1375                 };
1376                 let prefetch = if low_flag & 0x8 == 0x8 {
1377                     PciBarPrefetchable::Prefetchable
1378                 } else {
1379                     PciBarPrefetchable::NotPrefetchable
1380                 };
1381                 mem_bars.push(PciBarConfiguration::new(
1382                     i as usize,
1383                     size,
1384                     region_type,
1385                     prefetch,
1386                 ));
1387             } else if low_flag & 0x1 == 0x1 {
1388                 let size = !(low & 0xffff_fffc) + 1;
1389                 self.io_regions.push(PciBarConfiguration::new(
1390                     i as usize,
1391                     size.into(),
1392                     PciBarRegionType::IoRegion,
1393                     PciBarPrefetchable::NotPrefetchable,
1394                 ));
1395             }
1396 
1397             if is_64bit {
1398                 i += 2;
1399             } else {
1400                 i += 1;
1401             }
1402         }
1403         mem_bars
1404     }
1405 
configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64)1406     fn configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64) {
1407         let offset: u32 = bar_info.reg_index() as u32 * 4;
1408         let mmio_region = *bar_info;
1409         self.mmio_regions.push(mmio_region.set_address(bar_addr));
1410 
1411         let val: u32 = self.config.read_config(offset);
1412         let low = ((bar_addr & !0xf) as u32) | (val & 0xf);
1413         self.config.write_config(low, offset);
1414         if bar_info.is_64bit_memory() {
1415             let upper = (bar_addr >> 32) as u32;
1416             self.config.write_config(upper, offset + 4);
1417         }
1418     }
1419 
allocate_root_barmem( &mut self, mem_bars: &[PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1420     fn allocate_root_barmem(
1421         &mut self,
1422         mem_bars: &[PciBarConfiguration],
1423         resources: &mut SystemAllocator,
1424     ) -> Result<Vec<BarRange>, PciDeviceError> {
1425         let address = self.pci_address.unwrap();
1426         let mut ranges: Vec<BarRange> = Vec::new();
1427         for mem_bar in mem_bars {
1428             let bar_size = mem_bar.size();
1429             let mut bar_addr: u64 = 0;
1430             // Don't allocate mmio for hotplug device, OS will allocate it from
1431             // its parent's bridge window.
1432             if !self.hotplug {
1433                 bar_addr = resources
1434                     .allocate_mmio(
1435                         bar_size,
1436                         Alloc::PciBar {
1437                             bus: address.bus,
1438                             dev: address.dev,
1439                             func: address.func,
1440                             bar: mem_bar.bar_index() as u8,
1441                         },
1442                         "vfio_bar".to_string(),
1443                         AllocOptions::new()
1444                             .prefetchable(mem_bar.is_prefetchable())
1445                             .max_address(if mem_bar.is_64bit_memory() {
1446                                 u64::MAX
1447                             } else {
1448                                 u32::MAX.into()
1449                             })
1450                             .align(bar_size),
1451                     )
1452                     .map_err(|e| PciDeviceError::IoAllocationFailed(bar_size, e))?;
1453                 ranges.push(BarRange {
1454                     addr: bar_addr,
1455                     size: bar_size,
1456                     prefetchable: mem_bar.is_prefetchable(),
1457                 });
1458             }
1459             self.configure_barmem(mem_bar, bar_addr);
1460         }
1461         Ok(ranges)
1462     }
1463 
allocate_nonroot_barmem( &mut self, mem_bars: &mut [PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1464     fn allocate_nonroot_barmem(
1465         &mut self,
1466         mem_bars: &mut [PciBarConfiguration],
1467         resources: &mut SystemAllocator,
1468     ) -> Result<Vec<BarRange>, PciDeviceError> {
1469         const NON_PREFETCHABLE: usize = 0;
1470         const PREFETCHABLE: usize = 1;
1471         const ARRAY_SIZE: usize = 2;
1472         let mut membars: [Vec<PciBarConfiguration>; ARRAY_SIZE] = [Vec::new(), Vec::new()];
1473         let mut allocator: [VfioResourceAllocator; ARRAY_SIZE] = [
1474             match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u32::MAX as u64)) {
1475                 Ok(a) => a,
1476                 Err(e) => {
1477                     error!(
1478                         "{} init nonroot VfioResourceAllocator failed: {}",
1479                         self.debug_label(),
1480                         e
1481                     );
1482                     return Err(e);
1483                 }
1484             },
1485             match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u64::MAX)) {
1486                 Ok(a) => a,
1487                 Err(e) => {
1488                     error!(
1489                         "{} init nonroot VfioResourceAllocator failed: {}",
1490                         self.debug_label(),
1491                         e
1492                     );
1493                     return Err(e);
1494                 }
1495             },
1496         ];
1497         let mut memtype: [MmioType; ARRAY_SIZE] = [MmioType::Low, MmioType::High];
1498         // the window must be 1M-aligned as per the PCI spec
1499         let mut window_sz: [u64; ARRAY_SIZE] = [0; 2];
1500         let mut alignment: [u64; ARRAY_SIZE] = [0x100000; 2];
1501 
1502         // Descend by bar size, this could reduce allocated size for all the bars.
1503         mem_bars.sort_by_key(|a| Reverse(a.size()));
1504         for mem_bar in mem_bars {
1505             let prefetchable = mem_bar.is_prefetchable();
1506             let is_64bit = mem_bar.is_64bit_memory();
1507 
1508             // if one prefetchable bar is 32bit, all the prefetchable bars should be in Low MMIO,
1509             // as all the prefetchable bars should be in one region
1510             if prefetchable && !is_64bit {
1511                 memtype[PREFETCHABLE] = MmioType::Low;
1512             }
1513             let i = if prefetchable {
1514                 PREFETCHABLE
1515             } else {
1516                 NON_PREFETCHABLE
1517             };
1518             let bar_size = mem_bar.size();
1519             let start = match allocator[i].allocate_with_align(bar_size, bar_size) {
1520                 Ok(s) => s,
1521                 Err(e) => {
1522                     error!(
1523                         "{} nonroot allocate_wit_align failed: {}",
1524                         self.debug_label(),
1525                         e
1526                     );
1527                     return Err(e);
1528                 }
1529             };
1530             window_sz[i] = max(window_sz[i], start + bar_size);
1531             alignment[i] = max(alignment[i], bar_size);
1532             let mem_info = (*mem_bar).set_address(start);
1533             membars[i].push(mem_info);
1534         }
1535 
1536         let address = self.pci_address.unwrap();
1537         let mut ranges: Vec<BarRange> = Vec::new();
1538         for (index, bars) in membars.iter().enumerate() {
1539             if bars.is_empty() {
1540                 continue;
1541             }
1542 
1543             let i = if index == 1 {
1544                 PREFETCHABLE
1545             } else {
1546                 NON_PREFETCHABLE
1547             };
1548             let mut window_addr: u64 = 0;
1549             // Don't allocate mmio for hotplug device, OS will allocate it from
1550             // its parent's bridge window.
1551             if !self.hotplug {
1552                 window_sz[i] = (window_sz[i] + 0xfffff) & !0xfffff;
1553                 let alloc = if i == NON_PREFETCHABLE {
1554                     Alloc::PciBridgeWindow {
1555                         bus: address.bus,
1556                         dev: address.dev,
1557                         func: address.func,
1558                     }
1559                 } else {
1560                     Alloc::PciBridgePrefetchWindow {
1561                         bus: address.bus,
1562                         dev: address.dev,
1563                         func: address.func,
1564                     }
1565                 };
1566                 window_addr = resources
1567                     .mmio_allocator(memtype[i])
1568                     .allocate_with_align(
1569                         window_sz[i],
1570                         alloc,
1571                         "vfio_bar_window".to_string(),
1572                         alignment[i],
1573                     )
1574                     .map_err(|e| PciDeviceError::IoAllocationFailed(window_sz[i], e))?;
1575                 for mem_info in bars {
1576                     let bar_addr = window_addr + mem_info.address();
1577                     ranges.push(BarRange {
1578                         addr: bar_addr,
1579                         size: mem_info.size(),
1580                         prefetchable: mem_info.is_prefetchable(),
1581                     });
1582                 }
1583             }
1584 
1585             for mem_info in bars {
1586                 let bar_addr = window_addr + mem_info.address();
1587                 self.configure_barmem(mem_info, bar_addr);
1588             }
1589         }
1590         Ok(ranges)
1591     }
1592 
1593     /// Return the supported iova max address of the Vfio Pci device
get_max_iova(&self) -> u641594     pub fn get_max_iova(&self) -> u64 {
1595         self.device.get_max_addr()
1596     }
1597 
get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap>1598     fn get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap> {
1599         self.ext_caps
1600             .iter()
1601             .find(|ext_cap| reg >= ext_cap.offset && reg < ext_cap.offset + ext_cap.size)
1602             .cloned()
1603     }
1604 
is_skipped_reg(&self, reg: u32) -> bool1605     fn is_skipped_reg(&self, reg: u32) -> bool {
1606         // fast handle for pci config space
1607         if reg < PCI_CONFIG_SPACE_SIZE {
1608             return false;
1609         }
1610 
1611         self.get_ext_cap_by_reg(reg)
1612             .map_or(false, |cap| cap.is_skipped)
1613     }
1614 }
1615 
1616 impl PciDevice for VfioPciDevice {
debug_label(&self) -> String1617     fn debug_label(&self) -> String {
1618         format!("vfio {} device", self.device.device_name())
1619     }
1620 
preferred_address(&self) -> Option<PciAddress>1621     fn preferred_address(&self) -> Option<PciAddress> {
1622         Some(self.preferred_address)
1623     }
1624 
allocate_address( &mut self, resources: &mut SystemAllocator, ) -> Result<PciAddress, PciDeviceError>1625     fn allocate_address(
1626         &mut self,
1627         resources: &mut SystemAllocator,
1628     ) -> Result<PciAddress, PciDeviceError> {
1629         if self.pci_address.is_none() {
1630             let mut address = self.preferred_address;
1631             while address.func < 8 {
1632                 if resources.reserve_pci(
1633                     Alloc::PciBar {
1634                         bus: address.bus,
1635                         dev: address.dev,
1636                         func: address.func,
1637                         bar: 0,
1638                     },
1639                     self.debug_label(),
1640                 ) {
1641                     self.pci_address = Some(address);
1642                     break;
1643                 } else if self.hotplug_bus_number.is_none() {
1644                     break;
1645                 } else {
1646                     address.func += 1;
1647                 }
1648             }
1649         }
1650         self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1651     }
1652 
keep_rds(&self) -> Vec<RawDescriptor>1653     fn keep_rds(&self) -> Vec<RawDescriptor> {
1654         let mut rds = self.device.keep_rds();
1655         if let Some(ref interrupt_evt) = self.interrupt_evt {
1656             rds.extend(interrupt_evt.as_raw_descriptors());
1657         }
1658         rds.push(self.vm_memory_client.as_raw_descriptor());
1659         if let Some(vm_socket_vm) = &self.vm_socket_vm {
1660             rds.push(vm_socket_vm.as_raw_descriptor());
1661         }
1662         if let Some(msi_cap) = &self.msi_cap {
1663             rds.push(msi_cap.config.get_msi_socket());
1664         }
1665         if let Some(msix_cap) = &self.msix_cap {
1666             rds.push(msix_cap.lock().config.as_raw_descriptor());
1667         }
1668         rds
1669     }
1670 
preferred_irq(&self) -> PreferredIrq1671     fn preferred_irq(&self) -> PreferredIrq {
1672         // Is INTx configured?
1673         let pin = match self.config.read_config::<u8>(PCI_INTERRUPT_PIN) {
1674             1 => PciInterruptPin::IntA,
1675             2 => PciInterruptPin::IntB,
1676             3 => PciInterruptPin::IntC,
1677             4 => PciInterruptPin::IntD,
1678             _ => return PreferredIrq::None,
1679         };
1680 
1681         // TODO: replace sysfs/irq value parsing with vfio interface
1682         //       reporting host allocated interrupt number and type.
1683         let path = self.sysfs_path.join("irq");
1684         let gsi = fs::read_to_string(path)
1685             .map(|v| v.trim().parse::<u32>().unwrap_or(0))
1686             .unwrap_or(0);
1687 
1688         PreferredIrq::Fixed { pin, gsi }
1689     }
1690 
assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32)1691     fn assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32) {
1692         // Keep event/resample event references.
1693         self.interrupt_evt = Some(irq_evt);
1694 
1695         // enable INTX
1696         self.enable_intx();
1697 
1698         self.config
1699             .write_config(pin.to_mask() as u8, PCI_INTERRUPT_PIN);
1700         self.config.write_config(irq_num as u8, PCI_INTERRUPT_NUM);
1701     }
1702 
allocate_io_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1703     fn allocate_io_bars(
1704         &mut self,
1705         resources: &mut SystemAllocator,
1706     ) -> Result<Vec<BarRange>, PciDeviceError> {
1707         let address = self
1708             .pci_address
1709             .expect("allocate_address must be called prior to allocate_device_bars");
1710 
1711         let mut mem_bars = self.collect_bars();
1712 
1713         let ranges = if address.bus == 0 {
1714             self.allocate_root_barmem(&mem_bars, resources)?
1715         } else {
1716             self.allocate_nonroot_barmem(&mut mem_bars, resources)?
1717         };
1718 
1719         // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
1720         // driver doesn't claim this vga device, then xorg couldn't boot up.
1721         if self.is_intel_gfx() {
1722             let mut cmd = self.config.read_config::<u8>(PCI_COMMAND);
1723             cmd |= PCI_COMMAND_MEMORY;
1724             self.config.write_config(cmd, PCI_COMMAND);
1725         }
1726         Ok(ranges)
1727     }
1728 
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1729     fn allocate_device_bars(
1730         &mut self,
1731         resources: &mut SystemAllocator,
1732     ) -> Result<Vec<BarRange>, PciDeviceError> {
1733         let mut ranges: Vec<BarRange> = Vec::new();
1734 
1735         if !self.is_intel_gfx() {
1736             return Ok(ranges);
1737         }
1738 
1739         // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
1740         // then write this gpa into pci cfg register
1741         if let Some((index, size)) = self.device.get_cap_type_info(
1742             VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (PCI_VENDOR_ID_INTEL as u32),
1743             VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
1744         ) {
1745             let address = self
1746                 .pci_address
1747                 .expect("allocate_address must be called prior to allocate_device_bars");
1748             let bar_addr = resources
1749                 .allocate_mmio(
1750                     size,
1751                     Alloc::PciBar {
1752                         bus: address.bus,
1753                         dev: address.dev,
1754                         func: address.func,
1755                         bar: (index * 4) as u8,
1756                     },
1757                     "vfio_bar".to_string(),
1758                     AllocOptions::new().max_address(u32::MAX.into()),
1759                 )
1760                 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1761             ranges.push(BarRange {
1762                 addr: bar_addr,
1763                 size,
1764                 prefetchable: false,
1765             });
1766             self.device_data = Some(DeviceData::IntelGfxData {
1767                 opregion_index: index,
1768             });
1769 
1770             self.mmio_regions.push(
1771                 PciBarConfiguration::new(
1772                     index as usize,
1773                     size,
1774                     PciBarRegionType::Memory32BitRegion,
1775                     PciBarPrefetchable::NotPrefetchable,
1776                 )
1777                 .set_address(bar_addr),
1778             );
1779             self.config.write_config(bar_addr as u32, 0xFC);
1780         }
1781 
1782         Ok(ranges)
1783     }
1784 
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1785     fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1786         for region in self.mmio_regions.iter().chain(self.io_regions.iter()) {
1787             if region.bar_index() == bar_num {
1788                 let command: u8 = self.config.read_config(PCI_COMMAND);
1789                 if (region.is_memory() && (command & PCI_COMMAND_MEMORY == 0)) || region.is_io() {
1790                     return None;
1791                 } else {
1792                     return Some(*region);
1793                 }
1794             }
1795         }
1796 
1797         None
1798     }
1799 
register_device_capabilities(&mut self) -> Result<(), PciDeviceError>1800     fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
1801         Ok(())
1802     }
1803 
read_config_register(&self, reg_idx: usize) -> u321804     fn read_config_register(&self, reg_idx: usize) -> u32 {
1805         let reg: u32 = (reg_idx * 4) as u32;
1806         let mut config: u32 = self.config.read_config(reg);
1807 
1808         // See VfioPciDevice::new for details how extended caps are managed
1809         if reg >= PCI_CONFIG_SPACE_SIZE {
1810             let ext_cap = self.get_ext_cap_by_reg(reg);
1811             if let Some(ext_cap) = ext_cap {
1812                 if ext_cap.offset == reg {
1813                     config = (config & !(0xffc << 20)) | (((ext_cap.next & 0xffc) as u32) << 20);
1814                 }
1815 
1816                 if ext_cap.is_skipped {
1817                     if reg == PCI_CONFIG_SPACE_SIZE {
1818                         config = (config & (0xffc << 20)) | (PCI_EXT_CAP_ID_CAC as u32);
1819                     } else {
1820                         config = 0;
1821                     }
1822                 }
1823             }
1824         }
1825 
1826         // Ignore IO bar
1827         if (0x10..=0x24).contains(&reg) {
1828             let bar_idx = (reg as usize - 0x10) / 4;
1829             if let Some(bar) = self.get_bar_configuration(bar_idx) {
1830                 if bar.is_io() {
1831                     config = 0;
1832                 }
1833             }
1834         } else if let Some(msix_cap) = &self.msix_cap {
1835             let msix_cap = msix_cap.lock();
1836             if msix_cap.is_msix_control_reg(reg, 4) {
1837                 msix_cap.read_msix_control(&mut config);
1838             }
1839         } else if let Some(pm_cap) = &self.pm_cap {
1840             let pm_cap = pm_cap.lock();
1841             if pm_cap.is_pm_reg(reg) {
1842                 config = pm_cap.read(reg);
1843             }
1844         }
1845 
1846         // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
1847         if self.is_intel_gfx() && reg == 0x50 {
1848             config &= 0xffff00ff;
1849         }
1850 
1851         config
1852     }
1853 
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1854     fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1855         // When guest write config register at the first time, start worker thread
1856         if self.worker_thread.is_none() && self.vm_socket_vm.is_some() {
1857             self.start_work_thread();
1858         };
1859 
1860         let start = (reg_idx * 4) as u64 + offset;
1861 
1862         if let Some(pm_cap) = self.pm_cap.as_mut() {
1863             let mut pm_cap = pm_cap.lock();
1864             if pm_cap.is_pm_reg(start as u32) {
1865                 pm_cap.write(start, data);
1866             }
1867         }
1868 
1869         let mut msi_change: Option<VfioMsiChange> = None;
1870         if let Some(msi_cap) = self.msi_cap.as_mut() {
1871             if msi_cap.is_msi_reg(start, data.len()) {
1872                 msi_change = msi_cap.write_msi_reg(start, data);
1873             }
1874         }
1875 
1876         match msi_change {
1877             Some(VfioMsiChange::Enable) => self.enable_msi(),
1878             Some(VfioMsiChange::Disable) => self.disable_msi(),
1879             _ => (),
1880         }
1881 
1882         msi_change = None;
1883         if let Some(msix_cap) = &self.msix_cap {
1884             let mut msix_cap = msix_cap.lock();
1885             if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
1886                 msi_change = msix_cap.write_msix_control(data);
1887             }
1888         }
1889 
1890         match msi_change {
1891             Some(VfioMsiChange::Enable) => self.enable_msix(),
1892             Some(VfioMsiChange::Disable) => self.disable_msix(),
1893             Some(VfioMsiChange::FunctionChanged) => {
1894                 if let Err(e) = self.msix_vectors_update() {
1895                     error!("update msix vectors failed: {}", e);
1896                 }
1897             }
1898             _ => (),
1899         }
1900 
1901         if !self.is_skipped_reg(start as u32) {
1902             self.device
1903                 .region_write(VFIO_PCI_CONFIG_REGION_INDEX as usize, data, start);
1904         }
1905 
1906         // if guest enable memory access, then enable bar mappable once
1907         if start == PCI_COMMAND as u64
1908             && data.len() == 2
1909             && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
1910         {
1911             self.commit_bars_mmap();
1912         } else if (0x10..=0x24).contains(&start) && data.len() == 4 {
1913             let bar_idx = (start as u32 - 0x10) / 4;
1914             let value: [u8; 4] = [data[0], data[1], data[2], data[3]];
1915             let val = u32::from_le_bytes(value);
1916             let mut modify = false;
1917             for region in self.mmio_regions.iter_mut() {
1918                 if region.bar_index() == bar_idx as usize {
1919                     let old_addr = region.address();
1920                     let new_addr = val & 0xFFFFFFF0;
1921                     if !region.is_64bit_memory() && (old_addr as u32) != new_addr {
1922                         // Change 32bit bar address
1923                         *region = region.set_address(u64::from(new_addr));
1924                         modify = true;
1925                     } else if region.is_64bit_memory() && (old_addr as u32) != new_addr {
1926                         // Change 64bit bar low address
1927                         *region =
1928                             region.set_address(u64::from(new_addr) | ((old_addr >> 32) << 32));
1929                         modify = true;
1930                     }
1931                     break;
1932                 } else if region.is_64bit_memory()
1933                     && ((bar_idx % 2) == 1)
1934                     && (region.bar_index() + 1 == bar_idx as usize)
1935                 {
1936                     // Change 64bit bar high address
1937                     let old_addr = region.address();
1938                     if val != (old_addr >> 32) as u32 {
1939                         let mut new_addr = (u64::from(val)) << 32;
1940                         new_addr |= old_addr & 0xFFFFFFFF;
1941                         *region = region.set_address(new_addr);
1942                         modify = true;
1943                     }
1944                     break;
1945                 }
1946             }
1947             if modify {
1948                 // if bar is changed under memory enabled, mmap the
1949                 // new bar immediately.
1950                 let cmd = self.config.read_config::<u8>(PCI_COMMAND);
1951                 if cmd & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY {
1952                     self.commit_bars_mmap();
1953                 }
1954             }
1955         }
1956     }
1957 
read_virtual_config_register(&self, reg_idx: usize) -> u321958     fn read_virtual_config_register(&self, reg_idx: usize) -> u32 {
1959         if reg_idx == PCI_VCFG_NOTY {
1960             let mut q = self.acpi_notifier_val.lock();
1961             let mut val = 0;
1962             if !q.is_empty() {
1963                 val = q.remove(0);
1964             }
1965             drop(q);
1966             return val;
1967         }
1968 
1969         warn!(
1970             "{} read unsupported vcfg register {}",
1971             self.debug_label(),
1972             reg_idx
1973         );
1974         0xFFFF_FFFF
1975     }
1976 
write_virtual_config_register(&mut self, reg_idx: usize, value: u32)1977     fn write_virtual_config_register(&mut self, reg_idx: usize, value: u32) {
1978         match reg_idx {
1979             PCI_VCFG_PM => {
1980                 match value {
1981                     0 => {
1982                         if let Some(pm_evt) =
1983                             self.pm_evt.as_ref().map(|evt| evt.try_clone().unwrap())
1984                         {
1985                             *self.is_in_low_power.lock() = true;
1986                             let _ = self.device.pm_low_power_enter_with_wakeup(pm_evt);
1987                         } else {
1988                             let _ = self.device.pm_low_power_enter();
1989                         }
1990                     }
1991                     _ => {
1992                         *self.is_in_low_power.lock() = false;
1993                         let _ = self.device.pm_low_power_exit();
1994                     }
1995                 };
1996             }
1997             PCI_VCFG_DSM => {
1998                 if let Some(shm) = &self.vcfg_shm_mmap {
1999                     let mut args = [0u8; 4096];
2000                     if let Err(e) = shm.read_slice(&mut args, 0) {
2001                         error!("failed to read DSM Args: {}", e);
2002                         return;
2003                     }
2004                     let res = match self.device.acpi_dsm(&args) {
2005                         Ok(r) => r,
2006                         Err(e) => {
2007                             error!("failed to call DSM: {}", e);
2008                             return;
2009                         }
2010                     };
2011                     if let Err(e) = shm.write_slice(&res, 0) {
2012                         error!("failed to write DSM result: {}", e);
2013                         return;
2014                     }
2015                     if let Err(e) = shm.msync() {
2016                         error!("failed to msync: {}", e)
2017                     }
2018                 }
2019             }
2020             _ => warn!(
2021                 "{} write unsupported vcfg register {}",
2022                 self.debug_label(),
2023                 reg_idx
2024             ),
2025         };
2026     }
2027 
read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8])2028     fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
2029         if let Some(msix_cap) = &self.msix_cap {
2030             let msix_cap = msix_cap.lock();
2031             if msix_cap.is_msix_table(bar_index, offset) {
2032                 msix_cap.read_table(offset, data);
2033                 return;
2034             } else if msix_cap.is_msix_pba(bar_index, offset) {
2035                 msix_cap.read_pba(offset, data);
2036                 return;
2037             }
2038         }
2039         self.device.region_read(bar_index, data, offset);
2040     }
2041 
write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8])2042     fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
2043         // Ignore igd opregion's write
2044         if let Some(device_data) = &self.device_data {
2045             match *device_data {
2046                 DeviceData::IntelGfxData { opregion_index } => {
2047                     if opregion_index == bar_index as u32 {
2048                         return;
2049                     }
2050                 }
2051             }
2052         }
2053 
2054         if let Some(msix_cap) = &self.msix_cap {
2055             let mut msix_cap = msix_cap.lock();
2056             if msix_cap.is_msix_table(bar_index, offset) {
2057                 let behavior = msix_cap.write_table(offset, data);
2058                 if let MsixStatus::EntryChanged(index) = behavior {
2059                     let irqfd = msix_cap.get_msix_irqfd(index);
2060                     self.msix_vector_update(index, irqfd);
2061                 }
2062                 return;
2063             } else if msix_cap.is_msix_pba(bar_index, offset) {
2064                 msix_cap.write_pba(offset, data);
2065                 return;
2066             }
2067         }
2068 
2069         self.device.region_write(bar_index, data, offset);
2070     }
2071 
destroy_device(&mut self)2072     fn destroy_device(&mut self) {
2073         self.close();
2074     }
2075 
generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>)2076     fn generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>) {
2077         let mut amls = Vec::new();
2078         let mut shm = None;
2079         if let Some(pci_address) = self.pci_address {
2080             let vcfg_offset = pci_address.to_config_address(0, 13);
2081             if let Ok(vcfg_register) = DeviceVcfgRegister::new(vcfg_offset) {
2082                 vcfg_register.to_aml_bytes(&mut amls);
2083                 shm = vcfg_register
2084                     .create_shm_mmap()
2085                     .map(|shm| (vcfg_offset + SHM_OFFSET, shm));
2086                 self.vcfg_shm_mmap = vcfg_register.create_shm_mmap();
2087                 // All vfio-pci devices should have virtual _PRx method, otherwise
2088                 // host couldn't know whether device has enter into suspend state,
2089                 // host would always think it is in active state, so its parent PCIe
2090                 // switch couldn't enter into suspend state.
2091                 PowerResourceMethod {}.to_aml_bytes(&mut amls);
2092                 // TODO: WIP: Ideally, we should generate DSM only if the physical
2093                 // device has a _DSM; however, such information is not provided by
2094                 // Linux. As a temporary workaround, we chech whether there is an
2095                 // associated ACPI companion device node and skip generating guest
2096                 // _DSM if there is none.
2097                 let acpi_path = self.sysfs_path.join("firmware_node/path");
2098                 if acpi_path.exists() {
2099                     DsmMethod {}.to_aml_bytes(&mut amls);
2100                 }
2101             }
2102         }
2103 
2104         (amls, shm)
2105     }
2106 
set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32>2107     fn set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32> {
2108         if let Some(gpe_nr) = resources.allocate_gpe() {
2109             base::debug!("set_gpe: gpe-nr {} addr {:?}", gpe_nr, self.pci_address);
2110             self.gpe = Some(gpe_nr);
2111         }
2112         self.gpe
2113     }
2114 }
2115 
2116 impl Suspendable for VfioPciDevice {
sleep(&mut self) -> anyhow::Result<()>2117     fn sleep(&mut self) -> anyhow::Result<()> {
2118         if let Some(worker_thread) = self.worker_thread.take() {
2119             let res = worker_thread.stop();
2120             self.pci_address = Some(res.address);
2121             self.sysfs_path = res.sysfs_path;
2122             self.pm_cap = res.pm_cap;
2123             self.msix_cap = res.msix_cap;
2124             self.vm_socket_vm = Some(res.vm_socket);
2125         }
2126         Ok(())
2127     }
2128 
wake(&mut self) -> anyhow::Result<()>2129     fn wake(&mut self) -> anyhow::Result<()> {
2130         if self.activated {
2131             self.start_work_thread();
2132         }
2133         Ok(())
2134     }
2135 }
2136 
2137 #[cfg(test)]
2138 mod tests {
2139     use resources::AddressRange;
2140 
2141     use super::VfioResourceAllocator;
2142 
2143     #[test]
no_overlap()2144     fn no_overlap() {
2145         // regions [32, 95]
2146         let mut memory =
2147             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2148         memory
2149             .allocate_at_can_overlap(AddressRange::from_start_and_end(0, 15))
2150             .unwrap();
2151         memory
2152             .allocate_at_can_overlap(AddressRange::from_start_and_end(100, 115))
2153             .unwrap();
2154 
2155         let mut iter = memory.regions.iter();
2156         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 95)));
2157     }
2158 
2159     #[test]
complete_overlap()2160     fn complete_overlap() {
2161         // regions [32, 95]
2162         let mut memory =
2163             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2164         // regions [32, 47], [64, 95]
2165         memory
2166             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2167             .unwrap();
2168         // regions [64, 95]
2169         memory
2170             .allocate_at_can_overlap(AddressRange::from_start_and_end(32, 47))
2171             .unwrap();
2172 
2173         let mut iter = memory.regions.iter();
2174         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2175     }
2176 
2177     #[test]
partial_overlap_one()2178     fn partial_overlap_one() {
2179         // regions [32, 95]
2180         let mut memory =
2181             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2182         // regions [32, 47], [64, 95]
2183         memory
2184             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2185             .unwrap();
2186         // regions [32, 39], [64, 95]
2187         memory
2188             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 55))
2189             .unwrap();
2190 
2191         let mut iter = memory.regions.iter();
2192         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2193         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2194     }
2195 
2196     #[test]
partial_overlap_two()2197     fn partial_overlap_two() {
2198         // regions [32, 95]
2199         let mut memory =
2200             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2201         // regions [32, 47], [64, 95]
2202         memory
2203             .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2204             .unwrap();
2205         // regions [32, 39], [72, 95]
2206         memory
2207             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 71))
2208             .unwrap();
2209 
2210         let mut iter = memory.regions.iter();
2211         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2212         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(72, 95)));
2213     }
2214 
2215     #[test]
partial_overlap_three()2216     fn partial_overlap_three() {
2217         // regions [32, 95]
2218         let mut memory =
2219             VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2220         // regions [32, 39], [48, 95]
2221         memory
2222             .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 47))
2223             .unwrap();
2224         // regions [32, 39], [48, 63], [72, 95]
2225         memory
2226             .allocate_at_can_overlap(AddressRange::from_start_and_end(64, 71))
2227             .unwrap();
2228         // regions [32, 35], [76, 95]
2229         memory
2230             .allocate_at_can_overlap(AddressRange::from_start_and_end(36, 75))
2231             .unwrap();
2232 
2233         let mut iter = memory.regions.iter();
2234         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 35)));
2235         assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(76, 95)));
2236     }
2237 }
2238