1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::cmp::max;
6 use std::cmp::Reverse;
7 use std::collections::BTreeMap;
8 use std::collections::BTreeSet;
9 use std::fs;
10 use std::path::Path;
11 use std::path::PathBuf;
12 use std::str::FromStr;
13 use std::sync::Arc;
14
15 use acpi_tables::aml::Aml;
16 use base::debug;
17 use base::error;
18 use base::pagesize;
19 use base::warn;
20 use base::AsRawDescriptor;
21 use base::AsRawDescriptors;
22 use base::Event;
23 use base::EventToken;
24 use base::MemoryMapping;
25 use base::Protection;
26 use base::RawDescriptor;
27 use base::Tube;
28 use base::WaitContext;
29 use base::WorkerThread;
30 use hypervisor::MemCacheType;
31 use resources::AddressRange;
32 use resources::Alloc;
33 use resources::AllocOptions;
34 use resources::MmioType;
35 use resources::SystemAllocator;
36 use sync::Mutex;
37 use vfio_sys::vfio::VFIO_PCI_ACPI_NTFY_IRQ_INDEX;
38 use vfio_sys::*;
39 use vm_control::api::VmMemoryClient;
40 use vm_control::HotPlugDeviceInfo;
41 use vm_control::HotPlugDeviceType;
42 use vm_control::VmMemoryDestination;
43 use vm_control::VmMemoryRegionId;
44 use vm_control::VmMemorySource;
45 use vm_control::VmRequest;
46 use vm_control::VmResponse;
47
48 use crate::pci::acpi::DeviceVcfgRegister;
49 use crate::pci::acpi::DsmMethod;
50 use crate::pci::acpi::PowerResourceMethod;
51 use crate::pci::acpi::SHM_OFFSET;
52 use crate::pci::msi::MsiConfig;
53 use crate::pci::msi::MsiStatus;
54 use crate::pci::msi::PCI_MSI_FLAGS;
55 use crate::pci::msi::PCI_MSI_FLAGS_64BIT;
56 use crate::pci::msi::PCI_MSI_FLAGS_MASKBIT;
57 use crate::pci::msi::PCI_MSI_NEXT_POINTER;
58 use crate::pci::msix::MsixConfig;
59 use crate::pci::msix::MsixStatus;
60 use crate::pci::msix::BITS_PER_PBA_ENTRY;
61 use crate::pci::msix::MSIX_PBA_ENTRIES_MODULO;
62 use crate::pci::msix::MSIX_TABLE_ENTRIES_MODULO;
63 use crate::pci::pci_device::BarRange;
64 use crate::pci::pci_device::Error as PciDeviceError;
65 use crate::pci::pci_device::PciDevice;
66 use crate::pci::pci_device::PreferredIrq;
67 use crate::pci::pm::PciPmCap;
68 use crate::pci::pm::PmConfig;
69 use crate::pci::pm::PM_CAP_LENGTH;
70 use crate::pci::PciAddress;
71 use crate::pci::PciBarConfiguration;
72 use crate::pci::PciBarIndex;
73 use crate::pci::PciBarPrefetchable;
74 use crate::pci::PciBarRegionType;
75 use crate::pci::PciCapabilityID;
76 use crate::pci::PciClassCode;
77 use crate::pci::PciId;
78 use crate::pci::PciInterruptPin;
79 use crate::pci::PCI_VCFG_DSM;
80 use crate::pci::PCI_VCFG_NOTY;
81 use crate::pci::PCI_VCFG_PM;
82 use crate::pci::PCI_VENDOR_ID_INTEL;
83 use crate::vfio::VfioDevice;
84 use crate::vfio::VfioError;
85 use crate::vfio::VfioIrqType;
86 use crate::vfio::VfioPciConfig;
87 use crate::IrqLevelEvent;
88 use crate::Suspendable;
89
90 const PCI_VENDOR_ID: u32 = 0x0;
91 const PCI_DEVICE_ID: u32 = 0x2;
92 const PCI_COMMAND: u32 = 0x4;
93 const PCI_COMMAND_MEMORY: u8 = 0x2;
94 const PCI_BASE_CLASS_CODE: u32 = 0x0B;
95 const PCI_INTERRUPT_NUM: u32 = 0x3C;
96 const PCI_INTERRUPT_PIN: u32 = 0x3D;
97
98 const PCI_CAPABILITY_LIST: u32 = 0x34;
99 const PCI_CAP_ID_MSI: u8 = 0x05;
100 const PCI_CAP_ID_MSIX: u8 = 0x11;
101 const PCI_CAP_ID_PM: u8 = 0x01;
102
103 // Size of the standard PCI config space
104 const PCI_CONFIG_SPACE_SIZE: u32 = 0x100;
105 // Size of the standard PCIe config space: 4KB
106 const PCIE_CONFIG_SPACE_SIZE: u32 = 0x1000;
107
108 // Extended Capabilities
109 const PCI_EXT_CAP_ID_CAC: u16 = 0x0C;
110 const PCI_EXT_CAP_ID_ARI: u16 = 0x0E;
111 const PCI_EXT_CAP_ID_SRIOV: u16 = 0x10;
112 const PCI_EXT_CAP_ID_REBAR: u16 = 0x15;
113
114 struct VfioPmCap {
115 offset: u32,
116 capabilities: u32,
117 config: PmConfig,
118 }
119
120 impl VfioPmCap {
new(config: &VfioPciConfig, cap_start: u32) -> Self121 fn new(config: &VfioPciConfig, cap_start: u32) -> Self {
122 let mut capabilities: u32 = config.read_config(cap_start);
123 capabilities |= (PciPmCap::default_cap() as u32) << 16;
124 VfioPmCap {
125 offset: cap_start,
126 capabilities,
127 config: PmConfig::new(false),
128 }
129 }
130
should_trigger_pme(&mut self) -> bool131 pub fn should_trigger_pme(&mut self) -> bool {
132 self.config.should_trigger_pme()
133 }
134
is_pm_reg(&self, offset: u32) -> bool135 fn is_pm_reg(&self, offset: u32) -> bool {
136 (offset >= self.offset) && (offset < self.offset + PM_CAP_LENGTH as u32)
137 }
138
read(&self, offset: u32) -> u32139 pub fn read(&self, offset: u32) -> u32 {
140 let offset = offset - self.offset;
141 if offset == 0 {
142 self.capabilities
143 } else {
144 let mut data = 0;
145 self.config.read(&mut data);
146 data
147 }
148 }
149
write(&mut self, offset: u64, data: &[u8])150 pub fn write(&mut self, offset: u64, data: &[u8]) {
151 let offset = offset - self.offset as u64;
152 if offset >= std::mem::size_of::<u32>() as u64 {
153 let offset = offset - std::mem::size_of::<u32>() as u64;
154 self.config.write(offset, data);
155 }
156 }
157 }
158
159 enum VfioMsiChange {
160 Disable,
161 Enable,
162 FunctionChanged,
163 }
164
165 struct VfioMsiCap {
166 config: MsiConfig,
167 offset: u32,
168 }
169
170 impl VfioMsiCap {
new( config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube, device_id: u32, device_name: String, ) -> Self171 fn new(
172 config: &VfioPciConfig,
173 msi_cap_start: u32,
174 vm_socket_irq: Tube,
175 device_id: u32,
176 device_name: String,
177 ) -> Self {
178 let msi_ctl: u16 = config.read_config(msi_cap_start + PCI_MSI_FLAGS);
179 let is_64bit = (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0;
180 let mask_cap = (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0;
181
182 VfioMsiCap {
183 config: MsiConfig::new(is_64bit, mask_cap, vm_socket_irq, device_id, device_name),
184 offset: msi_cap_start,
185 }
186 }
187
is_msi_reg(&self, index: u64, len: usize) -> bool188 fn is_msi_reg(&self, index: u64, len: usize) -> bool {
189 self.config.is_msi_reg(self.offset, index, len)
190 }
191
write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange>192 fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> {
193 let offset = index as u32 - self.offset;
194 match self.config.write_msi_capability(offset, data) {
195 MsiStatus::Enabled => Some(VfioMsiChange::Enable),
196 MsiStatus::Disabled => Some(VfioMsiChange::Disable),
197 MsiStatus::NothingToDo => None,
198 }
199 }
200
get_msi_irqfd(&self) -> Option<&Event>201 fn get_msi_irqfd(&self) -> Option<&Event> {
202 self.config.get_irqfd()
203 }
204
destroy(&mut self)205 fn destroy(&mut self) {
206 self.config.destroy()
207 }
208 }
209
210 // MSI-X registers in MSI-X capability
211 const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control
212 const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size
213 const PCI_MSIX_TABLE: u32 = 0x04; // Table offset
214 const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index
215 const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
216 const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset
217 const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index
218 const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR
219
220 struct VfioMsixCap {
221 config: MsixConfig,
222 offset: u32,
223 table_size: u16,
224 table_pci_bar: PciBarIndex,
225 table_offset: u64,
226 table_size_bytes: u64,
227 pba_pci_bar: PciBarIndex,
228 pba_offset: u64,
229 pba_size_bytes: u64,
230 msix_interrupt_evt: Vec<Event>,
231 }
232
233 impl VfioMsixCap {
new( config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube, pci_id: u32, device_name: String, ) -> Self234 fn new(
235 config: &VfioPciConfig,
236 msix_cap_start: u32,
237 vm_socket_irq: Tube,
238 pci_id: u32,
239 device_name: String,
240 ) -> Self {
241 let msix_ctl: u16 = config.read_config(msix_cap_start + PCI_MSIX_FLAGS);
242 let table: u32 = config.read_config(msix_cap_start + PCI_MSIX_TABLE);
243 let table_pci_bar = (table & PCI_MSIX_TABLE_BIR) as PciBarIndex;
244 let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64;
245 let pba: u32 = config.read_config(msix_cap_start + PCI_MSIX_PBA);
246 let pba_pci_bar = (pba & PCI_MSIX_PBA_BIR) as PciBarIndex;
247 let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64;
248
249 let mut table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) as u64 + 1;
250 if table_pci_bar == pba_pci_bar
251 && pba_offset > table_offset
252 && (table_offset + table_size * MSIX_TABLE_ENTRIES_MODULO) > pba_offset
253 {
254 table_size = (pba_offset - table_offset) / MSIX_TABLE_ENTRIES_MODULO;
255 }
256
257 let table_size_bytes = table_size * MSIX_TABLE_ENTRIES_MODULO;
258 let pba_size_bytes = ((table_size + BITS_PER_PBA_ENTRY as u64 - 1)
259 / BITS_PER_PBA_ENTRY as u64)
260 * MSIX_PBA_ENTRIES_MODULO;
261 let mut msix_interrupt_evt = Vec::new();
262 for _ in 0..table_size {
263 msix_interrupt_evt.push(Event::new().expect("failed to create msix interrupt"));
264 }
265 VfioMsixCap {
266 config: MsixConfig::new(table_size as u16, vm_socket_irq, pci_id, device_name),
267 offset: msix_cap_start,
268 table_size: table_size as u16,
269 table_pci_bar,
270 table_offset,
271 table_size_bytes,
272 pba_pci_bar,
273 pba_offset,
274 pba_size_bytes,
275 msix_interrupt_evt,
276 }
277 }
278
279 // only msix control register is writable and need special handle in pci r/w
is_msix_control_reg(&self, offset: u32, size: u32) -> bool280 fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool {
281 let control_start = self.offset + PCI_MSIX_FLAGS;
282 let control_end = control_start + 2;
283
284 offset < control_end && offset + size > control_start
285 }
286
read_msix_control(&self, data: &mut u32)287 fn read_msix_control(&self, data: &mut u32) {
288 *data = self.config.read_msix_capability(*data);
289 }
290
write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange>291 fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> {
292 let old_enabled = self.config.enabled();
293 let old_masked = self.config.masked();
294
295 self.config
296 .write_msix_capability(PCI_MSIX_FLAGS.into(), data);
297
298 let new_enabled = self.config.enabled();
299 let new_masked = self.config.masked();
300
301 if !old_enabled && new_enabled {
302 Some(VfioMsiChange::Enable)
303 } else if old_enabled && !new_enabled {
304 Some(VfioMsiChange::Disable)
305 } else if new_enabled && old_masked != new_masked {
306 Some(VfioMsiChange::FunctionChanged)
307 } else {
308 None
309 }
310 }
311
is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool312 fn is_msix_table(&self, bar_index: PciBarIndex, offset: u64) -> bool {
313 bar_index == self.table_pci_bar
314 && offset >= self.table_offset
315 && offset < self.table_offset + self.table_size_bytes
316 }
317
get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange>318 fn get_msix_table(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
319 if bar_index == self.table_pci_bar {
320 AddressRange::from_start_and_size(self.table_offset, self.table_size_bytes)
321 } else {
322 None
323 }
324 }
325
read_table(&self, offset: u64, data: &mut [u8])326 fn read_table(&self, offset: u64, data: &mut [u8]) {
327 let offset = offset - self.table_offset;
328 self.config.read_msix_table(offset, data);
329 }
330
write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus331 fn write_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
332 let offset = offset - self.table_offset;
333 self.config.write_msix_table(offset, data)
334 }
335
is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool336 fn is_msix_pba(&self, bar_index: PciBarIndex, offset: u64) -> bool {
337 bar_index == self.pba_pci_bar
338 && offset >= self.pba_offset
339 && offset < self.pba_offset + self.pba_size_bytes
340 }
341
get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange>342 fn get_msix_pba(&self, bar_index: PciBarIndex) -> Option<AddressRange> {
343 if bar_index == self.pba_pci_bar {
344 AddressRange::from_start_and_size(self.pba_offset, self.pba_size_bytes)
345 } else {
346 None
347 }
348 }
349
read_pba(&self, offset: u64, data: &mut [u8])350 fn read_pba(&self, offset: u64, data: &mut [u8]) {
351 let offset = offset - self.pba_offset;
352 self.config.read_pba_entries(offset, data);
353 }
354
write_pba(&mut self, offset: u64, data: &[u8])355 fn write_pba(&mut self, offset: u64, data: &[u8]) {
356 let offset = offset - self.pba_offset;
357 self.config.write_pba_entries(offset, data);
358 }
359
get_msix_irqfd(&self, index: usize) -> Option<&Event>360 fn get_msix_irqfd(&self, index: usize) -> Option<&Event> {
361 let irqfd = self.config.get_irqfd(index);
362 if let Some(fd) = irqfd {
363 if self.msix_vector_masked(index) {
364 Some(&self.msix_interrupt_evt[index])
365 } else {
366 Some(fd)
367 }
368 } else {
369 None
370 }
371 }
372
get_msix_irqfds(&self) -> Vec<Option<&Event>>373 fn get_msix_irqfds(&self) -> Vec<Option<&Event>> {
374 let mut irqfds = Vec::new();
375
376 for i in 0..self.table_size {
377 irqfds.push(self.get_msix_irqfd(i as usize));
378 }
379
380 irqfds
381 }
382
table_size(&self) -> usize383 fn table_size(&self) -> usize {
384 self.table_size.into()
385 }
386
clone_msix_evt(&self) -> Vec<Event>387 fn clone_msix_evt(&self) -> Vec<Event> {
388 self.msix_interrupt_evt
389 .iter()
390 .map(|irq| irq.try_clone().unwrap())
391 .collect()
392 }
393
msix_vector_masked(&self, index: usize) -> bool394 fn msix_vector_masked(&self, index: usize) -> bool {
395 !self.config.enabled() || self.config.masked() || self.config.table_masked(index)
396 }
397
trigger(&mut self, index: usize)398 fn trigger(&mut self, index: usize) {
399 self.config.trigger(index as u16);
400 }
401
destroy(&mut self)402 fn destroy(&mut self) {
403 self.config.destroy()
404 }
405 }
406
407 struct VfioResourceAllocator {
408 // The region that is not allocated yet.
409 regions: BTreeSet<AddressRange>,
410 }
411
412 impl VfioResourceAllocator {
413 // Creates a new `VfioResourceAllocator` for managing VFIO resources.
414 // Can return `Err` if `base` + `size` overflows a u64.
415 //
416 // * `base` - The starting address of the range to manage.
417 // * `size` - The size of the address range in bytes.
new(pool: AddressRange) -> Result<Self, PciDeviceError>418 fn new(pool: AddressRange) -> Result<Self, PciDeviceError> {
419 if pool.is_empty() {
420 return Err(PciDeviceError::SizeZero);
421 }
422 let mut regions = BTreeSet::new();
423 regions.insert(pool);
424 Ok(VfioResourceAllocator { regions })
425 }
426
internal_allocate_from_slot( &mut self, slot: AddressRange, range: AddressRange, ) -> Result<u64, PciDeviceError>427 fn internal_allocate_from_slot(
428 &mut self,
429 slot: AddressRange,
430 range: AddressRange,
431 ) -> Result<u64, PciDeviceError> {
432 let slot_was_present = self.regions.remove(&slot);
433 assert!(slot_was_present);
434
435 let (before, after) = slot.non_overlapping_ranges(range);
436
437 if !before.is_empty() {
438 self.regions.insert(before);
439 }
440 if !after.is_empty() {
441 self.regions.insert(after);
442 }
443
444 Ok(range.start)
445 }
446
447 // Allocates a range of addresses from the managed region with a minimal alignment.
448 // Overlapping with a previous allocation is _not_ allowed.
449 // Returns allocated address.
allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError>450 fn allocate_with_align(&mut self, size: u64, alignment: u64) -> Result<u64, PciDeviceError> {
451 if size == 0 {
452 return Err(PciDeviceError::SizeZero);
453 }
454 if !alignment.is_power_of_two() {
455 return Err(PciDeviceError::BadAlignment);
456 }
457
458 // finds first region matching alignment and size.
459 let region = self.regions.iter().find(|range| {
460 match range.start % alignment {
461 0 => range.start.checked_add(size - 1),
462 r => range.start.checked_add(size - 1 + alignment - r),
463 }
464 .map_or(false, |end| end <= range.end)
465 });
466
467 match region {
468 Some(&slot) => {
469 let start = match slot.start % alignment {
470 0 => slot.start,
471 r => slot.start + alignment - r,
472 };
473 let end = start + size - 1;
474 let range = AddressRange::from_start_and_end(start, end);
475
476 self.internal_allocate_from_slot(slot, range)
477 }
478 None => Err(PciDeviceError::OutOfSpace),
479 }
480 }
481
482 // Allocates a range of addresses from the managed region with a required location.
483 // Overlapping with a previous allocation is allowed.
allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError>484 fn allocate_at_can_overlap(&mut self, range: AddressRange) -> Result<(), PciDeviceError> {
485 if range.is_empty() {
486 return Err(PciDeviceError::SizeZero);
487 }
488
489 while let Some(&slot) = self
490 .regions
491 .iter()
492 .find(|avail_range| avail_range.overlaps(range))
493 {
494 let _address = self.internal_allocate_from_slot(slot, range)?;
495 }
496 Ok(())
497 }
498 }
499
500 struct VfioPciWorker {
501 address: PciAddress,
502 sysfs_path: PathBuf,
503 vm_socket: Tube,
504 name: String,
505 pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
506 msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
507 }
508
509 impl VfioPciWorker {
run( &mut self, req_irq_evt: Event, wakeup_evt: Event, acpi_notify_evt: Event, kill_evt: Event, msix_evt: Vec<Event>, is_in_low_power: Arc<Mutex<bool>>, gpe: Option<u32>, notification_val: Arc<Mutex<Vec<u32>>>, )510 fn run(
511 &mut self,
512 req_irq_evt: Event,
513 wakeup_evt: Event,
514 acpi_notify_evt: Event,
515 kill_evt: Event,
516 msix_evt: Vec<Event>,
517 is_in_low_power: Arc<Mutex<bool>>,
518 gpe: Option<u32>,
519 notification_val: Arc<Mutex<Vec<u32>>>,
520 ) {
521 #[derive(EventToken, Debug)]
522 enum Token {
523 ReqIrq,
524 WakeUp,
525 AcpiNotifyEvent,
526 Kill,
527 MsixIrqi { index: usize },
528 }
529
530 let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
531 (&req_irq_evt, Token::ReqIrq),
532 (&wakeup_evt, Token::WakeUp),
533 (&acpi_notify_evt, Token::AcpiNotifyEvent),
534 (&kill_evt, Token::Kill),
535 ]) {
536 Ok(pc) => pc,
537 Err(e) => {
538 error!(
539 "{} failed creating vfio WaitContext: {}",
540 self.name.clone(),
541 e
542 );
543 return;
544 }
545 };
546
547 for (index, msix_int) in msix_evt.iter().enumerate() {
548 wait_ctx
549 .add(msix_int, Token::MsixIrqi { index })
550 .expect("Failed to create vfio WaitContext for msix interrupt event")
551 }
552
553 'wait: loop {
554 let events = match wait_ctx.wait() {
555 Ok(v) => v,
556 Err(e) => {
557 error!("{} failed polling vfio events: {}", self.name.clone(), e);
558 break;
559 }
560 };
561
562 for event in events.iter().filter(|e| e.is_readable) {
563 match event.token {
564 Token::MsixIrqi { index } => {
565 if let Some(msix_cap) = &self.msix_cap {
566 msix_cap.lock().trigger(index);
567 }
568 }
569 Token::ReqIrq => {
570 let device = HotPlugDeviceInfo {
571 device_type: HotPlugDeviceType::EndPoint,
572 path: self.sysfs_path.clone(),
573 hp_interrupt: false,
574 };
575
576 let request = VmRequest::HotPlugVfioCommand { device, add: false };
577 if self.vm_socket.send(&request).is_ok() {
578 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
579 error!("{} failed to remove vfio_device: {}", self.name.clone(), e);
580 } else {
581 break 'wait;
582 }
583 }
584 }
585 Token::WakeUp => {
586 let _ = wakeup_evt.wait();
587
588 if *is_in_low_power.lock() {
589 if let Some(pm_cap) = &self.pm_cap {
590 if pm_cap.lock().should_trigger_pme() {
591 let request =
592 VmRequest::PciPme(self.address.pme_requester_id());
593 if self.vm_socket.send(&request).is_ok() {
594 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
595 error!(
596 "{} failed to send PME: {}",
597 self.name.clone(),
598 e
599 );
600 }
601 }
602 }
603 }
604 }
605 }
606 Token::AcpiNotifyEvent => {
607 if let Some(gpe) = gpe {
608 if let Ok(val) = base::EventExt::read_count(&acpi_notify_evt) {
609 notification_val.lock().push(val as u32);
610 let request = VmRequest::Gpe {
611 gpe,
612 clear_evt: None,
613 };
614 if self.vm_socket.send(&request).is_ok() {
615 if let Err(e) = self.vm_socket.recv::<VmResponse>() {
616 error!("{} failed to send GPE: {}", self.name.clone(), e);
617 }
618 }
619 } else {
620 error!("{} failed to read acpi_notify_evt", self.name.clone());
621 }
622 }
623 }
624 Token::Kill => break 'wait,
625 }
626 }
627 }
628 }
629 }
630
get_next_from_extcap_header(cap_header: u32) -> u32631 fn get_next_from_extcap_header(cap_header: u32) -> u32 {
632 (cap_header >> 20) & 0xffc
633 }
634
is_skipped_ext_cap(cap_id: u16) -> bool635 fn is_skipped_ext_cap(cap_id: u16) -> bool {
636 matches!(
637 cap_id,
638 // SR-IOV/ARI/Resizable_BAR capabilities are not well handled and should not be exposed
639 PCI_EXT_CAP_ID_ARI | PCI_EXT_CAP_ID_SRIOV | PCI_EXT_CAP_ID_REBAR
640 )
641 }
642
643 enum DeviceData {
644 IntelGfxData { opregion_index: u32 },
645 }
646
647 /// PCI Express Extended Capabilities information
648 #[derive(Copy, Clone)]
649 struct ExtCap {
650 /// cap offset in Configuration Space
651 offset: u32,
652 /// cap size
653 size: u32,
654 /// next offset, set next non-skipped offset for non-skipped ext cap
655 next: u16,
656 /// whether to be exposed to guest
657 is_skipped: bool,
658 }
659
660 /// Implements the Vfio Pci device, then a pci device is added into vm
661 pub struct VfioPciDevice {
662 device: Arc<VfioDevice>,
663 config: VfioPciConfig,
664 hotplug: bool,
665 hotplug_bus_number: Option<u8>,
666 preferred_address: PciAddress,
667 pci_address: Option<PciAddress>,
668 interrupt_evt: Option<IrqLevelEvent>,
669 acpi_notification_evt: Option<Event>,
670 mmio_regions: Vec<PciBarConfiguration>,
671 io_regions: Vec<PciBarConfiguration>,
672 pm_cap: Option<Arc<Mutex<VfioPmCap>>>,
673 msi_cap: Option<VfioMsiCap>,
674 msix_cap: Option<Arc<Mutex<VfioMsixCap>>>,
675 irq_type: Option<VfioIrqType>,
676 vm_memory_client: VmMemoryClient,
677 device_data: Option<DeviceData>,
678 pm_evt: Option<Event>,
679 is_in_low_power: Arc<Mutex<bool>>,
680 worker_thread: Option<WorkerThread<VfioPciWorker>>,
681 vm_socket_vm: Option<Tube>,
682 sysfs_path: PathBuf,
683 // PCI Express Extended Capabilities
684 ext_caps: Vec<ExtCap>,
685 vcfg_shm_mmap: Option<MemoryMapping>,
686 mapped_mmio_bars: BTreeMap<PciBarIndex, (u64, Vec<VmMemoryRegionId>)>,
687 activated: bool,
688 acpi_notifier_val: Arc<Mutex<Vec<u32>>>,
689 gpe: Option<u32>,
690 base_class_code: PciClassCode,
691 }
692
693 impl VfioPciDevice {
694 /// Constructs a new Vfio Pci device for the give Vfio device
new( sysfs_path: &Path, device: VfioDevice, hotplug: bool, hotplug_bus_number: Option<u8>, guest_address: Option<PciAddress>, vfio_device_socket_msi: Tube, vfio_device_socket_msix: Tube, vm_memory_client: VmMemoryClient, vfio_device_socket_vm: Tube, ) -> Result<Self, PciDeviceError>695 pub fn new(
696 sysfs_path: &Path,
697 device: VfioDevice,
698 hotplug: bool,
699 hotplug_bus_number: Option<u8>,
700 guest_address: Option<PciAddress>,
701 vfio_device_socket_msi: Tube,
702 vfio_device_socket_msix: Tube,
703 vm_memory_client: VmMemoryClient,
704 vfio_device_socket_vm: Tube,
705 ) -> Result<Self, PciDeviceError> {
706 let preferred_address = if let Some(bus_num) = hotplug_bus_number {
707 debug!("hotplug bus {}", bus_num);
708 PciAddress {
709 // Caller specify pcie bus number for hotplug device
710 bus: bus_num,
711 // devfn should be 0, otherwise pcie root port couldn't detect it
712 dev: 0,
713 func: 0,
714 }
715 } else if let Some(guest_address) = guest_address {
716 debug!("guest PCI address {}", guest_address);
717 guest_address
718 } else {
719 let addr = PciAddress::from_str(device.device_name()).map_err(|e| {
720 PciDeviceError::PciAddressParseFailure(device.device_name().clone(), e)
721 })?;
722 debug!("parsed device PCI address {}", addr);
723 addr
724 };
725
726 let dev = Arc::new(device);
727 let config = VfioPciConfig::new(Arc::clone(&dev));
728 let mut msi_socket = Some(vfio_device_socket_msi);
729 let mut msix_socket = Some(vfio_device_socket_msix);
730 let mut msi_cap: Option<VfioMsiCap> = None;
731 let mut msix_cap: Option<Arc<Mutex<VfioMsixCap>>> = None;
732 let mut pm_cap: Option<Arc<Mutex<VfioPmCap>>> = None;
733
734 let mut is_pcie = false;
735 let mut cap_next: u32 = config.read_config::<u8>(PCI_CAPABILITY_LIST).into();
736 let vendor_id: u16 = config.read_config(PCI_VENDOR_ID);
737 let device_id: u16 = config.read_config(PCI_DEVICE_ID);
738 let base_class_code = PciClassCode::try_from(config.read_config::<u8>(PCI_BASE_CLASS_CODE))
739 .unwrap_or(PciClassCode::Other);
740
741 let pci_id = PciId::new(vendor_id, device_id);
742
743 while cap_next != 0 {
744 let cap_id: u8 = config.read_config(cap_next);
745 if cap_id == PCI_CAP_ID_PM {
746 pm_cap = Some(Arc::new(Mutex::new(VfioPmCap::new(&config, cap_next))));
747 } else if cap_id == PCI_CAP_ID_MSI {
748 if let Some(msi_socket) = msi_socket.take() {
749 msi_cap = Some(VfioMsiCap::new(
750 &config,
751 cap_next,
752 msi_socket,
753 pci_id.into(),
754 dev.device_name().to_string(),
755 ));
756 }
757 } else if cap_id == PCI_CAP_ID_MSIX {
758 if let Some(msix_socket) = msix_socket.take() {
759 msix_cap = Some(Arc::new(Mutex::new(VfioMsixCap::new(
760 &config,
761 cap_next,
762 msix_socket,
763 pci_id.into(),
764 dev.device_name().to_string(),
765 ))));
766 }
767 } else if cap_id == PciCapabilityID::PciExpress as u8 {
768 is_pcie = true;
769 }
770 let offset = cap_next + PCI_MSI_NEXT_POINTER;
771 cap_next = config.read_config::<u8>(offset).into();
772 }
773
774 let mut ext_caps: Vec<ExtCap> = Vec::new();
775 if is_pcie {
776 let mut ext_cap_next: u32 = PCI_CONFIG_SPACE_SIZE;
777 while ext_cap_next != 0 {
778 let ext_cap_config: u32 = config.read_config::<u32>(ext_cap_next);
779 if ext_cap_config == 0 {
780 break;
781 }
782 ext_caps.push(ExtCap {
783 offset: ext_cap_next,
784 // Calculate the size later
785 size: 0,
786 // init as the real value
787 next: get_next_from_extcap_header(ext_cap_config) as u16,
788 is_skipped: is_skipped_ext_cap((ext_cap_config & 0xffff) as u16),
789 });
790 ext_cap_next = get_next_from_extcap_header(ext_cap_config);
791 }
792
793 // Manage extended caps
794 //
795 // Extended capabilities are chained with each pointing to the next, so
796 // we can drop anything other than the head of the chain simply by
797 // modifying the previous next pointer. For the head of the chain, we
798 // can modify the capability ID to something that cannot match a valid
799 // capability. ID PCI_EXT_CAP_ID_CAC is for this since it is no longer
800 // supported.
801 //
802 // reverse order by offset
803 ext_caps.sort_by(|a, b| b.offset.cmp(&a.offset));
804 let mut next_offset: u32 = PCIE_CONFIG_SPACE_SIZE;
805 let mut non_skipped_next: u16 = 0;
806 for ext_cap in ext_caps.iter_mut() {
807 if !ext_cap.is_skipped {
808 ext_cap.next = non_skipped_next;
809 non_skipped_next = ext_cap.offset as u16;
810 } else if ext_cap.offset == PCI_CONFIG_SPACE_SIZE {
811 ext_cap.next = non_skipped_next;
812 }
813 ext_cap.size = next_offset - ext_cap.offset;
814 next_offset = ext_cap.offset;
815 }
816 // order by offset
817 ext_caps.reverse();
818 }
819
820 let is_intel_gfx =
821 base_class_code == PciClassCode::DisplayController && vendor_id == PCI_VENDOR_ID_INTEL;
822 let device_data = if is_intel_gfx {
823 Some(DeviceData::IntelGfxData {
824 opregion_index: u32::MAX,
825 })
826 } else {
827 None
828 };
829
830 Ok(VfioPciDevice {
831 device: dev,
832 config,
833 hotplug,
834 hotplug_bus_number,
835 preferred_address,
836 pci_address: None,
837 interrupt_evt: None,
838 acpi_notification_evt: None,
839 mmio_regions: Vec::new(),
840 io_regions: Vec::new(),
841 pm_cap,
842 msi_cap,
843 msix_cap,
844 irq_type: None,
845 vm_memory_client,
846 device_data,
847 pm_evt: None,
848 is_in_low_power: Arc::new(Mutex::new(false)),
849 worker_thread: None,
850 vm_socket_vm: Some(vfio_device_socket_vm),
851 sysfs_path: sysfs_path.to_path_buf(),
852 ext_caps,
853 vcfg_shm_mmap: None,
854 mapped_mmio_bars: BTreeMap::new(),
855 activated: false,
856 acpi_notifier_val: Arc::new(Mutex::new(Vec::new())),
857 gpe: None,
858 base_class_code,
859 })
860 }
861
862 /// Gets the pci address of the device, if one has already been allocated.
pci_address(&self) -> Option<PciAddress>863 pub fn pci_address(&self) -> Option<PciAddress> {
864 self.pci_address
865 }
866
is_gfx(&self) -> bool867 pub fn is_gfx(&self) -> bool {
868 self.base_class_code == PciClassCode::DisplayController
869 }
870
is_intel_gfx(&self) -> bool871 fn is_intel_gfx(&self) -> bool {
872 matches!(self.device_data, Some(DeviceData::IntelGfxData { .. }))
873 }
874
enable_acpi_notification(&mut self) -> Result<(), PciDeviceError>875 fn enable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
876 if let Some(ref acpi_notification_evt) = self.acpi_notification_evt {
877 return self
878 .device
879 .acpi_notification_evt_enable(acpi_notification_evt, VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
880 .map_err(|_| PciDeviceError::AcpiNotifySetupFailed);
881 }
882 Err(PciDeviceError::AcpiNotifySetupFailed)
883 }
884
885 #[allow(dead_code)]
disable_acpi_notification(&mut self) -> Result<(), PciDeviceError>886 fn disable_acpi_notification(&mut self) -> Result<(), PciDeviceError> {
887 if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
888 return self
889 .device
890 .acpi_notification_disable(VFIO_PCI_ACPI_NTFY_IRQ_INDEX)
891 .map_err(|_| PciDeviceError::AcpiNotifyDeactivationFailed);
892 }
893 Err(PciDeviceError::AcpiNotifyDeactivationFailed)
894 }
895
896 #[allow(dead_code)]
test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError>897 fn test_acpi_notification(&mut self, val: u32) -> Result<(), PciDeviceError> {
898 if let Some(ref _acpi_notification_evt) = self.acpi_notification_evt {
899 return self
900 .device
901 .acpi_notification_test(VFIO_PCI_ACPI_NTFY_IRQ_INDEX, val)
902 .map_err(|_| PciDeviceError::AcpiNotifyTestFailed);
903 }
904 Err(PciDeviceError::AcpiNotifyTestFailed)
905 }
906
enable_intx(&mut self)907 fn enable_intx(&mut self) {
908 if let Some(ref interrupt_evt) = self.interrupt_evt {
909 if let Err(e) = self.device.irq_enable(
910 &[Some(interrupt_evt.get_trigger())],
911 VFIO_PCI_INTX_IRQ_INDEX,
912 0,
913 ) {
914 error!("{} Intx enable failed: {}", self.debug_label(), e);
915 return;
916 }
917 if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) {
918 error!("{} Intx mask failed: {}", self.debug_label(), e);
919 self.disable_intx();
920 return;
921 }
922 if let Err(e) = self
923 .device
924 .resample_virq_enable(interrupt_evt.get_resample(), VFIO_PCI_INTX_IRQ_INDEX)
925 {
926 error!("{} resample enable failed: {}", self.debug_label(), e);
927 self.disable_intx();
928 return;
929 }
930 if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) {
931 error!("{} Intx unmask failed: {}", self.debug_label(), e);
932 self.disable_intx();
933 return;
934 }
935 self.irq_type = Some(VfioIrqType::Intx);
936 }
937 }
938
disable_intx(&mut self)939 fn disable_intx(&mut self) {
940 if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) {
941 error!("{} Intx disable failed: {}", self.debug_label(), e);
942 }
943 self.irq_type = None;
944 }
945
disable_irqs(&mut self)946 fn disable_irqs(&mut self) {
947 match self.irq_type {
948 Some(VfioIrqType::Msi) => self.disable_msi(),
949 Some(VfioIrqType::Msix) => self.disable_msix(),
950 _ => (),
951 }
952
953 // Above disable_msi() or disable_msix() will enable intx again.
954 // so disable_intx here again.
955 if let Some(VfioIrqType::Intx) = self.irq_type {
956 self.disable_intx();
957 }
958 }
959
enable_msi(&mut self)960 fn enable_msi(&mut self) {
961 self.disable_irqs();
962
963 let irqfd = match &self.msi_cap {
964 Some(cap) => {
965 if let Some(fd) = cap.get_msi_irqfd() {
966 fd
967 } else {
968 self.enable_intx();
969 return;
970 }
971 }
972 None => {
973 self.enable_intx();
974 return;
975 }
976 };
977
978 if let Err(e) = self
979 .device
980 .irq_enable(&[Some(irqfd)], VFIO_PCI_MSI_IRQ_INDEX, 0)
981 {
982 error!("{} failed to enable msi: {}", self.debug_label(), e);
983 self.enable_intx();
984 return;
985 }
986
987 self.irq_type = Some(VfioIrqType::Msi);
988 }
989
disable_msi(&mut self)990 fn disable_msi(&mut self) {
991 if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) {
992 error!("{} failed to disable msi: {}", self.debug_label(), e);
993 return;
994 }
995 self.irq_type = None;
996
997 self.enable_intx();
998 }
999
enable_msix(&mut self)1000 fn enable_msix(&mut self) {
1001 if self.msix_cap.is_none() {
1002 return;
1003 }
1004
1005 self.disable_irqs();
1006 let cap = self.msix_cap.as_ref().unwrap().lock();
1007 let vector_in_use = cap.get_msix_irqfds().iter().any(|&irq| irq.is_some());
1008
1009 let mut failed = false;
1010 if !vector_in_use {
1011 // If there are no msix vectors currently in use, we explicitly assign a new eventfd
1012 // to vector 0. Then we enable it and immediately disable it, so that vfio will
1013 // activate physical device. If there are available msix vectors, just enable them
1014 // instead.
1015 let fd = Event::new().expect("failed to create event");
1016 let table_size = cap.table_size();
1017 let mut irqfds = vec![None; table_size];
1018 irqfds[0] = Some(&fd);
1019 for fd in irqfds.iter_mut().skip(1) {
1020 *fd = None;
1021 }
1022 if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1023 error!("{} failed to enable msix: {}", self.debug_label(), e);
1024 failed = true;
1025 }
1026 irqfds[0] = None;
1027 if let Err(e) = self.device.irq_enable(&irqfds, VFIO_PCI_MSIX_IRQ_INDEX, 0) {
1028 error!("{} failed to enable msix: {}", self.debug_label(), e);
1029 failed = true;
1030 }
1031 } else {
1032 let result = self
1033 .device
1034 .irq_enable(&cap.get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0);
1035 if let Err(e) = result {
1036 error!("{} failed to enable msix: {}", self.debug_label(), e);
1037 failed = true;
1038 }
1039 }
1040
1041 std::mem::drop(cap);
1042 if failed {
1043 self.enable_intx();
1044 return;
1045 }
1046 self.irq_type = Some(VfioIrqType::Msix);
1047 }
1048
disable_msix(&mut self)1049 fn disable_msix(&mut self) {
1050 if self.msix_cap.is_none() {
1051 return;
1052 }
1053 if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) {
1054 error!("{} failed to disable msix: {}", self.debug_label(), e);
1055 return;
1056 }
1057 self.irq_type = None;
1058 self.enable_intx();
1059 }
1060
msix_vectors_update(&self) -> Result<(), VfioError>1061 fn msix_vectors_update(&self) -> Result<(), VfioError> {
1062 if let Some(cap) = &self.msix_cap {
1063 self.device
1064 .irq_enable(&cap.lock().get_msix_irqfds(), VFIO_PCI_MSIX_IRQ_INDEX, 0)?;
1065 }
1066 Ok(())
1067 }
1068
msix_vector_update(&self, index: usize, irqfd: Option<&Event>)1069 fn msix_vector_update(&self, index: usize, irqfd: Option<&Event>) {
1070 if let Err(e) = self
1071 .device
1072 .irq_enable(&[irqfd], VFIO_PCI_MSIX_IRQ_INDEX, index as u32)
1073 {
1074 error!(
1075 "{} failed to update msix vector {}: {}",
1076 self.debug_label(),
1077 index,
1078 e
1079 );
1080 }
1081 }
1082
adjust_bar_mmap( &self, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, remove_mmaps: &[AddressRange], ) -> Vec<vfio_region_sparse_mmap_area>1083 fn adjust_bar_mmap(
1084 &self,
1085 bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1086 remove_mmaps: &[AddressRange],
1087 ) -> Vec<vfio_region_sparse_mmap_area> {
1088 let mut mmaps: Vec<vfio_region_sparse_mmap_area> = Vec::with_capacity(bar_mmaps.len());
1089 let pgmask = (pagesize() as u64) - 1;
1090
1091 for mmap in bar_mmaps.iter() {
1092 let mmap_range = if let Some(mmap_range) =
1093 AddressRange::from_start_and_size(mmap.offset, mmap.size)
1094 {
1095 mmap_range
1096 } else {
1097 continue;
1098 };
1099 let mut to_mmap = match VfioResourceAllocator::new(mmap_range) {
1100 Ok(a) => a,
1101 Err(e) => {
1102 error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1103 mmaps.clear();
1104 return mmaps;
1105 }
1106 };
1107
1108 for &(mut remove_range) in remove_mmaps.iter() {
1109 remove_range = remove_range.intersect(mmap_range);
1110 if !remove_range.is_empty() {
1111 // align offsets to page size
1112 let begin = remove_range.start & !pgmask;
1113 let end = ((remove_range.end + 1 + pgmask) & !pgmask) - 1;
1114 let remove_range = AddressRange::from_start_and_end(begin, end);
1115 if let Err(e) = to_mmap.allocate_at_can_overlap(remove_range) {
1116 error!("{} adjust_bar_mmap failed: {}", self.debug_label(), e);
1117 }
1118 }
1119 }
1120
1121 for mmap in to_mmap.regions {
1122 mmaps.push(vfio_region_sparse_mmap_area {
1123 offset: mmap.start,
1124 size: mmap.end - mmap.start + 1,
1125 });
1126 }
1127 }
1128
1129 mmaps
1130 }
1131
remove_bar_mmap_msix( &self, bar_index: PciBarIndex, bar_mmaps: Vec<vfio_region_sparse_mmap_area>, ) -> Vec<vfio_region_sparse_mmap_area>1132 fn remove_bar_mmap_msix(
1133 &self,
1134 bar_index: PciBarIndex,
1135 bar_mmaps: Vec<vfio_region_sparse_mmap_area>,
1136 ) -> Vec<vfio_region_sparse_mmap_area> {
1137 let msix_cap = &self.msix_cap.as_ref().unwrap().lock();
1138 let mut msix_regions = Vec::new();
1139
1140 if let Some(t) = msix_cap.get_msix_table(bar_index) {
1141 msix_regions.push(t);
1142 }
1143 if let Some(p) = msix_cap.get_msix_pba(bar_index) {
1144 msix_regions.push(p);
1145 }
1146
1147 if msix_regions.is_empty() {
1148 return bar_mmaps;
1149 }
1150
1151 self.adjust_bar_mmap(bar_mmaps, &msix_regions)
1152 }
1153
add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId>1154 fn add_bar_mmap(&self, index: PciBarIndex, bar_addr: u64) -> Vec<VmMemoryRegionId> {
1155 let mut mmaps_ids: Vec<VmMemoryRegionId> = Vec::new();
1156 if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 {
1157 // the bar storing msix table and pba couldn't mmap.
1158 // these bars should be trapped, so that msix could be emulated.
1159 let mut mmaps = self.device.get_region_mmap(index);
1160
1161 if self.msix_cap.is_some() {
1162 mmaps = self.remove_bar_mmap_msix(index, mmaps);
1163 }
1164 if mmaps.is_empty() {
1165 return mmaps_ids;
1166 }
1167
1168 for mmap in mmaps.iter() {
1169 let mmap_offset = mmap.offset;
1170 let mmap_size = mmap.size;
1171 let guest_map_start = bar_addr + mmap_offset;
1172 let region_offset = self.device.get_region_offset(index);
1173 let offset = region_offset + mmap_offset;
1174 let descriptor = match self.device.device_file().try_clone() {
1175 Ok(device_file) => device_file.into(),
1176 Err(_) => break,
1177 };
1178 match self.vm_memory_client.register_memory(
1179 VmMemorySource::Descriptor {
1180 descriptor,
1181 offset,
1182 size: mmap_size,
1183 },
1184 VmMemoryDestination::GuestPhysicalAddress(guest_map_start),
1185 Protection::read_write(),
1186 MemCacheType::CacheCoherent,
1187 ) {
1188 Ok(id) => {
1189 mmaps_ids.push(id);
1190 }
1191 Err(e) => {
1192 error!("register_memory failed: {}", e);
1193 break;
1194 }
1195 }
1196 }
1197 }
1198
1199 mmaps_ids
1200 }
1201
remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId])1202 fn remove_bar_mmap(&self, mmap_ids: &[VmMemoryRegionId]) {
1203 for mmap_id in mmap_ids {
1204 if let Err(e) = self.vm_memory_client.unregister_memory(*mmap_id) {
1205 error!("unregister_memory failed: {}", e);
1206 }
1207 }
1208 }
1209
disable_bars_mmap(&mut self)1210 fn disable_bars_mmap(&mut self) {
1211 for (_, (_, mmap_ids)) in self.mapped_mmio_bars.iter() {
1212 self.remove_bar_mmap(mmap_ids);
1213 }
1214 self.mapped_mmio_bars.clear();
1215 }
1216
commit_bars_mmap(&mut self)1217 fn commit_bars_mmap(&mut self) {
1218 // Unmap all bars before remapping bars, to prevent issues with overlap
1219 let mut needs_map = Vec::new();
1220 for mmio_info in self.mmio_regions.iter() {
1221 let bar_idx = mmio_info.bar_index();
1222 let addr = mmio_info.address();
1223
1224 if let Some((cur_addr, ids)) = self.mapped_mmio_bars.remove(&bar_idx) {
1225 if cur_addr == addr {
1226 self.mapped_mmio_bars.insert(bar_idx, (cur_addr, ids));
1227 continue;
1228 } else {
1229 self.remove_bar_mmap(&ids);
1230 }
1231 }
1232
1233 if addr != 0 {
1234 needs_map.push((bar_idx, addr));
1235 }
1236 }
1237
1238 for (bar_idx, addr) in needs_map.iter() {
1239 let ids = self.add_bar_mmap(*bar_idx, *addr);
1240 self.mapped_mmio_bars.insert(*bar_idx, (*addr, ids));
1241 }
1242 }
1243
close(&mut self)1244 fn close(&mut self) {
1245 if let Some(msi) = self.msi_cap.as_mut() {
1246 msi.destroy();
1247 }
1248 if let Some(msix) = &self.msix_cap {
1249 msix.lock().destroy();
1250 }
1251 self.disable_bars_mmap();
1252 self.device.close();
1253 }
1254
start_work_thread(&mut self)1255 fn start_work_thread(&mut self) {
1256 let vm_socket = match self.vm_socket_vm.take() {
1257 Some(socket) => socket,
1258 None => return,
1259 };
1260
1261 let req_evt = match Event::new() {
1262 Ok(evt) => {
1263 if let Err(e) = self
1264 .device
1265 .irq_enable(&[Some(&evt)], VFIO_PCI_REQ_IRQ_INDEX, 0)
1266 {
1267 error!("{} enable req_irq failed: {}", self.debug_label(), e);
1268 return;
1269 }
1270 evt
1271 }
1272 Err(_) => return,
1273 };
1274
1275 let (self_pm_evt, pm_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1276 Ok(v) => v,
1277 Err(e) => {
1278 error!(
1279 "{} failed creating PM Event pair: {}",
1280 self.debug_label(),
1281 e
1282 );
1283 return;
1284 }
1285 };
1286 self.pm_evt = Some(self_pm_evt);
1287
1288 let (self_acpi_notify_evt, acpi_notify_evt) =
1289 match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1290 Ok(v) => v,
1291 Err(e) => {
1292 error!(
1293 "{} failed creating ACPI Event pair: {}",
1294 self.debug_label(),
1295 e
1296 );
1297 return;
1298 }
1299 };
1300 self.acpi_notification_evt = Some(self_acpi_notify_evt);
1301
1302 if let Err(e) = self.enable_acpi_notification() {
1303 error!("{}: {}", self.debug_label(), e);
1304 }
1305
1306 let mut msix_evt = Vec::new();
1307 if let Some(msix_cap) = &self.msix_cap {
1308 msix_evt = msix_cap.lock().clone_msix_evt();
1309 }
1310
1311 let name = self.device.device_name().to_string();
1312 let address = self.pci_address.expect("Unassigned PCI Address.");
1313 let sysfs_path = self.sysfs_path.clone();
1314 let pm_cap = self.pm_cap.clone();
1315 let msix_cap = self.msix_cap.clone();
1316 let is_in_low_power = self.is_in_low_power.clone();
1317 let gpe_nr = self.gpe;
1318 let notification_val = self.acpi_notifier_val.clone();
1319 self.worker_thread = Some(WorkerThread::start("vfio_pci", move |kill_evt| {
1320 let mut worker = VfioPciWorker {
1321 address,
1322 sysfs_path,
1323 vm_socket,
1324 name,
1325 pm_cap,
1326 msix_cap,
1327 };
1328 worker.run(
1329 req_evt,
1330 pm_evt,
1331 acpi_notify_evt,
1332 kill_evt,
1333 msix_evt,
1334 is_in_low_power,
1335 gpe_nr,
1336 notification_val,
1337 );
1338 worker
1339 }));
1340 self.activated = true;
1341 }
1342
collect_bars(&mut self) -> Vec<PciBarConfiguration>1343 fn collect_bars(&mut self) -> Vec<PciBarConfiguration> {
1344 let mut i = VFIO_PCI_BAR0_REGION_INDEX;
1345 let mut mem_bars: Vec<PciBarConfiguration> = Vec::new();
1346
1347 while i <= VFIO_PCI_ROM_REGION_INDEX {
1348 let mut low: u32 = 0xffffffff;
1349 let offset: u32 = if i == VFIO_PCI_ROM_REGION_INDEX {
1350 0x30
1351 } else {
1352 0x10 + i * 4
1353 };
1354 self.config.write_config(low, offset);
1355 low = self.config.read_config(offset);
1356
1357 let low_flag = low & 0xf;
1358 let is_64bit = low_flag & 0x4 == 0x4;
1359 if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 {
1360 let mut upper: u32 = 0xffffffff;
1361 if is_64bit {
1362 self.config.write_config(upper, offset + 4);
1363 upper = self.config.read_config(offset + 4);
1364 }
1365
1366 low &= 0xffff_fff0;
1367 let mut size: u64 = u64::from(upper);
1368 size <<= 32;
1369 size |= u64::from(low);
1370 size = !size + 1;
1371 let region_type = if is_64bit {
1372 PciBarRegionType::Memory64BitRegion
1373 } else {
1374 PciBarRegionType::Memory32BitRegion
1375 };
1376 let prefetch = if low_flag & 0x8 == 0x8 {
1377 PciBarPrefetchable::Prefetchable
1378 } else {
1379 PciBarPrefetchable::NotPrefetchable
1380 };
1381 mem_bars.push(PciBarConfiguration::new(
1382 i as usize,
1383 size,
1384 region_type,
1385 prefetch,
1386 ));
1387 } else if low_flag & 0x1 == 0x1 {
1388 let size = !(low & 0xffff_fffc) + 1;
1389 self.io_regions.push(PciBarConfiguration::new(
1390 i as usize,
1391 size.into(),
1392 PciBarRegionType::IoRegion,
1393 PciBarPrefetchable::NotPrefetchable,
1394 ));
1395 }
1396
1397 if is_64bit {
1398 i += 2;
1399 } else {
1400 i += 1;
1401 }
1402 }
1403 mem_bars
1404 }
1405
configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64)1406 fn configure_barmem(&mut self, bar_info: &PciBarConfiguration, bar_addr: u64) {
1407 let offset: u32 = bar_info.reg_index() as u32 * 4;
1408 let mmio_region = *bar_info;
1409 self.mmio_regions.push(mmio_region.set_address(bar_addr));
1410
1411 let val: u32 = self.config.read_config(offset);
1412 let low = ((bar_addr & !0xf) as u32) | (val & 0xf);
1413 self.config.write_config(low, offset);
1414 if bar_info.is_64bit_memory() {
1415 let upper = (bar_addr >> 32) as u32;
1416 self.config.write_config(upper, offset + 4);
1417 }
1418 }
1419
allocate_root_barmem( &mut self, mem_bars: &[PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1420 fn allocate_root_barmem(
1421 &mut self,
1422 mem_bars: &[PciBarConfiguration],
1423 resources: &mut SystemAllocator,
1424 ) -> Result<Vec<BarRange>, PciDeviceError> {
1425 let address = self.pci_address.unwrap();
1426 let mut ranges: Vec<BarRange> = Vec::new();
1427 for mem_bar in mem_bars {
1428 let bar_size = mem_bar.size();
1429 let mut bar_addr: u64 = 0;
1430 // Don't allocate mmio for hotplug device, OS will allocate it from
1431 // its parent's bridge window.
1432 if !self.hotplug {
1433 bar_addr = resources
1434 .allocate_mmio(
1435 bar_size,
1436 Alloc::PciBar {
1437 bus: address.bus,
1438 dev: address.dev,
1439 func: address.func,
1440 bar: mem_bar.bar_index() as u8,
1441 },
1442 "vfio_bar".to_string(),
1443 AllocOptions::new()
1444 .prefetchable(mem_bar.is_prefetchable())
1445 .max_address(if mem_bar.is_64bit_memory() {
1446 u64::MAX
1447 } else {
1448 u32::MAX.into()
1449 })
1450 .align(bar_size),
1451 )
1452 .map_err(|e| PciDeviceError::IoAllocationFailed(bar_size, e))?;
1453 ranges.push(BarRange {
1454 addr: bar_addr,
1455 size: bar_size,
1456 prefetchable: mem_bar.is_prefetchable(),
1457 });
1458 }
1459 self.configure_barmem(mem_bar, bar_addr);
1460 }
1461 Ok(ranges)
1462 }
1463
allocate_nonroot_barmem( &mut self, mem_bars: &mut [PciBarConfiguration], resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1464 fn allocate_nonroot_barmem(
1465 &mut self,
1466 mem_bars: &mut [PciBarConfiguration],
1467 resources: &mut SystemAllocator,
1468 ) -> Result<Vec<BarRange>, PciDeviceError> {
1469 const NON_PREFETCHABLE: usize = 0;
1470 const PREFETCHABLE: usize = 1;
1471 const ARRAY_SIZE: usize = 2;
1472 let mut membars: [Vec<PciBarConfiguration>; ARRAY_SIZE] = [Vec::new(), Vec::new()];
1473 let mut allocator: [VfioResourceAllocator; ARRAY_SIZE] = [
1474 match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u32::MAX as u64)) {
1475 Ok(a) => a,
1476 Err(e) => {
1477 error!(
1478 "{} init nonroot VfioResourceAllocator failed: {}",
1479 self.debug_label(),
1480 e
1481 );
1482 return Err(e);
1483 }
1484 },
1485 match VfioResourceAllocator::new(AddressRange::from_start_and_end(0, u64::MAX)) {
1486 Ok(a) => a,
1487 Err(e) => {
1488 error!(
1489 "{} init nonroot VfioResourceAllocator failed: {}",
1490 self.debug_label(),
1491 e
1492 );
1493 return Err(e);
1494 }
1495 },
1496 ];
1497 let mut memtype: [MmioType; ARRAY_SIZE] = [MmioType::Low, MmioType::High];
1498 // the window must be 1M-aligned as per the PCI spec
1499 let mut window_sz: [u64; ARRAY_SIZE] = [0; 2];
1500 let mut alignment: [u64; ARRAY_SIZE] = [0x100000; 2];
1501
1502 // Descend by bar size, this could reduce allocated size for all the bars.
1503 mem_bars.sort_by_key(|a| Reverse(a.size()));
1504 for mem_bar in mem_bars {
1505 let prefetchable = mem_bar.is_prefetchable();
1506 let is_64bit = mem_bar.is_64bit_memory();
1507
1508 // if one prefetchable bar is 32bit, all the prefetchable bars should be in Low MMIO,
1509 // as all the prefetchable bars should be in one region
1510 if prefetchable && !is_64bit {
1511 memtype[PREFETCHABLE] = MmioType::Low;
1512 }
1513 let i = if prefetchable {
1514 PREFETCHABLE
1515 } else {
1516 NON_PREFETCHABLE
1517 };
1518 let bar_size = mem_bar.size();
1519 let start = match allocator[i].allocate_with_align(bar_size, bar_size) {
1520 Ok(s) => s,
1521 Err(e) => {
1522 error!(
1523 "{} nonroot allocate_wit_align failed: {}",
1524 self.debug_label(),
1525 e
1526 );
1527 return Err(e);
1528 }
1529 };
1530 window_sz[i] = max(window_sz[i], start + bar_size);
1531 alignment[i] = max(alignment[i], bar_size);
1532 let mem_info = (*mem_bar).set_address(start);
1533 membars[i].push(mem_info);
1534 }
1535
1536 let address = self.pci_address.unwrap();
1537 let mut ranges: Vec<BarRange> = Vec::new();
1538 for (index, bars) in membars.iter().enumerate() {
1539 if bars.is_empty() {
1540 continue;
1541 }
1542
1543 let i = if index == 1 {
1544 PREFETCHABLE
1545 } else {
1546 NON_PREFETCHABLE
1547 };
1548 let mut window_addr: u64 = 0;
1549 // Don't allocate mmio for hotplug device, OS will allocate it from
1550 // its parent's bridge window.
1551 if !self.hotplug {
1552 window_sz[i] = (window_sz[i] + 0xfffff) & !0xfffff;
1553 let alloc = if i == NON_PREFETCHABLE {
1554 Alloc::PciBridgeWindow {
1555 bus: address.bus,
1556 dev: address.dev,
1557 func: address.func,
1558 }
1559 } else {
1560 Alloc::PciBridgePrefetchWindow {
1561 bus: address.bus,
1562 dev: address.dev,
1563 func: address.func,
1564 }
1565 };
1566 window_addr = resources
1567 .mmio_allocator(memtype[i])
1568 .allocate_with_align(
1569 window_sz[i],
1570 alloc,
1571 "vfio_bar_window".to_string(),
1572 alignment[i],
1573 )
1574 .map_err(|e| PciDeviceError::IoAllocationFailed(window_sz[i], e))?;
1575 for mem_info in bars {
1576 let bar_addr = window_addr + mem_info.address();
1577 ranges.push(BarRange {
1578 addr: bar_addr,
1579 size: mem_info.size(),
1580 prefetchable: mem_info.is_prefetchable(),
1581 });
1582 }
1583 }
1584
1585 for mem_info in bars {
1586 let bar_addr = window_addr + mem_info.address();
1587 self.configure_barmem(mem_info, bar_addr);
1588 }
1589 }
1590 Ok(ranges)
1591 }
1592
1593 /// Return the supported iova max address of the Vfio Pci device
get_max_iova(&self) -> u641594 pub fn get_max_iova(&self) -> u64 {
1595 self.device.get_max_addr()
1596 }
1597
get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap>1598 fn get_ext_cap_by_reg(&self, reg: u32) -> Option<ExtCap> {
1599 self.ext_caps
1600 .iter()
1601 .find(|ext_cap| reg >= ext_cap.offset && reg < ext_cap.offset + ext_cap.size)
1602 .cloned()
1603 }
1604
is_skipped_reg(&self, reg: u32) -> bool1605 fn is_skipped_reg(&self, reg: u32) -> bool {
1606 // fast handle for pci config space
1607 if reg < PCI_CONFIG_SPACE_SIZE {
1608 return false;
1609 }
1610
1611 self.get_ext_cap_by_reg(reg)
1612 .map_or(false, |cap| cap.is_skipped)
1613 }
1614 }
1615
1616 impl PciDevice for VfioPciDevice {
debug_label(&self) -> String1617 fn debug_label(&self) -> String {
1618 format!("vfio {} device", self.device.device_name())
1619 }
1620
preferred_address(&self) -> Option<PciAddress>1621 fn preferred_address(&self) -> Option<PciAddress> {
1622 Some(self.preferred_address)
1623 }
1624
allocate_address( &mut self, resources: &mut SystemAllocator, ) -> Result<PciAddress, PciDeviceError>1625 fn allocate_address(
1626 &mut self,
1627 resources: &mut SystemAllocator,
1628 ) -> Result<PciAddress, PciDeviceError> {
1629 if self.pci_address.is_none() {
1630 let mut address = self.preferred_address;
1631 while address.func < 8 {
1632 if resources.reserve_pci(
1633 Alloc::PciBar {
1634 bus: address.bus,
1635 dev: address.dev,
1636 func: address.func,
1637 bar: 0,
1638 },
1639 self.debug_label(),
1640 ) {
1641 self.pci_address = Some(address);
1642 break;
1643 } else if self.hotplug_bus_number.is_none() {
1644 break;
1645 } else {
1646 address.func += 1;
1647 }
1648 }
1649 }
1650 self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1651 }
1652
keep_rds(&self) -> Vec<RawDescriptor>1653 fn keep_rds(&self) -> Vec<RawDescriptor> {
1654 let mut rds = self.device.keep_rds();
1655 if let Some(ref interrupt_evt) = self.interrupt_evt {
1656 rds.extend(interrupt_evt.as_raw_descriptors());
1657 }
1658 rds.push(self.vm_memory_client.as_raw_descriptor());
1659 if let Some(vm_socket_vm) = &self.vm_socket_vm {
1660 rds.push(vm_socket_vm.as_raw_descriptor());
1661 }
1662 if let Some(msi_cap) = &self.msi_cap {
1663 rds.push(msi_cap.config.get_msi_socket());
1664 }
1665 if let Some(msix_cap) = &self.msix_cap {
1666 rds.push(msix_cap.lock().config.as_raw_descriptor());
1667 }
1668 rds
1669 }
1670
preferred_irq(&self) -> PreferredIrq1671 fn preferred_irq(&self) -> PreferredIrq {
1672 // Is INTx configured?
1673 let pin = match self.config.read_config::<u8>(PCI_INTERRUPT_PIN) {
1674 1 => PciInterruptPin::IntA,
1675 2 => PciInterruptPin::IntB,
1676 3 => PciInterruptPin::IntC,
1677 4 => PciInterruptPin::IntD,
1678 _ => return PreferredIrq::None,
1679 };
1680
1681 // TODO: replace sysfs/irq value parsing with vfio interface
1682 // reporting host allocated interrupt number and type.
1683 let path = self.sysfs_path.join("irq");
1684 let gsi = fs::read_to_string(path)
1685 .map(|v| v.trim().parse::<u32>().unwrap_or(0))
1686 .unwrap_or(0);
1687
1688 PreferredIrq::Fixed { pin, gsi }
1689 }
1690
assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32)1691 fn assign_irq(&mut self, irq_evt: IrqLevelEvent, pin: PciInterruptPin, irq_num: u32) {
1692 // Keep event/resample event references.
1693 self.interrupt_evt = Some(irq_evt);
1694
1695 // enable INTX
1696 self.enable_intx();
1697
1698 self.config
1699 .write_config(pin.to_mask() as u8, PCI_INTERRUPT_PIN);
1700 self.config.write_config(irq_num as u8, PCI_INTERRUPT_NUM);
1701 }
1702
allocate_io_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1703 fn allocate_io_bars(
1704 &mut self,
1705 resources: &mut SystemAllocator,
1706 ) -> Result<Vec<BarRange>, PciDeviceError> {
1707 let address = self
1708 .pci_address
1709 .expect("allocate_address must be called prior to allocate_device_bars");
1710
1711 let mut mem_bars = self.collect_bars();
1712
1713 let ranges = if address.bus == 0 {
1714 self.allocate_root_barmem(&mem_bars, resources)?
1715 } else {
1716 self.allocate_nonroot_barmem(&mut mem_bars, resources)?
1717 };
1718
1719 // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate
1720 // driver doesn't claim this vga device, then xorg couldn't boot up.
1721 if self.is_intel_gfx() {
1722 let mut cmd = self.config.read_config::<u8>(PCI_COMMAND);
1723 cmd |= PCI_COMMAND_MEMORY;
1724 self.config.write_config(cmd, PCI_COMMAND);
1725 }
1726 Ok(ranges)
1727 }
1728
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<BarRange>, PciDeviceError>1729 fn allocate_device_bars(
1730 &mut self,
1731 resources: &mut SystemAllocator,
1732 ) -> Result<Vec<BarRange>, PciDeviceError> {
1733 let mut ranges: Vec<BarRange> = Vec::new();
1734
1735 if !self.is_intel_gfx() {
1736 return Ok(ranges);
1737 }
1738
1739 // Make intel gfx's opregion as mmio bar, and allocate a gpa for it
1740 // then write this gpa into pci cfg register
1741 if let Some((index, size)) = self.device.get_cap_type_info(
1742 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (PCI_VENDOR_ID_INTEL as u32),
1743 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
1744 ) {
1745 let address = self
1746 .pci_address
1747 .expect("allocate_address must be called prior to allocate_device_bars");
1748 let bar_addr = resources
1749 .allocate_mmio(
1750 size,
1751 Alloc::PciBar {
1752 bus: address.bus,
1753 dev: address.dev,
1754 func: address.func,
1755 bar: (index * 4) as u8,
1756 },
1757 "vfio_bar".to_string(),
1758 AllocOptions::new().max_address(u32::MAX.into()),
1759 )
1760 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1761 ranges.push(BarRange {
1762 addr: bar_addr,
1763 size,
1764 prefetchable: false,
1765 });
1766 self.device_data = Some(DeviceData::IntelGfxData {
1767 opregion_index: index,
1768 });
1769
1770 self.mmio_regions.push(
1771 PciBarConfiguration::new(
1772 index as usize,
1773 size,
1774 PciBarRegionType::Memory32BitRegion,
1775 PciBarPrefetchable::NotPrefetchable,
1776 )
1777 .set_address(bar_addr),
1778 );
1779 self.config.write_config(bar_addr as u32, 0xFC);
1780 }
1781
1782 Ok(ranges)
1783 }
1784
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1785 fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1786 for region in self.mmio_regions.iter().chain(self.io_regions.iter()) {
1787 if region.bar_index() == bar_num {
1788 let command: u8 = self.config.read_config(PCI_COMMAND);
1789 if (region.is_memory() && (command & PCI_COMMAND_MEMORY == 0)) || region.is_io() {
1790 return None;
1791 } else {
1792 return Some(*region);
1793 }
1794 }
1795 }
1796
1797 None
1798 }
1799
register_device_capabilities(&mut self) -> Result<(), PciDeviceError>1800 fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> {
1801 Ok(())
1802 }
1803
read_config_register(&self, reg_idx: usize) -> u321804 fn read_config_register(&self, reg_idx: usize) -> u32 {
1805 let reg: u32 = (reg_idx * 4) as u32;
1806 let mut config: u32 = self.config.read_config(reg);
1807
1808 // See VfioPciDevice::new for details how extended caps are managed
1809 if reg >= PCI_CONFIG_SPACE_SIZE {
1810 let ext_cap = self.get_ext_cap_by_reg(reg);
1811 if let Some(ext_cap) = ext_cap {
1812 if ext_cap.offset == reg {
1813 config = (config & !(0xffc << 20)) | (((ext_cap.next & 0xffc) as u32) << 20);
1814 }
1815
1816 if ext_cap.is_skipped {
1817 if reg == PCI_CONFIG_SPACE_SIZE {
1818 config = (config & (0xffc << 20)) | (PCI_EXT_CAP_ID_CAC as u32);
1819 } else {
1820 config = 0;
1821 }
1822 }
1823 }
1824 }
1825
1826 // Ignore IO bar
1827 if (0x10..=0x24).contains(®) {
1828 let bar_idx = (reg as usize - 0x10) / 4;
1829 if let Some(bar) = self.get_bar_configuration(bar_idx) {
1830 if bar.is_io() {
1831 config = 0;
1832 }
1833 }
1834 } else if let Some(msix_cap) = &self.msix_cap {
1835 let msix_cap = msix_cap.lock();
1836 if msix_cap.is_msix_control_reg(reg, 4) {
1837 msix_cap.read_msix_control(&mut config);
1838 }
1839 } else if let Some(pm_cap) = &self.pm_cap {
1840 let pm_cap = pm_cap.lock();
1841 if pm_cap.is_pm_reg(reg) {
1842 config = pm_cap.read(reg);
1843 }
1844 }
1845
1846 // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51]
1847 if self.is_intel_gfx() && reg == 0x50 {
1848 config &= 0xffff00ff;
1849 }
1850
1851 config
1852 }
1853
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1854 fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1855 // When guest write config register at the first time, start worker thread
1856 if self.worker_thread.is_none() && self.vm_socket_vm.is_some() {
1857 self.start_work_thread();
1858 };
1859
1860 let start = (reg_idx * 4) as u64 + offset;
1861
1862 if let Some(pm_cap) = self.pm_cap.as_mut() {
1863 let mut pm_cap = pm_cap.lock();
1864 if pm_cap.is_pm_reg(start as u32) {
1865 pm_cap.write(start, data);
1866 }
1867 }
1868
1869 let mut msi_change: Option<VfioMsiChange> = None;
1870 if let Some(msi_cap) = self.msi_cap.as_mut() {
1871 if msi_cap.is_msi_reg(start, data.len()) {
1872 msi_change = msi_cap.write_msi_reg(start, data);
1873 }
1874 }
1875
1876 match msi_change {
1877 Some(VfioMsiChange::Enable) => self.enable_msi(),
1878 Some(VfioMsiChange::Disable) => self.disable_msi(),
1879 _ => (),
1880 }
1881
1882 msi_change = None;
1883 if let Some(msix_cap) = &self.msix_cap {
1884 let mut msix_cap = msix_cap.lock();
1885 if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) {
1886 msi_change = msix_cap.write_msix_control(data);
1887 }
1888 }
1889
1890 match msi_change {
1891 Some(VfioMsiChange::Enable) => self.enable_msix(),
1892 Some(VfioMsiChange::Disable) => self.disable_msix(),
1893 Some(VfioMsiChange::FunctionChanged) => {
1894 if let Err(e) = self.msix_vectors_update() {
1895 error!("update msix vectors failed: {}", e);
1896 }
1897 }
1898 _ => (),
1899 }
1900
1901 if !self.is_skipped_reg(start as u32) {
1902 self.device
1903 .region_write(VFIO_PCI_CONFIG_REGION_INDEX as usize, data, start);
1904 }
1905
1906 // if guest enable memory access, then enable bar mappable once
1907 if start == PCI_COMMAND as u64
1908 && data.len() == 2
1909 && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY
1910 {
1911 self.commit_bars_mmap();
1912 } else if (0x10..=0x24).contains(&start) && data.len() == 4 {
1913 let bar_idx = (start as u32 - 0x10) / 4;
1914 let value: [u8; 4] = [data[0], data[1], data[2], data[3]];
1915 let val = u32::from_le_bytes(value);
1916 let mut modify = false;
1917 for region in self.mmio_regions.iter_mut() {
1918 if region.bar_index() == bar_idx as usize {
1919 let old_addr = region.address();
1920 let new_addr = val & 0xFFFFFFF0;
1921 if !region.is_64bit_memory() && (old_addr as u32) != new_addr {
1922 // Change 32bit bar address
1923 *region = region.set_address(u64::from(new_addr));
1924 modify = true;
1925 } else if region.is_64bit_memory() && (old_addr as u32) != new_addr {
1926 // Change 64bit bar low address
1927 *region =
1928 region.set_address(u64::from(new_addr) | ((old_addr >> 32) << 32));
1929 modify = true;
1930 }
1931 break;
1932 } else if region.is_64bit_memory()
1933 && ((bar_idx % 2) == 1)
1934 && (region.bar_index() + 1 == bar_idx as usize)
1935 {
1936 // Change 64bit bar high address
1937 let old_addr = region.address();
1938 if val != (old_addr >> 32) as u32 {
1939 let mut new_addr = (u64::from(val)) << 32;
1940 new_addr |= old_addr & 0xFFFFFFFF;
1941 *region = region.set_address(new_addr);
1942 modify = true;
1943 }
1944 break;
1945 }
1946 }
1947 if modify {
1948 // if bar is changed under memory enabled, mmap the
1949 // new bar immediately.
1950 let cmd = self.config.read_config::<u8>(PCI_COMMAND);
1951 if cmd & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY {
1952 self.commit_bars_mmap();
1953 }
1954 }
1955 }
1956 }
1957
read_virtual_config_register(&self, reg_idx: usize) -> u321958 fn read_virtual_config_register(&self, reg_idx: usize) -> u32 {
1959 if reg_idx == PCI_VCFG_NOTY {
1960 let mut q = self.acpi_notifier_val.lock();
1961 let mut val = 0;
1962 if !q.is_empty() {
1963 val = q.remove(0);
1964 }
1965 drop(q);
1966 return val;
1967 }
1968
1969 warn!(
1970 "{} read unsupported vcfg register {}",
1971 self.debug_label(),
1972 reg_idx
1973 );
1974 0xFFFF_FFFF
1975 }
1976
write_virtual_config_register(&mut self, reg_idx: usize, value: u32)1977 fn write_virtual_config_register(&mut self, reg_idx: usize, value: u32) {
1978 match reg_idx {
1979 PCI_VCFG_PM => {
1980 match value {
1981 0 => {
1982 if let Some(pm_evt) =
1983 self.pm_evt.as_ref().map(|evt| evt.try_clone().unwrap())
1984 {
1985 *self.is_in_low_power.lock() = true;
1986 let _ = self.device.pm_low_power_enter_with_wakeup(pm_evt);
1987 } else {
1988 let _ = self.device.pm_low_power_enter();
1989 }
1990 }
1991 _ => {
1992 *self.is_in_low_power.lock() = false;
1993 let _ = self.device.pm_low_power_exit();
1994 }
1995 };
1996 }
1997 PCI_VCFG_DSM => {
1998 if let Some(shm) = &self.vcfg_shm_mmap {
1999 let mut args = [0u8; 4096];
2000 if let Err(e) = shm.read_slice(&mut args, 0) {
2001 error!("failed to read DSM Args: {}", e);
2002 return;
2003 }
2004 let res = match self.device.acpi_dsm(&args) {
2005 Ok(r) => r,
2006 Err(e) => {
2007 error!("failed to call DSM: {}", e);
2008 return;
2009 }
2010 };
2011 if let Err(e) = shm.write_slice(&res, 0) {
2012 error!("failed to write DSM result: {}", e);
2013 return;
2014 }
2015 if let Err(e) = shm.msync() {
2016 error!("failed to msync: {}", e)
2017 }
2018 }
2019 }
2020 _ => warn!(
2021 "{} write unsupported vcfg register {}",
2022 self.debug_label(),
2023 reg_idx
2024 ),
2025 };
2026 }
2027
read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8])2028 fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
2029 if let Some(msix_cap) = &self.msix_cap {
2030 let msix_cap = msix_cap.lock();
2031 if msix_cap.is_msix_table(bar_index, offset) {
2032 msix_cap.read_table(offset, data);
2033 return;
2034 } else if msix_cap.is_msix_pba(bar_index, offset) {
2035 msix_cap.read_pba(offset, data);
2036 return;
2037 }
2038 }
2039 self.device.region_read(bar_index, data, offset);
2040 }
2041
write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8])2042 fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
2043 // Ignore igd opregion's write
2044 if let Some(device_data) = &self.device_data {
2045 match *device_data {
2046 DeviceData::IntelGfxData { opregion_index } => {
2047 if opregion_index == bar_index as u32 {
2048 return;
2049 }
2050 }
2051 }
2052 }
2053
2054 if let Some(msix_cap) = &self.msix_cap {
2055 let mut msix_cap = msix_cap.lock();
2056 if msix_cap.is_msix_table(bar_index, offset) {
2057 let behavior = msix_cap.write_table(offset, data);
2058 if let MsixStatus::EntryChanged(index) = behavior {
2059 let irqfd = msix_cap.get_msix_irqfd(index);
2060 self.msix_vector_update(index, irqfd);
2061 }
2062 return;
2063 } else if msix_cap.is_msix_pba(bar_index, offset) {
2064 msix_cap.write_pba(offset, data);
2065 return;
2066 }
2067 }
2068
2069 self.device.region_write(bar_index, data, offset);
2070 }
2071
destroy_device(&mut self)2072 fn destroy_device(&mut self) {
2073 self.close();
2074 }
2075
generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>)2076 fn generate_acpi_methods(&mut self) -> (Vec<u8>, Option<(u32, MemoryMapping)>) {
2077 let mut amls = Vec::new();
2078 let mut shm = None;
2079 if let Some(pci_address) = self.pci_address {
2080 let vcfg_offset = pci_address.to_config_address(0, 13);
2081 if let Ok(vcfg_register) = DeviceVcfgRegister::new(vcfg_offset) {
2082 vcfg_register.to_aml_bytes(&mut amls);
2083 shm = vcfg_register
2084 .create_shm_mmap()
2085 .map(|shm| (vcfg_offset + SHM_OFFSET, shm));
2086 self.vcfg_shm_mmap = vcfg_register.create_shm_mmap();
2087 // All vfio-pci devices should have virtual _PRx method, otherwise
2088 // host couldn't know whether device has enter into suspend state,
2089 // host would always think it is in active state, so its parent PCIe
2090 // switch couldn't enter into suspend state.
2091 PowerResourceMethod {}.to_aml_bytes(&mut amls);
2092 // TODO: WIP: Ideally, we should generate DSM only if the physical
2093 // device has a _DSM; however, such information is not provided by
2094 // Linux. As a temporary workaround, we chech whether there is an
2095 // associated ACPI companion device node and skip generating guest
2096 // _DSM if there is none.
2097 let acpi_path = self.sysfs_path.join("firmware_node/path");
2098 if acpi_path.exists() {
2099 DsmMethod {}.to_aml_bytes(&mut amls);
2100 }
2101 }
2102 }
2103
2104 (amls, shm)
2105 }
2106
set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32>2107 fn set_gpe(&mut self, resources: &mut SystemAllocator) -> Option<u32> {
2108 if let Some(gpe_nr) = resources.allocate_gpe() {
2109 base::debug!("set_gpe: gpe-nr {} addr {:?}", gpe_nr, self.pci_address);
2110 self.gpe = Some(gpe_nr);
2111 }
2112 self.gpe
2113 }
2114 }
2115
2116 impl Suspendable for VfioPciDevice {
sleep(&mut self) -> anyhow::Result<()>2117 fn sleep(&mut self) -> anyhow::Result<()> {
2118 if let Some(worker_thread) = self.worker_thread.take() {
2119 let res = worker_thread.stop();
2120 self.pci_address = Some(res.address);
2121 self.sysfs_path = res.sysfs_path;
2122 self.pm_cap = res.pm_cap;
2123 self.msix_cap = res.msix_cap;
2124 self.vm_socket_vm = Some(res.vm_socket);
2125 }
2126 Ok(())
2127 }
2128
wake(&mut self) -> anyhow::Result<()>2129 fn wake(&mut self) -> anyhow::Result<()> {
2130 if self.activated {
2131 self.start_work_thread();
2132 }
2133 Ok(())
2134 }
2135 }
2136
2137 #[cfg(test)]
2138 mod tests {
2139 use resources::AddressRange;
2140
2141 use super::VfioResourceAllocator;
2142
2143 #[test]
no_overlap()2144 fn no_overlap() {
2145 // regions [32, 95]
2146 let mut memory =
2147 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2148 memory
2149 .allocate_at_can_overlap(AddressRange::from_start_and_end(0, 15))
2150 .unwrap();
2151 memory
2152 .allocate_at_can_overlap(AddressRange::from_start_and_end(100, 115))
2153 .unwrap();
2154
2155 let mut iter = memory.regions.iter();
2156 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 95)));
2157 }
2158
2159 #[test]
complete_overlap()2160 fn complete_overlap() {
2161 // regions [32, 95]
2162 let mut memory =
2163 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2164 // regions [32, 47], [64, 95]
2165 memory
2166 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2167 .unwrap();
2168 // regions [64, 95]
2169 memory
2170 .allocate_at_can_overlap(AddressRange::from_start_and_end(32, 47))
2171 .unwrap();
2172
2173 let mut iter = memory.regions.iter();
2174 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2175 }
2176
2177 #[test]
partial_overlap_one()2178 fn partial_overlap_one() {
2179 // regions [32, 95]
2180 let mut memory =
2181 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2182 // regions [32, 47], [64, 95]
2183 memory
2184 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2185 .unwrap();
2186 // regions [32, 39], [64, 95]
2187 memory
2188 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 55))
2189 .unwrap();
2190
2191 let mut iter = memory.regions.iter();
2192 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2193 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(64, 95)));
2194 }
2195
2196 #[test]
partial_overlap_two()2197 fn partial_overlap_two() {
2198 // regions [32, 95]
2199 let mut memory =
2200 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2201 // regions [32, 47], [64, 95]
2202 memory
2203 .allocate_at_can_overlap(AddressRange::from_start_and_end(48, 63))
2204 .unwrap();
2205 // regions [32, 39], [72, 95]
2206 memory
2207 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 71))
2208 .unwrap();
2209
2210 let mut iter = memory.regions.iter();
2211 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 39)));
2212 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(72, 95)));
2213 }
2214
2215 #[test]
partial_overlap_three()2216 fn partial_overlap_three() {
2217 // regions [32, 95]
2218 let mut memory =
2219 VfioResourceAllocator::new(AddressRange::from_start_and_end(32, 95)).unwrap();
2220 // regions [32, 39], [48, 95]
2221 memory
2222 .allocate_at_can_overlap(AddressRange::from_start_and_end(40, 47))
2223 .unwrap();
2224 // regions [32, 39], [48, 63], [72, 95]
2225 memory
2226 .allocate_at_can_overlap(AddressRange::from_start_and_end(64, 71))
2227 .unwrap();
2228 // regions [32, 35], [76, 95]
2229 memory
2230 .allocate_at_can_overlap(AddressRange::from_start_and_end(36, 75))
2231 .unwrap();
2232
2233 let mut iter = memory.regions.iter();
2234 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(32, 35)));
2235 assert_eq!(iter.next(), Some(&AddressRange::from_start_and_end(76, 95)));
2236 }
2237 }
2238