xref: /aosp_15_r20/external/crosvm/devices/src/pci/msix.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::convert::TryInto;
6 
7 use anyhow::Context;
8 use base::error;
9 use base::info;
10 use base::AsRawDescriptor;
11 use base::Error as SysError;
12 use base::Event;
13 use base::RawDescriptor;
14 use base::Tube;
15 use base::TubeError;
16 use bit_field::*;
17 use remain::sorted;
18 use serde::Deserialize;
19 use serde::Serialize;
20 use thiserror::Error;
21 use vm_control::VmIrqRequest;
22 use vm_control::VmIrqResponse;
23 use zerocopy::AsBytes;
24 use zerocopy::FromBytes;
25 use zerocopy::FromZeroes;
26 
27 use crate::pci::pci_configuration::PciCapConfig;
28 use crate::pci::pci_configuration::PciCapConfigWriteResult;
29 use crate::pci::PciCapability;
30 use crate::pci::PciCapabilityID;
31 
32 const MAX_MSIX_VECTORS_PER_DEVICE: u16 = 2048;
33 pub const MSIX_TABLE_ENTRIES_MODULO: u64 = 16;
34 pub const MSIX_PBA_ENTRIES_MODULO: u64 = 8;
35 pub const BITS_PER_PBA_ENTRY: usize = 64;
36 const FUNCTION_MASK_BIT: u16 = 0x4000;
37 const MSIX_ENABLE_BIT: u16 = 0x8000;
38 const MSIX_TABLE_ENTRY_MASK_BIT: u32 = 0x1;
39 
40 #[derive(Serialize, Deserialize, Clone, Default)]
41 struct MsixTableEntry {
42     msg_addr_lo: u32,
43     msg_addr_hi: u32,
44     msg_data: u32,
45     vector_ctl: u32,
46 }
47 
48 impl MsixTableEntry {
masked(&self) -> bool49     fn masked(&self) -> bool {
50         self.vector_ctl & MSIX_TABLE_ENTRY_MASK_BIT == MSIX_TABLE_ENTRY_MASK_BIT
51     }
52 }
53 
54 struct IrqfdGsi {
55     irqfd: Event,
56     gsi: u32,
57 }
58 
59 /// Wrapper over MSI-X Capability Structure and MSI-X Tables
60 pub struct MsixConfig {
61     table_entries: Vec<MsixTableEntry>,
62     pba_entries: Vec<u64>,
63     irq_vec: Vec<Option<IrqfdGsi>>,
64     masked: bool,
65     enabled: bool,
66     msi_device_socket: Tube,
67     msix_num: u16,
68     pci_id: u32,
69     device_name: String,
70 }
71 
72 #[derive(Serialize, Deserialize)]
73 struct MsixConfigSnapshot {
74     table_entries: Vec<MsixTableEntry>,
75     pba_entries: Vec<u64>,
76     /// Just like MsixConfig::irq_vec, but only the GSI.
77     irq_gsi_vec: Vec<Option<u32>>,
78     masked: bool,
79     enabled: bool,
80     msix_num: u16,
81     pci_id: u32,
82     device_name: String,
83 }
84 
85 #[sorted]
86 #[derive(Error, Debug)]
87 pub enum MsixError {
88     #[error("AddMsiRoute failed: {0}")]
89     AddMsiRoute(SysError),
90     #[error("failed to receive AddMsiRoute response: {0}")]
91     AddMsiRouteRecv(TubeError),
92     #[error("failed to send AddMsiRoute request: {0}")]
93     AddMsiRouteSend(TubeError),
94     #[error("AllocateOneMsi failed: {0}")]
95     AllocateOneMsi(SysError),
96     #[error("failed to receive AllocateOneMsi response: {0}")]
97     AllocateOneMsiRecv(TubeError),
98     #[error("failed to send AllocateOneMsi request: {0}")]
99     AllocateOneMsiSend(TubeError),
100     #[error("failed to deserialize snapshot: {0}")]
101     DeserializationFailed(serde_json::Error),
102     #[error("invalid vector length in snapshot: {0}")]
103     InvalidVectorLength(std::num::TryFromIntError),
104     #[error("ReleaseOneIrq failed: {0}")]
105     ReleaseOneIrq(base::Error),
106     #[error("failed to receive ReleaseOneIrq response: {0}")]
107     ReleaseOneIrqRecv(TubeError),
108     #[error("failed to send ReleaseOneIrq request: {0}")]
109     ReleaseOneIrqSend(TubeError),
110 }
111 
112 type MsixResult<T> = std::result::Result<T, MsixError>;
113 
114 #[derive(Copy, Clone)]
115 pub enum MsixStatus {
116     Changed,
117     EntryChanged(usize),
118     NothingToDo,
119 }
120 
121 impl PciCapConfigWriteResult for MsixStatus {}
122 
123 impl MsixConfig {
new(msix_vectors: u16, vm_socket: Tube, pci_id: u32, device_name: String) -> Self124     pub fn new(msix_vectors: u16, vm_socket: Tube, pci_id: u32, device_name: String) -> Self {
125         assert!(msix_vectors <= MAX_MSIX_VECTORS_PER_DEVICE);
126 
127         let mut table_entries: Vec<MsixTableEntry> = Vec::new();
128         table_entries.resize_with(msix_vectors as usize, Default::default);
129         table_entries
130             .iter_mut()
131             .for_each(|entry| entry.vector_ctl |= MSIX_TABLE_ENTRY_MASK_BIT);
132         let mut pba_entries: Vec<u64> = Vec::new();
133         let num_pba_entries: usize =
134             ((msix_vectors as usize) + BITS_PER_PBA_ENTRY - 1) / BITS_PER_PBA_ENTRY;
135         pba_entries.resize_with(num_pba_entries, Default::default);
136 
137         let mut irq_vec = Vec::new();
138         irq_vec.resize_with(msix_vectors.into(), || None::<IrqfdGsi>);
139 
140         MsixConfig {
141             table_entries,
142             pba_entries,
143             irq_vec,
144             masked: false,
145             enabled: false,
146             msi_device_socket: vm_socket,
147             msix_num: msix_vectors,
148             pci_id,
149             device_name,
150         }
151     }
152 
153     /// Get the number of MSI-X vectors in this configuration.
num_vectors(&self) -> u16154     pub fn num_vectors(&self) -> u16 {
155         self.msix_num
156     }
157 
158     /// Check whether the Function Mask bit in Message Control word in set or not.
159     /// if 1, all of the vectors associated with the function are masked,
160     /// regardless of their per-vector Mask bit states.
161     /// If 0, each vector's Mask bit determines whether the vector is masked or not.
masked(&self) -> bool162     pub fn masked(&self) -> bool {
163         self.masked
164     }
165 
166     /// Check whether the Function Mask bit in MSIX table Message Control
167     /// word in set or not.
168     /// If true, the vector is masked.
169     /// If false, the vector is unmasked.
table_masked(&self, index: usize) -> bool170     pub fn table_masked(&self, index: usize) -> bool {
171         if index >= self.table_entries.len() {
172             true
173         } else {
174             self.table_entries[index].masked()
175         }
176     }
177 
178     /// Check whether the MSI-X Enable bit in Message Control word in set or not.
179     /// if 1, the function is permitted to use MSI-X to request service.
enabled(&self) -> bool180     pub fn enabled(&self) -> bool {
181         self.enabled
182     }
183 
184     /// Read the MSI-X Capability Structure.
185     /// The top 2 bits in Message Control word are emulated and all other
186     /// bits are read only.
read_msix_capability(&self, data: u32) -> u32187     pub fn read_msix_capability(&self, data: u32) -> u32 {
188         let mut msg_ctl = (data >> 16) as u16;
189         msg_ctl &= !(MSIX_ENABLE_BIT | FUNCTION_MASK_BIT);
190 
191         if self.enabled {
192             msg_ctl |= MSIX_ENABLE_BIT;
193         }
194         if self.masked {
195             msg_ctl |= FUNCTION_MASK_BIT;
196         }
197         (msg_ctl as u32) << 16 | (data & u16::MAX as u32)
198     }
199 
200     /// Write to the MSI-X Capability Structure.
201     /// Only the top 2 bits in Message Control Word are writable.
write_msix_capability(&mut self, offset: u64, data: &[u8]) -> MsixStatus202     pub fn write_msix_capability(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
203         if offset == 2 && data.len() == 2 {
204             let reg = u16::from_le_bytes([data[0], data[1]]);
205             let old_masked = self.masked;
206             let old_enabled = self.enabled;
207 
208             self.masked = (reg & FUNCTION_MASK_BIT) == FUNCTION_MASK_BIT;
209             self.enabled = (reg & MSIX_ENABLE_BIT) == MSIX_ENABLE_BIT;
210 
211             if !old_enabled && self.enabled {
212                 if let Err(e) = self.msix_enable_all() {
213                     error!("failed to enable MSI-X: {}", e);
214                     self.enabled = false;
215                 }
216             }
217 
218             // If the Function Mask bit was set, and has just been cleared, it's
219             // important to go through the entire PBA to check if there was any
220             // pending MSI-X message to inject, given that the vector is not
221             // masked.
222             if old_masked && !self.masked {
223                 for (index, entry) in self.table_entries.clone().iter().enumerate() {
224                     if !entry.masked() && self.get_pba_bit(index as u16) == 1 {
225                         self.inject_msix_and_clear_pba(index);
226                     }
227                 }
228                 return MsixStatus::Changed;
229             } else if !old_masked && self.masked {
230                 return MsixStatus::Changed;
231             }
232         } else {
233             error!(
234                 "invalid write to MSI-X Capability Structure offset {:x}",
235                 offset
236             );
237         }
238         MsixStatus::NothingToDo
239     }
240 
241     /// Create a snapshot of the current MsixConfig struct for use in
242     /// snapshotting.
snapshot(&mut self) -> anyhow::Result<serde_json::Value>243     pub fn snapshot(&mut self) -> anyhow::Result<serde_json::Value> {
244         serde_json::to_value(MsixConfigSnapshot {
245             table_entries: self.table_entries.clone(),
246             pba_entries: self.pba_entries.clone(),
247             masked: self.masked,
248             enabled: self.enabled,
249             msix_num: self.msix_num,
250             pci_id: self.pci_id,
251             device_name: self.device_name.clone(),
252             irq_gsi_vec: self
253                 .irq_vec
254                 .iter()
255                 .map(|irq_opt| irq_opt.as_ref().map(|irq| irq.gsi))
256                 .collect(),
257         })
258         .context("failed to serialize MsixConfigSnapshot")
259     }
260 
261     /// Restore a MsixConfig struct based on a snapshot. In short, this will
262     /// restore all data exposed via MMIO, and recreate all MSI-X vectors (they
263     /// will be re-wired to the irq chip).
restore(&mut self, snapshot: serde_json::Value) -> MsixResult<()>264     pub fn restore(&mut self, snapshot: serde_json::Value) -> MsixResult<()> {
265         let snapshot: MsixConfigSnapshot =
266             serde_json::from_value(snapshot).map_err(MsixError::DeserializationFailed)?;
267 
268         self.table_entries = snapshot.table_entries;
269         self.pba_entries = snapshot.pba_entries;
270         self.masked = snapshot.masked;
271         self.enabled = snapshot.enabled;
272         self.msix_num = snapshot.msix_num;
273         self.pci_id = snapshot.pci_id;
274         self.device_name = snapshot.device_name;
275 
276         self.msix_release_all()?;
277         self.irq_vec
278             .resize_with(snapshot.irq_gsi_vec.len(), || None::<IrqfdGsi>);
279         for (vector, gsi) in snapshot.irq_gsi_vec.iter().enumerate() {
280             if let Some(gsi_num) = gsi {
281                 self.msix_restore_one(vector, *gsi_num)?;
282             } else {
283                 info!(
284                     "skipping restore of vector {} for device {}",
285                     vector, self.device_name
286                 );
287             }
288         }
289         Ok(())
290     }
291 
292     /// Restore the specified MSI-X vector.
293     ///
294     /// Note: we skip the checks from [MsixConfig::msix_enable_one] because for
295     /// an interrupt to be present in [MsixConfigSnapshot::irq_gsi_vec], it must
296     /// have passed those checks.
msix_restore_one(&mut self, index: usize, gsi: u32) -> MsixResult<()>297     fn msix_restore_one(&mut self, index: usize, gsi: u32) -> MsixResult<()> {
298         let irqfd = Event::new().map_err(MsixError::AllocateOneMsi)?;
299         let request = VmIrqRequest::AllocateOneMsiAtGsi {
300             irqfd,
301             gsi,
302             device_id: self.pci_id,
303             queue_id: index,
304             device_name: self.device_name.clone(),
305         };
306         self.msi_device_socket
307             .send(&request)
308             .map_err(MsixError::AllocateOneMsiSend)?;
309         if let VmIrqResponse::Err(e) = self
310             .msi_device_socket
311             .recv()
312             .map_err(MsixError::AllocateOneMsiRecv)?
313         {
314             return Err(MsixError::AllocateOneMsi(e));
315         };
316 
317         self.irq_vec[index] = Some(IrqfdGsi {
318             irqfd: match request {
319                 VmIrqRequest::AllocateOneMsiAtGsi { irqfd, .. } => irqfd,
320                 _ => unreachable!(),
321             },
322             gsi,
323         });
324         self.add_msi_route(index as u16, gsi)?;
325         Ok(())
326     }
327 
328     /// On warm restore, there could already be MSIs registered. We need to
329     /// release them in case the routing has changed (e.g. different
330     /// data <-> GSI).
msix_release_all(&mut self) -> MsixResult<()>331     fn msix_release_all(&mut self) -> MsixResult<()> {
332         for irqfd_gsi in self.irq_vec.drain(..).flatten() {
333             let request = VmIrqRequest::ReleaseOneIrq {
334                 gsi: irqfd_gsi.gsi,
335                 irqfd: irqfd_gsi.irqfd,
336             };
337 
338             self.msi_device_socket
339                 .send(&request)
340                 .map_err(MsixError::ReleaseOneIrqSend)?;
341             if let VmIrqResponse::Err(e) = self
342                 .msi_device_socket
343                 .recv()
344                 .map_err(MsixError::ReleaseOneIrqRecv)?
345             {
346                 return Err(MsixError::ReleaseOneIrq(e));
347             }
348         }
349         Ok(())
350     }
351 
add_msi_route(&mut self, index: u16, gsi: u32) -> MsixResult<()>352     fn add_msi_route(&mut self, index: u16, gsi: u32) -> MsixResult<()> {
353         let mut data: [u8; 8] = [0, 0, 0, 0, 0, 0, 0, 0];
354         self.read_msix_table((index * 16).into(), data.as_mut());
355         let msi_address: u64 = u64::from_le_bytes(data);
356         let mut data: [u8; 4] = [0, 0, 0, 0];
357         self.read_msix_table((index * 16 + 8).into(), data.as_mut());
358         let msi_data: u32 = u32::from_le_bytes(data);
359 
360         if msi_address == 0 {
361             return Ok(());
362         }
363 
364         self.msi_device_socket
365             .send(&VmIrqRequest::AddMsiRoute {
366                 gsi,
367                 msi_address,
368                 msi_data,
369             })
370             .map_err(MsixError::AddMsiRouteSend)?;
371         if let VmIrqResponse::Err(e) = self
372             .msi_device_socket
373             .recv()
374             .map_err(MsixError::AddMsiRouteRecv)?
375         {
376             return Err(MsixError::AddMsiRoute(e));
377         }
378         Ok(())
379     }
380 
381     // Enable MSI-X
msix_enable_all(&mut self) -> MsixResult<()>382     fn msix_enable_all(&mut self) -> MsixResult<()> {
383         for index in 0..self.irq_vec.len() {
384             self.msix_enable_one(index)?;
385         }
386         Ok(())
387     }
388 
389     // Use a new MSI-X vector
390     // Create a new eventfd and bind them to a new msi
msix_enable_one(&mut self, index: usize) -> MsixResult<()>391     fn msix_enable_one(&mut self, index: usize) -> MsixResult<()> {
392         if self.irq_vec[index].is_some()
393             || !self.enabled()
394             || self.masked()
395             || self.table_masked(index)
396         {
397             return Ok(());
398         }
399         let irqfd = Event::new().map_err(MsixError::AllocateOneMsi)?;
400         let request = VmIrqRequest::AllocateOneMsi {
401             irqfd,
402             device_id: self.pci_id,
403             queue_id: index,
404             device_name: self.device_name.clone(),
405         };
406         self.msi_device_socket
407             .send(&request)
408             .map_err(MsixError::AllocateOneMsiSend)?;
409         let irq_num: u32 = match self
410             .msi_device_socket
411             .recv()
412             .map_err(MsixError::AllocateOneMsiRecv)?
413         {
414             VmIrqResponse::AllocateOneMsi { gsi } => gsi,
415             VmIrqResponse::Err(e) => return Err(MsixError::AllocateOneMsi(e)),
416             _ => unreachable!(),
417         };
418         self.irq_vec[index] = Some(IrqfdGsi {
419             irqfd: match request {
420                 VmIrqRequest::AllocateOneMsi { irqfd, .. } => irqfd,
421                 _ => unreachable!(),
422             },
423             gsi: irq_num,
424         });
425 
426         self.add_msi_route(index as u16, irq_num)?;
427         Ok(())
428     }
429 
430     /// Read MSI-X table
431     ///  # Arguments
432     ///  * 'offset' - the offset within the MSI-X Table
433     ///  * 'data' - used to store the read results
434     ///
435     /// For all accesses to MSI-X Table and MSI-X PBA fields, software must use aligned full
436     /// DWORD or aligned full QWORD transactions; otherwise, the result is undefined.
437     ///
438     ///   location: DWORD3            DWORD2      DWORD1            DWORD0
439     ///   entry 0:  Vector Control    Msg Data    Msg Upper Addr    Msg Addr
440     ///   entry 1:  Vector Control    Msg Data    Msg Upper Addr    Msg Addr
441     ///   entry 2:  Vector Control    Msg Data    Msg Upper Addr    Msg Addr
442     ///   ...
read_msix_table(&self, offset: u64, data: &mut [u8])443     pub fn read_msix_table(&self, offset: u64, data: &mut [u8]) {
444         let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize;
445         let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO;
446 
447         match data.len() {
448             4 => {
449                 let value = match modulo_offset {
450                     0x0 => self.table_entries[index].msg_addr_lo,
451                     0x4 => self.table_entries[index].msg_addr_hi,
452                     0x8 => self.table_entries[index].msg_data,
453                     0xc => self.table_entries[index].vector_ctl,
454                     _ => {
455                         error!("invalid offset");
456                         0
457                     }
458                 };
459 
460                 data.copy_from_slice(&value.to_le_bytes());
461             }
462             8 => {
463                 let value = match modulo_offset {
464                     0x0 => {
465                         (u64::from(self.table_entries[index].msg_addr_hi) << 32)
466                             | u64::from(self.table_entries[index].msg_addr_lo)
467                     }
468                     0x8 => {
469                         (u64::from(self.table_entries[index].vector_ctl) << 32)
470                             | u64::from(self.table_entries[index].msg_data)
471                     }
472                     _ => {
473                         error!("invalid offset");
474                         0
475                     }
476                 };
477 
478                 data.copy_from_slice(&value.to_le_bytes());
479             }
480             _ => error!("invalid data length"),
481         };
482     }
483 
484     /// Write to MSI-X table
485     ///
486     /// Message Address: the contents of this field specifies the address
487     ///     for the memory write transaction; different MSI-X vectors have
488     ///     different Message Address values
489     /// Message Data: the contents of this field specifies the data driven
490     ///     on AD\[31::00\] during the memory write transaction's data phase.
491     /// Vector Control: only bit 0 (Mask Bit) is not reserved: when this bit
492     ///     is set, the function is prohibited from sending a message using
493     ///     this MSI-X Table entry.
write_msix_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus494     pub fn write_msix_table(&mut self, offset: u64, data: &[u8]) -> MsixStatus {
495         let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize;
496         let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO;
497 
498         // Store the value of the entry before modification
499         let old_entry = self.table_entries[index].clone();
500 
501         match data.len() {
502             4 => {
503                 let value = u32::from_le_bytes(data.try_into().unwrap());
504                 match modulo_offset {
505                     0x0 => self.table_entries[index].msg_addr_lo = value,
506                     0x4 => self.table_entries[index].msg_addr_hi = value,
507                     0x8 => self.table_entries[index].msg_data = value,
508                     0xc => self.table_entries[index].vector_ctl = value,
509                     _ => error!("invalid offset"),
510                 };
511             }
512             8 => {
513                 let value = u64::from_le_bytes(data.try_into().unwrap());
514                 match modulo_offset {
515                     0x0 => {
516                         self.table_entries[index].msg_addr_lo = (value & 0xffff_ffffu64) as u32;
517                         self.table_entries[index].msg_addr_hi = (value >> 32) as u32;
518                     }
519                     0x8 => {
520                         self.table_entries[index].msg_data = (value & 0xffff_ffffu64) as u32;
521                         self.table_entries[index].vector_ctl = (value >> 32) as u32;
522                     }
523                     _ => error!("invalid offset"),
524                 };
525             }
526             _ => error!("invalid data length"),
527         };
528 
529         let new_entry = self.table_entries[index].clone();
530 
531         // This MSI-X vector is enabled for the first time.
532         if self.enabled()
533             && !self.masked()
534             && self.irq_vec[index].is_none()
535             && old_entry.masked()
536             && !new_entry.masked()
537         {
538             if let Err(e) = self.msix_enable_one(index) {
539                 error!("failed to enable MSI-X vector {}: {}", index, e);
540                 self.table_entries[index].vector_ctl |= MSIX_TABLE_ENTRY_MASK_BIT;
541             }
542             return MsixStatus::EntryChanged(index);
543         }
544 
545         if self.enabled()
546             && (old_entry.msg_addr_lo != new_entry.msg_addr_lo
547                 || old_entry.msg_addr_hi != new_entry.msg_addr_hi
548                 || old_entry.msg_data != new_entry.msg_data)
549         {
550             if let Some(irqfd_gsi) = &self.irq_vec[index] {
551                 let irq_num = irqfd_gsi.gsi;
552                 if let Err(e) = self.add_msi_route(index as u16, irq_num) {
553                     error!("add_msi_route failed: {}", e);
554                 }
555             }
556         }
557 
558         // After the MSI-X table entry has been updated, it is necessary to
559         // check if the vector control masking bit has changed. In case the
560         // bit has been flipped from 1 to 0, we need to inject a MSI message
561         // if the corresponding pending bit from the PBA is set. Once the MSI
562         // has been injected, the pending bit in the PBA needs to be cleared.
563         // All of this is valid only if MSI-X has not been masked for the whole
564         // device.
565 
566         // Check if bit has been flipped
567         if !self.masked() {
568             if old_entry.masked() && !self.table_entries[index].masked() {
569                 if self.get_pba_bit(index as u16) == 1 {
570                     self.inject_msix_and_clear_pba(index);
571                 }
572                 return MsixStatus::EntryChanged(index);
573             } else if !old_entry.masked() && self.table_entries[index].masked() {
574                 return MsixStatus::EntryChanged(index);
575             }
576         }
577         MsixStatus::NothingToDo
578     }
579 
580     /// Read PBA Entries
581     ///  # Arguments
582     ///  * 'offset' - the offset within the PBA entries
583     ///  * 'data' - used to store the read results
584     ///
585     /// Pending Bits\[63::00\]: For each Pending Bit that is set, the function
586     /// has a pending message for the associated MSI-X Table entry.
read_pba_entries(&self, offset: u64, data: &mut [u8])587     pub fn read_pba_entries(&self, offset: u64, data: &mut [u8]) {
588         let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize;
589         let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO;
590 
591         match data.len() {
592             4 => {
593                 let value: u32 = match modulo_offset {
594                     0x0 => (self.pba_entries[index] & 0xffff_ffffu64) as u32,
595                     0x4 => (self.pba_entries[index] >> 32) as u32,
596                     _ => {
597                         error!("invalid offset");
598                         0
599                     }
600                 };
601 
602                 data.copy_from_slice(&value.to_le_bytes());
603             }
604             8 => {
605                 let value: u64 = match modulo_offset {
606                     0x0 => self.pba_entries[index],
607                     _ => {
608                         error!("invalid offset");
609                         0
610                     }
611                 };
612 
613                 data.copy_from_slice(&value.to_le_bytes());
614             }
615             _ => error!("invalid data length"),
616         }
617     }
618 
619     /// Write to PBA Entries
620     ///
621     /// Software should never write, and should only read Pending Bits.
622     /// If software writes to Pending Bits, the result is undefined.
write_pba_entries(&mut self, _offset: u64, _data: &[u8])623     pub fn write_pba_entries(&mut self, _offset: u64, _data: &[u8]) {
624         error!("Pending Bit Array is read only");
625     }
626 
set_pba_bit(&mut self, vector: u16, set: bool)627     fn set_pba_bit(&mut self, vector: u16, set: bool) {
628         assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE);
629 
630         let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY;
631         let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY;
632         let mut mask: u64 = (1 << shift) as u64;
633 
634         if set {
635             self.pba_entries[index] |= mask;
636         } else {
637             mask = !mask;
638             self.pba_entries[index] &= mask;
639         }
640     }
641 
get_pba_bit(&self, vector: u16) -> u8642     fn get_pba_bit(&self, vector: u16) -> u8 {
643         assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE);
644 
645         let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY;
646         let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY;
647 
648         ((self.pba_entries[index] >> shift) & 0x0000_0001u64) as u8
649     }
650 
inject_msix_and_clear_pba(&mut self, vector: usize)651     fn inject_msix_and_clear_pba(&mut self, vector: usize) {
652         if let Some(irq) = &self.irq_vec[vector] {
653             irq.irqfd.signal().unwrap();
654         }
655 
656         // Clear the bit from PBA
657         self.set_pba_bit(vector as u16, false);
658     }
659 
660     /// Inject virtual interrupt to the guest
661     ///
662     ///  # Arguments
663     ///  * 'vector' - the index to the MSI-X Table entry
664     ///
665     /// PCI Spec 3.0 6.8.3.5: while a vector is masked, the function is
666     /// prohibited from sending the associated message, and the function
667     /// must set the associated Pending bit whenever the function would
668     /// otherwise send the message. When software unmasks a vector whose
669     /// associated Pending bit is set, the function must schedule sending
670     /// the associated message, and clear the Pending bit as soon as the
671     /// message has been sent.
672     ///
673     /// If the vector is unmasked, writing to irqfd which wakes up KVM to
674     /// inject virtual interrupt to the guest.
trigger(&mut self, vector: u16)675     pub fn trigger(&mut self, vector: u16) {
676         if self.table_entries[vector as usize].masked() || self.masked() {
677             self.set_pba_bit(vector, true);
678         } else if let Some(irq) = self.irq_vec.get(vector as usize).unwrap_or(&None) {
679             irq.irqfd.signal().unwrap();
680         }
681     }
682 
683     /// Return the raw descriptor of the MSI device socket
get_msi_socket(&self) -> RawDescriptor684     pub fn get_msi_socket(&self) -> RawDescriptor {
685         self.msi_device_socket.as_raw_descriptor()
686     }
687 
688     /// Return irqfd of MSI-X Table entry
689     ///
690     ///  # Arguments
691     ///  * 'vector' - the index to the MSI-X table entry
get_irqfd(&self, vector: usize) -> Option<&Event>692     pub fn get_irqfd(&self, vector: usize) -> Option<&Event> {
693         match self.irq_vec.get(vector).unwrap_or(&None) {
694             Some(irq) => Some(&irq.irqfd),
695             None => None,
696         }
697     }
698 
destroy(&mut self)699     pub fn destroy(&mut self) {
700         while let Some(irq) = self.irq_vec.pop() {
701             if let Some(irq) = irq {
702                 let request = VmIrqRequest::ReleaseOneIrq {
703                     gsi: irq.gsi,
704                     irqfd: irq.irqfd,
705                 };
706                 if self.msi_device_socket.send(&request).is_err() {
707                     continue;
708                 }
709                 let _ = self.msi_device_socket.recv::<VmIrqResponse>();
710             }
711         }
712     }
713 }
714 
715 const MSIX_CONFIG_READ_MASK: [u32; 3] = [0xc000_0000, 0, 0];
716 
717 impl PciCapConfig for MsixConfig {
read_mask(&self) -> &'static [u32]718     fn read_mask(&self) -> &'static [u32] {
719         &MSIX_CONFIG_READ_MASK
720     }
721 
read_reg(&self, reg_idx: usize) -> u32722     fn read_reg(&self, reg_idx: usize) -> u32 {
723         if reg_idx == 0 {
724             self.read_msix_capability(0)
725         } else {
726             0
727         }
728     }
729 
write_reg( &mut self, reg_idx: usize, offset: u64, data: &[u8], ) -> Option<Box<dyn PciCapConfigWriteResult>>730     fn write_reg(
731         &mut self,
732         reg_idx: usize,
733         offset: u64,
734         data: &[u8],
735     ) -> Option<Box<dyn PciCapConfigWriteResult>> {
736         let status = if reg_idx == 0 {
737             self.write_msix_capability(offset, data)
738         } else {
739             MsixStatus::NothingToDo
740         };
741         Some(Box::new(status))
742     }
743 }
744 
745 impl AsRawDescriptor for MsixConfig {
as_raw_descriptor(&self) -> RawDescriptor746     fn as_raw_descriptor(&self) -> RawDescriptor {
747         self.msi_device_socket.as_raw_descriptor()
748     }
749 }
750 
751 /// Message Control Register
752 //   10-0:  MSI-X Table size
753 //   13-11: Reserved
754 //   14:    Mask. Mask all MSI-X when set.
755 //   15:    Enable. Enable all MSI-X when set.
756 // See <https://wiki.osdev.org/PCI#Enabling_MSI-X> for the details.
757 #[bitfield]
758 #[derive(Copy, Clone, Default, AsBytes, FromZeroes, FromBytes)]
759 pub struct MsixCtrl {
760     table_size: B10,
761     reserved: B4,
762     mask: B1,
763     enable: B1,
764 }
765 
766 #[allow(dead_code)]
767 #[repr(C)]
768 #[derive(Clone, Copy, Default, AsBytes, FromZeroes, FromBytes)]
769 /// MSI-X Capability Structure
770 pub struct MsixCap {
771     // To make add_capability() happy
772     _cap_vndr: u8,
773     _cap_next: u8,
774     // Message Control Register
775     msg_ctl: MsixCtrl,
776     // Table. Contains the offset and the BAR indicator (BIR)
777     //   2-0:  Table BAR indicator (BIR). Can be 0 to 5.
778     //   31-3: Table offset in the BAR pointed by the BIR.
779     table: u32,
780     // Pending Bit Array. Contains the offset and the BAR indicator (BIR)
781     //   2-0:  PBA BAR indicator (BIR). Can be 0 to 5.
782     //   31-3: PBA offset in the BAR pointed by the BIR.
783     pba: u32,
784 }
785 
786 impl PciCapability for MsixCap {
bytes(&self) -> &[u8]787     fn bytes(&self) -> &[u8] {
788         self.as_bytes()
789     }
790 
id(&self) -> PciCapabilityID791     fn id(&self) -> PciCapabilityID {
792         PciCapabilityID::Msix
793     }
794 
writable_bits(&self) -> Vec<u32>795     fn writable_bits(&self) -> Vec<u32> {
796         // Only msg_ctl[15:14] is writable
797         vec![0x3000_0000, 0, 0]
798     }
799 }
800 
801 impl MsixCap {
new( table_pci_bar: u8, table_size: u16, table_off: u32, pba_pci_bar: u8, pba_off: u32, ) -> Self802     pub fn new(
803         table_pci_bar: u8,
804         table_size: u16,
805         table_off: u32,
806         pba_pci_bar: u8,
807         pba_off: u32,
808     ) -> Self {
809         assert!(table_size < MAX_MSIX_VECTORS_PER_DEVICE);
810 
811         // Set the table size and enable MSI-X.
812         let mut msg_ctl = MsixCtrl::new();
813         msg_ctl.set_enable(1);
814         // Table Size is N - 1 encoded.
815         msg_ctl.set_table_size(table_size - 1);
816 
817         MsixCap {
818             _cap_vndr: 0,
819             _cap_next: 0,
820             msg_ctl,
821             table: (table_off & 0xffff_fff8u32) | u32::from(table_pci_bar & 0x7u8),
822             pba: (pba_off & 0xffff_fff8u32) | u32::from(pba_pci_bar & 0x7u8),
823         }
824     }
825 }
826 
827 #[cfg(test)]
828 mod tests {
829 
830     use std::thread;
831 
832     use super::*;
833 
834     #[track_caller]
recv_allocate_msi(t: &Tube) -> u32835     fn recv_allocate_msi(t: &Tube) -> u32 {
836         match t.recv::<VmIrqRequest>().unwrap() {
837             VmIrqRequest::AllocateOneMsiAtGsi { gsi, .. } => gsi,
838             msg => panic!("unexpected irqchip message: {:?}", msg),
839         }
840     }
841 
842     struct MsiRouteDetails {
843         gsi: u32,
844         msi_address: u64,
845         msi_data: u32,
846     }
847 
848     #[track_caller]
recv_add_msi_route(t: &Tube) -> MsiRouteDetails849     fn recv_add_msi_route(t: &Tube) -> MsiRouteDetails {
850         match t.recv::<VmIrqRequest>().unwrap() {
851             VmIrqRequest::AddMsiRoute {
852                 gsi,
853                 msi_address,
854                 msi_data,
855             } => MsiRouteDetails {
856                 gsi,
857                 msi_address,
858                 msi_data,
859             },
860             msg => panic!("unexpected irqchip message: {:?}", msg),
861         }
862     }
863 
864     #[track_caller]
recv_release_one_irq(t: &Tube) -> u32865     fn recv_release_one_irq(t: &Tube) -> u32 {
866         match t.recv::<VmIrqRequest>().unwrap() {
867             VmIrqRequest::ReleaseOneIrq { gsi, irqfd: _ } => gsi,
868             msg => panic!("unexpected irqchip message: {:?}", msg),
869         }
870     }
871 
872     #[track_caller]
send_ok(t: &Tube)873     fn send_ok(t: &Tube) {
874         t.send(&VmIrqResponse::Ok).unwrap();
875     }
876 
877     /// Tests a cold restore where there are no existing vectors at the time
878     /// restore is called.
879     #[test]
verify_msix_restore_cold_smoke()880     fn verify_msix_restore_cold_smoke() {
881         let (irqchip_tube, msix_config_tube) = Tube::pair().unwrap();
882         let (_unused, unused_config_tube) = Tube::pair().unwrap();
883 
884         let mut cfg = MsixConfig::new(2, unused_config_tube, 0, "test_device".to_owned());
885 
886         // Set up two MSI-X vectors (0 and 1).
887         // Data is 0xdVEC_NUM. Address is 0xaVEC_NUM.
888         cfg.table_entries[0].msg_data = 0xd0;
889         cfg.table_entries[0].msg_addr_lo = 0xa0;
890         cfg.table_entries[0].msg_addr_hi = 0;
891         cfg.table_entries[1].msg_data = 0xd1;
892         cfg.table_entries[1].msg_addr_lo = 0xa1;
893         cfg.table_entries[1].msg_addr_hi = 0;
894 
895         // Pretend that these vectors were hooked up to GSIs 10 & 20,
896         // respectively.
897         cfg.irq_vec = vec![
898             Some(IrqfdGsi {
899                 gsi: 10,
900                 irqfd: Event::new().unwrap(),
901             }),
902             Some(IrqfdGsi {
903                 gsi: 20,
904                 irqfd: Event::new().unwrap(),
905             }),
906         ];
907 
908         // Take a snapshot of MsixConfig.
909         let snapshot = cfg.snapshot().unwrap();
910 
911         // Create a fake irqchip to respond to our requests
912         let irqchip_fake = thread::spawn(move || {
913             assert_eq!(recv_allocate_msi(&irqchip_tube), 10);
914             send_ok(&irqchip_tube);
915             let route_one = recv_add_msi_route(&irqchip_tube);
916             assert_eq!(route_one.gsi, 10);
917             assert_eq!(route_one.msi_address, 0xa0);
918             assert_eq!(route_one.msi_data, 0xd0);
919             send_ok(&irqchip_tube);
920 
921             assert_eq!(recv_allocate_msi(&irqchip_tube), 20);
922             send_ok(&irqchip_tube);
923             let route_two = recv_add_msi_route(&irqchip_tube);
924             assert_eq!(route_two.gsi, 20);
925             assert_eq!(route_two.msi_address, 0xa1);
926             assert_eq!(route_two.msi_data, 0xd1);
927             send_ok(&irqchip_tube);
928             irqchip_tube
929         });
930 
931         let mut restored_cfg = MsixConfig::new(10, msix_config_tube, 10, "some_device".to_owned());
932         restored_cfg.restore(snapshot).unwrap();
933         irqchip_fake.join().unwrap();
934 
935         assert_eq!(restored_cfg.pci_id, 0);
936         assert_eq!(restored_cfg.device_name, "test_device");
937     }
938 
939     /// Tests a warm restore where there are existing vectors at the time
940     /// restore is called. These vectors need to be released first.
941     #[test]
verify_msix_restore_warm_smoke()942     fn verify_msix_restore_warm_smoke() {
943         let (irqchip_tube, msix_config_tube) = Tube::pair().unwrap();
944 
945         let mut cfg = MsixConfig::new(2, msix_config_tube, 0, "test_device".to_owned());
946 
947         // Set up two MSI-X vectors (0 and 1).
948         // Data is 0xdVEC_NUM. Address is 0xaVEC_NUM.
949         cfg.table_entries[0].msg_data = 0xd0;
950         cfg.table_entries[0].msg_addr_lo = 0xa0;
951         cfg.table_entries[0].msg_addr_hi = 0;
952         cfg.table_entries[1].msg_data = 0xd1;
953         cfg.table_entries[1].msg_addr_lo = 0xa1;
954         cfg.table_entries[1].msg_addr_hi = 0;
955 
956         // Pretend that these vectors were hooked up to GSIs 10 & 20,
957         // respectively.
958         cfg.irq_vec = vec![
959             Some(IrqfdGsi {
960                 gsi: 10,
961                 irqfd: Event::new().unwrap(),
962             }),
963             Some(IrqfdGsi {
964                 gsi: 20,
965                 irqfd: Event::new().unwrap(),
966             }),
967         ];
968 
969         // Take a snapshot of MsixConfig.
970         let snapshot = cfg.snapshot().unwrap();
971 
972         // Create a fake irqchip to respond to our requests
973         let irqchip_fake = thread::spawn(move || {
974             // First, we free the existing vectors / GSIs.
975             assert_eq!(recv_release_one_irq(&irqchip_tube), 10);
976             send_ok(&irqchip_tube);
977             assert_eq!(recv_release_one_irq(&irqchip_tube), 20);
978             send_ok(&irqchip_tube);
979 
980             // Now we re-allocate them.
981             assert_eq!(recv_allocate_msi(&irqchip_tube), 10);
982             send_ok(&irqchip_tube);
983             let route_one = recv_add_msi_route(&irqchip_tube);
984             assert_eq!(route_one.gsi, 10);
985             assert_eq!(route_one.msi_address, 0xa0);
986             assert_eq!(route_one.msi_data, 0xd0);
987             send_ok(&irqchip_tube);
988 
989             assert_eq!(recv_allocate_msi(&irqchip_tube), 20);
990             send_ok(&irqchip_tube);
991             let route_two = recv_add_msi_route(&irqchip_tube);
992             assert_eq!(route_two.gsi, 20);
993             assert_eq!(route_two.msi_address, 0xa1);
994             assert_eq!(route_two.msi_data, 0xd1);
995             send_ok(&irqchip_tube);
996             irqchip_tube
997         });
998 
999         cfg.restore(snapshot).unwrap();
1000         irqchip_fake.join().unwrap();
1001 
1002         assert_eq!(cfg.pci_id, 0);
1003         assert_eq!(cfg.device_name, "test_device");
1004     }
1005 }
1006