xref: /aosp_15_r20/external/crosvm/devices/src/pci/pcie/pcie_host.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::fs::read;
6 use std::fs::write;
7 use std::fs::File;
8 use std::fs::OpenOptions;
9 use std::os::unix::fs::FileExt;
10 use std::path::Path;
11 use std::path::PathBuf;
12 use std::sync::Arc;
13 use std::thread;
14 
15 use anyhow::anyhow;
16 use anyhow::bail;
17 use anyhow::Context;
18 use anyhow::Result;
19 use base::error;
20 use base::Tube;
21 use sync::Mutex;
22 use vm_control::HotPlugDeviceInfo;
23 use vm_control::HotPlugDeviceType;
24 use vm_control::VmRequest;
25 use vm_control::VmResponse;
26 use zerocopy::AsBytes;
27 use zerocopy::FromBytes;
28 
29 use crate::pci::pci_configuration::PciBridgeSubclass;
30 use crate::pci::pci_configuration::CAPABILITY_LIST_HEAD_OFFSET;
31 use crate::pci::pci_configuration::HEADER_TYPE_REG;
32 use crate::pci::pci_configuration::PCI_CAP_NEXT_POINTER;
33 use crate::pci::pcie::pci_bridge::PciBridgeBusRange;
34 use crate::pci::pcie::pci_bridge::BR_BUS_NUMBER_REG;
35 use crate::pci::pcie::pci_bridge::BR_MEM_BASE_MASK;
36 use crate::pci::pcie::pci_bridge::BR_MEM_BASE_SHIFT;
37 use crate::pci::pcie::pci_bridge::BR_MEM_LIMIT_MASK;
38 use crate::pci::pcie::pci_bridge::BR_MEM_MINIMUM;
39 use crate::pci::pcie::pci_bridge::BR_MEM_REG;
40 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_64BIT;
41 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_BASE_HIGH_REG;
42 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_LIMIT_HIGH_REG;
43 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_LOW_REG;
44 use crate::pci::pcie::pci_bridge::BR_WINDOW_ALIGNMENT;
45 use crate::pci::pcie::PcieDevicePortType;
46 use crate::pci::PciCapabilityID;
47 use crate::pci::PciClassCode;
48 
49 // Host Pci device's sysfs config file
50 struct PciHostConfig {
51     config_file: File,
52 }
53 
54 impl PciHostConfig {
55     // Create a new host pci device's sysfs config file
new(host_sysfs_path: &Path) -> Result<Self>56     fn new(host_sysfs_path: &Path) -> Result<Self> {
57         let mut config_path = PathBuf::new();
58         config_path.push(host_sysfs_path);
59         config_path.push("config");
60         let f = OpenOptions::new()
61             .write(true)
62             .read(true)
63             .open(config_path.as_path())
64             .with_context(|| format!("failed to open: {}", config_path.display()))?;
65         Ok(PciHostConfig { config_file: f })
66     }
67 
68     // Read host pci device's config register
read_config<T: AsBytes + FromBytes + Copy + Default>(&self, offset: u64) -> T69     fn read_config<T: AsBytes + FromBytes + Copy + Default>(&self, offset: u64) -> T {
70         let length = std::mem::size_of::<T>();
71         let mut val = T::default();
72         if offset % length as u64 != 0 {
73             error!(
74                 "read_config, offset {} isn't aligned to length {}",
75                 offset, length
76             );
77         } else if let Err(e) = self.config_file.read_exact_at(val.as_bytes_mut(), offset) {
78             error!("failed to read host sysfs config: {}", e);
79         }
80 
81         val
82     }
83 
84     // write host pci device's config register
85     #[allow(dead_code)]
write_config(&self, offset: u64, data: &[u8])86     fn write_config(&self, offset: u64, data: &[u8]) {
87         if offset % data.len() as u64 != 0 {
88             error!(
89                 "write_config, offset {} isn't aligned to length {}",
90                 offset,
91                 data.len()
92             );
93             return;
94         }
95         if let Err(e) = self.config_file.write_all_at(data, offset) {
96             error!("failed to write host sysfs config: {}", e);
97         }
98     }
99 }
100 
101 // Find all the added pcie devices
visit_children(dir: &Path, children: &mut Vec<HotPlugDeviceInfo>) -> Result<()>102 fn visit_children(dir: &Path, children: &mut Vec<HotPlugDeviceInfo>) -> Result<()> {
103     // Each pci device has a sysfs directory
104     if !dir.is_dir() {
105         bail!("{} isn't directory", dir.display());
106     }
107     // Loop device sysfs subdirectory
108     let entries = dir
109         .read_dir()
110         .with_context(|| format!("failed to read dir {}", dir.display()))?;
111     let mut devices = Vec::new();
112     for entry in entries {
113         let sub_dir = match entry {
114             Ok(sub) => sub,
115             _ => continue,
116         };
117 
118         if !sub_dir.path().is_dir() {
119             continue;
120         }
121 
122         let name = sub_dir
123             .file_name()
124             .into_string()
125             .map_err(|_| anyhow!("failed to get dir name"))?;
126         // Child pci device has name format 0000:xx:xx.x, length is 12
127         if name.len() != 12 || !name.starts_with("0000:") {
128             continue;
129         }
130         let child_path = dir.join(name);
131         devices.push(child_path);
132     }
133     devices.reverse();
134     let mut iter = devices.iter().peekable();
135     while let Some(device) = iter.next() {
136         let class_path = device.join("class");
137         let class_id = read(class_path.as_path())
138             .with_context(|| format!("failed to read {}", class_path.display()))?;
139         let hp_interrupt = iter.peek().is_none();
140         if !class_id.starts_with("0x0604".as_bytes()) {
141             // If the device isn't pci bridge, this is a pcie endpoint device
142             children.push(HotPlugDeviceInfo {
143                 device_type: HotPlugDeviceType::EndPoint,
144                 path: device.to_path_buf(),
145                 hp_interrupt,
146             });
147             // No need to look further
148             return Ok(());
149         } else {
150             // Find the pci express cap to get the port type of the pcie bridge
151             let host_config = PciHostConfig::new(device)?;
152             let mut cap_pointer: u8 = host_config.read_config(CAPABILITY_LIST_HEAD_OFFSET as u64);
153             while cap_pointer != 0x0 {
154                 let cap_id: u8 = host_config.read_config(cap_pointer as u64);
155                 if cap_id == PciCapabilityID::PciExpress as u8 {
156                     break;
157                 }
158                 cap_pointer = host_config.read_config(cap_pointer as u64 + 0x1);
159             }
160             if cap_pointer == 0x0 {
161                 bail!(
162                     "Failed to get pcie express capability for {}",
163                     device.display()
164                 );
165             }
166             let express_cap_reg: u16 = host_config.read_config(cap_pointer as u64 + 0x2);
167             match (express_cap_reg & 0xf0) >> 4 {
168                 x if x == PcieDevicePortType::UpstreamPort as u16 => {
169                     children.push(HotPlugDeviceInfo {
170                         device_type: HotPlugDeviceType::UpstreamPort,
171                         path: device.to_path_buf(),
172                         hp_interrupt,
173                     })
174                 }
175                 x if x == PcieDevicePortType::DownstreamPort as u16 => {
176                     children.push(HotPlugDeviceInfo {
177                         device_type: HotPlugDeviceType::DownstreamPort,
178                         path: device.to_path_buf(),
179                         hp_interrupt,
180                     })
181                 }
182                 _ => (),
183             }
184         }
185     }
186     for device in devices.iter() {
187         visit_children(device.as_path(), children)?;
188     }
189     Ok(())
190 }
191 
192 struct HotplugWorker {
193     host_name: String,
194 }
195 
196 impl HotplugWorker {
run(&self, vm_socket: Arc<Mutex<Tube>>, child_exist: Arc<Mutex<bool>>) -> Result<()>197     fn run(&self, vm_socket: Arc<Mutex<Tube>>, child_exist: Arc<Mutex<bool>>) -> Result<()> {
198         let mut host_sysfs = PathBuf::new();
199         host_sysfs.push("/sys/bus/pci/devices/");
200         host_sysfs.push(self.host_name.clone());
201         let rescan_path = host_sysfs.join("rescan");
202         // Let pcie root port rescan to find the added or removed children devices
203         write(rescan_path.as_path(), "1")
204             .with_context(|| format!("failed to write {}", rescan_path.display()))?;
205 
206         // If child device existed, but code run here again, this means host has a
207         // hotplug out event, after the above rescan, host should find the removed
208         // child device, and host vfio-pci kernel driver should notify crosvm vfio-pci
209         // devie such hotplug out event, so nothing is needed to do here, just return
210         // it now.
211         let mut child_exist = child_exist.lock();
212         if *child_exist {
213             return Ok(());
214         }
215 
216         // Probe the new added pcie endpoint devices
217         let mut children: Vec<HotPlugDeviceInfo> = Vec::new();
218         visit_children(host_sysfs.as_path(), &mut children)?;
219 
220         // Without reverse children, physical larger BDF device is at the top, it will be
221         // added into guest first with smaller virtual function number, so physical smaller
222         // BDF device has larger virtual function number, phyiscal larger BDF device has
223         // smaller virtual function number. During hotplug out process, host pcie root port
224         // driver remove physical smaller BDF pcie endpoint device first, so host vfio-pci
225         // driver send plug out event first for smaller BDF device and wait for this device
226         // removed from crosvm, when crosvm receives this plug out event, crosvm will remove
227         // all the children devices, crosvm remove smaller virtual function number device
228         // first, this isn't the target device which host vfio-pci driver is waiting for.
229         // Host vfio-pci driver holds a lock when it is waiting, when crosvm remove another
230         // device throgh vfio-pci which try to get the same lock, so deadlock happens in
231         // host kernel.
232         //
233         // In order to fix the deadlock, children is reversed, so physical smaller BDF
234         // device has smaller virtual function number, and it will have the same order
235         // between host kernel and crosvm during hotplug out process.
236         children.reverse();
237         while let Some(child) = children.pop() {
238             if let HotPlugDeviceType::EndPoint = child.device_type {
239                 // In order to bind device to vfio-pci driver, get device VID and DID
240                 let vendor_path = child.path.join("vendor");
241                 let vendor_id = read(vendor_path.as_path())
242                     .with_context(|| format!("failed to read {}", vendor_path.display()))?;
243                 // Remove the first two elements 0x
244                 let prefix: &str = "0x";
245                 let vendor = match vendor_id.strip_prefix(prefix.as_bytes()) {
246                     Some(v) => v.to_vec(),
247                     None => vendor_id,
248                 };
249                 let device_path = child.path.join("device");
250                 let device_id = read(device_path.as_path())
251                     .with_context(|| format!("failed to read {}", device_path.display()))?;
252                 // Remove the first two elements 0x
253                 let device = match device_id.strip_prefix(prefix.as_bytes()) {
254                     Some(d) => d.to_vec(),
255                     None => device_id,
256                 };
257                 let new_id = [
258                     String::from_utf8_lossy(&vendor),
259                     String::from_utf8_lossy(&device),
260                 ]
261                 .join(" ");
262                 if Path::new("/sys/bus/pci/drivers/vfio-pci-pm/new_id").exists() {
263                     let _ = write("/sys/bus/pci/drivers/vfio-pci-pm/new_id", &new_id);
264                 }
265                 // This is normal - either the kernel doesn't support vfio-pci-pm driver,
266                 // or the device failed to attach to vfio-pci-pm driver (most likely due to
267                 // lack of power management capability).
268                 if !child.path.join("driver/unbind").exists() {
269                     write("/sys/bus/pci/drivers/vfio-pci/new_id", &new_id).with_context(|| {
270                         format!("failed to write {} into vfio-pci/new_id", new_id)
271                     })?;
272                 }
273             }
274             // Request to hotplug the new added pcie device into guest
275             let request = VmRequest::HotPlugVfioCommand {
276                 device: child.clone(),
277                 add: true,
278             };
279             let vm_socket = vm_socket.lock();
280             vm_socket
281                 .send(&request)
282                 .with_context(|| format!("failed to send hotplug request for {:?}", child))?;
283             let response = vm_socket
284                 .recv::<VmResponse>()
285                 .with_context(|| format!("failed to receive hotplug response for {:?}", child))?;
286             match response {
287                 VmResponse::Ok => {}
288                 _ => bail!("unexpected hotplug response: {response}"),
289             };
290             if !*child_exist {
291                 *child_exist = true;
292             }
293         }
294 
295         Ok(())
296     }
297 }
298 
299 const PCI_CONFIG_DEVICE_ID: u64 = 0x02;
300 const PCI_BASE_CLASS_CODE: u64 = 0x0B;
301 const PCI_SUB_CLASS_CODE: u64 = 0x0A;
302 
303 /// Pcie root port device has a corresponding host pcie root port.
304 pub struct PcieHostPort {
305     host_config: PciHostConfig,
306     host_name: String,
307     hotplug_in_process: Arc<Mutex<bool>>,
308     hotplug_child_exist: Arc<Mutex<bool>>,
309     vm_socket: Arc<Mutex<Tube>>,
310 }
311 
312 impl PcieHostPort {
313     /// Create PcieHostPort, host_syfsfs_patch specify host pcie port
314     /// sysfs path.
new(host_sysfs_path: &Path, socket: Tube) -> Result<Self>315     pub fn new(host_sysfs_path: &Path, socket: Tube) -> Result<Self> {
316         let host_config = PciHostConfig::new(host_sysfs_path)?;
317         let host_name = host_sysfs_path
318             .file_name()
319             .unwrap()
320             .to_str()
321             .unwrap()
322             .to_owned();
323         let base_class: u8 = host_config.read_config(PCI_BASE_CLASS_CODE);
324         if base_class != PciClassCode::BridgeDevice.get_register_value() {
325             return Err(anyhow!("host {} isn't bridge", host_name));
326         }
327         let sub_class: u8 = host_config.read_config(PCI_SUB_CLASS_CODE);
328         if sub_class != PciBridgeSubclass::PciToPciBridge as u8 {
329             return Err(anyhow!("host {} isn't pci to pci bridge", host_name));
330         }
331 
332         let mut pcie_cap_reg: u8 = 0;
333 
334         let mut cap_next: u8 = host_config.read_config(CAPABILITY_LIST_HEAD_OFFSET as u64);
335         let mut counter: u16 = 0;
336         while cap_next != 0 && counter < 256 {
337             let cap_id: u8 = host_config.read_config(cap_next.into());
338             if cap_id == PciCapabilityID::PciExpress as u8 {
339                 pcie_cap_reg = cap_next;
340                 break;
341             }
342             let offset = cap_next as u64 + PCI_CAP_NEXT_POINTER as u64;
343             cap_next = host_config.read_config(offset);
344             counter += 1;
345         }
346 
347         if pcie_cap_reg == 0 {
348             return Err(anyhow!("host {} isn't pcie device", host_name));
349         }
350 
351         Ok(PcieHostPort {
352             host_config,
353             host_name,
354             hotplug_in_process: Arc::new(Mutex::new(false)),
355             hotplug_child_exist: Arc::new(Mutex::new(false)),
356             vm_socket: Arc::new(Mutex::new(socket)),
357         })
358     }
359 
get_bus_range(&self) -> PciBridgeBusRange360     pub fn get_bus_range(&self) -> PciBridgeBusRange {
361         let bus_num: u32 = self.host_config.read_config((BR_BUS_NUMBER_REG * 4) as u64);
362         let primary = (bus_num & 0xFF) as u8;
363         let secondary = ((bus_num >> 8) & 0xFF) as u8;
364         let subordinate = ((bus_num >> 16) & 0xFF) as u8;
365 
366         PciBridgeBusRange {
367             primary,
368             secondary,
369             subordinate,
370         }
371     }
372 
read_device_id(&self) -> u16373     pub fn read_device_id(&self) -> u16 {
374         self.host_config.read_config::<u16>(PCI_CONFIG_DEVICE_ID)
375     }
376 
host_name(&self) -> String377     pub fn host_name(&self) -> String {
378         self.host_name.clone()
379     }
380 
read_config(&self, reg_idx: usize, data: &mut u32)381     pub fn read_config(&self, reg_idx: usize, data: &mut u32) {
382         if reg_idx == HEADER_TYPE_REG {
383             *data = self.host_config.read_config((HEADER_TYPE_REG as u64) * 4)
384         }
385     }
386 
write_config(&mut self, _reg_idx: usize, _offset: u64, _data: &[u8])387     pub fn write_config(&mut self, _reg_idx: usize, _offset: u64, _data: &[u8]) {}
388 
get_bridge_window_size(&self) -> (u64, u64)389     pub fn get_bridge_window_size(&self) -> (u64, u64) {
390         let br_memory: u32 = self.host_config.read_config(BR_MEM_REG as u64 * 4);
391         let mem_base = (br_memory & BR_MEM_BASE_MASK) << BR_MEM_BASE_SHIFT;
392         let mem_limit = br_memory & BR_MEM_LIMIT_MASK;
393         let mem_size = if mem_limit > mem_base {
394             (mem_limit - mem_base) as u64 + BR_WINDOW_ALIGNMENT
395         } else {
396             BR_MEM_MINIMUM
397         };
398         let br_pref_mem_low: u32 = self.host_config.read_config(BR_PREF_MEM_LOW_REG as u64 * 4);
399         let pref_mem_base_low = (br_pref_mem_low & BR_MEM_BASE_MASK) << BR_MEM_BASE_SHIFT;
400         let pref_mem_limit_low = br_pref_mem_low & BR_MEM_LIMIT_MASK;
401         let mut pref_mem_base: u64 = pref_mem_base_low as u64;
402         let mut pref_mem_limit: u64 = pref_mem_limit_low as u64;
403         if br_pref_mem_low & BR_PREF_MEM_64BIT == BR_PREF_MEM_64BIT {
404             // 64bit prefetch memory
405             let pref_mem_base_high: u32 = self
406                 .host_config
407                 .read_config(BR_PREF_MEM_BASE_HIGH_REG as u64 * 4);
408             let pref_mem_limit_high: u32 = self
409                 .host_config
410                 .read_config(BR_PREF_MEM_LIMIT_HIGH_REG as u64 * 4);
411             pref_mem_base = ((pref_mem_base_high as u64) << 32) | (pref_mem_base_low as u64);
412             pref_mem_limit = ((pref_mem_limit_high as u64) << 32) | (pref_mem_limit_low as u64);
413         }
414         let pref_mem_size = if pref_mem_limit > pref_mem_base {
415             pref_mem_limit - pref_mem_base + BR_WINDOW_ALIGNMENT
416         } else {
417             BR_MEM_MINIMUM
418         };
419 
420         (mem_size, pref_mem_size)
421     }
422 
hotplug_probe(&mut self)423     pub fn hotplug_probe(&mut self) {
424         if *self.hotplug_in_process.lock() {
425             return;
426         }
427 
428         let hotplug_process = self.hotplug_in_process.clone();
429         let child_exist = self.hotplug_child_exist.clone();
430         let socket = self.vm_socket.clone();
431         let name = self.host_name.clone();
432         let _ = thread::Builder::new()
433             .name("pcie_hotplug".to_string())
434             .spawn(move || {
435                 let mut hotplug = hotplug_process.lock();
436                 *hotplug = true;
437                 let hotplug_worker = HotplugWorker { host_name: name };
438                 let _ = hotplug_worker.run(socket, child_exist);
439                 *hotplug = false;
440             });
441     }
442 
hot_unplug(&mut self)443     pub fn hot_unplug(&mut self) {
444         *self.hotplug_child_exist.lock() = false;
445     }
446 }
447