1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::fs::read;
6 use std::fs::write;
7 use std::fs::File;
8 use std::fs::OpenOptions;
9 use std::os::unix::fs::FileExt;
10 use std::path::Path;
11 use std::path::PathBuf;
12 use std::sync::Arc;
13 use std::thread;
14
15 use anyhow::anyhow;
16 use anyhow::bail;
17 use anyhow::Context;
18 use anyhow::Result;
19 use base::error;
20 use base::Tube;
21 use sync::Mutex;
22 use vm_control::HotPlugDeviceInfo;
23 use vm_control::HotPlugDeviceType;
24 use vm_control::VmRequest;
25 use vm_control::VmResponse;
26 use zerocopy::AsBytes;
27 use zerocopy::FromBytes;
28
29 use crate::pci::pci_configuration::PciBridgeSubclass;
30 use crate::pci::pci_configuration::CAPABILITY_LIST_HEAD_OFFSET;
31 use crate::pci::pci_configuration::HEADER_TYPE_REG;
32 use crate::pci::pci_configuration::PCI_CAP_NEXT_POINTER;
33 use crate::pci::pcie::pci_bridge::PciBridgeBusRange;
34 use crate::pci::pcie::pci_bridge::BR_BUS_NUMBER_REG;
35 use crate::pci::pcie::pci_bridge::BR_MEM_BASE_MASK;
36 use crate::pci::pcie::pci_bridge::BR_MEM_BASE_SHIFT;
37 use crate::pci::pcie::pci_bridge::BR_MEM_LIMIT_MASK;
38 use crate::pci::pcie::pci_bridge::BR_MEM_MINIMUM;
39 use crate::pci::pcie::pci_bridge::BR_MEM_REG;
40 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_64BIT;
41 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_BASE_HIGH_REG;
42 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_LIMIT_HIGH_REG;
43 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_LOW_REG;
44 use crate::pci::pcie::pci_bridge::BR_WINDOW_ALIGNMENT;
45 use crate::pci::pcie::PcieDevicePortType;
46 use crate::pci::PciCapabilityID;
47 use crate::pci::PciClassCode;
48
49 // Host Pci device's sysfs config file
50 struct PciHostConfig {
51 config_file: File,
52 }
53
54 impl PciHostConfig {
55 // Create a new host pci device's sysfs config file
new(host_sysfs_path: &Path) -> Result<Self>56 fn new(host_sysfs_path: &Path) -> Result<Self> {
57 let mut config_path = PathBuf::new();
58 config_path.push(host_sysfs_path);
59 config_path.push("config");
60 let f = OpenOptions::new()
61 .write(true)
62 .read(true)
63 .open(config_path.as_path())
64 .with_context(|| format!("failed to open: {}", config_path.display()))?;
65 Ok(PciHostConfig { config_file: f })
66 }
67
68 // Read host pci device's config register
read_config<T: AsBytes + FromBytes + Copy + Default>(&self, offset: u64) -> T69 fn read_config<T: AsBytes + FromBytes + Copy + Default>(&self, offset: u64) -> T {
70 let length = std::mem::size_of::<T>();
71 let mut val = T::default();
72 if offset % length as u64 != 0 {
73 error!(
74 "read_config, offset {} isn't aligned to length {}",
75 offset, length
76 );
77 } else if let Err(e) = self.config_file.read_exact_at(val.as_bytes_mut(), offset) {
78 error!("failed to read host sysfs config: {}", e);
79 }
80
81 val
82 }
83
84 // write host pci device's config register
85 #[allow(dead_code)]
write_config(&self, offset: u64, data: &[u8])86 fn write_config(&self, offset: u64, data: &[u8]) {
87 if offset % data.len() as u64 != 0 {
88 error!(
89 "write_config, offset {} isn't aligned to length {}",
90 offset,
91 data.len()
92 );
93 return;
94 }
95 if let Err(e) = self.config_file.write_all_at(data, offset) {
96 error!("failed to write host sysfs config: {}", e);
97 }
98 }
99 }
100
101 // Find all the added pcie devices
visit_children(dir: &Path, children: &mut Vec<HotPlugDeviceInfo>) -> Result<()>102 fn visit_children(dir: &Path, children: &mut Vec<HotPlugDeviceInfo>) -> Result<()> {
103 // Each pci device has a sysfs directory
104 if !dir.is_dir() {
105 bail!("{} isn't directory", dir.display());
106 }
107 // Loop device sysfs subdirectory
108 let entries = dir
109 .read_dir()
110 .with_context(|| format!("failed to read dir {}", dir.display()))?;
111 let mut devices = Vec::new();
112 for entry in entries {
113 let sub_dir = match entry {
114 Ok(sub) => sub,
115 _ => continue,
116 };
117
118 if !sub_dir.path().is_dir() {
119 continue;
120 }
121
122 let name = sub_dir
123 .file_name()
124 .into_string()
125 .map_err(|_| anyhow!("failed to get dir name"))?;
126 // Child pci device has name format 0000:xx:xx.x, length is 12
127 if name.len() != 12 || !name.starts_with("0000:") {
128 continue;
129 }
130 let child_path = dir.join(name);
131 devices.push(child_path);
132 }
133 devices.reverse();
134 let mut iter = devices.iter().peekable();
135 while let Some(device) = iter.next() {
136 let class_path = device.join("class");
137 let class_id = read(class_path.as_path())
138 .with_context(|| format!("failed to read {}", class_path.display()))?;
139 let hp_interrupt = iter.peek().is_none();
140 if !class_id.starts_with("0x0604".as_bytes()) {
141 // If the device isn't pci bridge, this is a pcie endpoint device
142 children.push(HotPlugDeviceInfo {
143 device_type: HotPlugDeviceType::EndPoint,
144 path: device.to_path_buf(),
145 hp_interrupt,
146 });
147 // No need to look further
148 return Ok(());
149 } else {
150 // Find the pci express cap to get the port type of the pcie bridge
151 let host_config = PciHostConfig::new(device)?;
152 let mut cap_pointer: u8 = host_config.read_config(CAPABILITY_LIST_HEAD_OFFSET as u64);
153 while cap_pointer != 0x0 {
154 let cap_id: u8 = host_config.read_config(cap_pointer as u64);
155 if cap_id == PciCapabilityID::PciExpress as u8 {
156 break;
157 }
158 cap_pointer = host_config.read_config(cap_pointer as u64 + 0x1);
159 }
160 if cap_pointer == 0x0 {
161 bail!(
162 "Failed to get pcie express capability for {}",
163 device.display()
164 );
165 }
166 let express_cap_reg: u16 = host_config.read_config(cap_pointer as u64 + 0x2);
167 match (express_cap_reg & 0xf0) >> 4 {
168 x if x == PcieDevicePortType::UpstreamPort as u16 => {
169 children.push(HotPlugDeviceInfo {
170 device_type: HotPlugDeviceType::UpstreamPort,
171 path: device.to_path_buf(),
172 hp_interrupt,
173 })
174 }
175 x if x == PcieDevicePortType::DownstreamPort as u16 => {
176 children.push(HotPlugDeviceInfo {
177 device_type: HotPlugDeviceType::DownstreamPort,
178 path: device.to_path_buf(),
179 hp_interrupt,
180 })
181 }
182 _ => (),
183 }
184 }
185 }
186 for device in devices.iter() {
187 visit_children(device.as_path(), children)?;
188 }
189 Ok(())
190 }
191
192 struct HotplugWorker {
193 host_name: String,
194 }
195
196 impl HotplugWorker {
run(&self, vm_socket: Arc<Mutex<Tube>>, child_exist: Arc<Mutex<bool>>) -> Result<()>197 fn run(&self, vm_socket: Arc<Mutex<Tube>>, child_exist: Arc<Mutex<bool>>) -> Result<()> {
198 let mut host_sysfs = PathBuf::new();
199 host_sysfs.push("/sys/bus/pci/devices/");
200 host_sysfs.push(self.host_name.clone());
201 let rescan_path = host_sysfs.join("rescan");
202 // Let pcie root port rescan to find the added or removed children devices
203 write(rescan_path.as_path(), "1")
204 .with_context(|| format!("failed to write {}", rescan_path.display()))?;
205
206 // If child device existed, but code run here again, this means host has a
207 // hotplug out event, after the above rescan, host should find the removed
208 // child device, and host vfio-pci kernel driver should notify crosvm vfio-pci
209 // devie such hotplug out event, so nothing is needed to do here, just return
210 // it now.
211 let mut child_exist = child_exist.lock();
212 if *child_exist {
213 return Ok(());
214 }
215
216 // Probe the new added pcie endpoint devices
217 let mut children: Vec<HotPlugDeviceInfo> = Vec::new();
218 visit_children(host_sysfs.as_path(), &mut children)?;
219
220 // Without reverse children, physical larger BDF device is at the top, it will be
221 // added into guest first with smaller virtual function number, so physical smaller
222 // BDF device has larger virtual function number, phyiscal larger BDF device has
223 // smaller virtual function number. During hotplug out process, host pcie root port
224 // driver remove physical smaller BDF pcie endpoint device first, so host vfio-pci
225 // driver send plug out event first for smaller BDF device and wait for this device
226 // removed from crosvm, when crosvm receives this plug out event, crosvm will remove
227 // all the children devices, crosvm remove smaller virtual function number device
228 // first, this isn't the target device which host vfio-pci driver is waiting for.
229 // Host vfio-pci driver holds a lock when it is waiting, when crosvm remove another
230 // device throgh vfio-pci which try to get the same lock, so deadlock happens in
231 // host kernel.
232 //
233 // In order to fix the deadlock, children is reversed, so physical smaller BDF
234 // device has smaller virtual function number, and it will have the same order
235 // between host kernel and crosvm during hotplug out process.
236 children.reverse();
237 while let Some(child) = children.pop() {
238 if let HotPlugDeviceType::EndPoint = child.device_type {
239 // In order to bind device to vfio-pci driver, get device VID and DID
240 let vendor_path = child.path.join("vendor");
241 let vendor_id = read(vendor_path.as_path())
242 .with_context(|| format!("failed to read {}", vendor_path.display()))?;
243 // Remove the first two elements 0x
244 let prefix: &str = "0x";
245 let vendor = match vendor_id.strip_prefix(prefix.as_bytes()) {
246 Some(v) => v.to_vec(),
247 None => vendor_id,
248 };
249 let device_path = child.path.join("device");
250 let device_id = read(device_path.as_path())
251 .with_context(|| format!("failed to read {}", device_path.display()))?;
252 // Remove the first two elements 0x
253 let device = match device_id.strip_prefix(prefix.as_bytes()) {
254 Some(d) => d.to_vec(),
255 None => device_id,
256 };
257 let new_id = [
258 String::from_utf8_lossy(&vendor),
259 String::from_utf8_lossy(&device),
260 ]
261 .join(" ");
262 if Path::new("/sys/bus/pci/drivers/vfio-pci-pm/new_id").exists() {
263 let _ = write("/sys/bus/pci/drivers/vfio-pci-pm/new_id", &new_id);
264 }
265 // This is normal - either the kernel doesn't support vfio-pci-pm driver,
266 // or the device failed to attach to vfio-pci-pm driver (most likely due to
267 // lack of power management capability).
268 if !child.path.join("driver/unbind").exists() {
269 write("/sys/bus/pci/drivers/vfio-pci/new_id", &new_id).with_context(|| {
270 format!("failed to write {} into vfio-pci/new_id", new_id)
271 })?;
272 }
273 }
274 // Request to hotplug the new added pcie device into guest
275 let request = VmRequest::HotPlugVfioCommand {
276 device: child.clone(),
277 add: true,
278 };
279 let vm_socket = vm_socket.lock();
280 vm_socket
281 .send(&request)
282 .with_context(|| format!("failed to send hotplug request for {:?}", child))?;
283 let response = vm_socket
284 .recv::<VmResponse>()
285 .with_context(|| format!("failed to receive hotplug response for {:?}", child))?;
286 match response {
287 VmResponse::Ok => {}
288 _ => bail!("unexpected hotplug response: {response}"),
289 };
290 if !*child_exist {
291 *child_exist = true;
292 }
293 }
294
295 Ok(())
296 }
297 }
298
299 const PCI_CONFIG_DEVICE_ID: u64 = 0x02;
300 const PCI_BASE_CLASS_CODE: u64 = 0x0B;
301 const PCI_SUB_CLASS_CODE: u64 = 0x0A;
302
303 /// Pcie root port device has a corresponding host pcie root port.
304 pub struct PcieHostPort {
305 host_config: PciHostConfig,
306 host_name: String,
307 hotplug_in_process: Arc<Mutex<bool>>,
308 hotplug_child_exist: Arc<Mutex<bool>>,
309 vm_socket: Arc<Mutex<Tube>>,
310 }
311
312 impl PcieHostPort {
313 /// Create PcieHostPort, host_syfsfs_patch specify host pcie port
314 /// sysfs path.
new(host_sysfs_path: &Path, socket: Tube) -> Result<Self>315 pub fn new(host_sysfs_path: &Path, socket: Tube) -> Result<Self> {
316 let host_config = PciHostConfig::new(host_sysfs_path)?;
317 let host_name = host_sysfs_path
318 .file_name()
319 .unwrap()
320 .to_str()
321 .unwrap()
322 .to_owned();
323 let base_class: u8 = host_config.read_config(PCI_BASE_CLASS_CODE);
324 if base_class != PciClassCode::BridgeDevice.get_register_value() {
325 return Err(anyhow!("host {} isn't bridge", host_name));
326 }
327 let sub_class: u8 = host_config.read_config(PCI_SUB_CLASS_CODE);
328 if sub_class != PciBridgeSubclass::PciToPciBridge as u8 {
329 return Err(anyhow!("host {} isn't pci to pci bridge", host_name));
330 }
331
332 let mut pcie_cap_reg: u8 = 0;
333
334 let mut cap_next: u8 = host_config.read_config(CAPABILITY_LIST_HEAD_OFFSET as u64);
335 let mut counter: u16 = 0;
336 while cap_next != 0 && counter < 256 {
337 let cap_id: u8 = host_config.read_config(cap_next.into());
338 if cap_id == PciCapabilityID::PciExpress as u8 {
339 pcie_cap_reg = cap_next;
340 break;
341 }
342 let offset = cap_next as u64 + PCI_CAP_NEXT_POINTER as u64;
343 cap_next = host_config.read_config(offset);
344 counter += 1;
345 }
346
347 if pcie_cap_reg == 0 {
348 return Err(anyhow!("host {} isn't pcie device", host_name));
349 }
350
351 Ok(PcieHostPort {
352 host_config,
353 host_name,
354 hotplug_in_process: Arc::new(Mutex::new(false)),
355 hotplug_child_exist: Arc::new(Mutex::new(false)),
356 vm_socket: Arc::new(Mutex::new(socket)),
357 })
358 }
359
get_bus_range(&self) -> PciBridgeBusRange360 pub fn get_bus_range(&self) -> PciBridgeBusRange {
361 let bus_num: u32 = self.host_config.read_config((BR_BUS_NUMBER_REG * 4) as u64);
362 let primary = (bus_num & 0xFF) as u8;
363 let secondary = ((bus_num >> 8) & 0xFF) as u8;
364 let subordinate = ((bus_num >> 16) & 0xFF) as u8;
365
366 PciBridgeBusRange {
367 primary,
368 secondary,
369 subordinate,
370 }
371 }
372
read_device_id(&self) -> u16373 pub fn read_device_id(&self) -> u16 {
374 self.host_config.read_config::<u16>(PCI_CONFIG_DEVICE_ID)
375 }
376
host_name(&self) -> String377 pub fn host_name(&self) -> String {
378 self.host_name.clone()
379 }
380
read_config(&self, reg_idx: usize, data: &mut u32)381 pub fn read_config(&self, reg_idx: usize, data: &mut u32) {
382 if reg_idx == HEADER_TYPE_REG {
383 *data = self.host_config.read_config((HEADER_TYPE_REG as u64) * 4)
384 }
385 }
386
write_config(&mut self, _reg_idx: usize, _offset: u64, _data: &[u8])387 pub fn write_config(&mut self, _reg_idx: usize, _offset: u64, _data: &[u8]) {}
388
get_bridge_window_size(&self) -> (u64, u64)389 pub fn get_bridge_window_size(&self) -> (u64, u64) {
390 let br_memory: u32 = self.host_config.read_config(BR_MEM_REG as u64 * 4);
391 let mem_base = (br_memory & BR_MEM_BASE_MASK) << BR_MEM_BASE_SHIFT;
392 let mem_limit = br_memory & BR_MEM_LIMIT_MASK;
393 let mem_size = if mem_limit > mem_base {
394 (mem_limit - mem_base) as u64 + BR_WINDOW_ALIGNMENT
395 } else {
396 BR_MEM_MINIMUM
397 };
398 let br_pref_mem_low: u32 = self.host_config.read_config(BR_PREF_MEM_LOW_REG as u64 * 4);
399 let pref_mem_base_low = (br_pref_mem_low & BR_MEM_BASE_MASK) << BR_MEM_BASE_SHIFT;
400 let pref_mem_limit_low = br_pref_mem_low & BR_MEM_LIMIT_MASK;
401 let mut pref_mem_base: u64 = pref_mem_base_low as u64;
402 let mut pref_mem_limit: u64 = pref_mem_limit_low as u64;
403 if br_pref_mem_low & BR_PREF_MEM_64BIT == BR_PREF_MEM_64BIT {
404 // 64bit prefetch memory
405 let pref_mem_base_high: u32 = self
406 .host_config
407 .read_config(BR_PREF_MEM_BASE_HIGH_REG as u64 * 4);
408 let pref_mem_limit_high: u32 = self
409 .host_config
410 .read_config(BR_PREF_MEM_LIMIT_HIGH_REG as u64 * 4);
411 pref_mem_base = ((pref_mem_base_high as u64) << 32) | (pref_mem_base_low as u64);
412 pref_mem_limit = ((pref_mem_limit_high as u64) << 32) | (pref_mem_limit_low as u64);
413 }
414 let pref_mem_size = if pref_mem_limit > pref_mem_base {
415 pref_mem_limit - pref_mem_base + BR_WINDOW_ALIGNMENT
416 } else {
417 BR_MEM_MINIMUM
418 };
419
420 (mem_size, pref_mem_size)
421 }
422
hotplug_probe(&mut self)423 pub fn hotplug_probe(&mut self) {
424 if *self.hotplug_in_process.lock() {
425 return;
426 }
427
428 let hotplug_process = self.hotplug_in_process.clone();
429 let child_exist = self.hotplug_child_exist.clone();
430 let socket = self.vm_socket.clone();
431 let name = self.host_name.clone();
432 let _ = thread::Builder::new()
433 .name("pcie_hotplug".to_string())
434 .spawn(move || {
435 let mut hotplug = hotplug_process.lock();
436 *hotplug = true;
437 let hotplug_worker = HotplugWorker { host_name: name };
438 let _ = hotplug_worker.run(socket, child_exist);
439 *hotplug = false;
440 });
441 }
442
hot_unplug(&mut self)443 pub fn hot_unplug(&mut self) {
444 *self.hotplug_child_exist.lock() = false;
445 }
446 }
447