1 // Copyright 2020 The ChromiumOS Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 use std::convert::TryFrom; 6 use std::convert::TryInto; 7 use std::fmt; 8 use std::fmt::Display; 9 use std::iter; 10 use std::sync::Arc; 11 12 cfg_if::cfg_if! { 13 if #[cfg(test)] { 14 use base::{FakeClock as Clock, FakeTimer as Timer}; 15 } else { 16 use base::{Clock, Timer}; 17 } 18 } 19 use anyhow::Context; 20 use base::error; 21 use base::info; 22 use base::warn; 23 use base::AsRawDescriptor; 24 use base::Descriptor; 25 use base::Error; 26 use base::Event; 27 use base::EventToken; 28 use base::Result; 29 use base::Tube; 30 use base::WaitContext; 31 use base::WorkerThread; 32 use hypervisor::DeliveryMode; 33 use hypervisor::IoapicState; 34 use hypervisor::IrqRoute; 35 use hypervisor::IrqSource; 36 use hypervisor::IrqSourceChip; 37 use hypervisor::LapicState; 38 use hypervisor::MPState; 39 use hypervisor::MsiAddressMessage; 40 use hypervisor::MsiDataMessage; 41 use hypervisor::PicSelect; 42 use hypervisor::PicState; 43 use hypervisor::PitState; 44 use hypervisor::Vcpu; 45 use hypervisor::VcpuX86_64; 46 use resources::SystemAllocator; 47 use sync::Condvar; 48 use sync::Mutex; 49 50 use crate::bus::BusDeviceSync; 51 use crate::irqchip::Apic; 52 use crate::irqchip::ApicBusMsg; 53 use crate::irqchip::DelayedIoApicIrqEvents; 54 use crate::irqchip::Interrupt; 55 use crate::irqchip::InterruptData; 56 use crate::irqchip::InterruptDestination; 57 use crate::irqchip::Ioapic; 58 use crate::irqchip::IrqEvent; 59 use crate::irqchip::IrqEventIndex; 60 use crate::irqchip::Pic; 61 use crate::irqchip::Routes; 62 use crate::irqchip::VcpuRunState; 63 use crate::irqchip::APIC_BASE_ADDRESS; 64 use crate::irqchip::APIC_MEM_LENGTH_BYTES; 65 use crate::irqchip::IOAPIC_BASE_ADDRESS; 66 use crate::irqchip::IOAPIC_MEM_LENGTH_BYTES; 67 use crate::pci::CrosvmDeviceId; 68 use crate::Bus; 69 use crate::BusAccessInfo; 70 use crate::BusDevice; 71 use crate::DeviceId; 72 use crate::IrqChip; 73 use crate::IrqChipCap; 74 use crate::IrqChipX86_64; 75 use crate::IrqEdgeEvent; 76 use crate::IrqEventSource; 77 use crate::IrqLevelEvent; 78 use crate::Pit; 79 use crate::PitError; 80 use crate::Suspendable; 81 82 /// PIT channel 0 timer is connected to IRQ 0 83 const PIT_CHANNEL0_IRQ: u32 = 0; 84 /// CR0 extension type bit 85 const X86_CR0_ET: u64 = 0x00000010; 86 /// CR0 not write through bit 87 const X86_CR0_NW: u64 = 0x20000000; 88 /// CR0 cache disable bit 89 const X86_CR0_CD: u64 = 0x40000000; 90 /// Default power on state of CR0 register, according to the Intel manual. 91 const X86_CR0_INIT: u64 = X86_CR0_ET | X86_CR0_NW | X86_CR0_CD; 92 93 /// An `IrqChip` with all interrupt devices emulated in userspace. `UserspaceIrqChip` works with 94 /// any hypervisor, but only supports x86. 95 pub struct UserspaceIrqChip<V: VcpuX86_64> { 96 pub vcpus: Arc<Mutex<Vec<Option<V>>>>, 97 routes: Arc<Mutex<Routes>>, 98 pit: Arc<Mutex<Pit>>, 99 pic: Arc<Mutex<Pic>>, 100 ioapic: Arc<Mutex<Ioapic>>, 101 ioapic_pins: usize, 102 pub apics: Vec<Arc<Mutex<Apic>>>, 103 // Condition variables used by wait_until_runnable. 104 waiters: Vec<Arc<Waiter>>, 105 // Raw descriptors of the apic Timers. 106 timer_descriptors: Vec<Descriptor>, 107 /// Delayed ioapic irq object, that contains the delayed events because the ioapic was locked 108 /// when service_irq was called on the irqchip. This prevents deadlocks when a Vcpu thread has 109 /// locked the ioapic and the ioapic sends a AddMsiRoute signal to the main thread (which 110 /// itself may be busy trying to call service_irq). 111 /// 112 /// ## Note: 113 /// This lock may be locked by itself to access the `DelayedIoApicIrqEvents`. If accessed in 114 /// conjunction with the `irq_events` field, that lock should be taken first to prevent 115 /// deadlocks stemming from lock-ordering issues. 116 delayed_ioapic_irq_events: Arc<Mutex<DelayedIoApicIrqEvents>>, 117 // Array of Events that devices will use to assert ioapic pins. 118 irq_events: Arc<Mutex<Vec<Option<IrqEvent>>>>, 119 dropper: Arc<Mutex<Dropper>>, 120 activated: bool, 121 } 122 123 /// Helper that implements `Drop` on behalf of `UserspaceIrqChip`. The many cloned copies of an irq 124 /// chip share a single arc'ed `Dropper`, which only runs its drop when the last irq chip copy is 125 /// dropped. 126 struct Dropper { 127 /// Worker threads that deliver timer events to the APICs. 128 workers: Vec<WorkerThread<TimerWorkerResult<()>>>, 129 } 130 131 impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> { 132 /// Constructs a new `UserspaceIrqChip`. new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self>133 pub fn new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self> { 134 let clock = Arc::new(Mutex::new(Clock::new())); 135 Self::new_with_clock(num_vcpus, irq_tube, ioapic_pins, clock) 136 } 137 138 /// Constructs a new `UserspaceIrqChip`, with a clock. Used for testing. new_with_clock( num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>, clock: Arc<Mutex<Clock>>, ) -> Result<Self>139 pub fn new_with_clock( 140 num_vcpus: usize, 141 irq_tube: Tube, 142 ioapic_pins: Option<usize>, 143 clock: Arc<Mutex<Clock>>, 144 ) -> Result<Self> { 145 let pit_evt = IrqEdgeEvent::new()?; 146 // For test only, this clock instance is FakeClock. It needs to be cloned for every Timer 147 // instance, so make a clone for it now. 148 #[cfg(test)] 149 let test_clock = clock.clone(); 150 let pit = Pit::new(pit_evt.try_clone()?, clock).map_err(|e| match e { 151 PitError::CloneEvent(err) => err, 152 PitError::CreateEvent(err) => err, 153 PitError::CreateWaitContext(err) => err, 154 PitError::TimerCreateError(err) => err, 155 PitError::WaitError(err) => err, 156 PitError::SpawnThread(_) => Error::new(libc::EIO), 157 })?; 158 let pit_event_source = IrqEventSource::from_device(&pit); 159 160 let ioapic_pins = ioapic_pins.unwrap_or(hypervisor::NUM_IOAPIC_PINS); 161 let ioapic = Ioapic::new(irq_tube, ioapic_pins)?; 162 163 let mut timer_descriptors: Vec<Descriptor> = Vec::with_capacity(num_vcpus); 164 let mut apics: Vec<Arc<Mutex<Apic>>> = Vec::with_capacity(num_vcpus); 165 for id in 0..num_vcpus { 166 cfg_if::cfg_if! { 167 if #[cfg(test)] { 168 let timer = Timer::new(test_clock.clone()); 169 } else { 170 let timer = Timer::new()?; 171 } 172 } 173 // Timers are owned by the apics, which outlive the raw descriptors stored here and in 174 // the worker threads. 175 timer_descriptors.push(Descriptor(timer.as_raw_descriptor())); 176 177 let id: u8 = id.try_into().or(Err(Error::new(libc::EINVAL)))?; 178 let apic = Apic::new(id, Box::new(timer)); 179 apics.push(Arc::new(Mutex::new(apic))); 180 } 181 let dropper = Dropper { 182 workers: Vec::new(), 183 }; 184 185 let mut chip = UserspaceIrqChip { 186 vcpus: Arc::new(Mutex::new( 187 iter::repeat_with(|| None).take(num_vcpus).collect(), 188 )), 189 waiters: iter::repeat_with(Default::default) 190 .take(num_vcpus) 191 .collect(), 192 routes: Arc::new(Mutex::new(Routes::new())), 193 pit: Arc::new(Mutex::new(pit)), 194 pic: Arc::new(Mutex::new(Pic::new())), 195 ioapic: Arc::new(Mutex::new(ioapic)), 196 ioapic_pins, 197 apics, 198 timer_descriptors, 199 delayed_ioapic_irq_events: Arc::new(Mutex::new(DelayedIoApicIrqEvents::new()?)), 200 irq_events: Arc::new(Mutex::new(Vec::new())), 201 dropper: Arc::new(Mutex::new(dropper)), 202 activated: false, 203 }; 204 205 // Setup standard x86 irq routes 206 chip.set_irq_routes(&Routes::default_pic_ioapic_routes(ioapic_pins))?; 207 208 chip.register_edge_irq_event(PIT_CHANNEL0_IRQ, &pit_evt, pit_event_source)?; 209 Ok(chip) 210 } 211 212 /// Handles a message from an APIC. handle_msg(&self, msg: ApicBusMsg)213 fn handle_msg(&self, msg: ApicBusMsg) { 214 match msg { 215 ApicBusMsg::Eoi(vector) => { 216 let _ = self.broadcast_eoi(vector); 217 } 218 ApicBusMsg::Ipi(interrupt) => self.send_irq_to_apics(&interrupt), 219 } 220 } 221 222 /// Sends a Message Signaled Interrupt to one or more APICs. MSIs are a 64-bit address and 223 /// 32-bit data, but in the Intel spec we're implementing, only the low 32 bits of the address 224 /// are used. send_msi(&self, addr: u32, data: u32)225 fn send_msi(&self, addr: u32, data: u32) { 226 let mut msi_addr = MsiAddressMessage::new(); 227 msi_addr.set(0, 32, addr as u64); 228 let dest = match InterruptDestination::try_from(&msi_addr) { 229 Ok(dest) => dest, 230 Err(e) => { 231 warn!("Invalid MSI message: {}", e); 232 return; 233 } 234 }; 235 236 let mut msi_data = MsiDataMessage::new(); 237 msi_data.set(0, 32, data as u64); 238 let data = InterruptData::from(&msi_data); 239 240 self.send_irq_to_apics(&Interrupt { dest, data }); 241 } 242 send_irq_to_apic(&self, id: usize, irq: &InterruptData)243 pub fn send_irq_to_apic(&self, id: usize, irq: &InterruptData) { 244 // id can come from the guest, so check bounds. 245 if let Some(apic) = self.apics.get(id) { 246 apic.lock().accept_irq(irq); 247 } else { 248 error!("Interrupt for non-existent apic {}: {:?}", id, irq); 249 } 250 if let Some(Some(vcpu)) = self.vcpus.lock().get(id) { 251 vcpu.set_interrupt_window_requested(true); 252 } else { 253 error!("Interrupt for non-existent vcpu {}: {:?}", id, irq); 254 } 255 self.waiters[id].notify(); 256 } 257 258 /// Sends an interrupt to one or more APICs. Used for sending MSIs and IPIs. send_irq_to_apics(&self, irq: &Interrupt)259 pub fn send_irq_to_apics(&self, irq: &Interrupt) { 260 match irq.data.delivery { 261 DeliveryMode::Fixed | DeliveryMode::Lowest | DeliveryMode::RemoteRead => {} 262 _ => info!("UserspaceIrqChip received special irq: {:?}", irq), 263 } 264 265 // First try the fast path, where the destination is a single APIC we can send to directly. 266 if let Some(apic_id) = Apic::single_dest_fast(&irq.dest) { 267 self.send_irq_to_apic(apic_id as usize, &irq.data); 268 return; 269 } 270 271 let lowest_mode = irq.data.delivery == DeliveryMode::Lowest; 272 let mut lowest_priority = u8::MAX; 273 let mut lowest_apic: Option<usize> = None; 274 275 for (i, apic) in self.apics.iter().enumerate() { 276 let send = { 277 let apic = apic.lock(); 278 if !apic.match_dest(&irq.dest) { 279 false 280 } else if lowest_mode { 281 let priority = apic.get_processor_priority(); 282 if priority <= lowest_priority { 283 lowest_priority = priority; 284 lowest_apic = Some(i); 285 } 286 false 287 } else { 288 true 289 } 290 }; 291 if send { 292 self.send_irq_to_apic(i, &irq.data); 293 } 294 } 295 296 if lowest_mode { 297 if let Some(index) = lowest_apic { 298 self.send_irq_to_apic(index, &irq.data); 299 } else { 300 // According to sections 10.6.2.1 and 10.6.2.2 of the SDM, the OS should not let 301 // this happen. If the OS is misconfigured then drop the interrupt and log a 302 // warning. 303 warn!( 304 "Lowest priority interrupt sent, but no apics configured as valid target: {:?}", 305 irq 306 ); 307 } 308 } 309 } 310 311 /// Delivers a startup IPI to `vcpu`. deliver_startup(&self, vcpu: &V, vector: u8) -> Result<()>312 fn deliver_startup(&self, vcpu: &V, vector: u8) -> Result<()> { 313 // This comes from Intel SDM volume 3, chapter 8.4. The vector specifies a page aligned 314 // address where execution should start. cs.base is the offset for the code segment with an 315 // RIP of 0. The cs.selector is just the base shifted right by 4 bits. 316 let mut sregs = vcpu.get_sregs()?; 317 sregs.cs.base = (vector as u64) << 12; 318 sregs.cs.selector = (vector as u16) << 8; 319 320 // Set CR0 to its INIT value per the manual. Application processors won't boot with the CR0 321 // protected mode and paging bits set by setup_sregs(). Kernel APIC doesn't have this 322 // issue, probably because it uses MSRs instead of MMIO, so it's less affected when the AP's 323 // state (CR3 etc.) doesn't reflect changes that Linux made while booting vcpu 0. 324 sregs.cr0 = X86_CR0_INIT; 325 vcpu.set_sregs(&sregs)?; 326 327 let mut regs = vcpu.get_regs()?; 328 regs.rip = 0; 329 vcpu.set_regs(®s)?; 330 331 Ok(()) 332 } 333 334 /// Checks if the specified VCPU is in a runnable state. is_runnable(&self, vcpu_id: usize) -> bool335 fn is_runnable(&self, vcpu_id: usize) -> bool { 336 self.apics[vcpu_id].lock().get_mp_state() == MPState::Runnable 337 } 338 } 339 340 impl Dropper { sleep(&mut self) -> anyhow::Result<()>341 fn sleep(&mut self) -> anyhow::Result<()> { 342 for thread in self.workers.split_off(0).into_iter() { 343 thread 344 .stop() 345 .context("UserspaceIrqChip worker thread exited with error")?; 346 } 347 Ok(()) 348 } 349 } 350 351 impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> { register_irq_event( &mut self, irq: u32, irq_event: &Event, resample_event: Option<&Event>, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>352 fn register_irq_event( 353 &mut self, 354 irq: u32, 355 irq_event: &Event, 356 resample_event: Option<&Event>, 357 source: IrqEventSource, 358 ) -> Result<Option<IrqEventIndex>> { 359 let mut evt = IrqEvent { 360 gsi: irq, 361 event: irq_event.try_clone()?, 362 resample_event: None, 363 source, 364 }; 365 if let Some(resample_event) = resample_event { 366 evt.resample_event = Some(resample_event.try_clone()?); 367 } 368 369 let mut irq_events = self.irq_events.lock(); 370 let index = irq_events.len(); 371 irq_events.push(Some(evt)); 372 Ok(Some(index)) 373 } 374 unregister_irq_event(&mut self, irq: u32, irq_event: &Event) -> Result<()>375 fn unregister_irq_event(&mut self, irq: u32, irq_event: &Event) -> Result<()> { 376 let mut irq_events = self.irq_events.lock(); 377 for (index, evt) in irq_events.iter().enumerate() { 378 if let Some(evt) = evt { 379 if evt.gsi == irq && irq_event.eq(&evt.event) { 380 irq_events[index] = None; 381 break; 382 } 383 } 384 } 385 Ok(()) 386 } 387 } 388 389 impl<V: VcpuX86_64 + 'static> IrqChip for UserspaceIrqChip<V> { add_vcpu(&mut self, vcpu_id: usize, vcpu: &dyn Vcpu) -> Result<()>390 fn add_vcpu(&mut self, vcpu_id: usize, vcpu: &dyn Vcpu) -> Result<()> { 391 let vcpu: &V = vcpu 392 .downcast_ref() 393 .expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type"); 394 self.vcpus.lock()[vcpu_id] = Some(vcpu.try_clone()?); 395 Ok(()) 396 } 397 register_edge_irq_event( &mut self, irq: u32, irq_event: &IrqEdgeEvent, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>398 fn register_edge_irq_event( 399 &mut self, 400 irq: u32, 401 irq_event: &IrqEdgeEvent, 402 source: IrqEventSource, 403 ) -> Result<Option<IrqEventIndex>> { 404 self.register_irq_event(irq, irq_event.get_trigger(), None, source) 405 } 406 unregister_edge_irq_event(&mut self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()>407 fn unregister_edge_irq_event(&mut self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()> { 408 self.unregister_irq_event(irq, irq_event.get_trigger()) 409 } 410 register_level_irq_event( &mut self, irq: u32, irq_event: &IrqLevelEvent, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>411 fn register_level_irq_event( 412 &mut self, 413 irq: u32, 414 irq_event: &IrqLevelEvent, 415 source: IrqEventSource, 416 ) -> Result<Option<IrqEventIndex>> { 417 self.register_irq_event( 418 irq, 419 irq_event.get_trigger(), 420 Some(irq_event.get_resample()), 421 source, 422 ) 423 } 424 unregister_level_irq_event(&mut self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()>425 fn unregister_level_irq_event(&mut self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()> { 426 self.unregister_irq_event(irq, irq_event.get_trigger()) 427 } 428 route_irq(&mut self, route: IrqRoute) -> Result<()>429 fn route_irq(&mut self, route: IrqRoute) -> Result<()> { 430 self.routes.lock().add(route) 431 } 432 set_irq_routes(&mut self, routes: &[IrqRoute]) -> Result<()>433 fn set_irq_routes(&mut self, routes: &[IrqRoute]) -> Result<()> { 434 self.routes.lock().replace_all(routes) 435 } 436 irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>>437 fn irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>> { 438 let mut tokens: Vec<(IrqEventIndex, IrqEventSource, Event)> = Vec::new(); 439 for (index, evt) in self.irq_events.lock().iter().enumerate() { 440 if let Some(evt) = evt { 441 tokens.push((index, evt.source.clone(), evt.event.try_clone()?)); 442 } 443 } 444 Ok(tokens) 445 } 446 service_irq(&mut self, irq: u32, level: bool) -> Result<()>447 fn service_irq(&mut self, irq: u32, level: bool) -> Result<()> { 448 for route in self.routes.lock()[irq as usize].iter() { 449 match *route { 450 IrqSource::Irqchip { 451 chip: IrqSourceChip::PicPrimary, 452 pin, 453 } 454 | IrqSource::Irqchip { 455 chip: IrqSourceChip::PicSecondary, 456 pin, 457 } => { 458 self.pic.lock().service_irq(pin as u8, level); 459 } 460 IrqSource::Irqchip { 461 chip: IrqSourceChip::Ioapic, 462 pin, 463 } => { 464 self.ioapic.lock().service_irq(pin as usize, level); 465 } 466 // service_irq's level parameter is ignored for MSIs. MSI data specifies the level. 467 IrqSource::Msi { address, data } => self.send_msi(address as u32, data), 468 _ => { 469 error!("Unexpected route source {:?}", route); 470 return Err(Error::new(libc::EINVAL)); 471 } 472 } 473 } 474 Ok(()) 475 } 476 477 /// Services an IRQ event by asserting then deasserting an IRQ line. The associated Event 478 /// that triggered the irq event will be read from. If the irq is associated with a resample 479 /// Event, then the deassert will only happen after an EOI is broadcast for a vector 480 /// associated with the irq line. 481 /// For UserspaceIrqChip, this function identifies the destination(s) of the irq: PIC, IOAPIC, 482 /// or APIC (MSI). If it's a PIC or IOAPIC route, we attempt to call service_irq on those 483 /// chips. If the IOAPIC is unable to be immediately locked, we add the irq to the 484 /// delayed_ioapic_irq_events (though we still read from the Event that triggered the irq 485 /// event). If it's an MSI route, we call send_msi to decode the MSI and send it to the 486 /// destination APIC(s). service_irq_event(&mut self, event_index: IrqEventIndex) -> Result<()>487 fn service_irq_event(&mut self, event_index: IrqEventIndex) -> Result<()> { 488 let irq_events = self.irq_events.lock(); 489 let evt = if let Some(evt) = &irq_events[event_index] { 490 evt 491 } else { 492 return Ok(()); 493 }; 494 evt.event.wait()?; 495 496 for route in self.routes.lock()[evt.gsi as usize].iter() { 497 match *route { 498 IrqSource::Irqchip { 499 chip: IrqSourceChip::PicPrimary, 500 pin, 501 } 502 | IrqSource::Irqchip { 503 chip: IrqSourceChip::PicSecondary, 504 pin, 505 } => { 506 let mut pic = self.pic.lock(); 507 if evt.resample_event.is_some() { 508 pic.service_irq(pin as u8, true); 509 } else { 510 pic.service_irq(pin as u8, true); 511 pic.service_irq(pin as u8, false); 512 } 513 } 514 IrqSource::Irqchip { 515 chip: IrqSourceChip::Ioapic, 516 pin, 517 } => { 518 if let Ok(mut ioapic) = self.ioapic.try_lock() { 519 if evt.resample_event.is_some() { 520 ioapic.service_irq(pin as usize, true); 521 } else { 522 ioapic.service_irq(pin as usize, true); 523 ioapic.service_irq(pin as usize, false); 524 } 525 } else { 526 let mut delayed_events = self.delayed_ioapic_irq_events.lock(); 527 delayed_events.events.push(event_index); 528 delayed_events.trigger.signal().unwrap(); 529 } 530 } 531 IrqSource::Msi { address, data } => self.send_msi(address as u32, data), 532 _ => { 533 error!("Unexpected route source {:?}", route); 534 return Err(Error::new(libc::EINVAL)); 535 } 536 } 537 } 538 539 Ok(()) 540 } 541 542 /// Broadcasts an end of interrupt. For UserspaceIrqChip this sends the EOI to the ioapic. broadcast_eoi(&self, vector: u8) -> Result<()>543 fn broadcast_eoi(&self, vector: u8) -> Result<()> { 544 self.ioapic.lock().end_of_interrupt(vector); 545 Ok(()) 546 } 547 548 /// Injects any pending interrupts for `vcpu`. 549 /// 550 /// For UserspaceIrqChip this: 551 /// * Injects a PIC interrupt, if vcpu_id is 0 and vcpu is ready for interrupt 552 /// * Injects an APIC fixed interrupt, if vcpu is ready for interrupt and PIC didn't inject 553 /// * Injects APIC NMIs 554 /// * Handles APIC INIT IPIs 555 /// * Handles APIC SIPIs 556 /// * Requests an interrupt window, if PIC or APIC still has pending interrupts for this vcpu inject_interrupts(&self, vcpu: &dyn Vcpu) -> Result<()>557 fn inject_interrupts(&self, vcpu: &dyn Vcpu) -> Result<()> { 558 let vcpu: &V = vcpu 559 .downcast_ref() 560 .expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type"); 561 let vcpu_id = vcpu.id(); 562 let mut vcpu_ready = vcpu.ready_for_interrupt(); 563 564 let mut pic_needs_window = false; 565 if vcpu_id == 0 { 566 let mut pic = self.pic.lock(); 567 if vcpu_ready { 568 if let Some(vector) = pic.get_external_interrupt() { 569 vcpu.interrupt(vector)?; 570 self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable); 571 // Already injected a PIC interrupt, so APIC fixed interrupt can't be injected. 572 vcpu_ready = false; 573 } 574 } 575 pic_needs_window = pic.interrupt_requested(); 576 } 577 578 let irqs = self.apics[vcpu_id].lock().get_pending_irqs(vcpu_ready); 579 if let Some(vector) = irqs.fixed { 580 let do_interrupt = { 581 let mut apic = self.apics[vcpu_id].lock(); 582 match apic.get_mp_state() { 583 MPState::Runnable | MPState::Halted => { 584 // APIC interrupts should only be injectable when the MPState is 585 // Halted or Runnable. 586 apic.set_mp_state(&MPState::Runnable); 587 true 588 } 589 s => { 590 // This shouldn't happen, but log a helpful error if it does. 591 error!("Interrupt cannot be injected while in state: {:?}", s); 592 false 593 } 594 } 595 }; 596 597 if do_interrupt { 598 vcpu.interrupt(vector)?; 599 } 600 } 601 for _ in 0..irqs.nmis { 602 let prev_state = self.apics[vcpu_id].lock().get_mp_state(); 603 vcpu.inject_nmi()?; 604 self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable); 605 info!( 606 "Delivered NMI to cpu {}, mp_state was {:?}, now is {:?}", 607 vcpu_id, 608 prev_state, 609 MPState::Runnable 610 ); 611 } 612 if irqs.init { 613 { 614 let mut apic = self.apics[vcpu_id].lock(); 615 apic.load_reset_state(); 616 apic.set_mp_state(&MPState::InitReceived); 617 } 618 info!("Delivered INIT IPI to cpu {}", vcpu_id); 619 } 620 if let Some(vector) = irqs.startup { 621 // If our state is not MPState::InitReceived then this is probably 622 // the second SIPI in the INIT-SIPI-SIPI sequence; ignore. 623 if self.apics[vcpu_id].lock().get_mp_state() == MPState::InitReceived { 624 self.deliver_startup(vcpu, vector)?; 625 self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable); 626 info!("Delivered SIPI to cpu {}", vcpu_id); 627 } 628 } 629 630 let needs_window = pic_needs_window || irqs.needs_window; 631 vcpu.set_interrupt_window_requested(needs_window); 632 633 Ok(()) 634 } 635 636 /// Notifies the irq chip that the specified VCPU has executed a halt instruction. 637 /// For `UserspaceIrqChip`, it sets the APIC's mp_state to `MPState::Halted`. halted(&self, vcpu_id: usize)638 fn halted(&self, vcpu_id: usize) { 639 self.apics[vcpu_id].lock().set_mp_state(&MPState::Halted) 640 } 641 642 /// Blocks until `vcpu` is in a runnable state or until interrupted by 643 /// `IrqChip::kick_halted_vcpus`. Returns `VcpuRunState::Runnable if vcpu is runnable, or 644 /// `VcpuRunState::Interrupted` if the wait was interrupted. 645 /// For `UserspaceIrqChip`, if the APIC isn't `MPState::Runnable`, sleep until there are new 646 /// interrupts pending on the APIC, inject the interrupts, and go back to sleep if still not 647 /// runnable. wait_until_runnable(&self, vcpu: &dyn Vcpu) -> Result<VcpuRunState>648 fn wait_until_runnable(&self, vcpu: &dyn Vcpu) -> Result<VcpuRunState> { 649 let vcpu_id = vcpu.id(); 650 let waiter = &self.waiters[vcpu_id]; 651 let mut interrupted_lock = waiter.mtx.lock(); 652 loop { 653 if *interrupted_lock { 654 *interrupted_lock = false; 655 info!("wait_until_runnable interrupted on cpu {}", vcpu_id); 656 return Ok(VcpuRunState::Interrupted); 657 } 658 if self.is_runnable(vcpu_id) { 659 return Ok(VcpuRunState::Runnable); 660 } 661 662 self.inject_interrupts(vcpu)?; 663 if self.is_runnable(vcpu_id) { 664 return Ok(VcpuRunState::Runnable); 665 } 666 interrupted_lock = waiter.cvar.wait(interrupted_lock); 667 } 668 } 669 670 /// Makes unrunnable VCPUs return immediately from `wait_until_runnable`. 671 /// For UserspaceIrqChip, every vcpu gets kicked so its current or next call to 672 /// `wait_until_runnable` will immediately return false. After that one kick, subsequent 673 /// `wait_until_runnable` calls go back to waiting for runnability normally. kick_halted_vcpus(&self)674 fn kick_halted_vcpus(&self) { 675 for waiter in self.waiters.iter() { 676 waiter.set_and_notify(/* interrupted= */ true); 677 } 678 } 679 get_mp_state(&self, vcpu_id: usize) -> Result<MPState>680 fn get_mp_state(&self, vcpu_id: usize) -> Result<MPState> { 681 Ok(self.apics[vcpu_id].lock().get_mp_state()) 682 } 683 set_mp_state(&mut self, vcpu_id: usize, state: &MPState) -> Result<()>684 fn set_mp_state(&mut self, vcpu_id: usize, state: &MPState) -> Result<()> { 685 self.apics[vcpu_id].lock().set_mp_state(state); 686 Ok(()) 687 } 688 try_clone(&self) -> Result<Self>689 fn try_clone(&self) -> Result<Self> { 690 // kill_evts and timer_descriptors don't change, so they could be a plain Vec with each 691 // element cloned. But the Arc<Mutex> avoids a quadratic number of open descriptors from 692 // cloning, and those fields aren't performance critical. 693 Ok(UserspaceIrqChip { 694 vcpus: self.vcpus.clone(), 695 waiters: self.waiters.clone(), 696 routes: self.routes.clone(), 697 pit: self.pit.clone(), 698 pic: self.pic.clone(), 699 ioapic: self.ioapic.clone(), 700 ioapic_pins: self.ioapic_pins, 701 apics: self.apics.clone(), 702 timer_descriptors: self.timer_descriptors.clone(), 703 delayed_ioapic_irq_events: self.delayed_ioapic_irq_events.clone(), 704 irq_events: self.irq_events.clone(), 705 dropper: self.dropper.clone(), 706 activated: self.activated, 707 }) 708 } 709 710 // TODO(srichman): factor out UserspaceIrqChip and KvmSplitIrqChip::finalize_devices finalize_devices( &mut self, resources: &mut SystemAllocator, io_bus: &Bus, mmio_bus: &Bus, ) -> Result<()>711 fn finalize_devices( 712 &mut self, 713 resources: &mut SystemAllocator, 714 io_bus: &Bus, 715 mmio_bus: &Bus, 716 ) -> Result<()> { 717 // Insert pit into io_bus 718 io_bus.insert(self.pit.clone(), 0x040, 0x8).unwrap(); 719 io_bus.insert(self.pit.clone(), 0x061, 0x1).unwrap(); 720 721 // Insert pic into io_bus 722 io_bus.insert(self.pic.clone(), 0x20, 0x2).unwrap(); 723 io_bus.insert(self.pic.clone(), 0xa0, 0x2).unwrap(); 724 io_bus.insert(self.pic.clone(), 0x4d0, 0x2).unwrap(); 725 726 // Insert ioapic into mmio_bus 727 mmio_bus 728 .insert( 729 self.ioapic.clone(), 730 IOAPIC_BASE_ADDRESS, 731 IOAPIC_MEM_LENGTH_BYTES, 732 ) 733 .unwrap(); 734 735 // Insert self into mmio_bus for handling APIC mmio 736 mmio_bus 737 .insert_sync( 738 Arc::new(self.try_clone()?), 739 APIC_BASE_ADDRESS, 740 APIC_MEM_LENGTH_BYTES, 741 ) 742 .unwrap(); 743 744 // At this point, all of our devices have been created and they have registered their 745 // irq events, so we can clone our resample events 746 let mut ioapic_resample_events: Vec<Vec<Event>> = 747 (0..self.ioapic_pins).map(|_| Vec::new()).collect(); 748 let mut pic_resample_events: Vec<Vec<Event>> = 749 (0..self.ioapic_pins).map(|_| Vec::new()).collect(); 750 751 for evt in self.irq_events.lock().iter().flatten() { 752 if (evt.gsi as usize) >= self.ioapic_pins { 753 continue; 754 } 755 if let Some(resample_evt) = &evt.resample_event { 756 ioapic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?); 757 pic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?); 758 } 759 } 760 761 // Register resample events with the ioapic 762 self.ioapic 763 .lock() 764 .register_resample_events(ioapic_resample_events); 765 // Register resample events with the pic 766 self.pic 767 .lock() 768 .register_resample_events(pic_resample_events); 769 770 // Make sure all future irq numbers are >= self.ioapic_pins 771 let mut irq_num = resources.allocate_irq().unwrap(); 772 while irq_num < self.ioapic_pins as u32 { 773 irq_num = resources.allocate_irq().unwrap(); 774 } 775 776 // Spawn timer threads here instead of in new(), in case crosvm is in sandbox mode. 777 self.activated = true; 778 let _ = self.wake(); 779 780 Ok(()) 781 } 782 783 /// The UserspaceIrqChip's ioapic may be locked because a vcpu thread is currently writing to 784 /// the ioapic, and the ioapic may be blocking on adding MSI routes, which requires blocking 785 /// tube communication back to the main thread. Thus, we do not want the main thread to 786 /// block on a locked ioapic, so any irqs that could not be serviced because the ioapic could 787 /// not be immediately locked are added to the delayed_ioapic_irq_events Vec. This function 788 /// processes each delayed event in the vec each time it's called. If the ioapic is still 789 /// locked, we keep the queued irqs for the next time this function is called. process_delayed_irq_events(&mut self) -> Result<()>790 fn process_delayed_irq_events(&mut self) -> Result<()> { 791 let irq_events = self.irq_events.lock(); 792 let mut delayed_events = self.delayed_ioapic_irq_events.lock(); 793 delayed_events.events.retain(|&event_index| { 794 if let Some(evt) = &irq_events[event_index] { 795 if let Ok(mut ioapic) = self.ioapic.try_lock() { 796 if evt.resample_event.is_some() { 797 ioapic.service_irq(evt.gsi as usize, true); 798 } else { 799 ioapic.service_irq(evt.gsi as usize, true); 800 ioapic.service_irq(evt.gsi as usize, false); 801 } 802 803 false 804 } else { 805 true 806 } 807 } else { 808 true 809 } 810 }); 811 812 if delayed_events.events.is_empty() { 813 delayed_events.trigger.wait()?; 814 } 815 Ok(()) 816 } 817 irq_delayed_event_token(&self) -> Result<Option<Event>>818 fn irq_delayed_event_token(&self) -> Result<Option<Event>> { 819 Ok(Some( 820 self.delayed_ioapic_irq_events.lock().trigger.try_clone()?, 821 )) 822 } 823 check_capability(&self, c: IrqChipCap) -> bool824 fn check_capability(&self, c: IrqChipCap) -> bool { 825 match c { 826 IrqChipCap::TscDeadlineTimer => false, 827 IrqChipCap::X2Apic => false, 828 IrqChipCap::MpStateGetSet => true, 829 } 830 } 831 } 832 833 impl<V: VcpuX86_64 + 'static> BusDevice for UserspaceIrqChip<V> { debug_label(&self) -> String834 fn debug_label(&self) -> String { 835 "UserspaceIrqChip APIC".to_string() 836 } device_id(&self) -> DeviceId837 fn device_id(&self) -> DeviceId { 838 CrosvmDeviceId::UserspaceIrqChip.into() 839 } 840 } 841 842 impl<V: VcpuX86_64 + 'static> Suspendable for UserspaceIrqChip<V> { sleep(&mut self) -> anyhow::Result<()>843 fn sleep(&mut self) -> anyhow::Result<()> { 844 let mut dropper = self.dropper.lock(); 845 dropper.sleep() 846 } 847 wake(&mut self) -> anyhow::Result<()>848 fn wake(&mut self) -> anyhow::Result<()> { 849 if self.activated { 850 // create workers and run them. 851 let mut dropper = self.dropper.lock(); 852 for (i, descriptor) in self.timer_descriptors.iter().enumerate() { 853 let mut worker = TimerWorker { 854 id: i, 855 apic: self.apics[i].clone(), 856 descriptor: *descriptor, 857 vcpus: self.vcpus.clone(), 858 waiter: self.waiters[i].clone(), 859 }; 860 let worker_thread = WorkerThread::start( 861 format!("UserspaceIrqChip timer worker {}", i), 862 move |evt| worker.run(evt), 863 ); 864 dropper.workers.push(worker_thread); 865 } 866 } 867 Ok(()) 868 } 869 } 870 871 impl<V: VcpuX86_64 + 'static> BusDeviceSync for UserspaceIrqChip<V> { read(&self, info: BusAccessInfo, data: &mut [u8])872 fn read(&self, info: BusAccessInfo, data: &mut [u8]) { 873 self.apics[info.id].lock().read(info.offset, data) 874 } write(&self, info: BusAccessInfo, data: &[u8])875 fn write(&self, info: BusAccessInfo, data: &[u8]) { 876 let msg = self.apics[info.id].lock().write(info.offset, data); 877 if let Some(m) = msg { 878 self.handle_msg(m); 879 } 880 } 881 } 882 883 impl<V: VcpuX86_64 + 'static> IrqChipX86_64 for UserspaceIrqChip<V> { try_box_clone(&self) -> Result<Box<dyn IrqChipX86_64>>884 fn try_box_clone(&self) -> Result<Box<dyn IrqChipX86_64>> { 885 Ok(Box::new(self.try_clone()?)) 886 } 887 as_irq_chip(&self) -> &dyn IrqChip888 fn as_irq_chip(&self) -> &dyn IrqChip { 889 self 890 } 891 as_irq_chip_mut(&mut self) -> &mut dyn IrqChip892 fn as_irq_chip_mut(&mut self) -> &mut dyn IrqChip { 893 self 894 } 895 get_pic_state(&self, select: PicSelect) -> Result<PicState>896 fn get_pic_state(&self, select: PicSelect) -> Result<PicState> { 897 Ok(self.pic.lock().get_pic_state(select)) 898 } 899 set_pic_state(&mut self, select: PicSelect, state: &PicState) -> Result<()>900 fn set_pic_state(&mut self, select: PicSelect, state: &PicState) -> Result<()> { 901 self.pic.lock().set_pic_state(select, state); 902 Ok(()) 903 } 904 get_ioapic_state(&self) -> Result<IoapicState>905 fn get_ioapic_state(&self) -> Result<IoapicState> { 906 Ok(self.ioapic.lock().get_ioapic_state()) 907 } 908 set_ioapic_state(&mut self, state: &IoapicState) -> Result<()>909 fn set_ioapic_state(&mut self, state: &IoapicState) -> Result<()> { 910 self.ioapic.lock().set_ioapic_state(state); 911 Ok(()) 912 } 913 get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState>914 fn get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState> { 915 Ok(self.apics[vcpu_id].lock().get_state()) 916 } 917 set_lapic_state(&mut self, vcpu_id: usize, state: &LapicState) -> Result<()>918 fn set_lapic_state(&mut self, vcpu_id: usize, state: &LapicState) -> Result<()> { 919 self.apics[vcpu_id].lock().set_state(state); 920 Ok(()) 921 } 922 923 /// Get the lapic frequency in Hz lapic_frequency(&self) -> u32924 fn lapic_frequency(&self) -> u32 { 925 Apic::frequency() 926 } 927 get_pit(&self) -> Result<PitState>928 fn get_pit(&self) -> Result<PitState> { 929 Ok(self.pit.lock().get_pit_state()) 930 } 931 set_pit(&mut self, state: &PitState) -> Result<()>932 fn set_pit(&mut self, state: &PitState) -> Result<()> { 933 self.pit.lock().set_pit_state(state); 934 Ok(()) 935 } 936 937 /// Returns true if the PIT uses port 0x61 for the PC speaker, false if 0x61 is unused. 938 /// devices::Pit uses 0x61. pit_uses_speaker_port(&self) -> bool939 fn pit_uses_speaker_port(&self) -> bool { 940 true 941 } 942 snapshot_chip_specific(&self) -> anyhow::Result<serde_json::Value>943 fn snapshot_chip_specific(&self) -> anyhow::Result<serde_json::Value> { 944 Err(anyhow::anyhow!("Not supported yet in userspace")) 945 } restore_chip_specific(&mut self, _data: serde_json::Value) -> anyhow::Result<()>946 fn restore_chip_specific(&mut self, _data: serde_json::Value) -> anyhow::Result<()> { 947 Err(anyhow::anyhow!("Not supported yet in userspace")) 948 } 949 } 950 951 /// Condition variable used by `UserspaceIrqChip::wait_until_runnable`. 952 #[derive(Default)] 953 struct Waiter { 954 // mtx stores an "interrupted" bool that's true if `kick_halted_vcpus` has been called. 955 mtx: Mutex<bool>, 956 cvar: Condvar, 957 } 958 959 impl Waiter { 960 /// Wakes up `wait_until_runnable` to recheck the interrupted flag and vcpu runnable state. notify(&self)961 pub fn notify(&self) { 962 let _lock = self.mtx.lock(); 963 self.cvar.notify_all(); 964 } 965 966 /// Sets the interrupted flag, and wakes up `wait_until_runnable` to recheck the interrupted 967 /// flag and vcpu runnable state. If `interrupted` is true, then `wait_until_runnable` should 968 /// stop waiting for a runnable vcpu and return immediately. set_and_notify(&self, interrupted: bool)969 pub fn set_and_notify(&self, interrupted: bool) { 970 let mut interrupted_lock = self.mtx.lock(); 971 *interrupted_lock = interrupted; 972 self.cvar.notify_all(); 973 } 974 } 975 976 /// Worker thread for polling timer events and sending them to an APIC. 977 struct TimerWorker<V: VcpuX86_64> { 978 id: usize, 979 apic: Arc<Mutex<Apic>>, 980 vcpus: Arc<Mutex<Vec<Option<V>>>>, 981 descriptor: Descriptor, 982 waiter: Arc<Waiter>, 983 } 984 985 impl<V: VcpuX86_64> TimerWorker<V> { run(&mut self, kill_evt: Event) -> TimerWorkerResult<()>986 fn run(&mut self, kill_evt: Event) -> TimerWorkerResult<()> { 987 #[derive(EventToken)] 988 enum Token { 989 // The timer expired. 990 TimerExpire, 991 // The parent thread requested an exit. 992 Kill, 993 } 994 995 let wait_ctx: WaitContext<Token> = WaitContext::build_with(&[ 996 (&self.descriptor, Token::TimerExpire), 997 (&kill_evt, Token::Kill), 998 ]) 999 .map_err(TimerWorkerError::CreateWaitContext)?; 1000 1001 loop { 1002 let events = wait_ctx.wait().map_err(TimerWorkerError::WaitError)?; 1003 for event in events.iter().filter(|e| e.is_readable) { 1004 match event.token { 1005 Token::TimerExpire => { 1006 self.apic.lock().handle_timer_expiration(); 1007 if let Some(Some(vcpu)) = self.vcpus.lock().get(self.id) { 1008 vcpu.set_interrupt_window_requested(true); 1009 } 1010 self.waiter.notify(); 1011 } 1012 Token::Kill => return Ok(()), 1013 } 1014 } 1015 } 1016 } 1017 } 1018 1019 #[derive(Debug)] 1020 enum TimerWorkerError { 1021 /// Creating WaitContext failed. 1022 CreateWaitContext(Error), 1023 /// Error while waiting for events. 1024 WaitError(Error), 1025 } 1026 1027 impl Display for TimerWorkerError { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1028 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 1029 use self::TimerWorkerError::*; 1030 1031 match self { 1032 CreateWaitContext(e) => write!(f, "failed to create event context: {}", e), 1033 WaitError(e) => write!(f, "failed to wait for events: {}", e), 1034 } 1035 } 1036 } 1037 1038 impl std::error::Error for TimerWorkerError {} 1039 1040 type TimerWorkerResult<T> = std::result::Result<T, TimerWorkerError>; 1041