xref: /aosp_15_r20/external/crosvm/devices/src/irqchip/userspace.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2020 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::convert::TryFrom;
6 use std::convert::TryInto;
7 use std::fmt;
8 use std::fmt::Display;
9 use std::iter;
10 use std::sync::Arc;
11 
12 cfg_if::cfg_if! {
13     if #[cfg(test)] {
14         use base::{FakeClock as Clock, FakeTimer as Timer};
15     } else {
16         use base::{Clock, Timer};
17     }
18 }
19 use anyhow::Context;
20 use base::error;
21 use base::info;
22 use base::warn;
23 use base::AsRawDescriptor;
24 use base::Descriptor;
25 use base::Error;
26 use base::Event;
27 use base::EventToken;
28 use base::Result;
29 use base::Tube;
30 use base::WaitContext;
31 use base::WorkerThread;
32 use hypervisor::DeliveryMode;
33 use hypervisor::IoapicState;
34 use hypervisor::IrqRoute;
35 use hypervisor::IrqSource;
36 use hypervisor::IrqSourceChip;
37 use hypervisor::LapicState;
38 use hypervisor::MPState;
39 use hypervisor::MsiAddressMessage;
40 use hypervisor::MsiDataMessage;
41 use hypervisor::PicSelect;
42 use hypervisor::PicState;
43 use hypervisor::PitState;
44 use hypervisor::Vcpu;
45 use hypervisor::VcpuX86_64;
46 use resources::SystemAllocator;
47 use sync::Condvar;
48 use sync::Mutex;
49 
50 use crate::bus::BusDeviceSync;
51 use crate::irqchip::Apic;
52 use crate::irqchip::ApicBusMsg;
53 use crate::irqchip::DelayedIoApicIrqEvents;
54 use crate::irqchip::Interrupt;
55 use crate::irqchip::InterruptData;
56 use crate::irqchip::InterruptDestination;
57 use crate::irqchip::Ioapic;
58 use crate::irqchip::IrqEvent;
59 use crate::irqchip::IrqEventIndex;
60 use crate::irqchip::Pic;
61 use crate::irqchip::Routes;
62 use crate::irqchip::VcpuRunState;
63 use crate::irqchip::APIC_BASE_ADDRESS;
64 use crate::irqchip::APIC_MEM_LENGTH_BYTES;
65 use crate::irqchip::IOAPIC_BASE_ADDRESS;
66 use crate::irqchip::IOAPIC_MEM_LENGTH_BYTES;
67 use crate::pci::CrosvmDeviceId;
68 use crate::Bus;
69 use crate::BusAccessInfo;
70 use crate::BusDevice;
71 use crate::DeviceId;
72 use crate::IrqChip;
73 use crate::IrqChipCap;
74 use crate::IrqChipX86_64;
75 use crate::IrqEdgeEvent;
76 use crate::IrqEventSource;
77 use crate::IrqLevelEvent;
78 use crate::Pit;
79 use crate::PitError;
80 use crate::Suspendable;
81 
82 /// PIT channel 0 timer is connected to IRQ 0
83 const PIT_CHANNEL0_IRQ: u32 = 0;
84 /// CR0 extension type bit
85 const X86_CR0_ET: u64 = 0x00000010;
86 /// CR0 not write through bit
87 const X86_CR0_NW: u64 = 0x20000000;
88 /// CR0 cache disable bit
89 const X86_CR0_CD: u64 = 0x40000000;
90 /// Default power on state of CR0 register, according to the Intel manual.
91 const X86_CR0_INIT: u64 = X86_CR0_ET | X86_CR0_NW | X86_CR0_CD;
92 
93 /// An `IrqChip` with all interrupt devices emulated in userspace.  `UserspaceIrqChip` works with
94 /// any hypervisor, but only supports x86.
95 pub struct UserspaceIrqChip<V: VcpuX86_64> {
96     pub vcpus: Arc<Mutex<Vec<Option<V>>>>,
97     routes: Arc<Mutex<Routes>>,
98     pit: Arc<Mutex<Pit>>,
99     pic: Arc<Mutex<Pic>>,
100     ioapic: Arc<Mutex<Ioapic>>,
101     ioapic_pins: usize,
102     pub apics: Vec<Arc<Mutex<Apic>>>,
103     // Condition variables used by wait_until_runnable.
104     waiters: Vec<Arc<Waiter>>,
105     // Raw descriptors of the apic Timers.
106     timer_descriptors: Vec<Descriptor>,
107     /// Delayed ioapic irq object, that contains the delayed events because the ioapic was locked
108     /// when service_irq was called on the irqchip. This prevents deadlocks when a Vcpu thread has
109     /// locked the ioapic and the ioapic sends a AddMsiRoute signal to the main thread (which
110     /// itself may be busy trying to call service_irq).
111     ///
112     /// ## Note:
113     /// This lock may be locked by itself to access the `DelayedIoApicIrqEvents`. If accessed in
114     /// conjunction with the `irq_events` field, that lock should be taken first to prevent
115     /// deadlocks stemming from lock-ordering issues.
116     delayed_ioapic_irq_events: Arc<Mutex<DelayedIoApicIrqEvents>>,
117     // Array of Events that devices will use to assert ioapic pins.
118     irq_events: Arc<Mutex<Vec<Option<IrqEvent>>>>,
119     dropper: Arc<Mutex<Dropper>>,
120     activated: bool,
121 }
122 
123 /// Helper that implements `Drop` on behalf of `UserspaceIrqChip`.  The many cloned copies of an irq
124 /// chip share a single arc'ed `Dropper`, which only runs its drop when the last irq chip copy is
125 /// dropped.
126 struct Dropper {
127     /// Worker threads that deliver timer events to the APICs.
128     workers: Vec<WorkerThread<TimerWorkerResult<()>>>,
129 }
130 
131 impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> {
132     /// Constructs a new `UserspaceIrqChip`.
new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self>133     pub fn new(num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>) -> Result<Self> {
134         let clock = Arc::new(Mutex::new(Clock::new()));
135         Self::new_with_clock(num_vcpus, irq_tube, ioapic_pins, clock)
136     }
137 
138     /// Constructs a new `UserspaceIrqChip`, with a clock.  Used for testing.
new_with_clock( num_vcpus: usize, irq_tube: Tube, ioapic_pins: Option<usize>, clock: Arc<Mutex<Clock>>, ) -> Result<Self>139     pub fn new_with_clock(
140         num_vcpus: usize,
141         irq_tube: Tube,
142         ioapic_pins: Option<usize>,
143         clock: Arc<Mutex<Clock>>,
144     ) -> Result<Self> {
145         let pit_evt = IrqEdgeEvent::new()?;
146         // For test only, this clock instance is FakeClock. It needs to be cloned for every Timer
147         // instance, so make a clone for it now.
148         #[cfg(test)]
149         let test_clock = clock.clone();
150         let pit = Pit::new(pit_evt.try_clone()?, clock).map_err(|e| match e {
151             PitError::CloneEvent(err) => err,
152             PitError::CreateEvent(err) => err,
153             PitError::CreateWaitContext(err) => err,
154             PitError::TimerCreateError(err) => err,
155             PitError::WaitError(err) => err,
156             PitError::SpawnThread(_) => Error::new(libc::EIO),
157         })?;
158         let pit_event_source = IrqEventSource::from_device(&pit);
159 
160         let ioapic_pins = ioapic_pins.unwrap_or(hypervisor::NUM_IOAPIC_PINS);
161         let ioapic = Ioapic::new(irq_tube, ioapic_pins)?;
162 
163         let mut timer_descriptors: Vec<Descriptor> = Vec::with_capacity(num_vcpus);
164         let mut apics: Vec<Arc<Mutex<Apic>>> = Vec::with_capacity(num_vcpus);
165         for id in 0..num_vcpus {
166             cfg_if::cfg_if! {
167                 if #[cfg(test)] {
168                     let timer = Timer::new(test_clock.clone());
169                 } else {
170                     let timer = Timer::new()?;
171                 }
172             }
173             // Timers are owned by the apics, which outlive the raw descriptors stored here and in
174             // the worker threads.
175             timer_descriptors.push(Descriptor(timer.as_raw_descriptor()));
176 
177             let id: u8 = id.try_into().or(Err(Error::new(libc::EINVAL)))?;
178             let apic = Apic::new(id, Box::new(timer));
179             apics.push(Arc::new(Mutex::new(apic)));
180         }
181         let dropper = Dropper {
182             workers: Vec::new(),
183         };
184 
185         let mut chip = UserspaceIrqChip {
186             vcpus: Arc::new(Mutex::new(
187                 iter::repeat_with(|| None).take(num_vcpus).collect(),
188             )),
189             waiters: iter::repeat_with(Default::default)
190                 .take(num_vcpus)
191                 .collect(),
192             routes: Arc::new(Mutex::new(Routes::new())),
193             pit: Arc::new(Mutex::new(pit)),
194             pic: Arc::new(Mutex::new(Pic::new())),
195             ioapic: Arc::new(Mutex::new(ioapic)),
196             ioapic_pins,
197             apics,
198             timer_descriptors,
199             delayed_ioapic_irq_events: Arc::new(Mutex::new(DelayedIoApicIrqEvents::new()?)),
200             irq_events: Arc::new(Mutex::new(Vec::new())),
201             dropper: Arc::new(Mutex::new(dropper)),
202             activated: false,
203         };
204 
205         // Setup standard x86 irq routes
206         chip.set_irq_routes(&Routes::default_pic_ioapic_routes(ioapic_pins))?;
207 
208         chip.register_edge_irq_event(PIT_CHANNEL0_IRQ, &pit_evt, pit_event_source)?;
209         Ok(chip)
210     }
211 
212     /// Handles a message from an APIC.
handle_msg(&self, msg: ApicBusMsg)213     fn handle_msg(&self, msg: ApicBusMsg) {
214         match msg {
215             ApicBusMsg::Eoi(vector) => {
216                 let _ = self.broadcast_eoi(vector);
217             }
218             ApicBusMsg::Ipi(interrupt) => self.send_irq_to_apics(&interrupt),
219         }
220     }
221 
222     /// Sends a Message Signaled Interrupt to one or more APICs.  MSIs are a 64-bit address and
223     /// 32-bit data, but in the Intel spec we're implementing, only the low 32 bits of the address
224     /// are used.
send_msi(&self, addr: u32, data: u32)225     fn send_msi(&self, addr: u32, data: u32) {
226         let mut msi_addr = MsiAddressMessage::new();
227         msi_addr.set(0, 32, addr as u64);
228         let dest = match InterruptDestination::try_from(&msi_addr) {
229             Ok(dest) => dest,
230             Err(e) => {
231                 warn!("Invalid MSI message: {}", e);
232                 return;
233             }
234         };
235 
236         let mut msi_data = MsiDataMessage::new();
237         msi_data.set(0, 32, data as u64);
238         let data = InterruptData::from(&msi_data);
239 
240         self.send_irq_to_apics(&Interrupt { dest, data });
241     }
242 
send_irq_to_apic(&self, id: usize, irq: &InterruptData)243     pub fn send_irq_to_apic(&self, id: usize, irq: &InterruptData) {
244         // id can come from the guest, so check bounds.
245         if let Some(apic) = self.apics.get(id) {
246             apic.lock().accept_irq(irq);
247         } else {
248             error!("Interrupt for non-existent apic {}: {:?}", id, irq);
249         }
250         if let Some(Some(vcpu)) = self.vcpus.lock().get(id) {
251             vcpu.set_interrupt_window_requested(true);
252         } else {
253             error!("Interrupt for non-existent vcpu {}: {:?}", id, irq);
254         }
255         self.waiters[id].notify();
256     }
257 
258     /// Sends an interrupt to one or more APICs.  Used for sending MSIs and IPIs.
send_irq_to_apics(&self, irq: &Interrupt)259     pub fn send_irq_to_apics(&self, irq: &Interrupt) {
260         match irq.data.delivery {
261             DeliveryMode::Fixed | DeliveryMode::Lowest | DeliveryMode::RemoteRead => {}
262             _ => info!("UserspaceIrqChip received special irq: {:?}", irq),
263         }
264 
265         // First try the fast path, where the destination is a single APIC we can send to directly.
266         if let Some(apic_id) = Apic::single_dest_fast(&irq.dest) {
267             self.send_irq_to_apic(apic_id as usize, &irq.data);
268             return;
269         }
270 
271         let lowest_mode = irq.data.delivery == DeliveryMode::Lowest;
272         let mut lowest_priority = u8::MAX;
273         let mut lowest_apic: Option<usize> = None;
274 
275         for (i, apic) in self.apics.iter().enumerate() {
276             let send = {
277                 let apic = apic.lock();
278                 if !apic.match_dest(&irq.dest) {
279                     false
280                 } else if lowest_mode {
281                     let priority = apic.get_processor_priority();
282                     if priority <= lowest_priority {
283                         lowest_priority = priority;
284                         lowest_apic = Some(i);
285                     }
286                     false
287                 } else {
288                     true
289                 }
290             };
291             if send {
292                 self.send_irq_to_apic(i, &irq.data);
293             }
294         }
295 
296         if lowest_mode {
297             if let Some(index) = lowest_apic {
298                 self.send_irq_to_apic(index, &irq.data);
299             } else {
300                 // According to sections 10.6.2.1 and 10.6.2.2 of the SDM, the OS should not let
301                 // this happen.  If the OS is misconfigured then drop the interrupt and log a
302                 // warning.
303                 warn!(
304                     "Lowest priority interrupt sent, but no apics configured as valid target: {:?}",
305                     irq
306                 );
307             }
308         }
309     }
310 
311     /// Delivers a startup IPI to `vcpu`.
deliver_startup(&self, vcpu: &V, vector: u8) -> Result<()>312     fn deliver_startup(&self, vcpu: &V, vector: u8) -> Result<()> {
313         // This comes from Intel SDM volume 3, chapter 8.4.  The vector specifies a page aligned
314         // address where execution should start.  cs.base is the offset for the code segment with an
315         // RIP of 0.  The cs.selector is just the base shifted right by 4 bits.
316         let mut sregs = vcpu.get_sregs()?;
317         sregs.cs.base = (vector as u64) << 12;
318         sregs.cs.selector = (vector as u16) << 8;
319 
320         // Set CR0 to its INIT value per the manual.  Application processors won't boot with the CR0
321         // protected mode and paging bits set by setup_sregs().  Kernel APIC doesn't have this
322         // issue, probably because it uses MSRs instead of MMIO, so it's less affected when the AP's
323         // state (CR3 etc.) doesn't reflect changes that Linux made while booting vcpu 0.
324         sregs.cr0 = X86_CR0_INIT;
325         vcpu.set_sregs(&sregs)?;
326 
327         let mut regs = vcpu.get_regs()?;
328         regs.rip = 0;
329         vcpu.set_regs(&regs)?;
330 
331         Ok(())
332     }
333 
334     /// Checks if the specified VCPU is in a runnable state.
is_runnable(&self, vcpu_id: usize) -> bool335     fn is_runnable(&self, vcpu_id: usize) -> bool {
336         self.apics[vcpu_id].lock().get_mp_state() == MPState::Runnable
337     }
338 }
339 
340 impl Dropper {
sleep(&mut self) -> anyhow::Result<()>341     fn sleep(&mut self) -> anyhow::Result<()> {
342         for thread in self.workers.split_off(0).into_iter() {
343             thread
344                 .stop()
345                 .context("UserspaceIrqChip worker thread exited with error")?;
346         }
347         Ok(())
348     }
349 }
350 
351 impl<V: VcpuX86_64 + 'static> UserspaceIrqChip<V> {
register_irq_event( &mut self, irq: u32, irq_event: &Event, resample_event: Option<&Event>, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>352     fn register_irq_event(
353         &mut self,
354         irq: u32,
355         irq_event: &Event,
356         resample_event: Option<&Event>,
357         source: IrqEventSource,
358     ) -> Result<Option<IrqEventIndex>> {
359         let mut evt = IrqEvent {
360             gsi: irq,
361             event: irq_event.try_clone()?,
362             resample_event: None,
363             source,
364         };
365         if let Some(resample_event) = resample_event {
366             evt.resample_event = Some(resample_event.try_clone()?);
367         }
368 
369         let mut irq_events = self.irq_events.lock();
370         let index = irq_events.len();
371         irq_events.push(Some(evt));
372         Ok(Some(index))
373     }
374 
unregister_irq_event(&mut self, irq: u32, irq_event: &Event) -> Result<()>375     fn unregister_irq_event(&mut self, irq: u32, irq_event: &Event) -> Result<()> {
376         let mut irq_events = self.irq_events.lock();
377         for (index, evt) in irq_events.iter().enumerate() {
378             if let Some(evt) = evt {
379                 if evt.gsi == irq && irq_event.eq(&evt.event) {
380                     irq_events[index] = None;
381                     break;
382                 }
383             }
384         }
385         Ok(())
386     }
387 }
388 
389 impl<V: VcpuX86_64 + 'static> IrqChip for UserspaceIrqChip<V> {
add_vcpu(&mut self, vcpu_id: usize, vcpu: &dyn Vcpu) -> Result<()>390     fn add_vcpu(&mut self, vcpu_id: usize, vcpu: &dyn Vcpu) -> Result<()> {
391         let vcpu: &V = vcpu
392             .downcast_ref()
393             .expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type");
394         self.vcpus.lock()[vcpu_id] = Some(vcpu.try_clone()?);
395         Ok(())
396     }
397 
register_edge_irq_event( &mut self, irq: u32, irq_event: &IrqEdgeEvent, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>398     fn register_edge_irq_event(
399         &mut self,
400         irq: u32,
401         irq_event: &IrqEdgeEvent,
402         source: IrqEventSource,
403     ) -> Result<Option<IrqEventIndex>> {
404         self.register_irq_event(irq, irq_event.get_trigger(), None, source)
405     }
406 
unregister_edge_irq_event(&mut self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()>407     fn unregister_edge_irq_event(&mut self, irq: u32, irq_event: &IrqEdgeEvent) -> Result<()> {
408         self.unregister_irq_event(irq, irq_event.get_trigger())
409     }
410 
register_level_irq_event( &mut self, irq: u32, irq_event: &IrqLevelEvent, source: IrqEventSource, ) -> Result<Option<IrqEventIndex>>411     fn register_level_irq_event(
412         &mut self,
413         irq: u32,
414         irq_event: &IrqLevelEvent,
415         source: IrqEventSource,
416     ) -> Result<Option<IrqEventIndex>> {
417         self.register_irq_event(
418             irq,
419             irq_event.get_trigger(),
420             Some(irq_event.get_resample()),
421             source,
422         )
423     }
424 
unregister_level_irq_event(&mut self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()>425     fn unregister_level_irq_event(&mut self, irq: u32, irq_event: &IrqLevelEvent) -> Result<()> {
426         self.unregister_irq_event(irq, irq_event.get_trigger())
427     }
428 
route_irq(&mut self, route: IrqRoute) -> Result<()>429     fn route_irq(&mut self, route: IrqRoute) -> Result<()> {
430         self.routes.lock().add(route)
431     }
432 
set_irq_routes(&mut self, routes: &[IrqRoute]) -> Result<()>433     fn set_irq_routes(&mut self, routes: &[IrqRoute]) -> Result<()> {
434         self.routes.lock().replace_all(routes)
435     }
436 
irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>>437     fn irq_event_tokens(&self) -> Result<Vec<(IrqEventIndex, IrqEventSource, Event)>> {
438         let mut tokens: Vec<(IrqEventIndex, IrqEventSource, Event)> = Vec::new();
439         for (index, evt) in self.irq_events.lock().iter().enumerate() {
440             if let Some(evt) = evt {
441                 tokens.push((index, evt.source.clone(), evt.event.try_clone()?));
442             }
443         }
444         Ok(tokens)
445     }
446 
service_irq(&mut self, irq: u32, level: bool) -> Result<()>447     fn service_irq(&mut self, irq: u32, level: bool) -> Result<()> {
448         for route in self.routes.lock()[irq as usize].iter() {
449             match *route {
450                 IrqSource::Irqchip {
451                     chip: IrqSourceChip::PicPrimary,
452                     pin,
453                 }
454                 | IrqSource::Irqchip {
455                     chip: IrqSourceChip::PicSecondary,
456                     pin,
457                 } => {
458                     self.pic.lock().service_irq(pin as u8, level);
459                 }
460                 IrqSource::Irqchip {
461                     chip: IrqSourceChip::Ioapic,
462                     pin,
463                 } => {
464                     self.ioapic.lock().service_irq(pin as usize, level);
465                 }
466                 // service_irq's level parameter is ignored for MSIs.  MSI data specifies the level.
467                 IrqSource::Msi { address, data } => self.send_msi(address as u32, data),
468                 _ => {
469                     error!("Unexpected route source {:?}", route);
470                     return Err(Error::new(libc::EINVAL));
471                 }
472             }
473         }
474         Ok(())
475     }
476 
477     /// Services an IRQ event by asserting then deasserting an IRQ line.  The associated Event
478     /// that triggered the irq event will be read from.  If the irq is associated with a resample
479     /// Event, then the deassert will only happen after an EOI is broadcast for a vector
480     /// associated with the irq line.
481     /// For UserspaceIrqChip, this function identifies the destination(s) of the irq: PIC, IOAPIC,
482     /// or APIC (MSI).  If it's a PIC or IOAPIC route, we attempt to call service_irq on those
483     /// chips.  If the IOAPIC is unable to be immediately locked, we add the irq to the
484     /// delayed_ioapic_irq_events (though we still read from the Event that triggered the irq
485     /// event).  If it's an MSI route, we call send_msi to decode the MSI and send it to the
486     /// destination APIC(s).
service_irq_event(&mut self, event_index: IrqEventIndex) -> Result<()>487     fn service_irq_event(&mut self, event_index: IrqEventIndex) -> Result<()> {
488         let irq_events = self.irq_events.lock();
489         let evt = if let Some(evt) = &irq_events[event_index] {
490             evt
491         } else {
492             return Ok(());
493         };
494         evt.event.wait()?;
495 
496         for route in self.routes.lock()[evt.gsi as usize].iter() {
497             match *route {
498                 IrqSource::Irqchip {
499                     chip: IrqSourceChip::PicPrimary,
500                     pin,
501                 }
502                 | IrqSource::Irqchip {
503                     chip: IrqSourceChip::PicSecondary,
504                     pin,
505                 } => {
506                     let mut pic = self.pic.lock();
507                     if evt.resample_event.is_some() {
508                         pic.service_irq(pin as u8, true);
509                     } else {
510                         pic.service_irq(pin as u8, true);
511                         pic.service_irq(pin as u8, false);
512                     }
513                 }
514                 IrqSource::Irqchip {
515                     chip: IrqSourceChip::Ioapic,
516                     pin,
517                 } => {
518                     if let Ok(mut ioapic) = self.ioapic.try_lock() {
519                         if evt.resample_event.is_some() {
520                             ioapic.service_irq(pin as usize, true);
521                         } else {
522                             ioapic.service_irq(pin as usize, true);
523                             ioapic.service_irq(pin as usize, false);
524                         }
525                     } else {
526                         let mut delayed_events = self.delayed_ioapic_irq_events.lock();
527                         delayed_events.events.push(event_index);
528                         delayed_events.trigger.signal().unwrap();
529                     }
530                 }
531                 IrqSource::Msi { address, data } => self.send_msi(address as u32, data),
532                 _ => {
533                     error!("Unexpected route source {:?}", route);
534                     return Err(Error::new(libc::EINVAL));
535                 }
536             }
537         }
538 
539         Ok(())
540     }
541 
542     /// Broadcasts an end of interrupt.  For UserspaceIrqChip this sends the EOI to the ioapic.
broadcast_eoi(&self, vector: u8) -> Result<()>543     fn broadcast_eoi(&self, vector: u8) -> Result<()> {
544         self.ioapic.lock().end_of_interrupt(vector);
545         Ok(())
546     }
547 
548     /// Injects any pending interrupts for `vcpu`.
549     ///
550     /// For UserspaceIrqChip this:
551     ///   * Injects a PIC interrupt, if vcpu_id is 0 and vcpu is ready for interrupt
552     ///   * Injects an APIC fixed interrupt, if vcpu is ready for interrupt and PIC didn't inject
553     ///   * Injects APIC NMIs
554     ///   * Handles APIC INIT IPIs
555     ///   * Handles APIC SIPIs
556     ///   * Requests an interrupt window, if PIC or APIC still has pending interrupts for this vcpu
inject_interrupts(&self, vcpu: &dyn Vcpu) -> Result<()>557     fn inject_interrupts(&self, vcpu: &dyn Vcpu) -> Result<()> {
558         let vcpu: &V = vcpu
559             .downcast_ref()
560             .expect("UserspaceIrqChip::add_vcpu called with incorrect vcpu type");
561         let vcpu_id = vcpu.id();
562         let mut vcpu_ready = vcpu.ready_for_interrupt();
563 
564         let mut pic_needs_window = false;
565         if vcpu_id == 0 {
566             let mut pic = self.pic.lock();
567             if vcpu_ready {
568                 if let Some(vector) = pic.get_external_interrupt() {
569                     vcpu.interrupt(vector)?;
570                     self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
571                     // Already injected a PIC interrupt, so APIC fixed interrupt can't be injected.
572                     vcpu_ready = false;
573                 }
574             }
575             pic_needs_window = pic.interrupt_requested();
576         }
577 
578         let irqs = self.apics[vcpu_id].lock().get_pending_irqs(vcpu_ready);
579         if let Some(vector) = irqs.fixed {
580             let do_interrupt = {
581                 let mut apic = self.apics[vcpu_id].lock();
582                 match apic.get_mp_state() {
583                     MPState::Runnable | MPState::Halted => {
584                         // APIC interrupts should only be injectable when the MPState is
585                         // Halted or Runnable.
586                         apic.set_mp_state(&MPState::Runnable);
587                         true
588                     }
589                     s => {
590                         // This shouldn't happen, but log a helpful error if it does.
591                         error!("Interrupt cannot be injected while in state: {:?}", s);
592                         false
593                     }
594                 }
595             };
596 
597             if do_interrupt {
598                 vcpu.interrupt(vector)?;
599             }
600         }
601         for _ in 0..irqs.nmis {
602             let prev_state = self.apics[vcpu_id].lock().get_mp_state();
603             vcpu.inject_nmi()?;
604             self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
605             info!(
606                 "Delivered NMI to cpu {}, mp_state was {:?}, now is {:?}",
607                 vcpu_id,
608                 prev_state,
609                 MPState::Runnable
610             );
611         }
612         if irqs.init {
613             {
614                 let mut apic = self.apics[vcpu_id].lock();
615                 apic.load_reset_state();
616                 apic.set_mp_state(&MPState::InitReceived);
617             }
618             info!("Delivered INIT IPI to cpu {}", vcpu_id);
619         }
620         if let Some(vector) = irqs.startup {
621             // If our state is not MPState::InitReceived then this is probably
622             // the second SIPI in the INIT-SIPI-SIPI sequence; ignore.
623             if self.apics[vcpu_id].lock().get_mp_state() == MPState::InitReceived {
624                 self.deliver_startup(vcpu, vector)?;
625                 self.apics[vcpu_id].lock().set_mp_state(&MPState::Runnable);
626                 info!("Delivered SIPI to cpu {}", vcpu_id);
627             }
628         }
629 
630         let needs_window = pic_needs_window || irqs.needs_window;
631         vcpu.set_interrupt_window_requested(needs_window);
632 
633         Ok(())
634     }
635 
636     /// Notifies the irq chip that the specified VCPU has executed a halt instruction.
637     /// For `UserspaceIrqChip`, it sets the APIC's mp_state to `MPState::Halted`.
halted(&self, vcpu_id: usize)638     fn halted(&self, vcpu_id: usize) {
639         self.apics[vcpu_id].lock().set_mp_state(&MPState::Halted)
640     }
641 
642     /// Blocks until `vcpu` is in a runnable state or until interrupted by
643     /// `IrqChip::kick_halted_vcpus`.  Returns `VcpuRunState::Runnable if vcpu is runnable, or
644     /// `VcpuRunState::Interrupted` if the wait was interrupted.
645     /// For `UserspaceIrqChip`, if the APIC isn't `MPState::Runnable`, sleep until there are new
646     /// interrupts pending on the APIC, inject the interrupts, and go back to sleep if still not
647     /// runnable.
wait_until_runnable(&self, vcpu: &dyn Vcpu) -> Result<VcpuRunState>648     fn wait_until_runnable(&self, vcpu: &dyn Vcpu) -> Result<VcpuRunState> {
649         let vcpu_id = vcpu.id();
650         let waiter = &self.waiters[vcpu_id];
651         let mut interrupted_lock = waiter.mtx.lock();
652         loop {
653             if *interrupted_lock {
654                 *interrupted_lock = false;
655                 info!("wait_until_runnable interrupted on cpu {}", vcpu_id);
656                 return Ok(VcpuRunState::Interrupted);
657             }
658             if self.is_runnable(vcpu_id) {
659                 return Ok(VcpuRunState::Runnable);
660             }
661 
662             self.inject_interrupts(vcpu)?;
663             if self.is_runnable(vcpu_id) {
664                 return Ok(VcpuRunState::Runnable);
665             }
666             interrupted_lock = waiter.cvar.wait(interrupted_lock);
667         }
668     }
669 
670     /// Makes unrunnable VCPUs return immediately from `wait_until_runnable`.
671     /// For UserspaceIrqChip, every vcpu gets kicked so its current or next call to
672     /// `wait_until_runnable` will immediately return false.  After that one kick, subsequent
673     /// `wait_until_runnable` calls go back to waiting for runnability normally.
kick_halted_vcpus(&self)674     fn kick_halted_vcpus(&self) {
675         for waiter in self.waiters.iter() {
676             waiter.set_and_notify(/* interrupted= */ true);
677         }
678     }
679 
get_mp_state(&self, vcpu_id: usize) -> Result<MPState>680     fn get_mp_state(&self, vcpu_id: usize) -> Result<MPState> {
681         Ok(self.apics[vcpu_id].lock().get_mp_state())
682     }
683 
set_mp_state(&mut self, vcpu_id: usize, state: &MPState) -> Result<()>684     fn set_mp_state(&mut self, vcpu_id: usize, state: &MPState) -> Result<()> {
685         self.apics[vcpu_id].lock().set_mp_state(state);
686         Ok(())
687     }
688 
try_clone(&self) -> Result<Self>689     fn try_clone(&self) -> Result<Self> {
690         // kill_evts and timer_descriptors don't change, so they could be a plain Vec with each
691         // element cloned.  But the Arc<Mutex> avoids a quadratic number of open descriptors from
692         // cloning, and those fields aren't performance critical.
693         Ok(UserspaceIrqChip {
694             vcpus: self.vcpus.clone(),
695             waiters: self.waiters.clone(),
696             routes: self.routes.clone(),
697             pit: self.pit.clone(),
698             pic: self.pic.clone(),
699             ioapic: self.ioapic.clone(),
700             ioapic_pins: self.ioapic_pins,
701             apics: self.apics.clone(),
702             timer_descriptors: self.timer_descriptors.clone(),
703             delayed_ioapic_irq_events: self.delayed_ioapic_irq_events.clone(),
704             irq_events: self.irq_events.clone(),
705             dropper: self.dropper.clone(),
706             activated: self.activated,
707         })
708     }
709 
710     // TODO(srichman): factor out UserspaceIrqChip and KvmSplitIrqChip::finalize_devices
finalize_devices( &mut self, resources: &mut SystemAllocator, io_bus: &Bus, mmio_bus: &Bus, ) -> Result<()>711     fn finalize_devices(
712         &mut self,
713         resources: &mut SystemAllocator,
714         io_bus: &Bus,
715         mmio_bus: &Bus,
716     ) -> Result<()> {
717         // Insert pit into io_bus
718         io_bus.insert(self.pit.clone(), 0x040, 0x8).unwrap();
719         io_bus.insert(self.pit.clone(), 0x061, 0x1).unwrap();
720 
721         // Insert pic into io_bus
722         io_bus.insert(self.pic.clone(), 0x20, 0x2).unwrap();
723         io_bus.insert(self.pic.clone(), 0xa0, 0x2).unwrap();
724         io_bus.insert(self.pic.clone(), 0x4d0, 0x2).unwrap();
725 
726         // Insert ioapic into mmio_bus
727         mmio_bus
728             .insert(
729                 self.ioapic.clone(),
730                 IOAPIC_BASE_ADDRESS,
731                 IOAPIC_MEM_LENGTH_BYTES,
732             )
733             .unwrap();
734 
735         // Insert self into mmio_bus for handling APIC mmio
736         mmio_bus
737             .insert_sync(
738                 Arc::new(self.try_clone()?),
739                 APIC_BASE_ADDRESS,
740                 APIC_MEM_LENGTH_BYTES,
741             )
742             .unwrap();
743 
744         // At this point, all of our devices have been created and they have registered their
745         // irq events, so we can clone our resample events
746         let mut ioapic_resample_events: Vec<Vec<Event>> =
747             (0..self.ioapic_pins).map(|_| Vec::new()).collect();
748         let mut pic_resample_events: Vec<Vec<Event>> =
749             (0..self.ioapic_pins).map(|_| Vec::new()).collect();
750 
751         for evt in self.irq_events.lock().iter().flatten() {
752             if (evt.gsi as usize) >= self.ioapic_pins {
753                 continue;
754             }
755             if let Some(resample_evt) = &evt.resample_event {
756                 ioapic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?);
757                 pic_resample_events[evt.gsi as usize].push(resample_evt.try_clone()?);
758             }
759         }
760 
761         // Register resample events with the ioapic
762         self.ioapic
763             .lock()
764             .register_resample_events(ioapic_resample_events);
765         // Register resample events with the pic
766         self.pic
767             .lock()
768             .register_resample_events(pic_resample_events);
769 
770         // Make sure all future irq numbers are >= self.ioapic_pins
771         let mut irq_num = resources.allocate_irq().unwrap();
772         while irq_num < self.ioapic_pins as u32 {
773             irq_num = resources.allocate_irq().unwrap();
774         }
775 
776         // Spawn timer threads here instead of in new(), in case crosvm is in sandbox mode.
777         self.activated = true;
778         let _ = self.wake();
779 
780         Ok(())
781     }
782 
783     /// The UserspaceIrqChip's ioapic may be locked because a vcpu thread is currently writing to
784     /// the ioapic, and the ioapic may be blocking on adding MSI routes, which requires blocking
785     /// tube communication back to the main thread.  Thus, we do not want the main thread to
786     /// block on a locked ioapic, so any irqs that could not be serviced because the ioapic could
787     /// not be immediately locked are added to the delayed_ioapic_irq_events Vec. This function
788     /// processes each delayed event in the vec each time it's called. If the ioapic is still
789     /// locked, we keep the queued irqs for the next time this function is called.
process_delayed_irq_events(&mut self) -> Result<()>790     fn process_delayed_irq_events(&mut self) -> Result<()> {
791         let irq_events = self.irq_events.lock();
792         let mut delayed_events = self.delayed_ioapic_irq_events.lock();
793         delayed_events.events.retain(|&event_index| {
794             if let Some(evt) = &irq_events[event_index] {
795                 if let Ok(mut ioapic) = self.ioapic.try_lock() {
796                     if evt.resample_event.is_some() {
797                         ioapic.service_irq(evt.gsi as usize, true);
798                     } else {
799                         ioapic.service_irq(evt.gsi as usize, true);
800                         ioapic.service_irq(evt.gsi as usize, false);
801                     }
802 
803                     false
804                 } else {
805                     true
806                 }
807             } else {
808                 true
809             }
810         });
811 
812         if delayed_events.events.is_empty() {
813             delayed_events.trigger.wait()?;
814         }
815         Ok(())
816     }
817 
irq_delayed_event_token(&self) -> Result<Option<Event>>818     fn irq_delayed_event_token(&self) -> Result<Option<Event>> {
819         Ok(Some(
820             self.delayed_ioapic_irq_events.lock().trigger.try_clone()?,
821         ))
822     }
823 
check_capability(&self, c: IrqChipCap) -> bool824     fn check_capability(&self, c: IrqChipCap) -> bool {
825         match c {
826             IrqChipCap::TscDeadlineTimer => false,
827             IrqChipCap::X2Apic => false,
828             IrqChipCap::MpStateGetSet => true,
829         }
830     }
831 }
832 
833 impl<V: VcpuX86_64 + 'static> BusDevice for UserspaceIrqChip<V> {
debug_label(&self) -> String834     fn debug_label(&self) -> String {
835         "UserspaceIrqChip APIC".to_string()
836     }
device_id(&self) -> DeviceId837     fn device_id(&self) -> DeviceId {
838         CrosvmDeviceId::UserspaceIrqChip.into()
839     }
840 }
841 
842 impl<V: VcpuX86_64 + 'static> Suspendable for UserspaceIrqChip<V> {
sleep(&mut self) -> anyhow::Result<()>843     fn sleep(&mut self) -> anyhow::Result<()> {
844         let mut dropper = self.dropper.lock();
845         dropper.sleep()
846     }
847 
wake(&mut self) -> anyhow::Result<()>848     fn wake(&mut self) -> anyhow::Result<()> {
849         if self.activated {
850             // create workers and run them.
851             let mut dropper = self.dropper.lock();
852             for (i, descriptor) in self.timer_descriptors.iter().enumerate() {
853                 let mut worker = TimerWorker {
854                     id: i,
855                     apic: self.apics[i].clone(),
856                     descriptor: *descriptor,
857                     vcpus: self.vcpus.clone(),
858                     waiter: self.waiters[i].clone(),
859                 };
860                 let worker_thread = WorkerThread::start(
861                     format!("UserspaceIrqChip timer worker {}", i),
862                     move |evt| worker.run(evt),
863                 );
864                 dropper.workers.push(worker_thread);
865             }
866         }
867         Ok(())
868     }
869 }
870 
871 impl<V: VcpuX86_64 + 'static> BusDeviceSync for UserspaceIrqChip<V> {
read(&self, info: BusAccessInfo, data: &mut [u8])872     fn read(&self, info: BusAccessInfo, data: &mut [u8]) {
873         self.apics[info.id].lock().read(info.offset, data)
874     }
write(&self, info: BusAccessInfo, data: &[u8])875     fn write(&self, info: BusAccessInfo, data: &[u8]) {
876         let msg = self.apics[info.id].lock().write(info.offset, data);
877         if let Some(m) = msg {
878             self.handle_msg(m);
879         }
880     }
881 }
882 
883 impl<V: VcpuX86_64 + 'static> IrqChipX86_64 for UserspaceIrqChip<V> {
try_box_clone(&self) -> Result<Box<dyn IrqChipX86_64>>884     fn try_box_clone(&self) -> Result<Box<dyn IrqChipX86_64>> {
885         Ok(Box::new(self.try_clone()?))
886     }
887 
as_irq_chip(&self) -> &dyn IrqChip888     fn as_irq_chip(&self) -> &dyn IrqChip {
889         self
890     }
891 
as_irq_chip_mut(&mut self) -> &mut dyn IrqChip892     fn as_irq_chip_mut(&mut self) -> &mut dyn IrqChip {
893         self
894     }
895 
get_pic_state(&self, select: PicSelect) -> Result<PicState>896     fn get_pic_state(&self, select: PicSelect) -> Result<PicState> {
897         Ok(self.pic.lock().get_pic_state(select))
898     }
899 
set_pic_state(&mut self, select: PicSelect, state: &PicState) -> Result<()>900     fn set_pic_state(&mut self, select: PicSelect, state: &PicState) -> Result<()> {
901         self.pic.lock().set_pic_state(select, state);
902         Ok(())
903     }
904 
get_ioapic_state(&self) -> Result<IoapicState>905     fn get_ioapic_state(&self) -> Result<IoapicState> {
906         Ok(self.ioapic.lock().get_ioapic_state())
907     }
908 
set_ioapic_state(&mut self, state: &IoapicState) -> Result<()>909     fn set_ioapic_state(&mut self, state: &IoapicState) -> Result<()> {
910         self.ioapic.lock().set_ioapic_state(state);
911         Ok(())
912     }
913 
get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState>914     fn get_lapic_state(&self, vcpu_id: usize) -> Result<LapicState> {
915         Ok(self.apics[vcpu_id].lock().get_state())
916     }
917 
set_lapic_state(&mut self, vcpu_id: usize, state: &LapicState) -> Result<()>918     fn set_lapic_state(&mut self, vcpu_id: usize, state: &LapicState) -> Result<()> {
919         self.apics[vcpu_id].lock().set_state(state);
920         Ok(())
921     }
922 
923     /// Get the lapic frequency in Hz
lapic_frequency(&self) -> u32924     fn lapic_frequency(&self) -> u32 {
925         Apic::frequency()
926     }
927 
get_pit(&self) -> Result<PitState>928     fn get_pit(&self) -> Result<PitState> {
929         Ok(self.pit.lock().get_pit_state())
930     }
931 
set_pit(&mut self, state: &PitState) -> Result<()>932     fn set_pit(&mut self, state: &PitState) -> Result<()> {
933         self.pit.lock().set_pit_state(state);
934         Ok(())
935     }
936 
937     /// Returns true if the PIT uses port 0x61 for the PC speaker, false if 0x61 is unused.
938     /// devices::Pit uses 0x61.
pit_uses_speaker_port(&self) -> bool939     fn pit_uses_speaker_port(&self) -> bool {
940         true
941     }
942 
snapshot_chip_specific(&self) -> anyhow::Result<serde_json::Value>943     fn snapshot_chip_specific(&self) -> anyhow::Result<serde_json::Value> {
944         Err(anyhow::anyhow!("Not supported yet in userspace"))
945     }
restore_chip_specific(&mut self, _data: serde_json::Value) -> anyhow::Result<()>946     fn restore_chip_specific(&mut self, _data: serde_json::Value) -> anyhow::Result<()> {
947         Err(anyhow::anyhow!("Not supported yet in userspace"))
948     }
949 }
950 
951 /// Condition variable used by `UserspaceIrqChip::wait_until_runnable`.
952 #[derive(Default)]
953 struct Waiter {
954     // mtx stores an "interrupted" bool that's true if `kick_halted_vcpus` has been called.
955     mtx: Mutex<bool>,
956     cvar: Condvar,
957 }
958 
959 impl Waiter {
960     /// Wakes up `wait_until_runnable` to recheck the interrupted flag and vcpu runnable state.
notify(&self)961     pub fn notify(&self) {
962         let _lock = self.mtx.lock();
963         self.cvar.notify_all();
964     }
965 
966     /// Sets the interrupted flag, and wakes up `wait_until_runnable` to recheck the interrupted
967     /// flag and vcpu runnable state.  If `interrupted` is true, then `wait_until_runnable` should
968     /// stop waiting for a runnable vcpu and return immediately.
set_and_notify(&self, interrupted: bool)969     pub fn set_and_notify(&self, interrupted: bool) {
970         let mut interrupted_lock = self.mtx.lock();
971         *interrupted_lock = interrupted;
972         self.cvar.notify_all();
973     }
974 }
975 
976 /// Worker thread for polling timer events and sending them to an APIC.
977 struct TimerWorker<V: VcpuX86_64> {
978     id: usize,
979     apic: Arc<Mutex<Apic>>,
980     vcpus: Arc<Mutex<Vec<Option<V>>>>,
981     descriptor: Descriptor,
982     waiter: Arc<Waiter>,
983 }
984 
985 impl<V: VcpuX86_64> TimerWorker<V> {
run(&mut self, kill_evt: Event) -> TimerWorkerResult<()>986     fn run(&mut self, kill_evt: Event) -> TimerWorkerResult<()> {
987         #[derive(EventToken)]
988         enum Token {
989             // The timer expired.
990             TimerExpire,
991             // The parent thread requested an exit.
992             Kill,
993         }
994 
995         let wait_ctx: WaitContext<Token> = WaitContext::build_with(&[
996             (&self.descriptor, Token::TimerExpire),
997             (&kill_evt, Token::Kill),
998         ])
999         .map_err(TimerWorkerError::CreateWaitContext)?;
1000 
1001         loop {
1002             let events = wait_ctx.wait().map_err(TimerWorkerError::WaitError)?;
1003             for event in events.iter().filter(|e| e.is_readable) {
1004                 match event.token {
1005                     Token::TimerExpire => {
1006                         self.apic.lock().handle_timer_expiration();
1007                         if let Some(Some(vcpu)) = self.vcpus.lock().get(self.id) {
1008                             vcpu.set_interrupt_window_requested(true);
1009                         }
1010                         self.waiter.notify();
1011                     }
1012                     Token::Kill => return Ok(()),
1013                 }
1014             }
1015         }
1016     }
1017 }
1018 
1019 #[derive(Debug)]
1020 enum TimerWorkerError {
1021     /// Creating WaitContext failed.
1022     CreateWaitContext(Error),
1023     /// Error while waiting for events.
1024     WaitError(Error),
1025 }
1026 
1027 impl Display for TimerWorkerError {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1028     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1029         use self::TimerWorkerError::*;
1030 
1031         match self {
1032             CreateWaitContext(e) => write!(f, "failed to create event context: {}", e),
1033             WaitError(e) => write!(f, "failed to wait for events: {}", e),
1034         }
1035     }
1036 }
1037 
1038 impl std::error::Error for TimerWorkerError {}
1039 
1040 type TimerWorkerResult<T> = std::result::Result<T, TimerWorkerError>;
1041