xref: /aosp_15_r20/external/crosvm/devices/src/pci/coiommu.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! This is the CoIOMMU backend implementation. CoIOMMU is a virtual device
6 //! which provide fine-grained pinning for the VFIO pci-passthrough device
7 //! so that hypervisor doesn't need to pin the enter VM's memory to improve
8 //! the memory utilization. CoIOMMU doesn't provide the intra-guest protection
9 //! so it can only be used for the TRUSTED passthrough devices.
10 //!
11 //! CoIOMMU is presented at KVM forum 2020:
12 //! <https://kvmforum2020.sched.com/event/eE2z/a-virtual-iommu-with-cooperative-dma-buffer-tracking-yu-zhang-intel>
13 //!
14 //! Also presented at usenix ATC20:
15 //! <https://www.usenix.org/conference/atc20/presentation/tian>
16 
17 use std::collections::VecDeque;
18 use std::convert::TryInto;
19 use std::default::Default;
20 use std::fmt;
21 use std::mem;
22 use std::panic;
23 use std::sync::atomic::fence;
24 use std::sync::atomic::AtomicU32;
25 use std::sync::atomic::Ordering;
26 use std::sync::Arc;
27 use std::thread;
28 use std::time::Duration;
29 
30 use anyhow::bail;
31 use anyhow::ensure;
32 use anyhow::Context;
33 use anyhow::Result;
34 use base::error;
35 use base::info;
36 use base::AsRawDescriptor;
37 use base::Event;
38 use base::EventToken;
39 use base::MemoryMapping;
40 use base::MemoryMappingBuilder;
41 use base::Protection;
42 use base::RawDescriptor;
43 use base::SafeDescriptor;
44 use base::SharedMemory;
45 use base::Timer;
46 use base::TimerTrait;
47 use base::Tube;
48 use base::TubeError;
49 use base::WaitContext;
50 use base::WorkerThread;
51 use hypervisor::Datamatch;
52 use hypervisor::MemCacheType;
53 use resources::Alloc;
54 use resources::AllocOptions;
55 use resources::SystemAllocator;
56 use serde::Deserialize;
57 use serde::Deserializer;
58 use serde::Serialize;
59 use serde_keyvalue::FromKeyValues;
60 use sync::Mutex;
61 use thiserror::Error as ThisError;
62 use vm_control::api::VmMemoryClient;
63 use vm_control::VmMemoryDestination;
64 use vm_control::VmMemorySource;
65 use vm_memory::GuestAddress;
66 use vm_memory::GuestMemory;
67 use zerocopy::AsBytes;
68 use zerocopy::FromBytes;
69 use zerocopy::FromZeroes;
70 
71 use crate::pci::pci_configuration::PciBarConfiguration;
72 use crate::pci::pci_configuration::PciBarPrefetchable;
73 use crate::pci::pci_configuration::PciBarRegionType;
74 use crate::pci::pci_configuration::PciClassCode;
75 use crate::pci::pci_configuration::PciConfiguration;
76 use crate::pci::pci_configuration::PciHeaderType;
77 use crate::pci::pci_configuration::PciOtherSubclass;
78 use crate::pci::pci_configuration::COMMAND_REG;
79 use crate::pci::pci_configuration::COMMAND_REG_MEMORY_SPACE_MASK;
80 use crate::pci::pci_device::BarRange;
81 use crate::pci::pci_device::PciDevice;
82 use crate::pci::pci_device::Result as PciResult;
83 use crate::pci::PciAddress;
84 use crate::pci::PciBarIndex;
85 use crate::pci::PciDeviceError;
86 use crate::vfio::VfioContainer;
87 use crate::Suspendable;
88 use crate::UnpinRequest;
89 use crate::UnpinResponse;
90 
91 const PCI_VENDOR_ID_COIOMMU: u16 = 0x1234;
92 const PCI_DEVICE_ID_COIOMMU: u16 = 0xabcd;
93 const COIOMMU_CMD_DEACTIVATE: u64 = 0;
94 const COIOMMU_CMD_ACTIVATE: u64 = 1;
95 const COIOMMU_CMD_PARK_UNPIN: u64 = 2;
96 const COIOMMU_CMD_UNPARK_UNPIN: u64 = 3;
97 const COIOMMU_REVISION_ID: u8 = 0x10;
98 const COIOMMU_MMIO_BAR: PciBarIndex = 0;
99 const COIOMMU_MMIO_BAR_SIZE: u64 = 0x2000;
100 const COIOMMU_NOTIFYMAP_BAR: PciBarIndex = 2;
101 const COIOMMU_NOTIFYMAP_SIZE: usize = 0x2000;
102 const COIOMMU_TOPOLOGYMAP_BAR: u8 = 4;
103 const COIOMMU_TOPOLOGYMAP_SIZE: usize = 0x2000;
104 const PAGE_SIZE_4K: u64 = 4096;
105 const PAGE_SHIFT_4K: u64 = 12;
106 const PIN_PAGES_IN_BATCH: u64 = 1 << 63;
107 
108 const DTTE_PINNED_FLAG: u32 = 1 << 31;
109 const DTTE_ACCESSED_FLAG: u32 = 1 << 30;
110 const DTT_ENTRY_PRESENT: u64 = 1;
111 const DTT_ENTRY_PFN_SHIFT: u64 = 12;
112 
113 #[derive(ThisError, Debug)]
114 enum Error {
115     #[error("CoIommu failed to create shared memory")]
116     CreateSharedMemory,
117     #[error("Failed to get DTT entry")]
118     GetDTTEntry,
119 }
120 
121 //default interval is 60s
122 const UNPIN_DEFAULT_INTERVAL: Duration = Duration::from_secs(60);
123 const UNPIN_GEN_DEFAULT_THRES: u64 = 10;
124 /// Holds the coiommu unpin policy
125 #[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize)]
126 #[serde(rename_all = "kebab-case")]
127 pub enum CoIommuUnpinPolicy {
128     #[default]
129     Off,
130     Lru,
131 }
132 
133 impl fmt::Display for CoIommuUnpinPolicy {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result134     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
135         use self::CoIommuUnpinPolicy::*;
136 
137         match self {
138             Off => write!(f, "off"),
139             Lru => write!(f, "lru"),
140         }
141     }
142 }
143 
deserialize_unpin_interval<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Duration, D::Error>144 fn deserialize_unpin_interval<'de, D: Deserializer<'de>>(
145     deserializer: D,
146 ) -> Result<Duration, D::Error> {
147     let secs = u64::deserialize(deserializer)?;
148 
149     Ok(Duration::from_secs(secs))
150 }
151 
deserialize_unpin_limit<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Option<u64>, D::Error>152 fn deserialize_unpin_limit<'de, D: Deserializer<'de>>(
153     deserializer: D,
154 ) -> Result<Option<u64>, D::Error> {
155     let limit = u64::deserialize(deserializer)?;
156 
157     match limit {
158         0 => Err(serde::de::Error::custom(
159             "Please use non-zero unpin_limit value",
160         )),
161         limit => Ok(Some(limit)),
162     }
163 }
164 
unpin_interval_default() -> Duration165 fn unpin_interval_default() -> Duration {
166     UNPIN_DEFAULT_INTERVAL
167 }
168 
unpin_gen_threshold_default() -> u64169 fn unpin_gen_threshold_default() -> u64 {
170     UNPIN_GEN_DEFAULT_THRES
171 }
172 
173 /// Holds the parameters for a coiommu device
174 #[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize, FromKeyValues)]
175 #[serde(deny_unknown_fields)]
176 pub struct CoIommuParameters {
177     #[serde(default)]
178     pub unpin_policy: CoIommuUnpinPolicy,
179     #[serde(
180         deserialize_with = "deserialize_unpin_interval",
181         default = "unpin_interval_default"
182     )]
183     pub unpin_interval: Duration,
184     #[serde(deserialize_with = "deserialize_unpin_limit", default)]
185     pub unpin_limit: Option<u64>,
186     // Number of unpin intervals a pinned page must be busy for to be aged into the
187     // older, less frequently checked generation.
188     #[serde(default = "unpin_gen_threshold_default")]
189     pub unpin_gen_threshold: u64,
190 }
191 
192 impl Default for CoIommuParameters {
default() -> Self193     fn default() -> Self {
194         Self {
195             unpin_policy: CoIommuUnpinPolicy::Off,
196             unpin_interval: UNPIN_DEFAULT_INTERVAL,
197             unpin_limit: None,
198             unpin_gen_threshold: UNPIN_GEN_DEFAULT_THRES,
199         }
200     }
201 }
202 
203 #[derive(Default, Debug, Copy, Clone)]
204 struct CoIommuReg {
205     dtt_root: u64,
206     cmd: u64,
207     dtt_level: u64,
208 }
209 
210 #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
211 struct PinnedPageInfo {
212     gfn: u64,
213     unpin_busy_cnt: u64,
214 }
215 
216 impl PinnedPageInfo {
new(gfn: u64, unpin_busy_cnt: u64) -> Self217     fn new(gfn: u64, unpin_busy_cnt: u64) -> Self {
218         PinnedPageInfo {
219             gfn,
220             unpin_busy_cnt,
221         }
222     }
223 }
224 
225 #[derive(PartialEq, Debug, Eq)]
226 enum UnpinThreadState {
227     Unparked,
228     Parked,
229 }
230 
231 struct CoIommuPinState {
232     new_gen_pinned_pages: VecDeque<PinnedPageInfo>,
233     old_gen_pinned_pages: VecDeque<u64>,
234     unpin_thread_state: UnpinThreadState,
235     unpin_park_count: u64,
236 }
237 
vfio_map( vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64, user_addr: u64, ) -> bool238 unsafe fn vfio_map(
239     vfio_container: &Arc<Mutex<VfioContainer>>,
240     iova: u64,
241     size: u64,
242     user_addr: u64,
243 ) -> bool {
244     match vfio_container
245         .lock()
246         .vfio_dma_map(iova, size, user_addr, true)
247     {
248         Ok(_) => true,
249         Err(e) => {
250             if let Some(errno) = std::io::Error::last_os_error().raw_os_error() {
251                 if errno == libc::EEXIST {
252                     // Already pinned. set PINNED flag
253                     error!("CoIommu: iova 0x{:x} already pinned", iova);
254                     return true;
255                 }
256             }
257             error!("CoIommu: failed to map iova 0x{:x}: {}", iova, e);
258             false
259         }
260     }
261 }
262 
vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool263 fn vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool {
264     match vfio_container.lock().vfio_dma_unmap(iova, size) {
265         Ok(_) => true,
266         Err(e) => {
267             error!("CoIommu: failed to unmap iova 0x{:x}: {}", iova, e);
268             false
269         }
270     }
271 }
272 
273 #[derive(Default, Debug, Copy, Clone, FromZeroes, FromBytes, AsBytes)]
274 #[repr(C)]
275 struct PinPageInfo {
276     bdf: u16,
277     pad: [u16; 3],
278     nr_pages: u64,
279 }
280 
281 const COIOMMU_UPPER_LEVEL_STRIDE: u64 = 9;
282 const COIOMMU_UPPER_LEVEL_MASK: u64 = (1 << COIOMMU_UPPER_LEVEL_STRIDE) - 1;
283 const COIOMMU_PT_LEVEL_STRIDE: u64 = 10;
284 const COIOMMU_PT_LEVEL_MASK: u64 = (1 << COIOMMU_PT_LEVEL_STRIDE) - 1;
285 
level_to_offset(gfn: u64, level: u64) -> Result<u64>286 fn level_to_offset(gfn: u64, level: u64) -> Result<u64> {
287     if level == 1 {
288         return Ok(gfn & COIOMMU_PT_LEVEL_MASK);
289     }
290 
291     if level == 0 {
292         bail!("Invalid level for gfn 0x{:x}", gfn);
293     }
294 
295     let offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE;
296 
297     Ok((gfn >> offset) & COIOMMU_UPPER_LEVEL_MASK)
298 }
299 
300 struct DTTIter {
301     ptr: *const u8,
302     gfn: u64,
303 }
304 
305 impl Default for DTTIter {
default() -> Self306     fn default() -> Self {
307         DTTIter {
308             ptr: std::ptr::null(),
309             gfn: 0,
310         }
311     }
312 }
313 
314 // Get a DMA Tracking Table(DTT) entry associated with the gfn.
315 //
316 // There are two ways to get the entry:
317 // #1. Walking the DMA Tracking Table(DTT) by the GFN to get the
318 // corresponding entry. The DTT is shared between frontend and
319 // backend. It is page-table-like strctures and the entry is indexed
320 // by GFN. The argument dtt_root represents the root page
321 // pga and dtt_level represents the maximum page table level.
322 //
323 // #2. Calculate the entry address via the argument dtt_iter. dtt_iter
324 // stores an entry address and the associated gfn. If the target gfn is
325 // in the same page table page with the gfn in dtt_iter, then can
326 // calculate the target entry address based on the entry address in
327 // dtt_iter.
328 //
329 // As the DTT entry is shared between frontend and backend, the accessing
330 // should be atomic. So the returned value is converted to an AtomicU32
331 // pointer.
gfn_to_dtt_pte( mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<*const AtomicU32>332 fn gfn_to_dtt_pte(
333     mem: &GuestMemory,
334     dtt_level: u64,
335     dtt_root: u64,
336     dtt_iter: &mut DTTIter,
337     gfn: u64,
338 ) -> Result<*const AtomicU32> {
339     let ptr = if dtt_iter.ptr.is_null()
340         || dtt_iter.gfn >> COIOMMU_PT_LEVEL_STRIDE != gfn >> COIOMMU_PT_LEVEL_STRIDE
341     {
342         // Slow path to walk the DTT to get the pte entry
343         let mut level = dtt_level;
344         let mut pt_gpa = dtt_root;
345         let dtt_nonleaf_entry_size = mem::size_of::<u64>() as u64;
346 
347         while level != 1 {
348             let index = level_to_offset(gfn, level)? * dtt_nonleaf_entry_size;
349             let parent_pt = mem
350                 .read_obj_from_addr::<u64>(GuestAddress(pt_gpa + index))
351                 .context(Error::GetDTTEntry)?;
352 
353             if (parent_pt & DTT_ENTRY_PRESENT) == 0 {
354                 bail!("DTT absent at level {} for gfn 0x{:x}", level, gfn);
355             }
356 
357             pt_gpa = (parent_pt >> DTT_ENTRY_PFN_SHIFT) << PAGE_SHIFT_4K;
358             level -= 1;
359         }
360 
361         let index = level_to_offset(gfn, level)? * mem::size_of::<u32>() as u64;
362 
363         mem.get_host_address(GuestAddress(pt_gpa + index))
364             .context(Error::GetDTTEntry)?
365     } else if gfn > dtt_iter.gfn {
366         // SAFETY:
367         // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
368         // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
369         // means the calculated ptr will point to the same page as dtt_iter.ptr
370         unsafe {
371             dtt_iter
372                 .ptr
373                 .add(mem::size_of::<AtomicU32>() * (gfn - dtt_iter.gfn) as usize)
374         }
375     } else {
376         // SAFETY:
377         // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
378         // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
379         // means the calculated ptr will point to the same page as dtt_iter.ptr
380         unsafe {
381             dtt_iter
382                 .ptr
383                 .sub(mem::size_of::<AtomicU32>() * (dtt_iter.gfn - gfn) as usize)
384         }
385     };
386 
387     dtt_iter.ptr = ptr;
388     dtt_iter.gfn = gfn;
389 
390     Ok(ptr as *const AtomicU32)
391 }
392 
pin_page( pinstate: &mut CoIommuPinState, policy: CoIommuUnpinPolicy, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<()>393 fn pin_page(
394     pinstate: &mut CoIommuPinState,
395     policy: CoIommuUnpinPolicy,
396     vfio_container: &Arc<Mutex<VfioContainer>>,
397     mem: &GuestMemory,
398     dtt_level: u64,
399     dtt_root: u64,
400     dtt_iter: &mut DTTIter,
401     gfn: u64,
402 ) -> Result<()> {
403     let leaf_entry = gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn)?;
404 
405     let gpa = gfn << PAGE_SHIFT_4K;
406     let host_addr = mem
407         .get_host_address_range(GuestAddress(gpa), PAGE_SIZE_4K as usize)
408         .context("failed to get host address")? as u64;
409 
410     // SAFETY:
411     // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
412     // Test PINNED flag
413     if (unsafe { (*leaf_entry).load(Ordering::Relaxed) } & DTTE_PINNED_FLAG) != 0 {
414         info!("CoIommu: gfn 0x{:x} already pinned", gfn);
415         return Ok(());
416     }
417 
418     // SAFETY:
419     // Safe because the gpa is valid from the gfn_to_dtt_pte and the host_addr
420     // is guaranteed by MemoryMapping interface.
421     if unsafe { vfio_map(vfio_container, gpa, PAGE_SIZE_4K, host_addr) } {
422         // SAFETY:
423         // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
424         // set PINNED flag
425         unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
426         if policy == CoIommuUnpinPolicy::Lru {
427             pinstate
428                 .new_gen_pinned_pages
429                 .push_back(PinnedPageInfo::new(gfn, 0));
430         }
431     }
432 
433     Ok(())
434 }
435 
436 #[derive(PartialEq, Debug, Eq)]
437 enum UnpinResult {
438     UnpinlistEmpty,
439     Unpinned,
440     NotPinned,
441     NotUnpinned,
442     FailedUnpin,
443     UnpinParked,
444 }
445 
unpin_page( pinstate: &mut CoIommuPinState, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, force: bool, ) -> UnpinResult446 fn unpin_page(
447     pinstate: &mut CoIommuPinState,
448     vfio_container: &Arc<Mutex<VfioContainer>>,
449     mem: &GuestMemory,
450     dtt_level: u64,
451     dtt_root: u64,
452     dtt_iter: &mut DTTIter,
453     gfn: u64,
454     force: bool,
455 ) -> UnpinResult {
456     if pinstate.unpin_thread_state == UnpinThreadState::Parked {
457         return UnpinResult::UnpinParked;
458     }
459 
460     let leaf_entry = match gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn) {
461         Ok(v) => v,
462         Err(_) => {
463             // The case force == true may try to unpin a page which is not
464             // mapped in the dtt. For such page, the pte doesn't exist yet
465             // thus don't need to report any error log.
466             // The case force == false is used by coiommu to periodically
467             // unpin the pages which have been mapped in dtt, thus the pte
468             // for such page does exist. However with the unpin request from
469             // virtio balloon, such pages can be unpinned already and the DTT
470             // pages might be reclaimed by the Guest OS kernel as well, thus
471             // it is also possible to be here. Not to report an error log.
472             return UnpinResult::NotPinned;
473         }
474     };
475 
476     if force {
477         // SAFETY:
478         // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
479         // This case is for balloon to evict pages so these pages should
480         // already been locked by balloon and no device driver in VM is
481         // able to access these pages, so just clear ACCESSED flag first
482         // to make sure the following unpin can be success.
483         unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
484     }
485 
486     // SAFETY:
487     // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
488     if let Err(entry) = unsafe {
489         (*leaf_entry).compare_exchange(DTTE_PINNED_FLAG, 0, Ordering::SeqCst, Ordering::SeqCst)
490     } {
491         // The compare_exchange failed as the original leaf entry is
492         // not DTTE_PINNED_FLAG so cannot do the unpin.
493         if entry == 0 {
494             // The GFN is already unpinned. This is very similar to the
495             // gfn_to_dtt_pte error case, with the only difference being
496             // that the dtt_pte happens to be on a present page table.
497             UnpinResult::NotPinned
498         } else {
499             if !force {
500                 // SAFETY:
501                 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
502                 // The ACCESSED_FLAG is set by the guest if guest requires DMA map for
503                 // this page. It represents whether or not this page is touched by the
504                 // guest. By clearing this flag after an unpin work, we can detect if
505                 // this page has been touched by the guest in the next round of unpin
506                 // work. If the ACCESSED_FLAG is set at the next round, unpin this page
507                 // will be failed and we will be here again to clear this flag. If this
508                 // flag is not set at the next round, unpin this page will be probably
509                 // success.
510                 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
511             } else {
512                 // If we're here, then the guest is trying to release a page via the
513                 // balloon that it still has pinned. This most likely that something is
514                 // wrong in the guest kernel. Just leave the page pinned and log
515                 // an error.
516                 // This failure blocks the balloon from removing the page, which ensures
517                 // that the guest's view of memory will remain consistent with device
518                 // DMA's view of memory. Also note that the host kernel maintains an
519                 // elevated refcount for pinned pages, which is a second guarantee the
520                 // pages accessible by device DMA won't be freed until after they are
521                 // unpinned.
522                 error!(
523                     "CoIommu: force case cannot pin gfn 0x{:x} entry 0x{:x}",
524                     gfn, entry
525                 );
526             }
527             // GFN cannot be unpinned either because the unmap count
528             // is non-zero or the it has accessed flag set.
529             UnpinResult::NotUnpinned
530         }
531     } else {
532         // The compare_exchange success as the original leaf entry is
533         // DTTE_PINNED_FLAG and the new leaf entry is 0 now. Unpin the
534         // page.
535         let gpa = gfn << PAGE_SHIFT_4K;
536         if vfio_unmap(vfio_container, gpa, PAGE_SIZE_4K) {
537             UnpinResult::Unpinned
538         } else {
539             // SAFETY:
540             // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
541             // make sure the pinned flag is set
542             unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
543             // need to put this gfn back to pinned vector
544             UnpinResult::FailedUnpin
545         }
546     }
547 }
548 
549 struct PinWorker {
550     mem: GuestMemory,
551     endpoints: Vec<u16>,
552     notifymap_mmap: Arc<MemoryMapping>,
553     dtt_level: u64,
554     dtt_root: u64,
555     ioevents: Vec<Event>,
556     vfio_container: Arc<Mutex<VfioContainer>>,
557     pinstate: Arc<Mutex<CoIommuPinState>>,
558     params: CoIommuParameters,
559 }
560 
561 impl PinWorker {
debug_label(&self) -> &'static str562     fn debug_label(&self) -> &'static str {
563         "CoIommuPinWorker"
564     }
565 
run(&mut self, kill_evt: Event)566     fn run(&mut self, kill_evt: Event) {
567         #[derive(EventToken)]
568         enum Token {
569             Kill,
570             Pin { index: usize },
571         }
572 
573         let wait_ctx: WaitContext<Token> =
574             match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
575                 Ok(pc) => pc,
576                 Err(e) => {
577                     error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
578                     return;
579                 }
580             };
581 
582         for (index, event) in self.ioevents.iter().enumerate() {
583             match wait_ctx.add(event, Token::Pin { index }) {
584                 Ok(_) => {}
585                 Err(e) => {
586                     error!(
587                         "{}: failed to add ioevent for index {}: {}",
588                         self.debug_label(),
589                         index,
590                         e
591                     );
592                     return;
593                 }
594             }
595         }
596 
597         'wait: loop {
598             let events = match wait_ctx.wait() {
599                 Ok(v) => v,
600                 Err(e) => {
601                     error!("{}: failed polling for events: {}", self.debug_label(), e);
602                     break;
603                 }
604             };
605 
606             for event in events.iter().filter(|e| e.is_readable) {
607                 match event.token {
608                     Token::Kill => break 'wait,
609                     Token::Pin { index } => {
610                         let offset = index * mem::size_of::<u64>();
611                         if let Some(event) = self.ioevents.get(index) {
612                             if let Err(e) = event.wait() {
613                                 error!(
614                                     "{}: failed reading event {}: {}",
615                                     self.debug_label(),
616                                     index,
617                                     e
618                                 );
619                                 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
620                                 break 'wait;
621                             }
622                         }
623                         if let Ok(data) = self.notifymap_mmap.read_obj::<u64>(offset) {
624                             if let Err(e) = self.pin_pages(data) {
625                                 error!("{}: {}", self.debug_label(), e);
626                             }
627                         }
628                         fence(Ordering::SeqCst);
629                         self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
630                     }
631                 }
632             }
633         }
634     }
635 
pin_pages_in_batch(&mut self, gpa: u64) -> Result<()>636     fn pin_pages_in_batch(&mut self, gpa: u64) -> Result<()> {
637         let pin_page_info = self
638             .mem
639             .read_obj_from_addr::<PinPageInfo>(GuestAddress(gpa))
640             .context("failed to get pin page info")?;
641 
642         let bdf = pin_page_info.bdf;
643         ensure!(
644             self.endpoints.iter().any(|&x| x == bdf),
645             "pin page for unexpected bdf 0x{:x}",
646             bdf
647         );
648 
649         let mut nr_pages = pin_page_info.nr_pages;
650         let mut offset = mem::size_of::<PinPageInfo>() as u64;
651         let mut dtt_iter: DTTIter = Default::default();
652         let mut pinstate = self.pinstate.lock();
653         while nr_pages > 0 {
654             let gfn = self
655                 .mem
656                 .read_obj_from_addr::<u64>(GuestAddress(gpa + offset))
657                 .context("failed to get pin page gfn")?;
658 
659             pin_page(
660                 &mut pinstate,
661                 self.params.unpin_policy,
662                 &self.vfio_container,
663                 &self.mem,
664                 self.dtt_level,
665                 self.dtt_root,
666                 &mut dtt_iter,
667                 gfn,
668             )?;
669 
670             offset += mem::size_of::<u64>() as u64;
671             nr_pages -= 1;
672         }
673 
674         Ok(())
675     }
676 
pin_pages(&mut self, gfn_bdf: u64) -> Result<()>677     fn pin_pages(&mut self, gfn_bdf: u64) -> Result<()> {
678         if gfn_bdf & PIN_PAGES_IN_BATCH != 0 {
679             let gpa = gfn_bdf & !PIN_PAGES_IN_BATCH;
680             self.pin_pages_in_batch(gpa)
681         } else {
682             let bdf = (gfn_bdf & 0xffff) as u16;
683             let gfn = gfn_bdf >> 16;
684             let mut dtt_iter: DTTIter = Default::default();
685             ensure!(
686                 self.endpoints.iter().any(|&x| x == bdf),
687                 "pin page for unexpected bdf 0x{:x}",
688                 bdf
689             );
690 
691             let mut pinstate = self.pinstate.lock();
692             pin_page(
693                 &mut pinstate,
694                 self.params.unpin_policy,
695                 &self.vfio_container,
696                 &self.mem,
697                 self.dtt_level,
698                 self.dtt_root,
699                 &mut dtt_iter,
700                 gfn,
701             )
702         }
703     }
704 }
705 
706 struct UnpinWorker {
707     mem: GuestMemory,
708     dtt_level: u64,
709     dtt_root: u64,
710     vfio_container: Arc<Mutex<VfioContainer>>,
711     unpin_tube: Option<Tube>,
712     pinstate: Arc<Mutex<CoIommuPinState>>,
713     params: CoIommuParameters,
714     unpin_gen_threshold: u64,
715 }
716 
717 impl UnpinWorker {
debug_label(&self) -> &'static str718     fn debug_label(&self) -> &'static str {
719         "CoIommuUnpinWorker"
720     }
721 
run(&mut self, kill_evt: Event)722     fn run(&mut self, kill_evt: Event) {
723         #[derive(EventToken)]
724         enum Token {
725             UnpinTimer,
726             UnpinReq,
727             Kill,
728         }
729 
730         let wait_ctx: WaitContext<Token> =
731             match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
732                 Ok(pc) => pc,
733                 Err(e) => {
734                     error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
735                     return;
736                 }
737             };
738 
739         if let Some(tube) = &self.unpin_tube {
740             if let Err(e) = wait_ctx.add(tube, Token::UnpinReq) {
741                 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
742                 return;
743             }
744         }
745 
746         let mut unpin_timer = if self.params.unpin_policy != CoIommuUnpinPolicy::Off
747             && !self.params.unpin_interval.is_zero()
748         {
749             let mut timer = match Timer::new() {
750                 Ok(t) => t,
751                 Err(e) => {
752                     error!(
753                         "{}: failed to create the unpin timer: {}",
754                         self.debug_label(),
755                         e
756                     );
757                     return;
758                 }
759             };
760             if let Err(e) = timer.reset_repeating(self.params.unpin_interval) {
761                 error!(
762                     "{}: failed to start the unpin timer: {}",
763                     self.debug_label(),
764                     e
765                 );
766                 return;
767             }
768             if let Err(e) = wait_ctx.add(&timer, Token::UnpinTimer) {
769                 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
770                 return;
771             }
772             Some(timer)
773         } else {
774             None
775         };
776 
777         let unpin_tube = self.unpin_tube.take();
778         'wait: loop {
779             let events = match wait_ctx.wait() {
780                 Ok(v) => v,
781                 Err(e) => {
782                     error!("{}: failed polling for events: {}", self.debug_label(), e);
783                     break;
784                 }
785             };
786 
787             for event in events.iter().filter(|e| e.is_readable) {
788                 match event.token {
789                     Token::UnpinTimer => {
790                         self.unpin_pages();
791                         if let Some(timer) = &mut unpin_timer {
792                             if let Err(e) = timer.mark_waited() {
793                                 error!(
794                                     "{}: failed to clear unpin timer: {}",
795                                     self.debug_label(),
796                                     e
797                                 );
798                                 break 'wait;
799                             }
800                         }
801                     }
802                     Token::UnpinReq => {
803                         if let Some(tube) = &unpin_tube {
804                             match tube.recv::<UnpinRequest>() {
805                                 Ok(req) => {
806                                     let mut unpin_done = true;
807                                     for range in req.ranges {
808                                         // Locking with respect to pin_pages isn't necessary
809                                         // for this case because the unpinned pages in the range
810                                         // should all be in the balloon and so nothing will attempt
811                                         // to pin them.
812                                         if !self.unpin_pages_in_range(range.0, range.1) {
813                                             unpin_done = false;
814                                             break;
815                                         }
816                                     }
817                                     let resp = if unpin_done {
818                                         UnpinResponse::Success
819                                     } else {
820                                         UnpinResponse::Failed
821                                     };
822                                     if let Err(e) = tube.send(&resp) {
823                                         error!(
824                                             "{}: failed to send unpin response {}",
825                                             self.debug_label(),
826                                             e
827                                         );
828                                     }
829                                 }
830                                 Err(e) => {
831                                     if let TubeError::Disconnected = e {
832                                         if let Err(e) = wait_ctx.delete(tube) {
833                                             error!(
834                                                 "{}: failed to remove unpin_tube: {}",
835                                                 self.debug_label(),
836                                                 e
837                                             );
838                                         }
839                                     } else {
840                                         error!(
841                                             "{}: failed to recv Unpin Request: {}",
842                                             self.debug_label(),
843                                             e
844                                         );
845                                     }
846                                 }
847                             }
848                         }
849                     }
850                     Token::Kill => break 'wait,
851                 }
852             }
853         }
854         self.unpin_tube = unpin_tube;
855     }
856 
unpin_pages(&mut self)857     fn unpin_pages(&mut self) {
858         if self.params.unpin_policy == CoIommuUnpinPolicy::Lru {
859             self.lru_unpin_pages();
860         }
861     }
862 
lru_unpin_page( &mut self, dtt_iter: &mut DTTIter, new_gen: bool, ) -> (UnpinResult, Option<PinnedPageInfo>)863     fn lru_unpin_page(
864         &mut self,
865         dtt_iter: &mut DTTIter,
866         new_gen: bool,
867     ) -> (UnpinResult, Option<PinnedPageInfo>) {
868         let mut pinstate = self.pinstate.lock();
869         let pageinfo = if new_gen {
870             pinstate.new_gen_pinned_pages.pop_front()
871         } else {
872             pinstate
873                 .old_gen_pinned_pages
874                 .pop_front()
875                 .map(|gfn| PinnedPageInfo::new(gfn, 0))
876         };
877 
878         pageinfo.map_or((UnpinResult::UnpinlistEmpty, None), |pageinfo| {
879             (
880                 unpin_page(
881                     &mut pinstate,
882                     &self.vfio_container,
883                     &self.mem,
884                     self.dtt_level,
885                     self.dtt_root,
886                     dtt_iter,
887                     pageinfo.gfn,
888                     false,
889                 ),
890                 Some(pageinfo),
891             )
892         })
893     }
894 
lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64895     fn lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64 {
896         let mut not_unpinned_new_gen_pages = VecDeque::new();
897         let mut not_unpinned_old_gen_pages = VecDeque::new();
898         let mut unpinned_count = 0;
899         let has_limit = unpin_limit.is_some();
900         let limit_count = unpin_limit.unwrap_or(0);
901         let mut dtt_iter: DTTIter = Default::default();
902 
903         // If has_limit is true but limit_count is 0, will not do the unpin
904         while !has_limit || unpinned_count != limit_count {
905             let (result, pinned_page) = self.lru_unpin_page(&mut dtt_iter, new_gen);
906             match result {
907                 UnpinResult::UnpinlistEmpty => break,
908                 UnpinResult::Unpinned => unpinned_count += 1,
909                 UnpinResult::NotPinned => {}
910                 UnpinResult::NotUnpinned => {
911                     if let Some(mut page) = pinned_page {
912                         if self.params.unpin_gen_threshold != 0 {
913                             page.unpin_busy_cnt += 1;
914                             // Unpin from new_gen queue but not
915                             // successfully unpinned. Need to check
916                             // the unpin_gen threshold. If reach, put
917                             // it to old_gen queue.
918                             // And if it is not from new_gen, directly
919                             // put into old_gen queue.
920                             if !new_gen || page.unpin_busy_cnt >= self.params.unpin_gen_threshold {
921                                 not_unpinned_old_gen_pages.push_back(page.gfn);
922                             } else {
923                                 not_unpinned_new_gen_pages.push_back(page);
924                             }
925                         }
926                     }
927                 }
928                 UnpinResult::FailedUnpin | UnpinResult::UnpinParked => {
929                     // Although UnpinParked means we didn't actually try to unpin
930                     // gfn, it's not worth specifically handing since parking is
931                     // expected to be relatively rare.
932                     if let Some(page) = pinned_page {
933                         if new_gen {
934                             not_unpinned_new_gen_pages.push_back(page);
935                         } else {
936                             not_unpinned_old_gen_pages.push_back(page.gfn);
937                         }
938                     }
939                     if result == UnpinResult::UnpinParked {
940                         thread::park();
941                     }
942                 }
943             }
944         }
945 
946         if !not_unpinned_new_gen_pages.is_empty() {
947             let mut pinstate = self.pinstate.lock();
948             pinstate
949                 .new_gen_pinned_pages
950                 .append(&mut not_unpinned_new_gen_pages);
951         }
952 
953         if !not_unpinned_old_gen_pages.is_empty() {
954             let mut pinstate = self.pinstate.lock();
955             pinstate
956                 .old_gen_pinned_pages
957                 .append(&mut not_unpinned_old_gen_pages);
958         }
959 
960         unpinned_count
961     }
962 
lru_unpin_pages(&mut self)963     fn lru_unpin_pages(&mut self) {
964         let mut unpin_count = 0;
965         if self.params.unpin_gen_threshold != 0 {
966             self.unpin_gen_threshold += 1;
967             if self.unpin_gen_threshold == self.params.unpin_gen_threshold {
968                 self.unpin_gen_threshold = 0;
969                 // Try to unpin inactive queue first if reaches the thres hold
970                 unpin_count = self.lru_unpin_pages_in_loop(self.params.unpin_limit, false);
971             }
972         }
973         // Unpin the new_gen queue with the updated unpin_limit after unpin old_gen queue
974         self.lru_unpin_pages_in_loop(
975             self.params
976                 .unpin_limit
977                 .map(|limit| limit.saturating_sub(unpin_count)),
978             true,
979         );
980     }
981 
unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool982     fn unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool {
983         let mut dtt_iter: DTTIter = Default::default();
984         let mut index = 0;
985         while index != count {
986             let mut pinstate = self.pinstate.lock();
987             let result = unpin_page(
988                 &mut pinstate,
989                 &self.vfio_container,
990                 &self.mem,
991                 self.dtt_level,
992                 self.dtt_root,
993                 &mut dtt_iter,
994                 gfn + index,
995                 true,
996             );
997             drop(pinstate);
998 
999             match result {
1000                 UnpinResult::Unpinned | UnpinResult::NotPinned => {}
1001                 UnpinResult::UnpinParked => {
1002                     thread::park();
1003                     continue;
1004                 }
1005                 _ => {
1006                     error!("coiommu: force unpin failed by {:?}", result);
1007                     return false;
1008                 }
1009             }
1010             index += 1;
1011         }
1012         true
1013     }
1014 }
1015 
1016 pub struct CoIommuDev {
1017     config_regs: PciConfiguration,
1018     pci_address: Option<PciAddress>,
1019     mem: GuestMemory,
1020     coiommu_reg: CoIommuReg,
1021     endpoints: Vec<u16>,
1022     notifymap_mem: SafeDescriptor,
1023     notifymap_mmap: Arc<MemoryMapping>,
1024     notifymap_addr: Option<u64>,
1025     topologymap_mem: SafeDescriptor,
1026     topologymap_addr: Option<u64>,
1027     mmapped: bool,
1028     vm_memory_client: VmMemoryClient,
1029     pin_thread: Option<WorkerThread<PinWorker>>,
1030     unpin_thread: Option<WorkerThread<UnpinWorker>>,
1031     unpin_tube: Option<Tube>,
1032     ioevents: Vec<Event>,
1033     vfio_container: Arc<Mutex<VfioContainer>>,
1034     pinstate: Arc<Mutex<CoIommuPinState>>,
1035     params: CoIommuParameters,
1036 }
1037 
1038 impl CoIommuDev {
new( mem: GuestMemory, vfio_container: Arc<Mutex<VfioContainer>>, vm_memory_client: VmMemoryClient, unpin_tube: Option<Tube>, endpoints: Vec<u16>, vcpu_count: u64, params: CoIommuParameters, ) -> Result<Self>1039     pub fn new(
1040         mem: GuestMemory,
1041         vfio_container: Arc<Mutex<VfioContainer>>,
1042         vm_memory_client: VmMemoryClient,
1043         unpin_tube: Option<Tube>,
1044         endpoints: Vec<u16>,
1045         vcpu_count: u64,
1046         params: CoIommuParameters,
1047     ) -> Result<Self> {
1048         let config_regs = PciConfiguration::new(
1049             PCI_VENDOR_ID_COIOMMU,
1050             PCI_DEVICE_ID_COIOMMU,
1051             PciClassCode::Other,
1052             &PciOtherSubclass::Other,
1053             None, // No Programming interface.
1054             PciHeaderType::Device,
1055             PCI_VENDOR_ID_COIOMMU,
1056             PCI_DEVICE_ID_COIOMMU,
1057             COIOMMU_REVISION_ID,
1058         );
1059 
1060         // notifymap_mem is used as Bar2 for Guest to check if request is completed by coIOMMU.
1061         let notifymap_mem = SharedMemory::new("coiommu_notifymap", COIOMMU_NOTIFYMAP_SIZE as u64)
1062             .context(Error::CreateSharedMemory)?;
1063         let notifymap_mmap = Arc::new(
1064             MemoryMappingBuilder::new(COIOMMU_NOTIFYMAP_SIZE)
1065                 .from_shared_memory(&notifymap_mem)
1066                 .offset(0)
1067                 .build()?,
1068         );
1069 
1070         // topologymap_mem is used as Bar4 for Guest to check which device is on top of coIOMMU.
1071         let topologymap_mem =
1072             SharedMemory::new("coiommu_topologymap", COIOMMU_TOPOLOGYMAP_SIZE as u64)
1073                 .context(Error::CreateSharedMemory)?;
1074         let topologymap_mmap = Arc::new(
1075             MemoryMappingBuilder::new(COIOMMU_TOPOLOGYMAP_SIZE)
1076                 .from_shared_memory(&topologymap_mem)
1077                 .offset(0)
1078                 .build()?,
1079         );
1080 
1081         ensure!(
1082             (endpoints.len() + 1) * mem::size_of::<u16>() <= COIOMMU_TOPOLOGYMAP_SIZE,
1083             "Coiommu: too many endpoints"
1084         );
1085         topologymap_mmap.write_obj::<u16>(endpoints.len() as u16, 0)?;
1086         for (index, endpoint) in endpoints.iter().enumerate() {
1087             topologymap_mmap.write_obj::<u16>(*endpoint, (index + 1) * mem::size_of::<u16>())?;
1088         }
1089 
1090         let mut ioevents = Vec::new();
1091         for _ in 0..vcpu_count {
1092             ioevents.push(Event::new().context("CoIommu failed to create event fd")?);
1093         }
1094 
1095         Ok(Self {
1096             config_regs,
1097             pci_address: None,
1098             mem,
1099             coiommu_reg: Default::default(),
1100             endpoints,
1101             notifymap_mem: notifymap_mem.into(),
1102             notifymap_mmap,
1103             notifymap_addr: None,
1104             topologymap_mem: topologymap_mem.into(),
1105             topologymap_addr: None,
1106             mmapped: false,
1107             vm_memory_client,
1108             pin_thread: None,
1109             unpin_thread: None,
1110             unpin_tube,
1111             ioevents,
1112             vfio_container,
1113             pinstate: Arc::new(Mutex::new(CoIommuPinState {
1114                 new_gen_pinned_pages: VecDeque::new(),
1115                 old_gen_pinned_pages: VecDeque::new(),
1116                 unpin_thread_state: UnpinThreadState::Unparked,
1117                 unpin_park_count: 0,
1118             })),
1119             params,
1120         })
1121     }
1122 
register_mmap( &self, descriptor: SafeDescriptor, size: usize, offset: u64, gpa: u64, prot: Protection, ) -> Result<()>1123     fn register_mmap(
1124         &self,
1125         descriptor: SafeDescriptor,
1126         size: usize,
1127         offset: u64,
1128         gpa: u64,
1129         prot: Protection,
1130     ) -> Result<()> {
1131         let _region = self
1132             .vm_memory_client
1133             .register_memory(
1134                 VmMemorySource::Descriptor {
1135                     descriptor,
1136                     offset,
1137                     size: size as u64,
1138                 },
1139                 VmMemoryDestination::GuestPhysicalAddress(gpa),
1140                 prot,
1141                 MemCacheType::CacheCoherent,
1142             )
1143             .context("register_mmap register_memory failed")?;
1144         Ok(())
1145     }
1146 
mmap(&mut self)1147     fn mmap(&mut self) {
1148         if self.mmapped {
1149             return;
1150         }
1151 
1152         if let Some(gpa) = self.notifymap_addr {
1153             match self.register_mmap(
1154                 self.notifymap_mem.try_clone().unwrap(),
1155                 COIOMMU_NOTIFYMAP_SIZE,
1156                 0,
1157                 gpa,
1158                 Protection::read_write(),
1159             ) {
1160                 Ok(_) => {}
1161                 Err(e) => {
1162                     panic!("{}: map notifymap failed: {}", self.debug_label(), e);
1163                 }
1164             }
1165         }
1166 
1167         if let Some(gpa) = self.topologymap_addr {
1168             match self.register_mmap(
1169                 self.topologymap_mem.try_clone().unwrap(),
1170                 COIOMMU_TOPOLOGYMAP_SIZE,
1171                 0,
1172                 gpa,
1173                 Protection::read(),
1174             ) {
1175                 Ok(_) => {}
1176                 Err(e) => {
1177                     panic!("{}: map topologymap failed: {}", self.debug_label(), e);
1178                 }
1179             }
1180         }
1181 
1182         self.mmapped = true;
1183     }
1184 
start_workers(&mut self)1185     fn start_workers(&mut self) {
1186         if self.pin_thread.is_none() {
1187             self.start_pin_thread();
1188         }
1189 
1190         if self.unpin_thread.is_none() {
1191             self.start_unpin_thread();
1192         }
1193     }
1194 
start_pin_thread(&mut self)1195     fn start_pin_thread(&mut self) {
1196         let mem = self.mem.clone();
1197         let endpoints = self.endpoints.to_vec();
1198         let notifymap_mmap = self.notifymap_mmap.clone();
1199         let dtt_root = self.coiommu_reg.dtt_root;
1200         let dtt_level = self.coiommu_reg.dtt_level;
1201         let ioevents: Vec<Event> = self
1202             .ioevents
1203             .iter()
1204             .map(|e| e.try_clone().unwrap())
1205             .collect();
1206 
1207         let bar0 = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR);
1208         let notify_base = bar0 + mem::size_of::<CoIommuReg>() as u64;
1209         for (i, evt) in self.ioevents.iter().enumerate() {
1210             self.vm_memory_client
1211                 .register_io_event(
1212                     evt.try_clone().expect("failed to clone event"),
1213                     notify_base + i as u64,
1214                     Datamatch::AnyLength,
1215                 )
1216                 .expect("failed to register ioevent");
1217         }
1218 
1219         let vfio_container = self.vfio_container.clone();
1220         let pinstate = self.pinstate.clone();
1221         let params = self.params;
1222 
1223         self.pin_thread = Some(WorkerThread::start("coiommu_pin", move |kill_evt| {
1224             let mut worker = PinWorker {
1225                 mem,
1226                 endpoints,
1227                 notifymap_mmap,
1228                 dtt_root,
1229                 dtt_level,
1230                 ioevents,
1231                 vfio_container,
1232                 pinstate,
1233                 params,
1234             };
1235             worker.run(kill_evt);
1236             worker
1237         }));
1238     }
1239 
start_unpin_thread(&mut self)1240     fn start_unpin_thread(&mut self) {
1241         let mem = self.mem.clone();
1242         let dtt_root = self.coiommu_reg.dtt_root;
1243         let dtt_level = self.coiommu_reg.dtt_level;
1244         let vfio_container = self.vfio_container.clone();
1245         let unpin_tube = self.unpin_tube.take();
1246         let pinstate = self.pinstate.clone();
1247         let params = self.params;
1248         self.unpin_thread = Some(WorkerThread::start("coiommu_unpin", move |kill_evt| {
1249             let mut worker = UnpinWorker {
1250                 mem,
1251                 dtt_level,
1252                 dtt_root,
1253                 vfio_container,
1254                 unpin_tube,
1255                 pinstate,
1256                 params,
1257                 unpin_gen_threshold: 0,
1258             };
1259             worker.run(kill_evt);
1260             worker
1261         }));
1262     }
1263 
allocate_bar_address( &mut self, resources: &mut SystemAllocator, address: PciAddress, size: u64, bar_num: u8, name: &str, ) -> PciResult<u64>1264     fn allocate_bar_address(
1265         &mut self,
1266         resources: &mut SystemAllocator,
1267         address: PciAddress,
1268         size: u64,
1269         bar_num: u8,
1270         name: &str,
1271     ) -> PciResult<u64> {
1272         let addr = resources
1273             .allocate_mmio(
1274                 size,
1275                 Alloc::PciBar {
1276                     bus: address.bus,
1277                     dev: address.dev,
1278                     func: address.func,
1279                     bar: bar_num,
1280                 },
1281                 name.to_string(),
1282                 AllocOptions::new().prefetchable(true).align(size),
1283             )
1284             .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1285 
1286         let bar = PciBarConfiguration::new(
1287             bar_num as usize,
1288             size,
1289             PciBarRegionType::Memory64BitRegion,
1290             PciBarPrefetchable::Prefetchable,
1291         )
1292         .set_address(addr);
1293 
1294         self.config_regs
1295             .add_pci_bar(bar)
1296             .map_err(|e| PciDeviceError::IoRegistrationFailed(addr, e))?;
1297 
1298         Ok(addr)
1299     }
1300 
read_mmio(&mut self, offset: u64, data: &mut [u8])1301     fn read_mmio(&mut self, offset: u64, data: &mut [u8]) {
1302         if offset >= mem::size_of::<CoIommuReg>() as u64 {
1303             error!(
1304                 "{}: read_mmio: invalid offset 0x{:x}",
1305                 self.debug_label(),
1306                 offset
1307             );
1308             return;
1309         }
1310 
1311         // Sanity check, must be 64bit aligned accessing
1312         if offset % 8 != 0 || data.len() != 8 {
1313             error!(
1314                 "{}: read_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1315                 self.debug_label(),
1316                 offset,
1317                 data.len()
1318             );
1319             return;
1320         }
1321 
1322         let v = match offset / 8 {
1323             0 => self.coiommu_reg.dtt_root,
1324             1 => self.coiommu_reg.cmd,
1325             2 => self.coiommu_reg.dtt_level,
1326             _ => return,
1327         };
1328 
1329         data.copy_from_slice(&v.to_ne_bytes());
1330     }
1331 
write_mmio(&mut self, offset: u64, data: &[u8])1332     fn write_mmio(&mut self, offset: u64, data: &[u8]) {
1333         let mmio_len = mem::size_of::<CoIommuReg>() as u64;
1334         if offset >= mmio_len {
1335             if data.len() != 1 {
1336                 error!(
1337                     "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 1",
1338                     self.debug_label(),
1339                     offset,
1340                     data.len()
1341                 );
1342                 return;
1343             }
1344 
1345             // Usually will not be here as this is for the per-vcpu notify
1346             // register which is monitored by the ioevents. For the notify
1347             // register which is not covered by the ioevents, they are not
1348             // be used by the frontend driver. In case the frontend driver
1349             // went here, do a simple handle to make sure the frontend driver
1350             // will not be blocked, and through an error log.
1351             let index = (offset - mmio_len) as usize;
1352             if let Some(event) = self.ioevents.get(index) {
1353                 let _ = event.signal();
1354             } else {
1355                 self.notifymap_mmap
1356                     .write_obj::<u64>(0, index * mem::size_of::<u64>())
1357                     .unwrap();
1358                 error!(
1359                     "{}: No page will be pinned as driver is accessing unused trigger register: offset 0x{:x}",
1360                     self.debug_label(),
1361                     offset
1362                 );
1363             }
1364             return;
1365         }
1366 
1367         // Sanity check, must be 64bit aligned accessing for CoIommuReg
1368         if offset % 8 != 0 || data.len() != 8 {
1369             error!(
1370                 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1371                 self.debug_label(),
1372                 offset,
1373                 data.len()
1374             );
1375             return;
1376         }
1377 
1378         let index = offset / 8;
1379         let v = u64::from_ne_bytes(data.try_into().unwrap());
1380         match index {
1381             0 => {
1382                 if self.coiommu_reg.dtt_root == 0 {
1383                     self.coiommu_reg.dtt_root = v;
1384                 }
1385             }
1386             1 => match v {
1387                 // Deactivate can happen if the frontend driver in the guest
1388                 // fails during probing or if the CoIommu device is removed
1389                 // by the guest. Neither of these cases is expected, and if
1390                 // either happens the guest will be non-functional due to
1391                 // pass-through devices which rely on CoIommu not working.
1392                 // So just fail hard and panic.
1393                 COIOMMU_CMD_DEACTIVATE => {
1394                     panic!("{}: Deactivate is not supported", self.debug_label())
1395                 }
1396                 COIOMMU_CMD_ACTIVATE => {
1397                     if self.coiommu_reg.dtt_root != 0 && self.coiommu_reg.dtt_level != 0 {
1398                         self.start_workers();
1399                     }
1400                 }
1401                 COIOMMU_CMD_PARK_UNPIN => {
1402                     let mut pinstate = self.pinstate.lock();
1403                     pinstate.unpin_thread_state = UnpinThreadState::Parked;
1404                     if let Some(v) = pinstate.unpin_park_count.checked_add(1) {
1405                         pinstate.unpin_park_count = v;
1406                     } else {
1407                         panic!("{}: Park request overflowing", self.debug_label());
1408                     }
1409                 }
1410                 COIOMMU_CMD_UNPARK_UNPIN => {
1411                     let mut pinstate = self.pinstate.lock();
1412                     if pinstate.unpin_thread_state == UnpinThreadState::Parked {
1413                         if let Some(v) = pinstate.unpin_park_count.checked_sub(1) {
1414                             pinstate.unpin_park_count = v;
1415                             if pinstate.unpin_park_count == 0 {
1416                                 if let Some(worker_thread) = &self.unpin_thread {
1417                                     worker_thread.thread().unpark();
1418                                 }
1419                                 pinstate.unpin_thread_state = UnpinThreadState::Unparked;
1420                             }
1421                         } else {
1422                             error!("{}: Park count is already reached to 0", self.debug_label());
1423                         }
1424                     }
1425                 }
1426                 _ => {}
1427             },
1428             2 => {
1429                 if self.coiommu_reg.dtt_level == 0 {
1430                     self.coiommu_reg.dtt_level = v;
1431                 }
1432             }
1433             _ => {}
1434         }
1435     }
1436 }
1437 
1438 impl PciDevice for CoIommuDev {
debug_label(&self) -> String1439     fn debug_label(&self) -> String {
1440         "CoIommu".to_owned()
1441     }
1442 
allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress>1443     fn allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress> {
1444         if self.pci_address.is_none() {
1445             self.pci_address = match resources.allocate_pci(0, self.debug_label()) {
1446                 Some(Alloc::PciBar {
1447                     bus,
1448                     dev,
1449                     func,
1450                     bar: _,
1451                 }) => Some(PciAddress { bus, dev, func }),
1452                 _ => None,
1453             }
1454         }
1455         self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1456     }
1457 
allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>>1458     fn allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>> {
1459         let address = self
1460             .pci_address
1461             .expect("allocate_address must be called prior to allocate_io_bars");
1462 
1463         // Allocate one bar for the structures pointed to by the capability structures.
1464         let mut ranges: Vec<BarRange> = Vec::new();
1465 
1466         let mmio_addr = self.allocate_bar_address(
1467             resources,
1468             address,
1469             COIOMMU_MMIO_BAR_SIZE,
1470             COIOMMU_MMIO_BAR as u8,
1471             "coiommu-mmiobar",
1472         )?;
1473 
1474         ranges.push(BarRange {
1475             addr: mmio_addr,
1476             size: COIOMMU_MMIO_BAR_SIZE,
1477             prefetchable: false,
1478         });
1479 
1480         Ok(ranges)
1481     }
1482 
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> PciResult<Vec<BarRange>>1483     fn allocate_device_bars(
1484         &mut self,
1485         resources: &mut SystemAllocator,
1486     ) -> PciResult<Vec<BarRange>> {
1487         let address = self
1488             .pci_address
1489             .expect("allocate_address must be called prior to allocate_device_bars");
1490 
1491         let mut ranges: Vec<BarRange> = Vec::new();
1492 
1493         let topologymap_addr = self.allocate_bar_address(
1494             resources,
1495             address,
1496             COIOMMU_TOPOLOGYMAP_SIZE as u64,
1497             COIOMMU_TOPOLOGYMAP_BAR,
1498             "coiommu-topology",
1499         )?;
1500         self.topologymap_addr = Some(topologymap_addr);
1501         ranges.push(BarRange {
1502             addr: topologymap_addr,
1503             size: COIOMMU_TOPOLOGYMAP_SIZE as u64,
1504             prefetchable: false,
1505         });
1506 
1507         let notifymap_addr = self.allocate_bar_address(
1508             resources,
1509             address,
1510             COIOMMU_NOTIFYMAP_SIZE as u64,
1511             COIOMMU_NOTIFYMAP_BAR as u8,
1512             "coiommu-notifymap",
1513         )?;
1514         self.notifymap_addr = Some(notifymap_addr);
1515         ranges.push(BarRange {
1516             addr: notifymap_addr,
1517             size: COIOMMU_NOTIFYMAP_SIZE as u64,
1518             prefetchable: false,
1519         });
1520 
1521         Ok(ranges)
1522     }
1523 
read_config_register(&self, reg_idx: usize) -> u321524     fn read_config_register(&self, reg_idx: usize) -> u32 {
1525         self.config_regs.read_reg(reg_idx)
1526     }
1527 
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1528     fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1529         if reg_idx == COMMAND_REG
1530             && data.len() == 2
1531             && data[0] & COMMAND_REG_MEMORY_SPACE_MASK as u8 != 0
1532             && !self.mmapped
1533         {
1534             self.mmap();
1535         }
1536 
1537         self.config_regs.write_reg(reg_idx, offset, data);
1538     }
1539 
keep_rds(&self) -> Vec<RawDescriptor>1540     fn keep_rds(&self) -> Vec<RawDescriptor> {
1541         let mut rds = vec![
1542             self.vfio_container.lock().as_raw_descriptor(),
1543             self.vm_memory_client.as_raw_descriptor(),
1544             self.notifymap_mem.as_raw_descriptor(),
1545             self.topologymap_mem.as_raw_descriptor(),
1546         ];
1547         if let Some(unpin_tube) = &self.unpin_tube {
1548             rds.push(unpin_tube.as_raw_descriptor());
1549         }
1550         rds.extend(self.ioevents.iter().map(Event::as_raw_descriptor));
1551         rds
1552     }
1553 
read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8])1554     fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
1555         match bar_index {
1556             COIOMMU_MMIO_BAR => self.read_mmio(offset, data),
1557             COIOMMU_NOTIFYMAP_BAR => {
1558                 // With coiommu device activated, the accessing the notifymap bar
1559                 // won't cause vmexit. If goes here, means the coiommu device is
1560                 // deactivated, and will not do the pin/unpin work. Thus no need
1561                 // to handle this notifymap read.
1562             }
1563             _ => {}
1564         }
1565     }
1566 
write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8])1567     fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
1568         match bar_index {
1569             COIOMMU_MMIO_BAR => self.write_mmio(offset, data),
1570             COIOMMU_NOTIFYMAP_BAR => {
1571                 // With coiommu device activated, the accessing the notifymap bar
1572                 // won't cause vmexit. If goes here, means the coiommu device is
1573                 // deactivated, and will not do the pin/unpin work. Thus no need
1574                 // to handle this notifymap write.
1575             }
1576             _ => {}
1577         }
1578     }
1579 
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1580     fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1581         self.config_regs.get_bar_configuration(bar_num)
1582     }
1583 }
1584 
1585 impl Suspendable for CoIommuDev {}
1586