xref: /aosp_15_r20/external/crosvm/swap/src/userfaultfd.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! Provides wrapper of userfaultfd crate for vmm-swap feature.
6 
7 #![deny(missing_docs)]
8 
9 use std::convert::From;
10 use std::fs::File;
11 use std::fs::OpenOptions;
12 use std::ops::Range;
13 use std::os::unix::io::AsRawFd;
14 use std::os::unix::prelude::FromRawFd;
15 use std::os::unix::prelude::OpenOptionsExt;
16 
17 use anyhow::Context;
18 use base::errno_result;
19 use base::info;
20 use base::ioctl_io_nr;
21 use base::ioctl_iowr_nr;
22 use base::ioctl_with_mut_ref;
23 use base::ioctl_with_val;
24 use base::linux::MemoryMappingUnix;
25 use base::AsRawDescriptor;
26 use base::AsRawDescriptors;
27 use base::FromRawDescriptor;
28 use base::IntoRawDescriptor;
29 use base::MappedRegion;
30 use base::MemoryMapping;
31 use base::MemoryMappingBuilder;
32 use base::RawDescriptor;
33 use thiserror::Error as ThisError;
34 use userfaultfd::Error as UffdError;
35 pub use userfaultfd::Event as UffdEvent;
36 use userfaultfd::FeatureFlags;
37 use userfaultfd::IoctlFlags;
38 use userfaultfd::Uffd;
39 use userfaultfd::UffdBuilder;
40 
41 use crate::pagesize::pages_to_bytes;
42 
43 const DEV_USERFAULTFD_PATH: &str = "/dev/userfaultfd";
44 const USERFAULTFD_IOC: u32 = 0xAA;
45 ioctl_io_nr!(USERFAULTFD_IOC_NEW, USERFAULTFD_IOC, 0x00);
46 ioctl_iowr_nr!(
47     UFFDIO_API,
48     userfaultfd_sys::UFFDIO,
49     userfaultfd_sys::_UFFDIO_API,
50     userfaultfd_sys::uffdio_api
51 );
52 
53 /// Result for Userfaultfd
54 pub type Result<T> = std::result::Result<T, Error>;
55 
56 /// Errors for Userfaultfd
57 #[derive(ThisError, Debug)]
58 pub enum Error {
59     #[error("userfaultfd error: {0:?}")]
60     /// unrecoverable userfaultfd error.
61     Userfaultfd(UffdError),
62     #[error("copy partially succeeded: {0:?} bytes copied")]
63     /// UFFDIO_COPY partillay succeed.
64     PartiallyCopied(usize),
65     #[error("the page is already filled")]
66     /// The page is already filled.
67     PageExist,
68     #[error("the uffd in the corresponding process is already closed")]
69     /// The corresponding process is already dead or has run exec(2).
70     UffdClosed,
71     #[error("clone error: {0:?}")]
72     /// Failed to clone userfaultfd.
73     Clone(base::Error),
74 }
75 
76 impl From<UffdError> for Error {
from(e: UffdError) -> Self77     fn from(e: UffdError) -> Self {
78         match e {
79             UffdError::PartiallyCopied(copied) => Self::PartiallyCopied(copied),
80             UffdError::CopyFailed(errno) if errno as i32 == libc::ESRCH => Self::UffdClosed,
81             UffdError::ZeropageFailed(errno) if errno as i32 == libc::EEXIST => Self::PageExist,
82             UffdError::ZeropageFailed(errno) if errno as i32 == libc::ESRCH => Self::UffdClosed,
83             other => Self::Userfaultfd(other),
84         }
85     }
86 }
87 
88 /// Register all the regions to all the userfaultfd
89 ///
90 /// # Arguments
91 ///
92 /// * `regions` - the list of address range of regions.
93 /// * `uffds` - the reference to the list of [Userfaultfd] for all the processes which may touch the
94 ///   `address_range` to be registered.
95 ///
96 /// # Safety
97 ///
98 /// Each address range in `regions` must be from guest memory.
99 ///
100 /// The `uffds` must cover all the processes which may touch the `address_range`. otherwise some
101 /// pages are zeroed by kernel on the unregistered process instead of swapping in from the swap
102 /// file.
103 #[deny(unsafe_op_in_unsafe_fn)]
register_regions(regions: &[Range<usize>], uffds: &[Userfaultfd]) -> Result<()>104 pub unsafe fn register_regions(regions: &[Range<usize>], uffds: &[Userfaultfd]) -> Result<()> {
105     for address_range in regions {
106         for uffd in uffds {
107             // SAFETY:
108             // Safe because the range is from the guest memory region.
109             let result = unsafe {
110                 uffd.register(address_range.start, address_range.end - address_range.start)
111             };
112             match result {
113                 Ok(_) => {}
114                 // Skip the userfaultfd for dead processes.
115                 Err(Error::UffdClosed) => {}
116                 Err(e) => {
117                     return Err(e);
118                 }
119             };
120         }
121     }
122     Ok(())
123 }
124 
125 /// Unregister all the regions from all the userfaultfd.
126 ///
127 /// `UFFDIO_UNREGISTER` unblocks any threads currently waiting on the region and remove page fault
128 /// events on the region from the userfaultfd event queue.
129 ///
130 /// # Arguments
131 ///
132 /// * `regions` - the list of address range of regions.
133 /// * `uffds` - the reference to the list of registered [Userfaultfd].
unregister_regions(regions: &[Range<usize>], uffds: &[Userfaultfd]) -> Result<()>134 pub fn unregister_regions(regions: &[Range<usize>], uffds: &[Userfaultfd]) -> Result<()> {
135     for address_range in regions {
136         for uffd in uffds {
137             let result =
138                 uffd.unregister(address_range.start, address_range.end - address_range.start);
139             match result {
140                 Ok(_) => {}
141                 // Skip the userfaultfd for dead processes.
142                 Err(Error::UffdClosed) => {}
143                 Err(e) => {
144                     return Err(e);
145                 }
146             };
147         }
148     }
149     Ok(())
150 }
151 
152 /// Factory for [Userfaultfd].
153 ///
154 /// If `/dev/userfaultfd` (introduced from Linux 6.1) exists, creates userfaultfd from the dev file.
155 /// Otherwise use `userfaultfd(2)` to create a userfaultfd.
156 pub struct Factory {
157     dev_file: Option<File>,
158 }
159 
160 impl Default for Factory {
default() -> Self161     fn default() -> Self {
162         Self::new()
163     }
164 }
165 
166 impl Factory {
167     /// Create [Factory] and try open `/dev/userfaultfd`.
168     ///
169     /// If it fails to open `/dev/userfaultfd`, userfaultfd creation fallback to `userfaultfd(2)`
170     /// syscall.
new() -> Self171     pub fn new() -> Self {
172         let dev_file = OpenOptions::new()
173             .read(true)
174             .custom_flags(libc::O_CLOEXEC | libc::O_NONBLOCK)
175             .open(DEV_USERFAULTFD_PATH);
176         match dev_file {
177             Ok(dev_file) => Self {
178                 dev_file: Some(dev_file),
179             },
180             Err(e) => {
181                 info!(
182                     "Failed to open /dev/userfaultfd ({:?}), will fall back to userfaultfd(2)",
183                     e
184                 );
185                 Self { dev_file: None }
186             }
187         }
188     }
189 
190     /// Creates a new [Userfaultfd] for this process.
create(&self) -> anyhow::Result<Userfaultfd>191     pub fn create(&self) -> anyhow::Result<Userfaultfd> {
192         if let Some(dev_file) = &self.dev_file {
193             // SAFETY:
194             // Safe because ioctl(2) USERFAULTFD_IOC_NEW with does not change Rust memory safety.
195             let res = unsafe {
196                 ioctl_with_val(
197                     dev_file,
198                     USERFAULTFD_IOC_NEW,
199                     (libc::O_CLOEXEC | libc::O_NONBLOCK) as libc::c_ulong,
200                 )
201             };
202             let uffd = if res < 0 {
203                 return errno_result().context("USERFAULTFD_IOC_NEW");
204             } else {
205                 // Safe because the uffd is not owned by anyone in this process.
206                 // SAFETY:
207                 unsafe { Userfaultfd::from_raw_descriptor(res) }
208             };
209             let mut api = userfaultfd_sys::uffdio_api {
210                 api: userfaultfd_sys::UFFD_API,
211                 features: (FeatureFlags::MISSING_SHMEM | FeatureFlags::EVENT_REMOVE).bits(),
212                 ioctls: 0,
213             };
214             // SAFETY:
215             // Safe because ioctl(2) UFFDIO_API with does not change Rust memory safety.
216             let res = unsafe { ioctl_with_mut_ref(&uffd, UFFDIO_API, &mut api) };
217             if res < 0 {
218                 errno_result().context("UFFDIO_API")
219             } else {
220                 Ok(uffd)
221             }
222         } else {
223             Userfaultfd::new().context("create userfaultfd")
224         }
225     }
226 
227     /// Create a new [Factory] object.
try_clone(&self) -> anyhow::Result<Self>228     pub fn try_clone(&self) -> anyhow::Result<Self> {
229         let dev_file = self.dev_file.as_ref().map(File::try_clone).transpose()?;
230         Ok(Self { dev_file })
231     }
232 }
233 
234 impl AsRawDescriptors for Factory {
as_raw_descriptors(&self) -> Vec<RawDescriptor>235     fn as_raw_descriptors(&self) -> Vec<RawDescriptor> {
236         if let Some(dev_file) = &self.dev_file {
237             vec![dev_file.as_raw_descriptor()]
238         } else {
239             Vec::new()
240         }
241     }
242 }
243 
244 /// Wrapper for [`userfaultfd::Uffd`] to be used in the vmm-swap feature.
245 ///
246 /// # Safety
247 ///
248 /// The userfaultfd operations (`UFFDIO_COPY` and `UFFDIO_ZEROPAGE`) looks unsafe since it fills a
249 /// memory content directly. But they actually are not unsafe operation but `UFFDIO_REGISTER` should
250 /// be the unsafe operation for Rust memory safety.
251 ///
252 /// According to [the Rust document](https://doc.rust-lang.org/nomicon/uninitialized.html),
253 ///
254 /// > All runtime-allocated memory in a Rust program begins its life as uninitialized.
255 ///
256 /// The userfaultfd operations actually does not change/overwrite the existing memory contents but
257 /// they just setup the "uninitialized" pages. If the page was already initialized, the userfaultfd
258 /// operations fail and return EEXIST error (which is not documented unfortunately). So they
259 /// originally does not affect the Rust memory safety.
260 ///
261 /// The "uninitialized" page in this context has 2 patterns:
262 ///
263 /// 1. pages which is never touched or,
264 /// 2. pages which is never touched after MADV_REMOVE
265 ///
266 /// Filling the (1) pages with any contents should not affect the Rust memory safety.
267 ///
268 /// Filling the (2) pages potentially may break the memory used by Rust. But the safety should be
269 /// examined at `MADV_REMOVE` and `UFFDIO_REGISTER` timing.
270 #[derive(Debug)]
271 pub struct Userfaultfd {
272     uffd: Uffd,
273 }
274 
275 impl Userfaultfd {
276     /// Creates a new userfaultfd using userfaultfd(2) syscall.
277     ///
278     /// This is public for tests.
new() -> Result<Self>279     pub fn new() -> Result<Self> {
280         let uffd = UffdBuilder::new()
281             .close_on_exec(true)
282             .non_blocking(true)
283             .user_mode_only(false)
284             .require_features(FeatureFlags::MISSING_SHMEM | FeatureFlags::EVENT_REMOVE)
285             .create()?;
286         Ok(Self { uffd })
287     }
288 
289     /// Register a range of memory to the userfaultfd.
290     ///
291     /// After this registration, any page faults on the range will be caught by the userfaultfd.
292     ///
293     /// # Arguments
294     ///
295     /// * `addr` - the starting address of the range of memory.
296     /// * `len` - the length in bytes of the range of memory.
297     ///
298     /// # Safety
299     ///
300     /// [addr, addr+len) must lie within a [MemoryMapping], and that mapping
301     /// must live for the lifespan of the userfaultfd kernel object (which may be distinct from the
302     /// `Userfaultfd` rust object in this process).
register(&self, addr: usize, len: usize) -> Result<IoctlFlags>303     pub unsafe fn register(&self, addr: usize, len: usize) -> Result<IoctlFlags> {
304         match self.uffd.register(addr as *mut libc::c_void, len) {
305             Ok(flags) => Ok(flags),
306             Err(UffdError::SystemError(errno)) if errno as i32 == libc::ENOMEM => {
307                 // Userfaultfd returns `ENOMEM` if the corresponding process dies or run as another
308                 // program by `exec` system call.
309                 // TODO(b/267124393): Verify UFFDIO_ZEROPAGE + ESRCH as well since ENOMEM may be for
310                 // other reasons.
311                 Err(Error::UffdClosed)
312             }
313             Err(e) => Err(e.into()),
314         }
315     }
316 
317     /// Unregister a range of memory from the userfaultfd.
318     ///
319     /// # Arguments
320     ///
321     /// * `addr` - the starting address of the range of memory.
322     /// * `len` - the length in bytes of the range of memory.
unregister(&self, addr: usize, len: usize) -> Result<()>323     pub fn unregister(&self, addr: usize, len: usize) -> Result<()> {
324         match self.uffd.unregister(addr as *mut libc::c_void, len) {
325             Ok(_) => Ok(()),
326             Err(UffdError::SystemError(errno)) if errno as i32 == libc::ENOMEM => {
327                 // Userfaultfd returns `ENOMEM` if the corresponding process dies or run as another
328                 // program by `exec` system call.
329                 // TODO(b/267124393): Verify UFFDIO_ZEROPAGE + ESRCH as well since ENOMEM may be for
330                 // other reasons.
331                 Err(Error::UffdClosed)
332             }
333             Err(e) => Err(e.into()),
334         }
335     }
336 
337     /// Initialize page(s) and fill it with zero.
338     ///
339     /// # Arguments
340     ///
341     /// * `addr` - the starting address of the page(s) to be initialzed with zero.
342     /// * `len` - the length in bytes of the page(s).
343     /// * `wake` - whether or not to unblock the faulting thread.
zero(&self, addr: usize, len: usize, wake: bool) -> Result<usize>344     pub fn zero(&self, addr: usize, len: usize, wake: bool) -> Result<usize> {
345         // SAFETY:
346         // safe because zeroing untouched pages does not break the Rust memory safety since "All
347         // runtime-allocated memory in a Rust program begins its life as uninitialized."
348         // https://doc.rust-lang.org/nomicon/uninitialized.html
349         Ok(unsafe { self.uffd.zeropage(addr as *mut libc::c_void, len, wake) }?)
350     }
351 
352     /// Copy the `data` to the page(s) starting from `addr`.
353     ///
354     /// # Arguments
355     ///
356     /// * `addr` - the starting address of the page(s) to be initialzed with data.
357     /// * `len` - the length in bytes of the page(s).
358     /// * `data` - the starting address of the content.
359     /// * `wake` - whether or not to unblock the faulting thread.
copy(&self, addr: usize, len: usize, data: *const u8, wake: bool) -> Result<usize>360     pub fn copy(&self, addr: usize, len: usize, data: *const u8, wake: bool) -> Result<usize> {
361         Ok(
362             // SAFETY:
363             // safe because filling untouched pages with data does not break the Rust memory safety
364             // since "All runtime-allocated memory in a Rust program begins its life as
365             // uninitialized." https://doc.rust-lang.org/nomicon/uninitialized.html
366             unsafe {
367                 self.uffd.copy(
368                     data as *const libc::c_void,
369                     addr as *mut libc::c_void,
370                     len,
371                     wake,
372                 )
373             }?,
374         )
375     }
376 
377     /// Wake the faulting thread blocked by the page(s).
378     ///
379     /// If the page is not initialized, the thread causes a page fault again.
380     ///
381     /// # Arguments
382     ///
383     /// * `addr` - the starting address of the page(s).
384     /// * `len` - the length in bytes of the page(s).
wake(&self, addr: usize, len: usize) -> Result<()>385     pub fn wake(&self, addr: usize, len: usize) -> Result<()> {
386         Ok(self.uffd.wake(addr as *mut libc::c_void, len)?)
387     }
388 
389     /// Read an event from the userfaultfd.
390     ///
391     /// Return `None` immediately if no events is ready to read.
read_event(&self) -> Result<Option<UffdEvent>>392     pub fn read_event(&self) -> Result<Option<UffdEvent>> {
393         Ok(self.uffd.read_event()?)
394     }
395 
396     /// Try to clone [Userfaultfd]
try_clone(&self) -> Result<Self>397     pub fn try_clone(&self) -> Result<Self> {
398         let dup_desc = base::clone_descriptor(self).map_err(Error::Clone)?;
399         // SAFETY: no one owns dup_desc.
400         let uffd = Self::from(unsafe { Uffd::from_raw_fd(dup_desc.into_raw_descriptor()) });
401         Ok(uffd)
402     }
403 }
404 
405 impl From<Uffd> for Userfaultfd {
from(uffd: Uffd) -> Self406     fn from(uffd: Uffd) -> Self {
407         Self { uffd }
408     }
409 }
410 
411 impl FromRawDescriptor for Userfaultfd {
from_raw_descriptor(descriptor: RawDescriptor) -> Self412     unsafe fn from_raw_descriptor(descriptor: RawDescriptor) -> Self {
413         Self::from(Uffd::from_raw_fd(descriptor))
414     }
415 }
416 
417 impl AsRawDescriptor for Userfaultfd {
as_raw_descriptor(&self) -> RawDescriptor418     fn as_raw_descriptor(&self) -> RawDescriptor {
419         self.uffd.as_raw_fd()
420     }
421 }
422 
423 /// Check whether the process for the [Userfaultfd] is dead or not.
424 pub trait DeadUffdChecker {
425     /// Register the [Userfaultfd]
register(&self, uffd: &Userfaultfd) -> anyhow::Result<()>426     fn register(&self, uffd: &Userfaultfd) -> anyhow::Result<()>;
427     /// Check whether the [Userfaultfd] is dead or not.
is_dead(&self, uffd: &Userfaultfd) -> bool428     fn is_dead(&self, uffd: &Userfaultfd) -> bool;
429     /// Free the internal state.
reset(&self) -> anyhow::Result<()>430     fn reset(&self) -> anyhow::Result<()>;
431 }
432 
433 /// Check whether the process for the [Userfaultfd] is dead or not.
434 ///
435 /// [DeadUffdCheckerImpl] uses `UFFD_ZERO` on a dummy mmap page to check the liveness.
436 ///
437 /// This must keep alive on the main process to make the dummy mmap present in all descendant
438 /// processes.
439 pub struct DeadUffdCheckerImpl {
440     dummy_mmap: MemoryMapping,
441 }
442 
443 impl DeadUffdCheckerImpl {
444     /// Creates [DeadUffdCheckerImpl].
new() -> anyhow::Result<Self>445     pub fn new() -> anyhow::Result<Self> {
446         Ok(Self {
447             dummy_mmap: MemoryMappingBuilder::new(pages_to_bytes(1))
448                 .build()
449                 .context("create dummy mmap")?,
450         })
451     }
452 }
453 
454 impl DeadUffdChecker for DeadUffdCheckerImpl {
register(&self, uffd: &Userfaultfd) -> anyhow::Result<()>455     fn register(&self, uffd: &Userfaultfd) -> anyhow::Result<()> {
456         // SAFETY: no one except DeadUffdCheckerImpl access dummy_mmap.
457         unsafe { uffd.register(self.dummy_mmap.as_ptr() as usize, pages_to_bytes(1)) }
458             .map(|_| ())
459             .context("register to dummy mmap")
460     }
461 
is_dead(&self, uffd: &Userfaultfd) -> bool462     fn is_dead(&self, uffd: &Userfaultfd) -> bool {
463         // UFFDIO_ZEROPAGE returns ESRCH for dead uffd.
464         matches!(
465             uffd.zero(self.dummy_mmap.as_ptr() as usize, pages_to_bytes(1), false),
466             Err(Error::UffdClosed)
467         )
468     }
469 
reset(&self) -> anyhow::Result<()>470     fn reset(&self) -> anyhow::Result<()> {
471         self.dummy_mmap
472             .remove_range(0, pages_to_bytes(1))
473             .context("free dummy mmap")
474     }
475 }
476