xref: /aosp_15_r20/external/crosvm/swap/src/page_handler.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1*bb4ee6a4SAndroid Build Coastguard Worker // Copyright 2022 The ChromiumOS Authors
2*bb4ee6a4SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be
3*bb4ee6a4SAndroid Build Coastguard Worker // found in the LICENSE file.
4*bb4ee6a4SAndroid Build Coastguard Worker 
5*bb4ee6a4SAndroid Build Coastguard Worker //! PageHandler manages the page states of multiple regions.
6*bb4ee6a4SAndroid Build Coastguard Worker 
7*bb4ee6a4SAndroid Build Coastguard Worker #![deny(missing_docs)]
8*bb4ee6a4SAndroid Build Coastguard Worker 
9*bb4ee6a4SAndroid Build Coastguard Worker use std::fs::File;
10*bb4ee6a4SAndroid Build Coastguard Worker use std::mem;
11*bb4ee6a4SAndroid Build Coastguard Worker use std::ops::Range;
12*bb4ee6a4SAndroid Build Coastguard Worker use std::sync::Arc;
13*bb4ee6a4SAndroid Build Coastguard Worker 
14*bb4ee6a4SAndroid Build Coastguard Worker use anyhow::Context;
15*bb4ee6a4SAndroid Build Coastguard Worker use base::error;
16*bb4ee6a4SAndroid Build Coastguard Worker use base::linux::FileDataIterator;
17*bb4ee6a4SAndroid Build Coastguard Worker use base::AsRawDescriptor;
18*bb4ee6a4SAndroid Build Coastguard Worker use base::SharedMemory;
19*bb4ee6a4SAndroid Build Coastguard Worker use base::VolatileSlice;
20*bb4ee6a4SAndroid Build Coastguard Worker use sync::Mutex;
21*bb4ee6a4SAndroid Build Coastguard Worker use thiserror::Error as ThisError;
22*bb4ee6a4SAndroid Build Coastguard Worker 
23*bb4ee6a4SAndroid Build Coastguard Worker use crate::file::Error as FileError;
24*bb4ee6a4SAndroid Build Coastguard Worker use crate::file::SwapFile;
25*bb4ee6a4SAndroid Build Coastguard Worker use crate::pagesize::addr_to_page_idx;
26*bb4ee6a4SAndroid Build Coastguard Worker use crate::pagesize::bytes_to_pages;
27*bb4ee6a4SAndroid Build Coastguard Worker use crate::pagesize::is_hugepage_aligned;
28*bb4ee6a4SAndroid Build Coastguard Worker use crate::pagesize::is_page_aligned;
29*bb4ee6a4SAndroid Build Coastguard Worker use crate::pagesize::page_base_addr;
30*bb4ee6a4SAndroid Build Coastguard Worker use crate::pagesize::page_idx_to_addr;
31*bb4ee6a4SAndroid Build Coastguard Worker use crate::pagesize::pages_to_bytes;
32*bb4ee6a4SAndroid Build Coastguard Worker use crate::pagesize::round_up_hugepage_size;
33*bb4ee6a4SAndroid Build Coastguard Worker use crate::pagesize::THP_SIZE;
34*bb4ee6a4SAndroid Build Coastguard Worker use crate::staging::CopyOp;
35*bb4ee6a4SAndroid Build Coastguard Worker use crate::staging::Error as StagingError;
36*bb4ee6a4SAndroid Build Coastguard Worker use crate::staging::StagingMemory;
37*bb4ee6a4SAndroid Build Coastguard Worker use crate::userfaultfd::Error as UffdError;
38*bb4ee6a4SAndroid Build Coastguard Worker use crate::userfaultfd::Userfaultfd;
39*bb4ee6a4SAndroid Build Coastguard Worker use crate::worker::Channel;
40*bb4ee6a4SAndroid Build Coastguard Worker use crate::worker::Task;
41*bb4ee6a4SAndroid Build Coastguard Worker use crate::SwapMetrics;
42*bb4ee6a4SAndroid Build Coastguard Worker 
43*bb4ee6a4SAndroid Build Coastguard Worker pub(crate) const MLOCK_BUDGET: usize = 16 * 1024 * 1024; // = 16MB
44*bb4ee6a4SAndroid Build Coastguard Worker const PREFETCH_THRESHOLD: usize = 4 * 1024 * 1024; // = 4MB
45*bb4ee6a4SAndroid Build Coastguard Worker 
46*bb4ee6a4SAndroid Build Coastguard Worker /// Result for PageHandler
47*bb4ee6a4SAndroid Build Coastguard Worker pub type Result<T> = std::result::Result<T, Error>;
48*bb4ee6a4SAndroid Build Coastguard Worker 
49*bb4ee6a4SAndroid Build Coastguard Worker /// Errors for PageHandler
50*bb4ee6a4SAndroid Build Coastguard Worker #[derive(ThisError, Debug)]
51*bb4ee6a4SAndroid Build Coastguard Worker pub enum Error {
52*bb4ee6a4SAndroid Build Coastguard Worker     #[error("the address is invalid {0:#018X}")]
53*bb4ee6a4SAndroid Build Coastguard Worker     /// the address is invalid
54*bb4ee6a4SAndroid Build Coastguard Worker     InvalidAddress(usize),
55*bb4ee6a4SAndroid Build Coastguard Worker     #[error("the regions {0:?} and {1:?} overlap")]
56*bb4ee6a4SAndroid Build Coastguard Worker     /// regions are overlaps on registering
57*bb4ee6a4SAndroid Build Coastguard Worker     RegionOverlap(Range<usize>, Range<usize>),
58*bb4ee6a4SAndroid Build Coastguard Worker     #[error("failed to create page handler {0:?}")]
59*bb4ee6a4SAndroid Build Coastguard Worker     /// failed to create page handler
60*bb4ee6a4SAndroid Build Coastguard Worker     CreateFailed(anyhow::Error),
61*bb4ee6a4SAndroid Build Coastguard Worker     #[error("file operation failed : {0:?}")]
62*bb4ee6a4SAndroid Build Coastguard Worker     /// file operation failed
63*bb4ee6a4SAndroid Build Coastguard Worker     File(#[from] FileError),
64*bb4ee6a4SAndroid Build Coastguard Worker     #[error("staging operation failed : {0:?}")]
65*bb4ee6a4SAndroid Build Coastguard Worker     /// staging operation failed
66*bb4ee6a4SAndroid Build Coastguard Worker     Staging(#[from] StagingError),
67*bb4ee6a4SAndroid Build Coastguard Worker     #[error("userfaultfd failed : {0:?}")]
68*bb4ee6a4SAndroid Build Coastguard Worker     /// userfaultfd operation failed
69*bb4ee6a4SAndroid Build Coastguard Worker     Userfaultfd(#[from] UffdError),
70*bb4ee6a4SAndroid Build Coastguard Worker     #[error("failed to iterate data ranges: {0:?}")]
71*bb4ee6a4SAndroid Build Coastguard Worker     /// FileDataIterator failed
72*bb4ee6a4SAndroid Build Coastguard Worker     FileDataIterator(#[from] base::Error),
73*bb4ee6a4SAndroid Build Coastguard Worker }
74*bb4ee6a4SAndroid Build Coastguard Worker 
75*bb4ee6a4SAndroid Build Coastguard Worker /// Remove the memory range on the guest memory.
76*bb4ee6a4SAndroid Build Coastguard Worker ///
77*bb4ee6a4SAndroid Build Coastguard Worker /// This is an alternative to [vm_memory::GuestMemory::remove_range()] when working with host
78*bb4ee6a4SAndroid Build Coastguard Worker /// addresses instead of guest addresses.
79*bb4ee6a4SAndroid Build Coastguard Worker ///
80*bb4ee6a4SAndroid Build Coastguard Worker /// # Safety
81*bb4ee6a4SAndroid Build Coastguard Worker ///
82*bb4ee6a4SAndroid Build Coastguard Worker /// The memory range must be on the guest memory.
83*bb4ee6a4SAndroid Build Coastguard Worker #[deny(unsafe_op_in_unsafe_fn)]
remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error>84*bb4ee6a4SAndroid Build Coastguard Worker unsafe fn remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error> {
85*bb4ee6a4SAndroid Build Coastguard Worker     // SAFETY:
86*bb4ee6a4SAndroid Build Coastguard Worker     // Safe because the caller guarantees addr is in guest memory, so this does not affect any rust
87*bb4ee6a4SAndroid Build Coastguard Worker     // managed memory.
88*bb4ee6a4SAndroid Build Coastguard Worker     let ret = unsafe { libc::madvise(addr as *mut libc::c_void, len, libc::MADV_REMOVE) };
89*bb4ee6a4SAndroid Build Coastguard Worker     if ret < 0 {
90*bb4ee6a4SAndroid Build Coastguard Worker         base::errno_result()
91*bb4ee6a4SAndroid Build Coastguard Worker     } else {
92*bb4ee6a4SAndroid Build Coastguard Worker         Ok(())
93*bb4ee6a4SAndroid Build Coastguard Worker     }
94*bb4ee6a4SAndroid Build Coastguard Worker }
95*bb4ee6a4SAndroid Build Coastguard Worker 
uffd_copy_all( uffd: &Userfaultfd, mut page_addr: usize, mut data_slice: VolatileSlice, wake: bool, ) -> std::result::Result<(), UffdError>96*bb4ee6a4SAndroid Build Coastguard Worker fn uffd_copy_all(
97*bb4ee6a4SAndroid Build Coastguard Worker     uffd: &Userfaultfd,
98*bb4ee6a4SAndroid Build Coastguard Worker     mut page_addr: usize,
99*bb4ee6a4SAndroid Build Coastguard Worker     mut data_slice: VolatileSlice,
100*bb4ee6a4SAndroid Build Coastguard Worker     wake: bool,
101*bb4ee6a4SAndroid Build Coastguard Worker ) -> std::result::Result<(), UffdError> {
102*bb4ee6a4SAndroid Build Coastguard Worker     loop {
103*bb4ee6a4SAndroid Build Coastguard Worker         let result = uffd.copy(page_addr, data_slice.size(), data_slice.as_ptr(), wake);
104*bb4ee6a4SAndroid Build Coastguard Worker         match result {
105*bb4ee6a4SAndroid Build Coastguard Worker             Err(UffdError::PartiallyCopied(copied)) => {
106*bb4ee6a4SAndroid Build Coastguard Worker                 page_addr += copied;
107*bb4ee6a4SAndroid Build Coastguard Worker                 data_slice.advance(copied);
108*bb4ee6a4SAndroid Build Coastguard Worker             }
109*bb4ee6a4SAndroid Build Coastguard Worker             other => {
110*bb4ee6a4SAndroid Build Coastguard Worker                 // Even EEXIST for copy operation should be an error for page fault handling. If
111*bb4ee6a4SAndroid Build Coastguard Worker                 // the page was swapped in before, the page should be cleared from the swap file
112*bb4ee6a4SAndroid Build Coastguard Worker                 // and do `Userfaultfd::zero()` instead.
113*bb4ee6a4SAndroid Build Coastguard Worker                 return other.map(|_| ());
114*bb4ee6a4SAndroid Build Coastguard Worker             }
115*bb4ee6a4SAndroid Build Coastguard Worker         }
116*bb4ee6a4SAndroid Build Coastguard Worker     }
117*bb4ee6a4SAndroid Build Coastguard Worker }
118*bb4ee6a4SAndroid Build Coastguard Worker 
119*bb4ee6a4SAndroid Build Coastguard Worker /// [Region] represents a memory region and corresponding [SwapFile].
120*bb4ee6a4SAndroid Build Coastguard Worker struct Region {
121*bb4ee6a4SAndroid Build Coastguard Worker     /// the head page index of the region.
122*bb4ee6a4SAndroid Build Coastguard Worker     head_page_idx: usize,
123*bb4ee6a4SAndroid Build Coastguard Worker     base_page_idx_in_file: usize,
124*bb4ee6a4SAndroid Build Coastguard Worker     num_pages: usize,
125*bb4ee6a4SAndroid Build Coastguard Worker     staging_memory: StagingMemory,
126*bb4ee6a4SAndroid Build Coastguard Worker     copied_from_file_pages: usize,
127*bb4ee6a4SAndroid Build Coastguard Worker     copied_from_staging_pages: usize,
128*bb4ee6a4SAndroid Build Coastguard Worker     zeroed_pages: usize,
129*bb4ee6a4SAndroid Build Coastguard Worker     swap_in_pages: usize,
130*bb4ee6a4SAndroid Build Coastguard Worker     /// the amount of pages which were already initialized on page faults.
131*bb4ee6a4SAndroid Build Coastguard Worker     redundant_pages: usize,
132*bb4ee6a4SAndroid Build Coastguard Worker }
133*bb4ee6a4SAndroid Build Coastguard Worker 
134*bb4ee6a4SAndroid Build Coastguard Worker /// MoveToStaging copies chunks of consecutive pages next to each other on the guest memory to the
135*bb4ee6a4SAndroid Build Coastguard Worker /// staging memory and removes the chunks on the guest memory.
136*bb4ee6a4SAndroid Build Coastguard Worker pub struct MoveToStaging {
137*bb4ee6a4SAndroid Build Coastguard Worker     remove_area: Range<usize>,
138*bb4ee6a4SAndroid Build Coastguard Worker     copies: Vec<CopyOp>,
139*bb4ee6a4SAndroid Build Coastguard Worker }
140*bb4ee6a4SAndroid Build Coastguard Worker 
141*bb4ee6a4SAndroid Build Coastguard Worker impl Task for MoveToStaging {
execute(self)142*bb4ee6a4SAndroid Build Coastguard Worker     fn execute(self) {
143*bb4ee6a4SAndroid Build Coastguard Worker         for copy_op in self.copies {
144*bb4ee6a4SAndroid Build Coastguard Worker             copy_op.execute();
145*bb4ee6a4SAndroid Build Coastguard Worker         }
146*bb4ee6a4SAndroid Build Coastguard Worker         // Remove chunks of pages at once to reduce madvise(2) syscall.
147*bb4ee6a4SAndroid Build Coastguard Worker         // SAFETY:
148*bb4ee6a4SAndroid Build Coastguard Worker         // Safe because the region is already backed by the file and the content will be
149*bb4ee6a4SAndroid Build Coastguard Worker         // swapped in on a page fault.
150*bb4ee6a4SAndroid Build Coastguard Worker         let result = unsafe {
151*bb4ee6a4SAndroid Build Coastguard Worker             remove_memory(
152*bb4ee6a4SAndroid Build Coastguard Worker                 self.remove_area.start,
153*bb4ee6a4SAndroid Build Coastguard Worker                 self.remove_area.end - self.remove_area.start,
154*bb4ee6a4SAndroid Build Coastguard Worker             )
155*bb4ee6a4SAndroid Build Coastguard Worker         };
156*bb4ee6a4SAndroid Build Coastguard Worker         if let Err(e) = result {
157*bb4ee6a4SAndroid Build Coastguard Worker             panic!("failed to remove memory: {:?}", e);
158*bb4ee6a4SAndroid Build Coastguard Worker         }
159*bb4ee6a4SAndroid Build Coastguard Worker     }
160*bb4ee6a4SAndroid Build Coastguard Worker }
161*bb4ee6a4SAndroid Build Coastguard Worker 
162*bb4ee6a4SAndroid Build Coastguard Worker struct PageHandleContext<'a> {
163*bb4ee6a4SAndroid Build Coastguard Worker     file: SwapFile<'a>,
164*bb4ee6a4SAndroid Build Coastguard Worker     regions: Vec<Region>,
165*bb4ee6a4SAndroid Build Coastguard Worker     mlock_budget_pages: usize,
166*bb4ee6a4SAndroid Build Coastguard Worker }
167*bb4ee6a4SAndroid Build Coastguard Worker 
168*bb4ee6a4SAndroid Build Coastguard Worker /// PageHandler manages the page states of multiple regions.
169*bb4ee6a4SAndroid Build Coastguard Worker ///
170*bb4ee6a4SAndroid Build Coastguard Worker /// Handles multiple events derived from userfaultfd and swap out requests.
171*bb4ee6a4SAndroid Build Coastguard Worker /// All the addresses and sizes in bytes are converted to page id internally.
172*bb4ee6a4SAndroid Build Coastguard Worker pub struct PageHandler<'a> {
173*bb4ee6a4SAndroid Build Coastguard Worker     ctx: Mutex<PageHandleContext<'a>>,
174*bb4ee6a4SAndroid Build Coastguard Worker     channel: Arc<Channel<MoveToStaging>>,
175*bb4ee6a4SAndroid Build Coastguard Worker }
176*bb4ee6a4SAndroid Build Coastguard Worker 
177*bb4ee6a4SAndroid Build Coastguard Worker impl<'a> PageHandler<'a> {
178*bb4ee6a4SAndroid Build Coastguard Worker     /// Creates [PageHandler] for the given region.
179*bb4ee6a4SAndroid Build Coastguard Worker     ///
180*bb4ee6a4SAndroid Build Coastguard Worker     /// If any of regions overlaps, this returns [Error::RegionOverlap].
181*bb4ee6a4SAndroid Build Coastguard Worker     ///
182*bb4ee6a4SAndroid Build Coastguard Worker     /// # Arguments
183*bb4ee6a4SAndroid Build Coastguard Worker     ///
184*bb4ee6a4SAndroid Build Coastguard Worker     /// * `swap_file` - The swap file.
185*bb4ee6a4SAndroid Build Coastguard Worker     /// * `staging_shmem` - The staging memory. It must have enough size to hold guest memory.
186*bb4ee6a4SAndroid Build Coastguard Worker     ///   Otherwise monitor process crashes on creating a mmap.
187*bb4ee6a4SAndroid Build Coastguard Worker     /// * `address_ranges` - The list of address range of the regions. the start address must align
188*bb4ee6a4SAndroid Build Coastguard Worker     ///   with page. the size must be multiple of pagesize.
create( swap_file: &'a File, staging_shmem: &'a SharedMemory, address_ranges: &[Range<usize>], stating_move_context: Arc<Channel<MoveToStaging>>, ) -> Result<Self>189*bb4ee6a4SAndroid Build Coastguard Worker     pub fn create(
190*bb4ee6a4SAndroid Build Coastguard Worker         swap_file: &'a File,
191*bb4ee6a4SAndroid Build Coastguard Worker         staging_shmem: &'a SharedMemory,
192*bb4ee6a4SAndroid Build Coastguard Worker         address_ranges: &[Range<usize>],
193*bb4ee6a4SAndroid Build Coastguard Worker         stating_move_context: Arc<Channel<MoveToStaging>>,
194*bb4ee6a4SAndroid Build Coastguard Worker     ) -> Result<Self> {
195*bb4ee6a4SAndroid Build Coastguard Worker         // Truncate the file into the size to hold all regions, otherwise access beyond the end of
196*bb4ee6a4SAndroid Build Coastguard Worker         // file may cause SIGBUS.
197*bb4ee6a4SAndroid Build Coastguard Worker         swap_file
198*bb4ee6a4SAndroid Build Coastguard Worker             .set_len(
199*bb4ee6a4SAndroid Build Coastguard Worker                 address_ranges
200*bb4ee6a4SAndroid Build Coastguard Worker                     .iter()
201*bb4ee6a4SAndroid Build Coastguard Worker                     .map(|r| (r.end.saturating_sub(r.start)) as u64)
202*bb4ee6a4SAndroid Build Coastguard Worker                     .sum(),
203*bb4ee6a4SAndroid Build Coastguard Worker             )
204*bb4ee6a4SAndroid Build Coastguard Worker             .context("truncate swap file")
205*bb4ee6a4SAndroid Build Coastguard Worker             .map_err(Error::CreateFailed)?;
206*bb4ee6a4SAndroid Build Coastguard Worker 
207*bb4ee6a4SAndroid Build Coastguard Worker         let mut regions: Vec<Region> = Vec::new();
208*bb4ee6a4SAndroid Build Coastguard Worker         let mut offset_pages = 0;
209*bb4ee6a4SAndroid Build Coastguard Worker         for address_range in address_ranges {
210*bb4ee6a4SAndroid Build Coastguard Worker             let head_page_idx = addr_to_page_idx(address_range.start);
211*bb4ee6a4SAndroid Build Coastguard Worker             if address_range.end < address_range.start {
212*bb4ee6a4SAndroid Build Coastguard Worker                 return Err(Error::CreateFailed(anyhow::anyhow!(
213*bb4ee6a4SAndroid Build Coastguard Worker                     "invalid region end < start"
214*bb4ee6a4SAndroid Build Coastguard Worker                 )));
215*bb4ee6a4SAndroid Build Coastguard Worker             }
216*bb4ee6a4SAndroid Build Coastguard Worker             let region_size = address_range.end - address_range.start;
217*bb4ee6a4SAndroid Build Coastguard Worker             let num_pages = bytes_to_pages(region_size);
218*bb4ee6a4SAndroid Build Coastguard Worker 
219*bb4ee6a4SAndroid Build Coastguard Worker             // Find an overlapping region
220*bb4ee6a4SAndroid Build Coastguard Worker             match regions.iter().position(|region| {
221*bb4ee6a4SAndroid Build Coastguard Worker                 if region.head_page_idx < head_page_idx {
222*bb4ee6a4SAndroid Build Coastguard Worker                     region.head_page_idx + region.num_pages > head_page_idx
223*bb4ee6a4SAndroid Build Coastguard Worker                 } else {
224*bb4ee6a4SAndroid Build Coastguard Worker                     region.head_page_idx < head_page_idx + num_pages
225*bb4ee6a4SAndroid Build Coastguard Worker                 }
226*bb4ee6a4SAndroid Build Coastguard Worker             }) {
227*bb4ee6a4SAndroid Build Coastguard Worker                 Some(i) => {
228*bb4ee6a4SAndroid Build Coastguard Worker                     let region = &regions[i];
229*bb4ee6a4SAndroid Build Coastguard Worker 
230*bb4ee6a4SAndroid Build Coastguard Worker                     return Err(Error::RegionOverlap(
231*bb4ee6a4SAndroid Build Coastguard Worker                         address_range.clone(),
232*bb4ee6a4SAndroid Build Coastguard Worker                         page_idx_to_addr(region.head_page_idx)
233*bb4ee6a4SAndroid Build Coastguard Worker                             ..(page_idx_to_addr(region.head_page_idx + region.num_pages)),
234*bb4ee6a4SAndroid Build Coastguard Worker                     ));
235*bb4ee6a4SAndroid Build Coastguard Worker                 }
236*bb4ee6a4SAndroid Build Coastguard Worker                 None => {
237*bb4ee6a4SAndroid Build Coastguard Worker                     let base_addr = address_range.start;
238*bb4ee6a4SAndroid Build Coastguard Worker                     assert!(is_page_aligned(base_addr));
239*bb4ee6a4SAndroid Build Coastguard Worker                     assert!(is_page_aligned(region_size));
240*bb4ee6a4SAndroid Build Coastguard Worker 
241*bb4ee6a4SAndroid Build Coastguard Worker                     let staging_memory = StagingMemory::new(
242*bb4ee6a4SAndroid Build Coastguard Worker                         staging_shmem,
243*bb4ee6a4SAndroid Build Coastguard Worker                         pages_to_bytes(offset_pages) as u64,
244*bb4ee6a4SAndroid Build Coastguard Worker                         num_pages,
245*bb4ee6a4SAndroid Build Coastguard Worker                     )?;
246*bb4ee6a4SAndroid Build Coastguard Worker                     regions.push(Region {
247*bb4ee6a4SAndroid Build Coastguard Worker                         head_page_idx,
248*bb4ee6a4SAndroid Build Coastguard Worker                         base_page_idx_in_file: offset_pages,
249*bb4ee6a4SAndroid Build Coastguard Worker                         num_pages,
250*bb4ee6a4SAndroid Build Coastguard Worker                         staging_memory,
251*bb4ee6a4SAndroid Build Coastguard Worker                         copied_from_file_pages: 0,
252*bb4ee6a4SAndroid Build Coastguard Worker                         copied_from_staging_pages: 0,
253*bb4ee6a4SAndroid Build Coastguard Worker                         zeroed_pages: 0,
254*bb4ee6a4SAndroid Build Coastguard Worker                         swap_in_pages: 0,
255*bb4ee6a4SAndroid Build Coastguard Worker                         redundant_pages: 0,
256*bb4ee6a4SAndroid Build Coastguard Worker                     });
257*bb4ee6a4SAndroid Build Coastguard Worker                     offset_pages += num_pages;
258*bb4ee6a4SAndroid Build Coastguard Worker                 }
259*bb4ee6a4SAndroid Build Coastguard Worker             }
260*bb4ee6a4SAndroid Build Coastguard Worker         }
261*bb4ee6a4SAndroid Build Coastguard Worker 
262*bb4ee6a4SAndroid Build Coastguard Worker         let file = SwapFile::new(swap_file, offset_pages)?;
263*bb4ee6a4SAndroid Build Coastguard Worker 
264*bb4ee6a4SAndroid Build Coastguard Worker         Ok(Self {
265*bb4ee6a4SAndroid Build Coastguard Worker             ctx: Mutex::new(PageHandleContext {
266*bb4ee6a4SAndroid Build Coastguard Worker                 file,
267*bb4ee6a4SAndroid Build Coastguard Worker                 regions,
268*bb4ee6a4SAndroid Build Coastguard Worker                 mlock_budget_pages: bytes_to_pages(MLOCK_BUDGET),
269*bb4ee6a4SAndroid Build Coastguard Worker             }),
270*bb4ee6a4SAndroid Build Coastguard Worker             channel: stating_move_context,
271*bb4ee6a4SAndroid Build Coastguard Worker         })
272*bb4ee6a4SAndroid Build Coastguard Worker     }
273*bb4ee6a4SAndroid Build Coastguard Worker 
find_region(regions: &mut [Region], page_idx: usize) -> Option<&mut Region>274*bb4ee6a4SAndroid Build Coastguard Worker     fn find_region(regions: &mut [Region], page_idx: usize) -> Option<&mut Region> {
275*bb4ee6a4SAndroid Build Coastguard Worker         // sequential search the corresponding page map from the list. It should be fast enough
276*bb4ee6a4SAndroid Build Coastguard Worker         // because there are a few regions (usually only 1).
277*bb4ee6a4SAndroid Build Coastguard Worker         regions.iter_mut().find(|region| {
278*bb4ee6a4SAndroid Build Coastguard Worker             region.head_page_idx <= page_idx && page_idx < region.head_page_idx + region.num_pages
279*bb4ee6a4SAndroid Build Coastguard Worker         })
280*bb4ee6a4SAndroid Build Coastguard Worker     }
281*bb4ee6a4SAndroid Build Coastguard Worker 
282*bb4ee6a4SAndroid Build Coastguard Worker     /// Fills the faulted page with zero if the page is not initialized, with the content in the
283*bb4ee6a4SAndroid Build Coastguard Worker     /// swap file if the page is swapped out.
284*bb4ee6a4SAndroid Build Coastguard Worker     ///
285*bb4ee6a4SAndroid Build Coastguard Worker     /// # Arguments
286*bb4ee6a4SAndroid Build Coastguard Worker     ///
287*bb4ee6a4SAndroid Build Coastguard Worker     /// * `uffd` - the reference to the [Userfaultfd] for the faulting process.
288*bb4ee6a4SAndroid Build Coastguard Worker     /// * `address` - the address that triggered the page fault.
handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()>289*bb4ee6a4SAndroid Build Coastguard Worker     pub fn handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()> {
290*bb4ee6a4SAndroid Build Coastguard Worker         let page_idx = addr_to_page_idx(address);
291*bb4ee6a4SAndroid Build Coastguard Worker         // the head address of the page.
292*bb4ee6a4SAndroid Build Coastguard Worker         let page_addr = page_base_addr(address);
293*bb4ee6a4SAndroid Build Coastguard Worker         let page_size = pages_to_bytes(1);
294*bb4ee6a4SAndroid Build Coastguard Worker         let mut ctx = self.ctx.lock();
295*bb4ee6a4SAndroid Build Coastguard Worker         let PageHandleContext { regions, file, .. } = &mut *ctx;
296*bb4ee6a4SAndroid Build Coastguard Worker         let region = Self::find_region(regions, page_idx).ok_or(Error::InvalidAddress(address))?;
297*bb4ee6a4SAndroid Build Coastguard Worker 
298*bb4ee6a4SAndroid Build Coastguard Worker         let idx_in_region = page_idx - region.head_page_idx;
299*bb4ee6a4SAndroid Build Coastguard Worker         let idx_in_file = idx_in_region + region.base_page_idx_in_file;
300*bb4ee6a4SAndroid Build Coastguard Worker         if let Some(page_slice) = region.staging_memory.page_content(idx_in_region)? {
301*bb4ee6a4SAndroid Build Coastguard Worker             uffd_copy_all(uffd, page_addr, page_slice, true)?;
302*bb4ee6a4SAndroid Build Coastguard Worker             // TODO(b/265758094): optimize clear operation.
303*bb4ee6a4SAndroid Build Coastguard Worker             region
304*bb4ee6a4SAndroid Build Coastguard Worker                 .staging_memory
305*bb4ee6a4SAndroid Build Coastguard Worker                 .clear_range(idx_in_region..idx_in_region + 1)?;
306*bb4ee6a4SAndroid Build Coastguard Worker             region.copied_from_staging_pages += 1;
307*bb4ee6a4SAndroid Build Coastguard Worker             Ok(())
308*bb4ee6a4SAndroid Build Coastguard Worker         } else if let Some(page_slice) = file.page_content(idx_in_file, false)? {
309*bb4ee6a4SAndroid Build Coastguard Worker             // TODO(kawasin): Unlock regions to proceed swap-in operation background.
310*bb4ee6a4SAndroid Build Coastguard Worker             uffd_copy_all(uffd, page_addr, page_slice, true)?;
311*bb4ee6a4SAndroid Build Coastguard Worker             // TODO(b/265758094): optimize clear operation.
312*bb4ee6a4SAndroid Build Coastguard Worker             // Do not erase the page from the disk for trimming optimization on next swap out.
313*bb4ee6a4SAndroid Build Coastguard Worker             let munlocked_pages = file.clear_range(idx_in_file..idx_in_file + 1)?;
314*bb4ee6a4SAndroid Build Coastguard Worker             region.copied_from_file_pages += 1;
315*bb4ee6a4SAndroid Build Coastguard Worker             ctx.mlock_budget_pages += munlocked_pages;
316*bb4ee6a4SAndroid Build Coastguard Worker             Ok(())
317*bb4ee6a4SAndroid Build Coastguard Worker         } else {
318*bb4ee6a4SAndroid Build Coastguard Worker             // Map a zero page since no swap file has been created yet but the fault
319*bb4ee6a4SAndroid Build Coastguard Worker             // happened.
320*bb4ee6a4SAndroid Build Coastguard Worker             // safe because the fault page is notified by uffd.
321*bb4ee6a4SAndroid Build Coastguard Worker             let result = uffd.zero(page_addr, page_size, true);
322*bb4ee6a4SAndroid Build Coastguard Worker             match result {
323*bb4ee6a4SAndroid Build Coastguard Worker                 Ok(_) => {
324*bb4ee6a4SAndroid Build Coastguard Worker                     region.zeroed_pages += 1;
325*bb4ee6a4SAndroid Build Coastguard Worker                     Ok(())
326*bb4ee6a4SAndroid Build Coastguard Worker                 }
327*bb4ee6a4SAndroid Build Coastguard Worker                 Err(UffdError::PageExist) => {
328*bb4ee6a4SAndroid Build Coastguard Worker                     // This case can happen if page faults on the same page happen on different
329*bb4ee6a4SAndroid Build Coastguard Worker                     // processes.
330*bb4ee6a4SAndroid Build Coastguard Worker                     uffd.wake(page_addr, page_size)?;
331*bb4ee6a4SAndroid Build Coastguard Worker                     region.redundant_pages += 1;
332*bb4ee6a4SAndroid Build Coastguard Worker                     Ok(())
333*bb4ee6a4SAndroid Build Coastguard Worker                 }
334*bb4ee6a4SAndroid Build Coastguard Worker                 Err(e) => Err(e.into()),
335*bb4ee6a4SAndroid Build Coastguard Worker             }
336*bb4ee6a4SAndroid Build Coastguard Worker         }
337*bb4ee6a4SAndroid Build Coastguard Worker     }
338*bb4ee6a4SAndroid Build Coastguard Worker 
339*bb4ee6a4SAndroid Build Coastguard Worker     /// Clear the internal state for the pages.
340*bb4ee6a4SAndroid Build Coastguard Worker     ///
341*bb4ee6a4SAndroid Build Coastguard Worker     /// When pages are removed by madvise with `MADV_DONTNEED` or `MADV_REMOVE`, userfaultfd
342*bb4ee6a4SAndroid Build Coastguard Worker     /// notifies the event as `UFFD_EVENT_REMOVE`. This handles the remove event.
343*bb4ee6a4SAndroid Build Coastguard Worker     ///
344*bb4ee6a4SAndroid Build Coastguard Worker     /// In crosvm, balloon frees the guest memory and cause `UFFD_EVENT_REMOVE`.
345*bb4ee6a4SAndroid Build Coastguard Worker     ///
346*bb4ee6a4SAndroid Build Coastguard Worker     /// # Arguments
347*bb4ee6a4SAndroid Build Coastguard Worker     ///
348*bb4ee6a4SAndroid Build Coastguard Worker     /// * `start_addr` - the head address of the memory area to be freed.
349*bb4ee6a4SAndroid Build Coastguard Worker     /// * `end_addr` - the end address of the memory area to be freed. `UFFD_EVENT_REMOVE` tells the
350*bb4ee6a4SAndroid Build Coastguard Worker     ///   head address of the next memory area of the freed area. (i.e. the exact tail address of
351*bb4ee6a4SAndroid Build Coastguard Worker     ///   the memory area is `end_addr - 1`.)
handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()>352*bb4ee6a4SAndroid Build Coastguard Worker     pub fn handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()> {
353*bb4ee6a4SAndroid Build Coastguard Worker         if !is_page_aligned(start_addr) {
354*bb4ee6a4SAndroid Build Coastguard Worker             return Err(Error::InvalidAddress(start_addr));
355*bb4ee6a4SAndroid Build Coastguard Worker         } else if !is_page_aligned(end_addr) {
356*bb4ee6a4SAndroid Build Coastguard Worker             return Err(Error::InvalidAddress(end_addr));
357*bb4ee6a4SAndroid Build Coastguard Worker         }
358*bb4ee6a4SAndroid Build Coastguard Worker         let start_page_idx = addr_to_page_idx(start_addr);
359*bb4ee6a4SAndroid Build Coastguard Worker         let last_page_idx = addr_to_page_idx(end_addr);
360*bb4ee6a4SAndroid Build Coastguard Worker         let mut ctx = self.ctx.lock();
361*bb4ee6a4SAndroid Build Coastguard Worker         // TODO(b/269983521): Clear multiple pages in the same region at once.
362*bb4ee6a4SAndroid Build Coastguard Worker         for page_idx in start_page_idx..(last_page_idx) {
363*bb4ee6a4SAndroid Build Coastguard Worker             let page_addr = page_idx_to_addr(page_idx);
364*bb4ee6a4SAndroid Build Coastguard Worker             // TODO(kawasin): Cache the position if the range does not span multiple regions.
365*bb4ee6a4SAndroid Build Coastguard Worker             let region = Self::find_region(&mut ctx.regions, page_idx)
366*bb4ee6a4SAndroid Build Coastguard Worker                 .ok_or(Error::InvalidAddress(page_addr))?;
367*bb4ee6a4SAndroid Build Coastguard Worker             let idx_in_region = page_idx - region.head_page_idx;
368*bb4ee6a4SAndroid Build Coastguard Worker             let idx_range = idx_in_region..idx_in_region + 1;
369*bb4ee6a4SAndroid Build Coastguard Worker             if let Err(e) = region.staging_memory.clear_range(idx_range) {
370*bb4ee6a4SAndroid Build Coastguard Worker                 error!("failed to clear removed page from staging: {:?}", e);
371*bb4ee6a4SAndroid Build Coastguard Worker             }
372*bb4ee6a4SAndroid Build Coastguard Worker             let idx_in_file = idx_in_region + region.base_page_idx_in_file;
373*bb4ee6a4SAndroid Build Coastguard Worker             let idx_range = idx_in_file..idx_in_file + 1;
374*bb4ee6a4SAndroid Build Coastguard Worker             // Erase the pages from the disk because the pages are removed from the guest memory.
375*bb4ee6a4SAndroid Build Coastguard Worker             let munlocked_pages = ctx.file.free_range(idx_range)?;
376*bb4ee6a4SAndroid Build Coastguard Worker             ctx.mlock_budget_pages += munlocked_pages;
377*bb4ee6a4SAndroid Build Coastguard Worker         }
378*bb4ee6a4SAndroid Build Coastguard Worker         Ok(())
379*bb4ee6a4SAndroid Build Coastguard Worker     }
380*bb4ee6a4SAndroid Build Coastguard Worker 
381*bb4ee6a4SAndroid Build Coastguard Worker     /// Move active pages in the memory region to the staging memory.
382*bb4ee6a4SAndroid Build Coastguard Worker     ///
383*bb4ee6a4SAndroid Build Coastguard Worker     /// It only moves active contents in the guest memory to the swap file and skips empty pages
384*bb4ee6a4SAndroid Build Coastguard Worker     /// (e.g. pages not touched, freed by balloon) using `lseek(2)` + `SEEK_HOLE/DATA`.
385*bb4ee6a4SAndroid Build Coastguard Worker     ///
386*bb4ee6a4SAndroid Build Coastguard Worker     /// Returns the count of moved out pages.
387*bb4ee6a4SAndroid Build Coastguard Worker     ///
388*bb4ee6a4SAndroid Build Coastguard Worker     /// # Arguments
389*bb4ee6a4SAndroid Build Coastguard Worker     ///
390*bb4ee6a4SAndroid Build Coastguard Worker     /// * `base_addr` - the head address of the memory region.
391*bb4ee6a4SAndroid Build Coastguard Worker     /// * `memfd` - the file descriptor of the memfd backing the guest memory region.
392*bb4ee6a4SAndroid Build Coastguard Worker     /// * `base_offset` - the offset of the memory region in the memfd.
393*bb4ee6a4SAndroid Build Coastguard Worker     ///
394*bb4ee6a4SAndroid Build Coastguard Worker     /// # Safety
395*bb4ee6a4SAndroid Build Coastguard Worker     ///
396*bb4ee6a4SAndroid Build Coastguard Worker     /// The region must have been registered to all userfaultfd of processes which may touch the
397*bb4ee6a4SAndroid Build Coastguard Worker     /// region.
398*bb4ee6a4SAndroid Build Coastguard Worker     ///
399*bb4ee6a4SAndroid Build Coastguard Worker     /// The memory must be protected not to be updated while moving.
400*bb4ee6a4SAndroid Build Coastguard Worker     ///
401*bb4ee6a4SAndroid Build Coastguard Worker     /// The page fault events for the region from the userfaultfd must be handled by
402*bb4ee6a4SAndroid Build Coastguard Worker     /// [Self::handle_page_fault()].
403*bb4ee6a4SAndroid Build Coastguard Worker     ///
404*bb4ee6a4SAndroid Build Coastguard Worker     /// Must call [Channel::wait_complete()] to wait all the copy operation complete within the
405*bb4ee6a4SAndroid Build Coastguard Worker     /// memory protection period.
406*bb4ee6a4SAndroid Build Coastguard Worker     #[deny(unsafe_op_in_unsafe_fn)]
move_to_staging<T>( &self, base_addr: usize, memfd: &T, base_offset: u64, ) -> Result<usize> where T: AsRawDescriptor,407*bb4ee6a4SAndroid Build Coastguard Worker     pub unsafe fn move_to_staging<T>(
408*bb4ee6a4SAndroid Build Coastguard Worker         &self,
409*bb4ee6a4SAndroid Build Coastguard Worker         base_addr: usize,
410*bb4ee6a4SAndroid Build Coastguard Worker         memfd: &T,
411*bb4ee6a4SAndroid Build Coastguard Worker         base_offset: u64,
412*bb4ee6a4SAndroid Build Coastguard Worker     ) -> Result<usize>
413*bb4ee6a4SAndroid Build Coastguard Worker     where
414*bb4ee6a4SAndroid Build Coastguard Worker         T: AsRawDescriptor,
415*bb4ee6a4SAndroid Build Coastguard Worker     {
416*bb4ee6a4SAndroid Build Coastguard Worker         let hugepage_size = *THP_SIZE;
417*bb4ee6a4SAndroid Build Coastguard Worker         let mut ctx = self.ctx.lock();
418*bb4ee6a4SAndroid Build Coastguard Worker         let region = Self::find_region(&mut ctx.regions, addr_to_page_idx(base_addr))
419*bb4ee6a4SAndroid Build Coastguard Worker             .ok_or(Error::InvalidAddress(base_addr))?;
420*bb4ee6a4SAndroid Build Coastguard Worker 
421*bb4ee6a4SAndroid Build Coastguard Worker         if page_idx_to_addr(region.head_page_idx) != base_addr {
422*bb4ee6a4SAndroid Build Coastguard Worker             return Err(Error::InvalidAddress(base_addr));
423*bb4ee6a4SAndroid Build Coastguard Worker         }
424*bb4ee6a4SAndroid Build Coastguard Worker         let region_size = pages_to_bytes(region.num_pages);
425*bb4ee6a4SAndroid Build Coastguard Worker         let mut file_data = FileDataIterator::new(memfd, base_offset, region_size as u64);
426*bb4ee6a4SAndroid Build Coastguard Worker         let mut moved_size = 0;
427*bb4ee6a4SAndroid Build Coastguard Worker         let mut copies = Vec::new();
428*bb4ee6a4SAndroid Build Coastguard Worker         let mut remaining_batch_size = hugepage_size;
429*bb4ee6a4SAndroid Build Coastguard Worker         let mut batch_head_offset = 0;
430*bb4ee6a4SAndroid Build Coastguard Worker         let mut cur_data = None;
431*bb4ee6a4SAndroid Build Coastguard Worker         while let Some(data_range) = cur_data
432*bb4ee6a4SAndroid Build Coastguard Worker             .take()
433*bb4ee6a4SAndroid Build Coastguard Worker             .map(Ok)
434*bb4ee6a4SAndroid Build Coastguard Worker             .or_else(|| file_data.next())
435*bb4ee6a4SAndroid Build Coastguard Worker             .transpose()
436*bb4ee6a4SAndroid Build Coastguard Worker             .map_err(Error::FileDataIterator)?
437*bb4ee6a4SAndroid Build Coastguard Worker         {
438*bb4ee6a4SAndroid Build Coastguard Worker             // Assert offset is page aligned
439*bb4ee6a4SAndroid Build Coastguard Worker             let offset = (data_range.start - base_offset) as usize;
440*bb4ee6a4SAndroid Build Coastguard Worker             assert!(is_page_aligned(offset));
441*bb4ee6a4SAndroid Build Coastguard Worker 
442*bb4ee6a4SAndroid Build Coastguard Worker             // The chunk size must be within usize since the chunk is within the guest memory.
443*bb4ee6a4SAndroid Build Coastguard Worker             let chunk_size = (data_range.end - data_range.start) as usize;
444*bb4ee6a4SAndroid Build Coastguard Worker             let data_range = if chunk_size > remaining_batch_size {
445*bb4ee6a4SAndroid Build Coastguard Worker                 // Split the chunk if it is bigger than remaining_batch_size.
446*bb4ee6a4SAndroid Build Coastguard Worker 
447*bb4ee6a4SAndroid Build Coastguard Worker                 let split_size = if chunk_size >= hugepage_size {
448*bb4ee6a4SAndroid Build Coastguard Worker                     // If the chunk size is bigger than or equals to huge page size, the chunk may
449*bb4ee6a4SAndroid Build Coastguard Worker                     // contains a huge page. If we MADV_REMOVE a huge page partially, it can cause
450*bb4ee6a4SAndroid Build Coastguard Worker                     // inconsistency between the actual page table and vmm-swap internal state.
451*bb4ee6a4SAndroid Build Coastguard Worker                     let chunk_addr = base_addr + offset;
452*bb4ee6a4SAndroid Build Coastguard Worker                     if !is_hugepage_aligned(chunk_addr) {
453*bb4ee6a4SAndroid Build Coastguard Worker                         // Split the chunk before the where a huge page could start.
454*bb4ee6a4SAndroid Build Coastguard Worker                         std::cmp::min(
455*bb4ee6a4SAndroid Build Coastguard Worker                             round_up_hugepage_size(chunk_addr) - chunk_addr,
456*bb4ee6a4SAndroid Build Coastguard Worker                             remaining_batch_size,
457*bb4ee6a4SAndroid Build Coastguard Worker                         )
458*bb4ee6a4SAndroid Build Coastguard Worker                     } else {
459*bb4ee6a4SAndroid Build Coastguard Worker                         if remaining_batch_size < hugepage_size {
460*bb4ee6a4SAndroid Build Coastguard Worker                             // Remove the batch since it does not have enough room for a huge page.
461*bb4ee6a4SAndroid Build Coastguard Worker                             self.channel.push(MoveToStaging {
462*bb4ee6a4SAndroid Build Coastguard Worker                                 remove_area: base_addr + batch_head_offset..base_addr + offset,
463*bb4ee6a4SAndroid Build Coastguard Worker                                 copies: mem::take(&mut copies),
464*bb4ee6a4SAndroid Build Coastguard Worker                             });
465*bb4ee6a4SAndroid Build Coastguard Worker                             remaining_batch_size = hugepage_size;
466*bb4ee6a4SAndroid Build Coastguard Worker                             batch_head_offset = offset;
467*bb4ee6a4SAndroid Build Coastguard Worker                         }
468*bb4ee6a4SAndroid Build Coastguard Worker                         hugepage_size
469*bb4ee6a4SAndroid Build Coastguard Worker                     }
470*bb4ee6a4SAndroid Build Coastguard Worker                 } else {
471*bb4ee6a4SAndroid Build Coastguard Worker                     remaining_batch_size
472*bb4ee6a4SAndroid Build Coastguard Worker                 };
473*bb4ee6a4SAndroid Build Coastguard Worker                 // Cache the rest of splitted chunk to avoid useless lseek(2) syscall.
474*bb4ee6a4SAndroid Build Coastguard Worker                 cur_data = Some(data_range.start + split_size as u64..data_range.end);
475*bb4ee6a4SAndroid Build Coastguard Worker                 data_range.start..data_range.start + split_size as u64
476*bb4ee6a4SAndroid Build Coastguard Worker             } else {
477*bb4ee6a4SAndroid Build Coastguard Worker                 data_range
478*bb4ee6a4SAndroid Build Coastguard Worker             };
479*bb4ee6a4SAndroid Build Coastguard Worker 
480*bb4ee6a4SAndroid Build Coastguard Worker             let size = (data_range.end - data_range.start) as usize;
481*bb4ee6a4SAndroid Build Coastguard Worker             assert!(is_page_aligned(size));
482*bb4ee6a4SAndroid Build Coastguard Worker 
483*bb4ee6a4SAndroid Build Coastguard Worker             // SAFETY:
484*bb4ee6a4SAndroid Build Coastguard Worker             // Safe because:
485*bb4ee6a4SAndroid Build Coastguard Worker             // * src_addr is aligned with page size
486*bb4ee6a4SAndroid Build Coastguard Worker             // * the data_range starting from src_addr is on the guest memory.
487*bb4ee6a4SAndroid Build Coastguard Worker             let copy_op = unsafe {
488*bb4ee6a4SAndroid Build Coastguard Worker                 region.staging_memory.copy(
489*bb4ee6a4SAndroid Build Coastguard Worker                     (base_addr + offset) as *const u8,
490*bb4ee6a4SAndroid Build Coastguard Worker                     bytes_to_pages(offset),
491*bb4ee6a4SAndroid Build Coastguard Worker                     bytes_to_pages(size),
492*bb4ee6a4SAndroid Build Coastguard Worker                 )?
493*bb4ee6a4SAndroid Build Coastguard Worker             };
494*bb4ee6a4SAndroid Build Coastguard Worker             copies.push(copy_op);
495*bb4ee6a4SAndroid Build Coastguard Worker 
496*bb4ee6a4SAndroid Build Coastguard Worker             moved_size += size;
497*bb4ee6a4SAndroid Build Coastguard Worker             // The size must be smaller than or equals to remaining_batch_size.
498*bb4ee6a4SAndroid Build Coastguard Worker             remaining_batch_size -= size;
499*bb4ee6a4SAndroid Build Coastguard Worker 
500*bb4ee6a4SAndroid Build Coastguard Worker             if remaining_batch_size == 0 {
501*bb4ee6a4SAndroid Build Coastguard Worker                 // Remove the batch of pages at once to reduce madvise(2) syscall.
502*bb4ee6a4SAndroid Build Coastguard Worker                 self.channel.push(MoveToStaging {
503*bb4ee6a4SAndroid Build Coastguard Worker                     remove_area: base_addr + batch_head_offset..base_addr + offset + size,
504*bb4ee6a4SAndroid Build Coastguard Worker                     copies: mem::take(&mut copies),
505*bb4ee6a4SAndroid Build Coastguard Worker                 });
506*bb4ee6a4SAndroid Build Coastguard Worker                 remaining_batch_size = hugepage_size;
507*bb4ee6a4SAndroid Build Coastguard Worker                 batch_head_offset = offset + size;
508*bb4ee6a4SAndroid Build Coastguard Worker             }
509*bb4ee6a4SAndroid Build Coastguard Worker         }
510*bb4ee6a4SAndroid Build Coastguard Worker         // Remove the final batch of pages.
511*bb4ee6a4SAndroid Build Coastguard Worker         self.channel.push(MoveToStaging {
512*bb4ee6a4SAndroid Build Coastguard Worker             remove_area: base_addr + batch_head_offset..base_addr + region_size,
513*bb4ee6a4SAndroid Build Coastguard Worker             copies,
514*bb4ee6a4SAndroid Build Coastguard Worker         });
515*bb4ee6a4SAndroid Build Coastguard Worker 
516*bb4ee6a4SAndroid Build Coastguard Worker         region.copied_from_file_pages = 0;
517*bb4ee6a4SAndroid Build Coastguard Worker         region.copied_from_staging_pages = 0;
518*bb4ee6a4SAndroid Build Coastguard Worker         region.zeroed_pages = 0;
519*bb4ee6a4SAndroid Build Coastguard Worker         region.swap_in_pages = 0;
520*bb4ee6a4SAndroid Build Coastguard Worker         region.redundant_pages = 0;
521*bb4ee6a4SAndroid Build Coastguard Worker 
522*bb4ee6a4SAndroid Build Coastguard Worker         Ok(bytes_to_pages(moved_size))
523*bb4ee6a4SAndroid Build Coastguard Worker     }
524*bb4ee6a4SAndroid Build Coastguard Worker 
525*bb4ee6a4SAndroid Build Coastguard Worker     /// Write a chunk of consecutive pages in the staging memory to the swap file.
526*bb4ee6a4SAndroid Build Coastguard Worker     ///
527*bb4ee6a4SAndroid Build Coastguard Worker     /// If there is no active pages in the staging memory, this returns `Ok(0)`.
528*bb4ee6a4SAndroid Build Coastguard Worker     ///
529*bb4ee6a4SAndroid Build Coastguard Worker     /// The pages in guest memory have been moved to staging memory by [Self::move_to_staging()].
530*bb4ee6a4SAndroid Build Coastguard Worker     ///
531*bb4ee6a4SAndroid Build Coastguard Worker     /// Returns the count of swapped out pages.
532*bb4ee6a4SAndroid Build Coastguard Worker     ///
533*bb4ee6a4SAndroid Build Coastguard Worker     /// Even if swap_out fails on any internal steps, it does not break the page state management
534*bb4ee6a4SAndroid Build Coastguard Worker     /// and `PageHandler` can continue working with a little pages leaking in staging memory or swap
535*bb4ee6a4SAndroid Build Coastguard Worker     /// file. The leaked pages are removed when vmm-swap is disabled and `PageHandler` is dropped.
536*bb4ee6a4SAndroid Build Coastguard Worker     ///
537*bb4ee6a4SAndroid Build Coastguard Worker     /// # Arguments
538*bb4ee6a4SAndroid Build Coastguard Worker     ///
539*bb4ee6a4SAndroid Build Coastguard Worker     /// * `max_size` - the upper limit of the chunk size to write into the swap file at once. The
540*bb4ee6a4SAndroid Build Coastguard Worker     ///   chunk is splitted if it is bigger than `max_size`.
swap_out(&self, max_size: usize) -> Result<usize>541*bb4ee6a4SAndroid Build Coastguard Worker     pub fn swap_out(&self, max_size: usize) -> Result<usize> {
542*bb4ee6a4SAndroid Build Coastguard Worker         let max_pages = bytes_to_pages(max_size);
543*bb4ee6a4SAndroid Build Coastguard Worker         let mut ctx = self.ctx.lock();
544*bb4ee6a4SAndroid Build Coastguard Worker         let PageHandleContext { regions, file, .. } = &mut *ctx;
545*bb4ee6a4SAndroid Build Coastguard Worker         for region in regions.iter_mut() {
546*bb4ee6a4SAndroid Build Coastguard Worker             if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
547*bb4ee6a4SAndroid Build Coastguard Worker                 let idx_range_in_file = idx_range.start + region.base_page_idx_in_file
548*bb4ee6a4SAndroid Build Coastguard Worker                     ..idx_range.end + region.base_page_idx_in_file;
549*bb4ee6a4SAndroid Build Coastguard Worker                 let pages = idx_range.end - idx_range.start;
550*bb4ee6a4SAndroid Build Coastguard Worker                 let slice = region.staging_memory.get_slice(idx_range.clone())?;
551*bb4ee6a4SAndroid Build Coastguard Worker                 // Convert VolatileSlice to &[u8]
552*bb4ee6a4SAndroid Build Coastguard Worker                 // SAFETY:
553*bb4ee6a4SAndroid Build Coastguard Worker                 // Safe because the range of volatile slice is already validated.
554*bb4ee6a4SAndroid Build Coastguard Worker                 let slice = unsafe { std::slice::from_raw_parts(slice.as_ptr(), slice.size()) };
555*bb4ee6a4SAndroid Build Coastguard Worker                 file.write_to_file(idx_range_in_file.start, slice)?;
556*bb4ee6a4SAndroid Build Coastguard Worker                 // TODO(kawasin): clear state_list on each write and MADV_REMOVE several chunk at
557*bb4ee6a4SAndroid Build Coastguard Worker                 // once.
558*bb4ee6a4SAndroid Build Coastguard Worker                 region.staging_memory.clear_range(idx_range)?;
559*bb4ee6a4SAndroid Build Coastguard Worker                 // TODO(kawasin): free the page cache of the swap file.
560*bb4ee6a4SAndroid Build Coastguard Worker                 // TODO(kawasin): use writev() to swap_out several small chunks at once.
561*bb4ee6a4SAndroid Build Coastguard Worker                 return Ok(pages);
562*bb4ee6a4SAndroid Build Coastguard Worker             }
563*bb4ee6a4SAndroid Build Coastguard Worker         }
564*bb4ee6a4SAndroid Build Coastguard Worker         Ok(0)
565*bb4ee6a4SAndroid Build Coastguard Worker     }
566*bb4ee6a4SAndroid Build Coastguard Worker 
567*bb4ee6a4SAndroid Build Coastguard Worker     /// Create a new [SwapInContext].
start_swap_in(&'a self) -> SwapInContext<'a>568*bb4ee6a4SAndroid Build Coastguard Worker     pub fn start_swap_in(&'a self) -> SwapInContext<'a> {
569*bb4ee6a4SAndroid Build Coastguard Worker         SwapInContext {
570*bb4ee6a4SAndroid Build Coastguard Worker             ctx: &self.ctx,
571*bb4ee6a4SAndroid Build Coastguard Worker             cur_staging: 0,
572*bb4ee6a4SAndroid Build Coastguard Worker         }
573*bb4ee6a4SAndroid Build Coastguard Worker     }
574*bb4ee6a4SAndroid Build Coastguard Worker 
575*bb4ee6a4SAndroid Build Coastguard Worker     /// Create a new [TrimContext].
start_trim(&'a self) -> TrimContext<'a>576*bb4ee6a4SAndroid Build Coastguard Worker     pub fn start_trim(&'a self) -> TrimContext<'a> {
577*bb4ee6a4SAndroid Build Coastguard Worker         TrimContext {
578*bb4ee6a4SAndroid Build Coastguard Worker             ctx: &self.ctx,
579*bb4ee6a4SAndroid Build Coastguard Worker             cur_page: 0,
580*bb4ee6a4SAndroid Build Coastguard Worker             cur_region: 0,
581*bb4ee6a4SAndroid Build Coastguard Worker             next_data_in_file: 0..0,
582*bb4ee6a4SAndroid Build Coastguard Worker             clean_pages: 0,
583*bb4ee6a4SAndroid Build Coastguard Worker             zero_pages: 0,
584*bb4ee6a4SAndroid Build Coastguard Worker         }
585*bb4ee6a4SAndroid Build Coastguard Worker     }
586*bb4ee6a4SAndroid Build Coastguard Worker 
587*bb4ee6a4SAndroid Build Coastguard Worker     /// Returns count of pages copied from vmm-swap file to the guest memory.
compute_copied_from_file_pages(&self) -> usize588*bb4ee6a4SAndroid Build Coastguard Worker     fn compute_copied_from_file_pages(&self) -> usize {
589*bb4ee6a4SAndroid Build Coastguard Worker         self.ctx
590*bb4ee6a4SAndroid Build Coastguard Worker             .lock()
591*bb4ee6a4SAndroid Build Coastguard Worker             .regions
592*bb4ee6a4SAndroid Build Coastguard Worker             .iter()
593*bb4ee6a4SAndroid Build Coastguard Worker             .map(|r| r.copied_from_file_pages)
594*bb4ee6a4SAndroid Build Coastguard Worker             .sum()
595*bb4ee6a4SAndroid Build Coastguard Worker     }
596*bb4ee6a4SAndroid Build Coastguard Worker 
597*bb4ee6a4SAndroid Build Coastguard Worker     /// Returns count of pages copied from staging memory to the guest memory.
compute_copied_from_staging_pages(&self) -> usize598*bb4ee6a4SAndroid Build Coastguard Worker     fn compute_copied_from_staging_pages(&self) -> usize {
599*bb4ee6a4SAndroid Build Coastguard Worker         self.ctx
600*bb4ee6a4SAndroid Build Coastguard Worker             .lock()
601*bb4ee6a4SAndroid Build Coastguard Worker             .regions
602*bb4ee6a4SAndroid Build Coastguard Worker             .iter()
603*bb4ee6a4SAndroid Build Coastguard Worker             .map(|r| r.copied_from_staging_pages)
604*bb4ee6a4SAndroid Build Coastguard Worker             .sum()
605*bb4ee6a4SAndroid Build Coastguard Worker     }
606*bb4ee6a4SAndroid Build Coastguard Worker 
607*bb4ee6a4SAndroid Build Coastguard Worker     /// Returns count of pages initialized with zero.
compute_zeroed_pages(&self) -> usize608*bb4ee6a4SAndroid Build Coastguard Worker     fn compute_zeroed_pages(&self) -> usize {
609*bb4ee6a4SAndroid Build Coastguard Worker         self.ctx.lock().regions.iter().map(|r| r.zeroed_pages).sum()
610*bb4ee6a4SAndroid Build Coastguard Worker     }
611*bb4ee6a4SAndroid Build Coastguard Worker 
612*bb4ee6a4SAndroid Build Coastguard Worker     /// Returns count of pages which were already initialized on page faults.
compute_redundant_pages(&self) -> usize613*bb4ee6a4SAndroid Build Coastguard Worker     fn compute_redundant_pages(&self) -> usize {
614*bb4ee6a4SAndroid Build Coastguard Worker         self.ctx
615*bb4ee6a4SAndroid Build Coastguard Worker             .lock()
616*bb4ee6a4SAndroid Build Coastguard Worker             .regions
617*bb4ee6a4SAndroid Build Coastguard Worker             .iter()
618*bb4ee6a4SAndroid Build Coastguard Worker             .map(|r| r.redundant_pages)
619*bb4ee6a4SAndroid Build Coastguard Worker             .sum()
620*bb4ee6a4SAndroid Build Coastguard Worker     }
621*bb4ee6a4SAndroid Build Coastguard Worker 
622*bb4ee6a4SAndroid Build Coastguard Worker     /// Returns count of pages present in the staging memory.
compute_staging_pages(&self) -> usize623*bb4ee6a4SAndroid Build Coastguard Worker     fn compute_staging_pages(&self) -> usize {
624*bb4ee6a4SAndroid Build Coastguard Worker         self.ctx
625*bb4ee6a4SAndroid Build Coastguard Worker             .lock()
626*bb4ee6a4SAndroid Build Coastguard Worker             .regions
627*bb4ee6a4SAndroid Build Coastguard Worker             .iter()
628*bb4ee6a4SAndroid Build Coastguard Worker             .map(|r| r.staging_memory.present_pages())
629*bb4ee6a4SAndroid Build Coastguard Worker             .sum()
630*bb4ee6a4SAndroid Build Coastguard Worker     }
631*bb4ee6a4SAndroid Build Coastguard Worker 
632*bb4ee6a4SAndroid Build Coastguard Worker     /// Returns count of pages present in the swap files.
compute_swap_pages(&self) -> usize633*bb4ee6a4SAndroid Build Coastguard Worker     fn compute_swap_pages(&self) -> usize {
634*bb4ee6a4SAndroid Build Coastguard Worker         self.ctx.lock().file.present_pages()
635*bb4ee6a4SAndroid Build Coastguard Worker     }
636*bb4ee6a4SAndroid Build Coastguard Worker 
637*bb4ee6a4SAndroid Build Coastguard Worker     /// Fill [SwapMetrics] with page handler metrics.
load_metrics(&self, metrics: &mut SwapMetrics)638*bb4ee6a4SAndroid Build Coastguard Worker     pub fn load_metrics(&self, metrics: &mut SwapMetrics) {
639*bb4ee6a4SAndroid Build Coastguard Worker         metrics.copied_from_file_pages = self.compute_copied_from_file_pages() as u64;
640*bb4ee6a4SAndroid Build Coastguard Worker         metrics.copied_from_staging_pages = self.compute_copied_from_staging_pages() as u64;
641*bb4ee6a4SAndroid Build Coastguard Worker         metrics.zeroed_pages = self.compute_zeroed_pages() as u64;
642*bb4ee6a4SAndroid Build Coastguard Worker         metrics.redundant_pages = self.compute_redundant_pages() as u64;
643*bb4ee6a4SAndroid Build Coastguard Worker         metrics.staging_pages = self.compute_staging_pages() as u64;
644*bb4ee6a4SAndroid Build Coastguard Worker         metrics.swap_pages = self.compute_swap_pages() as u64;
645*bb4ee6a4SAndroid Build Coastguard Worker     }
646*bb4ee6a4SAndroid Build Coastguard Worker }
647*bb4ee6a4SAndroid Build Coastguard Worker 
648*bb4ee6a4SAndroid Build Coastguard Worker /// Context for swap-in operation.
649*bb4ee6a4SAndroid Build Coastguard Worker ///
650*bb4ee6a4SAndroid Build Coastguard Worker /// This holds cursor of indices in the regions for each step for optimization.
651*bb4ee6a4SAndroid Build Coastguard Worker pub struct SwapInContext<'a> {
652*bb4ee6a4SAndroid Build Coastguard Worker     ctx: &'a Mutex<PageHandleContext<'a>>,
653*bb4ee6a4SAndroid Build Coastguard Worker     cur_staging: usize,
654*bb4ee6a4SAndroid Build Coastguard Worker }
655*bb4ee6a4SAndroid Build Coastguard Worker 
656*bb4ee6a4SAndroid Build Coastguard Worker impl SwapInContext<'_> {
657*bb4ee6a4SAndroid Build Coastguard Worker     /// Swap in a chunk of consecutive pages from the staging memory and the swap file.
658*bb4ee6a4SAndroid Build Coastguard Worker     ///
659*bb4ee6a4SAndroid Build Coastguard Worker     /// If there is no more pages present outside of the guest memory, this returns `Ok(0)`.
660*bb4ee6a4SAndroid Build Coastguard Worker     ///
661*bb4ee6a4SAndroid Build Coastguard Worker     /// Returns the count of swapped in pages.
662*bb4ee6a4SAndroid Build Coastguard Worker     ///
663*bb4ee6a4SAndroid Build Coastguard Worker     /// # Arguments
664*bb4ee6a4SAndroid Build Coastguard Worker     ///
665*bb4ee6a4SAndroid Build Coastguard Worker     /// * `uffd` - the main [Userfaultfd].
666*bb4ee6a4SAndroid Build Coastguard Worker     /// * `max_size` - the upper limit of the chunk size to swap into the guest memory at once. The
667*bb4ee6a4SAndroid Build Coastguard Worker     ///   chunk is splitted if it is bigger than `max_size`.
swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize>668*bb4ee6a4SAndroid Build Coastguard Worker     pub fn swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize> {
669*bb4ee6a4SAndroid Build Coastguard Worker         let mut ctx = self.ctx.lock();
670*bb4ee6a4SAndroid Build Coastguard Worker         // Request the kernel to pre-populate the present pages in the swap file to page cache
671*bb4ee6a4SAndroid Build Coastguard Worker         // background. At most 16MB of pages will be populated.
672*bb4ee6a4SAndroid Build Coastguard Worker         // The threshold is to apply MADV_WILLNEED to bigger chunk of pages. The kernel populates
673*bb4ee6a4SAndroid Build Coastguard Worker         // consecutive pages at once on MADV_WILLNEED.
674*bb4ee6a4SAndroid Build Coastguard Worker         if ctx.mlock_budget_pages > bytes_to_pages(PREFETCH_THRESHOLD) {
675*bb4ee6a4SAndroid Build Coastguard Worker             let mlock_budget_pages = ctx.mlock_budget_pages;
676*bb4ee6a4SAndroid Build Coastguard Worker             let locked_pages = ctx.file.lock_and_async_prefetch(mlock_budget_pages)?;
677*bb4ee6a4SAndroid Build Coastguard Worker             ctx.mlock_budget_pages -= locked_pages;
678*bb4ee6a4SAndroid Build Coastguard Worker         }
679*bb4ee6a4SAndroid Build Coastguard Worker 
680*bb4ee6a4SAndroid Build Coastguard Worker         let max_pages = bytes_to_pages(max_size);
681*bb4ee6a4SAndroid Build Coastguard Worker         for region in ctx.regions[self.cur_staging..].iter_mut() {
682*bb4ee6a4SAndroid Build Coastguard Worker             // TODO(kawasin): swap_in multiple chunks less than max_size at once.
683*bb4ee6a4SAndroid Build Coastguard Worker             if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
684*bb4ee6a4SAndroid Build Coastguard Worker                 let pages = idx_range.end - idx_range.start;
685*bb4ee6a4SAndroid Build Coastguard Worker                 let page_addr = page_idx_to_addr(region.head_page_idx + idx_range.start);
686*bb4ee6a4SAndroid Build Coastguard Worker                 let slice = region.staging_memory.get_slice(idx_range.clone())?;
687*bb4ee6a4SAndroid Build Coastguard Worker                 uffd_copy_all(uffd, page_addr, slice, false)?;
688*bb4ee6a4SAndroid Build Coastguard Worker                 // Clear the staging memory to avoid memory spike.
689*bb4ee6a4SAndroid Build Coastguard Worker                 // TODO(kawasin): reduce the call count of MADV_REMOVE by removing several data
690*bb4ee6a4SAndroid Build Coastguard Worker                 // at once.
691*bb4ee6a4SAndroid Build Coastguard Worker                 region.staging_memory.clear_range(idx_range)?;
692*bb4ee6a4SAndroid Build Coastguard Worker                 region.swap_in_pages += pages;
693*bb4ee6a4SAndroid Build Coastguard Worker                 return Ok(pages);
694*bb4ee6a4SAndroid Build Coastguard Worker             }
695*bb4ee6a4SAndroid Build Coastguard Worker             self.cur_staging += 1;
696*bb4ee6a4SAndroid Build Coastguard Worker         }
697*bb4ee6a4SAndroid Build Coastguard Worker 
698*bb4ee6a4SAndroid Build Coastguard Worker         if let Some(mut idx_range_in_file) = ctx.file.first_data_range(max_pages) {
699*bb4ee6a4SAndroid Build Coastguard Worker             let PageHandleContext { regions, file, .. } = &mut *ctx;
700*bb4ee6a4SAndroid Build Coastguard Worker             for region in regions.iter_mut() {
701*bb4ee6a4SAndroid Build Coastguard Worker                 let region_tail_idx_in_file = region.base_page_idx_in_file + region.num_pages;
702*bb4ee6a4SAndroid Build Coastguard Worker                 if idx_range_in_file.start >= region_tail_idx_in_file {
703*bb4ee6a4SAndroid Build Coastguard Worker                     continue;
704*bb4ee6a4SAndroid Build Coastguard Worker                 } else if idx_range_in_file.start < region.base_page_idx_in_file {
705*bb4ee6a4SAndroid Build Coastguard Worker                     return Err(Error::File(FileError::OutOfRange));
706*bb4ee6a4SAndroid Build Coastguard Worker                 } else if idx_range_in_file.end > region_tail_idx_in_file {
707*bb4ee6a4SAndroid Build Coastguard Worker                     // The consecutive pages can be across regions. Swap-in pages in a region at
708*bb4ee6a4SAndroid Build Coastguard Worker                     // once.
709*bb4ee6a4SAndroid Build Coastguard Worker                     idx_range_in_file.end = region_tail_idx_in_file;
710*bb4ee6a4SAndroid Build Coastguard Worker                 }
711*bb4ee6a4SAndroid Build Coastguard Worker                 let pages = idx_range_in_file.end - idx_range_in_file.start;
712*bb4ee6a4SAndroid Build Coastguard Worker                 let page_addr = page_idx_to_addr(
713*bb4ee6a4SAndroid Build Coastguard Worker                     idx_range_in_file.start - region.base_page_idx_in_file + region.head_page_idx,
714*bb4ee6a4SAndroid Build Coastguard Worker                 );
715*bb4ee6a4SAndroid Build Coastguard Worker                 let slice = file.get_slice(idx_range_in_file.clone())?;
716*bb4ee6a4SAndroid Build Coastguard Worker                 // TODO(kawasin): Unlock regions to proceed page fault handling on the main thread.
717*bb4ee6a4SAndroid Build Coastguard Worker                 //                We also need to handle the EEXIST error from UFFD_COPY.
718*bb4ee6a4SAndroid Build Coastguard Worker                 uffd_copy_all(uffd, page_addr, slice, false)?;
719*bb4ee6a4SAndroid Build Coastguard Worker                 // Do not erase each chunk of pages from disk on swap_in. The whole file will be
720*bb4ee6a4SAndroid Build Coastguard Worker                 // truncated when swap_in is completed. Even if swap_in is aborted, the remaining
721*bb4ee6a4SAndroid Build Coastguard Worker                 // disk contents help the trimming optimization on swap_out.
722*bb4ee6a4SAndroid Build Coastguard Worker                 let munlocked_pages = file.clear_range(idx_range_in_file)?;
723*bb4ee6a4SAndroid Build Coastguard Worker                 region.swap_in_pages += pages;
724*bb4ee6a4SAndroid Build Coastguard Worker                 ctx.mlock_budget_pages += munlocked_pages;
725*bb4ee6a4SAndroid Build Coastguard Worker                 return Ok(pages);
726*bb4ee6a4SAndroid Build Coastguard Worker             }
727*bb4ee6a4SAndroid Build Coastguard Worker             // File has remaining pages, but regions has been consumed.
728*bb4ee6a4SAndroid Build Coastguard Worker             return Err(Error::File(FileError::OutOfRange));
729*bb4ee6a4SAndroid Build Coastguard Worker         }
730*bb4ee6a4SAndroid Build Coastguard Worker 
731*bb4ee6a4SAndroid Build Coastguard Worker         Ok(0)
732*bb4ee6a4SAndroid Build Coastguard Worker     }
733*bb4ee6a4SAndroid Build Coastguard Worker }
734*bb4ee6a4SAndroid Build Coastguard Worker 
735*bb4ee6a4SAndroid Build Coastguard Worker impl Drop for SwapInContext<'_> {
drop(&mut self)736*bb4ee6a4SAndroid Build Coastguard Worker     fn drop(&mut self) {
737*bb4ee6a4SAndroid Build Coastguard Worker         let mut ctx = self.ctx.lock();
738*bb4ee6a4SAndroid Build Coastguard Worker         if let Err(e) = ctx.file.clear_mlock() {
739*bb4ee6a4SAndroid Build Coastguard Worker             panic!("failed to clear mlock: {:?}", e);
740*bb4ee6a4SAndroid Build Coastguard Worker         }
741*bb4ee6a4SAndroid Build Coastguard Worker         ctx.mlock_budget_pages = bytes_to_pages(MLOCK_BUDGET);
742*bb4ee6a4SAndroid Build Coastguard Worker     }
743*bb4ee6a4SAndroid Build Coastguard Worker }
744*bb4ee6a4SAndroid Build Coastguard Worker 
745*bb4ee6a4SAndroid Build Coastguard Worker /// Context for trim operation.
746*bb4ee6a4SAndroid Build Coastguard Worker ///
747*bb4ee6a4SAndroid Build Coastguard Worker /// This drops 2 types of pages in the staging memory to reduce disk write.
748*bb4ee6a4SAndroid Build Coastguard Worker ///
749*bb4ee6a4SAndroid Build Coastguard Worker /// * Clean pages
750*bb4ee6a4SAndroid Build Coastguard Worker ///   * The pages which have been swapped out to the disk and have not been changed.
751*bb4ee6a4SAndroid Build Coastguard Worker ///   * Drop the pages in the staging memory and mark it as present on the swap file.
752*bb4ee6a4SAndroid Build Coastguard Worker /// * Zero pages
753*bb4ee6a4SAndroid Build Coastguard Worker ///   * Drop the pages in the staging memory. The pages will be UFFD_ZEROed on page fault.
754*bb4ee6a4SAndroid Build Coastguard Worker pub struct TrimContext<'a> {
755*bb4ee6a4SAndroid Build Coastguard Worker     ctx: &'a Mutex<PageHandleContext<'a>>,
756*bb4ee6a4SAndroid Build Coastguard Worker     cur_region: usize,
757*bb4ee6a4SAndroid Build Coastguard Worker     cur_page: usize,
758*bb4ee6a4SAndroid Build Coastguard Worker     /// The page idx range of pages which have been stored in the swap file.
759*bb4ee6a4SAndroid Build Coastguard Worker     next_data_in_file: Range<usize>,
760*bb4ee6a4SAndroid Build Coastguard Worker     clean_pages: usize,
761*bb4ee6a4SAndroid Build Coastguard Worker     zero_pages: usize,
762*bb4ee6a4SAndroid Build Coastguard Worker }
763*bb4ee6a4SAndroid Build Coastguard Worker 
764*bb4ee6a4SAndroid Build Coastguard Worker impl TrimContext<'_> {
765*bb4ee6a4SAndroid Build Coastguard Worker     /// Trim pages in the staging memory.
766*bb4ee6a4SAndroid Build Coastguard Worker     ///
767*bb4ee6a4SAndroid Build Coastguard Worker     /// This returns the pages trimmed. This returns `None` if it traversed all pages in the staging
768*bb4ee6a4SAndroid Build Coastguard Worker     /// memory.
769*bb4ee6a4SAndroid Build Coastguard Worker     ///
770*bb4ee6a4SAndroid Build Coastguard Worker     /// # Arguments
771*bb4ee6a4SAndroid Build Coastguard Worker     ///
772*bb4ee6a4SAndroid Build Coastguard Worker     /// `max_size` - The maximum pages to be compared.
trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>>773*bb4ee6a4SAndroid Build Coastguard Worker     pub fn trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>> {
774*bb4ee6a4SAndroid Build Coastguard Worker         let mut ctx = self.ctx.lock();
775*bb4ee6a4SAndroid Build Coastguard Worker         if self.cur_region >= ctx.regions.len() {
776*bb4ee6a4SAndroid Build Coastguard Worker             return Ok(None);
777*bb4ee6a4SAndroid Build Coastguard Worker         }
778*bb4ee6a4SAndroid Build Coastguard Worker         let PageHandleContext { regions, file, .. } = &mut *ctx;
779*bb4ee6a4SAndroid Build Coastguard Worker         let region = &mut regions[self.cur_region];
780*bb4ee6a4SAndroid Build Coastguard Worker         let mut n_trimmed = 0;
781*bb4ee6a4SAndroid Build Coastguard Worker 
782*bb4ee6a4SAndroid Build Coastguard Worker         for _ in 0..max_pages {
783*bb4ee6a4SAndroid Build Coastguard Worker             if let Some(slice_in_staging) = region
784*bb4ee6a4SAndroid Build Coastguard Worker                 .staging_memory
785*bb4ee6a4SAndroid Build Coastguard Worker                 .page_content(self.cur_page)
786*bb4ee6a4SAndroid Build Coastguard Worker                 .context("get page of staging memory")?
787*bb4ee6a4SAndroid Build Coastguard Worker             {
788*bb4ee6a4SAndroid Build Coastguard Worker                 let idx_range = self.cur_page..self.cur_page + 1;
789*bb4ee6a4SAndroid Build Coastguard Worker                 let idx_in_file = idx_range.start + region.base_page_idx_in_file;
790*bb4ee6a4SAndroid Build Coastguard Worker 
791*bb4ee6a4SAndroid Build Coastguard Worker                 // Check zero page on the staging memory first. If the page is non-zero and have not
792*bb4ee6a4SAndroid Build Coastguard Worker                 // been changed, zero checking is useless, but less cost than file I/O for the pages
793*bb4ee6a4SAndroid Build Coastguard Worker                 // which were in the swap file and now is zero.
794*bb4ee6a4SAndroid Build Coastguard Worker                 // Check 2 types of page in the same loop to utilize CPU cache for staging memory.
795*bb4ee6a4SAndroid Build Coastguard Worker                 if slice_in_staging.is_all_zero() {
796*bb4ee6a4SAndroid Build Coastguard Worker                     region
797*bb4ee6a4SAndroid Build Coastguard Worker                         .staging_memory
798*bb4ee6a4SAndroid Build Coastguard Worker                         .clear_range(idx_range.clone())
799*bb4ee6a4SAndroid Build Coastguard Worker                         .context("clear a page in staging memory")?;
800*bb4ee6a4SAndroid Build Coastguard Worker                     // The page is on the swap file as well.
801*bb4ee6a4SAndroid Build Coastguard Worker                     let munlocked_pages = file
802*bb4ee6a4SAndroid Build Coastguard Worker                         .free_range(idx_in_file..idx_in_file + 1)
803*bb4ee6a4SAndroid Build Coastguard Worker                         .context("clear a page in swap file")?;
804*bb4ee6a4SAndroid Build Coastguard Worker                     if munlocked_pages != 0 {
805*bb4ee6a4SAndroid Build Coastguard Worker                         // Only either of swap-in or trimming runs at the same time. This is not
806*bb4ee6a4SAndroid Build Coastguard Worker                         // expected path. Just logging an error because leaking
807*bb4ee6a4SAndroid Build Coastguard Worker                         // mlock_budget_pages is not fatal.
808*bb4ee6a4SAndroid Build Coastguard Worker                         error!("pages are mlock(2)ed while trimming");
809*bb4ee6a4SAndroid Build Coastguard Worker                     }
810*bb4ee6a4SAndroid Build Coastguard Worker                     n_trimmed += 1;
811*bb4ee6a4SAndroid Build Coastguard Worker                     self.zero_pages += 1;
812*bb4ee6a4SAndroid Build Coastguard Worker                 } else if let Some(slice_in_file) = file.page_content(idx_in_file, true)? {
813*bb4ee6a4SAndroid Build Coastguard Worker                     // Compare the page with the previous content of the page on the disk.
814*bb4ee6a4SAndroid Build Coastguard Worker                     if slice_in_staging == slice_in_file {
815*bb4ee6a4SAndroid Build Coastguard Worker                         region
816*bb4ee6a4SAndroid Build Coastguard Worker                             .staging_memory
817*bb4ee6a4SAndroid Build Coastguard Worker                             .clear_range(idx_range.clone())
818*bb4ee6a4SAndroid Build Coastguard Worker                             .context("clear a page in staging memory")?;
819*bb4ee6a4SAndroid Build Coastguard Worker                         file.mark_as_present(idx_in_file)?;
820*bb4ee6a4SAndroid Build Coastguard Worker                         n_trimmed += 1;
821*bb4ee6a4SAndroid Build Coastguard Worker                         self.clean_pages += 1;
822*bb4ee6a4SAndroid Build Coastguard Worker                     }
823*bb4ee6a4SAndroid Build Coastguard Worker                 }
824*bb4ee6a4SAndroid Build Coastguard Worker             }
825*bb4ee6a4SAndroid Build Coastguard Worker 
826*bb4ee6a4SAndroid Build Coastguard Worker             self.cur_page += 1;
827*bb4ee6a4SAndroid Build Coastguard Worker             if self.cur_page >= region.num_pages {
828*bb4ee6a4SAndroid Build Coastguard Worker                 self.cur_region += 1;
829*bb4ee6a4SAndroid Build Coastguard Worker                 self.cur_page = 0;
830*bb4ee6a4SAndroid Build Coastguard Worker                 self.next_data_in_file = 0..0;
831*bb4ee6a4SAndroid Build Coastguard Worker                 break;
832*bb4ee6a4SAndroid Build Coastguard Worker             }
833*bb4ee6a4SAndroid Build Coastguard Worker         }
834*bb4ee6a4SAndroid Build Coastguard Worker 
835*bb4ee6a4SAndroid Build Coastguard Worker         Ok(Some(n_trimmed))
836*bb4ee6a4SAndroid Build Coastguard Worker     }
837*bb4ee6a4SAndroid Build Coastguard Worker 
838*bb4ee6a4SAndroid Build Coastguard Worker     /// Total trimmed clean pages.
trimmed_clean_pages(&self) -> usize839*bb4ee6a4SAndroid Build Coastguard Worker     pub fn trimmed_clean_pages(&self) -> usize {
840*bb4ee6a4SAndroid Build Coastguard Worker         self.clean_pages
841*bb4ee6a4SAndroid Build Coastguard Worker     }
842*bb4ee6a4SAndroid Build Coastguard Worker 
843*bb4ee6a4SAndroid Build Coastguard Worker     /// Total trimmed zero pages.
trimmed_zero_pages(&self) -> usize844*bb4ee6a4SAndroid Build Coastguard Worker     pub fn trimmed_zero_pages(&self) -> usize {
845*bb4ee6a4SAndroid Build Coastguard Worker         self.zero_pages
846*bb4ee6a4SAndroid Build Coastguard Worker     }
847*bb4ee6a4SAndroid Build Coastguard Worker }
848