1 // Copyright 2021, The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 //! Functions for running instances of `crosvm`.
16 
17 use crate::aidl::{remove_temporary_files, Cid, GLOBAL_SERVICE, VirtualMachineCallbacks};
18 use crate::atom::{get_num_cpus, write_vm_exited_stats_sync};
19 use crate::debug_config::DebugConfig;
20 use anyhow::{anyhow, bail, Context, Error, Result};
21 use binder::ParcelFileDescriptor;
22 use command_fds::CommandFdExt;
23 use libc::{sysconf, _SC_CLK_TCK};
24 use log::{debug, error, info};
25 use semver::{Version, VersionReq};
26 use nix::{fcntl::OFlag, unistd::pipe2, unistd::Uid, unistd::User};
27 use regex::{Captures, Regex};
28 use rustutils::system_properties;
29 use shared_child::SharedChild;
30 use std::borrow::Cow;
31 use std::cmp::max;
32 use std::ffi::CString;
33 use std::fmt;
34 use std::fs::{read_to_string, File};
35 use std::io::{self, Read};
36 use std::mem;
37 use std::num::{NonZeroU16, NonZeroU32};
38 use std::os::unix::io::{AsRawFd, OwnedFd};
39 use std::os::unix::process::CommandExt;
40 use std::os::unix::process::ExitStatusExt;
41 use std::path::{Path, PathBuf};
42 use std::process::{Command, ExitStatus};
43 use std::sync::{Arc, Condvar, Mutex, LazyLock};
44 use std::time::{Duration, SystemTime};
45 use std::thread::{self, JoinHandle};
46 use android_system_virtualizationcommon::aidl::android::system::virtualizationcommon::DeathReason::DeathReason;
47 use android_system_virtualizationservice::aidl::android::system::virtualizationservice::{
48     VirtualMachineAppConfig::DebugLevel::DebugLevel,
49     AudioConfig::AudioConfig as AudioConfigParcelable,
50     DisplayConfig::DisplayConfig as DisplayConfigParcelable,
51     GpuConfig::GpuConfig as GpuConfigParcelable,
52     UsbConfig::UsbConfig as UsbConfigParcelable,
53 };
54 use android_system_virtualizationservice_internal::aidl::android::system::virtualizationservice_internal::IGlobalVmContext::IGlobalVmContext;
55 use android_system_virtualizationservice_internal::aidl::android::system::virtualizationservice_internal::IBoundDevice::IBoundDevice;
56 use binder::Strong;
57 use android_system_virtualmachineservice::aidl::android::system::virtualmachineservice::IVirtualMachineService::IVirtualMachineService;
58 use tombstoned_client::{TombstonedConnection, DebuggerdDumpType};
59 use rpcbinder::RpcServer;
60 
61 const CROSVM_PATH: &str = "/apex/com.android.virt/bin/crosvm";
62 
63 /// Version of the platform that crosvm currently implements. The format follows SemVer. This
64 /// should be updated when there is a platform change in the crosvm side. Having this value here is
65 /// fine because virtualizationservice and crosvm are supposed to be updated together in the virt
66 /// APEX.
67 const CROSVM_PLATFORM_VERSION: &str = "1.0.0";
68 
69 /// The exit status which crosvm returns when it has an error starting a VM.
70 const CROSVM_START_ERROR_STATUS: i32 = 1;
71 /// The exit status which crosvm returns when a VM requests a reboot.
72 const CROSVM_REBOOT_STATUS: i32 = 32;
73 /// The exit status which crosvm returns when it crashes due to an error.
74 const CROSVM_CRASH_STATUS: i32 = 33;
75 /// The exit status which crosvm returns when vcpu is stalled.
76 const CROSVM_WATCHDOG_REBOOT_STATUS: i32 = 36;
77 /// The size of memory (in MiB) reserved for ramdump
78 const RAMDUMP_RESERVED_MIB: u32 = 17;
79 
80 const MILLIS_PER_SEC: i64 = 1000;
81 
82 const SYSPROP_CUSTOM_PVMFW_PATH: &str = "hypervisor.pvmfw.path";
83 
84 /// Serial device for VM console input.
85 /// Hypervisor (virtio-console)
86 const CONSOLE_HVC0: &str = "hvc0";
87 /// Serial (emulated uart)
88 const CONSOLE_TTYS0: &str = "ttyS0";
89 
90 /// If the VM doesn't move to the Started state within this amount time, a hang-up error is
91 /// triggered.
92 static BOOT_HANGUP_TIMEOUT: LazyLock<Duration> = LazyLock::new(|| {
93     if nested_virt::is_nested_virtualization().unwrap() {
94         // Nested virtualization is slow, so we need a longer timeout.
95         Duration::from_secs(300)
96     } else {
97         Duration::from_secs(30)
98     }
99 });
100 
101 /// Configuration for a VM to run with crosvm.
102 #[derive(Debug)]
103 pub struct CrosvmConfig {
104     pub cid: Cid,
105     pub name: String,
106     pub bootloader: Option<File>,
107     pub kernel: Option<File>,
108     pub initrd: Option<File>,
109     pub disks: Vec<DiskFile>,
110     pub shared_paths: Vec<SharedPathConfig>,
111     pub params: Option<String>,
112     pub protected: bool,
113     pub debug_config: DebugConfig,
114     pub memory_mib: NonZeroU32,
115     pub cpus: Option<NonZeroU32>,
116     pub host_cpu_topology: bool,
117     pub console_out_fd: Option<File>,
118     pub console_in_fd: Option<File>,
119     pub log_fd: Option<File>,
120     pub ramdump: Option<File>,
121     pub indirect_files: Vec<File>,
122     pub platform_version: VersionReq,
123     pub detect_hangup: bool,
124     pub gdb_port: Option<NonZeroU16>,
125     pub vfio_devices: Vec<VfioDevice>,
126     pub dtbo: Option<File>,
127     pub device_tree_overlay: Option<File>,
128     pub display_config: Option<DisplayConfig>,
129     pub input_device_options: Vec<InputDeviceOption>,
130     pub hugepages: bool,
131     pub tap: Option<File>,
132     pub console_input_device: Option<String>,
133     pub boost_uclamp: bool,
134     pub gpu_config: Option<GpuConfig>,
135     pub audio_config: Option<AudioConfig>,
136     pub no_balloon: bool,
137     pub usb_config: UsbConfig,
138     pub dump_dt_fd: Option<File>,
139 }
140 
141 #[derive(Debug)]
142 pub struct AudioConfig {
143     pub use_microphone: bool,
144     pub use_speaker: bool,
145 }
146 
147 impl AudioConfig {
new(raw_config: &AudioConfigParcelable) -> Self148     pub fn new(raw_config: &AudioConfigParcelable) -> Self {
149         AudioConfig { use_microphone: raw_config.useMicrophone, use_speaker: raw_config.useSpeaker }
150     }
151 }
152 
153 #[derive(Debug)]
154 pub struct UsbConfig {
155     pub controller: bool,
156 }
157 
158 impl UsbConfig {
new(raw_config: &UsbConfigParcelable) -> Result<UsbConfig>159     pub fn new(raw_config: &UsbConfigParcelable) -> Result<UsbConfig> {
160         Ok(UsbConfig { controller: raw_config.controller })
161     }
162 }
163 
164 #[derive(Debug)]
165 pub struct DisplayConfig {
166     pub width: NonZeroU32,
167     pub height: NonZeroU32,
168     pub horizontal_dpi: NonZeroU32,
169     pub vertical_dpi: NonZeroU32,
170     pub refresh_rate: NonZeroU32,
171 }
172 
173 impl DisplayConfig {
new(raw_config: &DisplayConfigParcelable) -> Result<DisplayConfig>174     pub fn new(raw_config: &DisplayConfigParcelable) -> Result<DisplayConfig> {
175         let width = try_into_non_zero_u32(raw_config.width)?;
176         let height = try_into_non_zero_u32(raw_config.height)?;
177         let horizontal_dpi = try_into_non_zero_u32(raw_config.horizontalDpi)?;
178         let vertical_dpi = try_into_non_zero_u32(raw_config.verticalDpi)?;
179         let refresh_rate = try_into_non_zero_u32(raw_config.refreshRate)?;
180         Ok(DisplayConfig { width, height, horizontal_dpi, vertical_dpi, refresh_rate })
181     }
182 }
183 
184 #[derive(Debug)]
185 pub struct GpuConfig {
186     pub backend: Option<String>,
187     pub context_types: Option<Vec<String>>,
188     pub pci_address: Option<String>,
189     pub renderer_features: Option<String>,
190     pub renderer_use_egl: Option<bool>,
191     pub renderer_use_gles: Option<bool>,
192     pub renderer_use_glx: Option<bool>,
193     pub renderer_use_surfaceless: Option<bool>,
194     pub renderer_use_vulkan: Option<bool>,
195 }
196 
197 impl GpuConfig {
new(raw_config: &GpuConfigParcelable) -> Result<GpuConfig>198     pub fn new(raw_config: &GpuConfigParcelable) -> Result<GpuConfig> {
199         Ok(GpuConfig {
200             backend: raw_config.backend.clone(),
201             context_types: raw_config.contextTypes.clone().map(|context_types| {
202                 context_types.iter().filter_map(|context_type| context_type.clone()).collect()
203             }),
204             pci_address: raw_config.pciAddress.clone(),
205             renderer_features: raw_config.rendererFeatures.clone(),
206             renderer_use_egl: Some(raw_config.rendererUseEgl),
207             renderer_use_gles: Some(raw_config.rendererUseGles),
208             renderer_use_glx: Some(raw_config.rendererUseGlx),
209             renderer_use_surfaceless: Some(raw_config.rendererUseSurfaceless),
210             renderer_use_vulkan: Some(raw_config.rendererUseVulkan),
211         })
212     }
213 }
214 
try_into_non_zero_u32(value: i32) -> Result<NonZeroU32>215 fn try_into_non_zero_u32(value: i32) -> Result<NonZeroU32> {
216     let u32_value = value.try_into()?;
217     NonZeroU32::new(u32_value).ok_or(anyhow!("value should be greater than 0"))
218 }
219 
220 /// A disk image to pass to crosvm for a VM.
221 #[derive(Debug)]
222 pub struct DiskFile {
223     pub image: File,
224     pub writable: bool,
225 }
226 
227 /// Shared path between host and guest VM.
228 #[derive(Debug)]
229 pub struct SharedPathConfig {
230     pub path: String,
231     pub host_uid: i32,
232     pub host_gid: i32,
233     pub guest_uid: i32,
234     pub guest_gid: i32,
235     pub mask: i32,
236     pub tag: String,
237     pub socket_path: String,
238     pub socket_fd: Option<File>,
239     pub app_domain: bool,
240 }
241 
242 /// virtio-input device configuration from `external/crosvm/src/crosvm/config.rs`
243 #[derive(Debug)]
244 #[allow(dead_code)]
245 pub enum InputDeviceOption {
246     EvDev(File),
247     SingleTouch { file: File, width: u32, height: u32, name: Option<String> },
248     Keyboard(File),
249     Mouse(File),
250     Switches(File),
251     MultiTouchTrackpad { file: File, width: u32, height: u32, name: Option<String> },
252     MultiTouch { file: File, width: u32, height: u32, name: Option<String> },
253 }
254 
255 type VfioDevice = Strong<dyn IBoundDevice>;
256 
257 /// The lifecycle state which the payload in the VM has reported itself to be in.
258 ///
259 /// Note that the order of enum variants is significant; only forward transitions are allowed by
260 /// [`VmInstance::update_payload_state`].
261 #[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
262 pub enum PayloadState {
263     Starting,
264     Started,
265     Ready,
266     Finished,
267     Hangup, // Hasn't reached to Ready before timeout expires
268 }
269 
270 /// The current state of the VM itself.
271 #[derive(Debug)]
272 pub enum VmState {
273     /// The VM has not yet tried to start.
274     NotStarted {
275         ///The configuration needed to start the VM, if it has not yet been started.
276         config: Box<CrosvmConfig>,
277     },
278     /// The VM has been started.
279     Running {
280         /// The crosvm child process.
281         child: Arc<SharedChild>,
282         /// The thread waiting for crosvm to finish.
283         monitor_vm_exit_thread: Option<JoinHandle<()>>,
284     },
285     /// The VM died or was killed.
286     Dead,
287     /// The VM failed to start.
288     Failed,
289 }
290 
291 /// RSS values of VM and CrosVM process itself.
292 #[derive(Copy, Clone, Debug, Default)]
293 pub struct Rss {
294     pub vm: i64,
295     pub crosvm: i64,
296 }
297 
298 /// Metrics regarding the VM.
299 #[derive(Debug, Default)]
300 pub struct VmMetric {
301     /// Recorded timestamp when the VM is started.
302     pub start_timestamp: Option<SystemTime>,
303     /// Update most recent guest_time periodically from /proc/[crosvm pid]/stat while VM is
304     /// running.
305     pub cpu_guest_time: Option<i64>,
306     /// Update maximum RSS values periodically from /proc/[crosvm pid]/smaps while VM is running.
307     pub rss: Option<Rss>,
308 }
309 
310 impl VmState {
311     /// Tries to start the VM, if it is in the `NotStarted` state.
312     ///
313     /// Returns an error if the VM is in the wrong state, or fails to start.
start(&mut self, instance: Arc<VmInstance>) -> Result<(), Error>314     fn start(&mut self, instance: Arc<VmInstance>) -> Result<(), Error> {
315         let state = mem::replace(self, VmState::Failed);
316         if let VmState::NotStarted { config } = state {
317             let config = *config;
318             let detect_hangup = config.detect_hangup;
319             let (failure_pipe_read, failure_pipe_write) = create_pipe()?;
320             let vfio_devices = config.vfio_devices.clone();
321             let tap =
322                 if let Some(tap_file) = &config.tap { Some(tap_file.try_clone()?) } else { None };
323 
324             run_virtiofs(&config)?;
325 
326             // If this fails and returns an error, `self` will be left in the `Failed` state.
327             let child =
328                 Arc::new(run_vm(config, &instance.crosvm_control_socket_path, failure_pipe_write)?);
329 
330             let instance_monitor_status = instance.clone();
331             let child_monitor_status = child.clone();
332             thread::spawn(move || {
333                 instance_monitor_status.clone().monitor_vm_status(child_monitor_status);
334             });
335 
336             let child_clone = child.clone();
337             let instance_clone = instance.clone();
338             let monitor_vm_exit_thread = Some(thread::spawn(move || {
339                 instance_clone.monitor_vm_exit(child_clone, failure_pipe_read, vfio_devices, tap);
340             }));
341 
342             if detect_hangup {
343                 let child_clone = child.clone();
344                 thread::spawn(move || {
345                     instance.monitor_payload_hangup(child_clone);
346                 });
347             }
348 
349             // If it started correctly, update the state.
350             *self = VmState::Running { child, monitor_vm_exit_thread };
351             Ok(())
352         } else {
353             *self = state;
354             bail!("VM already started or failed")
355         }
356     }
357 }
358 
359 /// Internal struct that holds the handles to globally unique resources of a VM.
360 #[derive(Debug)]
361 pub struct VmContext {
362     #[allow(dead_code)] // Keeps the global context alive
363     pub(crate) global_context: Strong<dyn IGlobalVmContext>,
364     #[allow(dead_code)] // Keeps the server alive
365     vm_server: Option<RpcServer>,
366 }
367 
368 impl VmContext {
369     /// Construct new VmContext.
new( global_context: Strong<dyn IGlobalVmContext>, vm_server: Option<RpcServer>, ) -> VmContext370     pub fn new(
371         global_context: Strong<dyn IGlobalVmContext>,
372         vm_server: Option<RpcServer>,
373     ) -> VmContext {
374         VmContext { global_context, vm_server }
375     }
376 }
377 
378 /// Information about a particular instance of a VM which may be running.
379 #[derive(Debug)]
380 pub struct VmInstance {
381     /// The current state of the VM.
382     pub vm_state: Mutex<VmState>,
383     /// Global resources allocated for this VM.
384     #[allow(dead_code)] // Keeps the context alive
385     pub(crate) vm_context: VmContext,
386     /// The CID assigned to the VM for vsock communication.
387     pub cid: Cid,
388     /// Path to crosvm control socket
389     crosvm_control_socket_path: PathBuf,
390     /// The name of the VM.
391     pub name: String,
392     /// Whether the VM is a protected VM.
393     pub protected: bool,
394     /// Directory of temporary files used by the VM while it is running.
395     pub temporary_directory: PathBuf,
396     /// The UID of the process which requested the VM.
397     pub requester_uid: u32,
398     /// The PID of the process which requested the VM. Note that this process may no longer exist
399     /// and the PID may have been reused for a different process, so this should not be trusted.
400     pub requester_debug_pid: i32,
401     /// Callbacks to clients of the VM.
402     pub callbacks: VirtualMachineCallbacks,
403     /// VirtualMachineService binder object for the VM.
404     #[allow(dead_code)]
405     pub vm_service: Mutex<Option<Strong<dyn IVirtualMachineService>>>,
406     /// Recorded metrics of VM such as timestamp or cpu / memory usage.
407     pub vm_metric: Mutex<VmMetric>,
408     /// The latest lifecycle state which the payload reported itself to be in.
409     payload_state: Mutex<PayloadState>,
410     /// Represents the condition that payload_state was updated
411     payload_state_updated: Condvar,
412     /// The human readable name of requester_uid
413     requester_uid_name: String,
414 }
415 
416 impl fmt::Display for VmInstance {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result417     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
418         let adj = if self.protected { "Protected" } else { "Non-protected" };
419         write!(
420             f,
421             "{} virtual machine \"{}\" (owner: {}, cid: {})",
422             adj, self.name, self.requester_uid_name, self.cid
423         )
424     }
425 }
426 
427 impl VmInstance {
428     /// Validates the given config and creates a new `VmInstance` but doesn't start running it.
new( config: CrosvmConfig, temporary_directory: PathBuf, requester_uid: u32, requester_debug_pid: i32, vm_context: VmContext, ) -> Result<VmInstance, Error>429     pub fn new(
430         config: CrosvmConfig,
431         temporary_directory: PathBuf,
432         requester_uid: u32,
433         requester_debug_pid: i32,
434         vm_context: VmContext,
435     ) -> Result<VmInstance, Error> {
436         validate_config(&config)?;
437         let cid = config.cid;
438         let name = config.name.clone();
439         let protected = config.protected;
440         let requester_uid_name = User::from_uid(Uid::from_raw(requester_uid))
441             .ok()
442             .flatten()
443             .map_or_else(|| format!("{}", requester_uid), |u| u.name);
444         let instance = VmInstance {
445             vm_state: Mutex::new(VmState::NotStarted { config: Box::new(config) }),
446             vm_context,
447             cid,
448             crosvm_control_socket_path: temporary_directory.join("crosvm.sock"),
449             name,
450             protected,
451             temporary_directory,
452             requester_uid,
453             requester_debug_pid,
454             callbacks: Default::default(),
455             vm_service: Mutex::new(None),
456             vm_metric: Mutex::new(Default::default()),
457             payload_state: Mutex::new(PayloadState::Starting),
458             payload_state_updated: Condvar::new(),
459             requester_uid_name,
460         };
461         info!("{} created", &instance);
462         Ok(instance)
463     }
464 
465     /// Starts an instance of `crosvm` to manage the VM. The `crosvm` instance will be killed when
466     /// the `VmInstance` is dropped.
start(self: &Arc<Self>) -> Result<(), Error>467     pub fn start(self: &Arc<Self>) -> Result<(), Error> {
468         let mut vm_metric = self.vm_metric.lock().unwrap();
469         vm_metric.start_timestamp = Some(SystemTime::now());
470         let ret = self.vm_state.lock().unwrap().start(self.clone());
471         if ret.is_ok() {
472             info!("{} started", &self);
473         }
474         ret.with_context(|| format!("{} failed to start", &self))
475     }
476 
477     /// Monitors the exit of the VM (i.e. termination of the `child` process). When that happens,
478     /// handles the event by updating the state, noityfing the event to clients by calling
479     /// callbacks, and removing temporary files for the VM.
monitor_vm_exit( &self, child: Arc<SharedChild>, failure_pipe_read: File, vfio_devices: Vec<VfioDevice>, tap: Option<File>, )480     fn monitor_vm_exit(
481         &self,
482         child: Arc<SharedChild>,
483         failure_pipe_read: File,
484         vfio_devices: Vec<VfioDevice>,
485         tap: Option<File>,
486     ) {
487         let failure_reason_thread = std::thread::spawn(move || {
488             // Read the pipe to see if any failure reason is written
489             let mut failure_reason = String::new();
490             // Arbitrary max size in case of misbehaving guest.
491             const MAX_SIZE: u64 = 50_000;
492             match failure_pipe_read.take(MAX_SIZE).read_to_string(&mut failure_reason) {
493                 Err(e) => error!("Error reading VM failure reason from pipe: {}", e),
494                 Ok(len) if len > 0 => error!("VM returned failure reason '{}'", &failure_reason),
495                 _ => (),
496             };
497             failure_reason
498         });
499 
500         let result = child.wait();
501         match &result {
502             Err(e) => error!("Error waiting for crosvm({}) instance to die: {}", child.id(), e),
503             Ok(status) => {
504                 info!("crosvm({}) exited with status {}", child.id(), status);
505                 if let Some(exit_status_code) = status.code() {
506                     if exit_status_code == CROSVM_WATCHDOG_REBOOT_STATUS {
507                         info!("detected vcpu stall on crosvm");
508                     }
509                 }
510             }
511         }
512 
513         let failure_reason = failure_reason_thread.join().expect("failure_reason_thread panic'd");
514 
515         let mut vm_state = self.vm_state.lock().unwrap();
516         *vm_state = VmState::Dead;
517         // Ensure that the mutex is released before calling the callbacks.
518         drop(vm_state);
519         info!("{} exited", &self);
520 
521         // In case of hangup, the pipe doesn't give us any information because the hangup can't be
522         // detected on the VM side (otherwise, it isn't a hangup), but in the
523         // monitor_payload_hangup function below which updates the payload state to Hangup.
524         let failure_reason =
525             if failure_reason.is_empty() && self.payload_state() == PayloadState::Hangup {
526                 Cow::from("HANGUP")
527             } else {
528                 Cow::from(failure_reason)
529             };
530 
531         self.handle_ramdump().unwrap_or_else(|e| error!("Error handling ramdump: {}", e));
532 
533         let death_reason = death_reason(&result, &failure_reason);
534         let exit_signal = exit_signal(&result);
535 
536         self.callbacks.callback_on_died(self.cid, death_reason);
537 
538         let vm_metric = self.vm_metric.lock().unwrap();
539         write_vm_exited_stats_sync(
540             self.requester_uid as i32,
541             &self.name,
542             death_reason,
543             exit_signal,
544             &vm_metric,
545         );
546 
547         // Delete temporary files. The folder itself is removed by VirtualizationServiceInternal.
548         remove_temporary_files(&self.temporary_directory).unwrap_or_else(|e| {
549             error!("Error removing temporary files from {:?}: {}", self.temporary_directory, e);
550         });
551 
552         if let Some(tap_file) = tap {
553             GLOBAL_SERVICE
554                 .deleteTapInterface(&ParcelFileDescriptor::new(OwnedFd::from(tap_file)))
555                 .unwrap_or_else(|e| {
556                     error!("Error deleting TAP interface: {e:?}");
557                 });
558         }
559 
560         drop(vfio_devices); // Cleanup devices.
561     }
562 
563     /// Waits until payload is started, or timeout expires. When timeout occurs, kill
564     /// the VM to prevent indefinite hangup and update the payload_state accordingly.
monitor_payload_hangup(&self, child: Arc<SharedChild>)565     fn monitor_payload_hangup(&self, child: Arc<SharedChild>) {
566         debug!("Starting to monitor hangup for Microdroid({})", child.id());
567         let (state, result) = self
568             .payload_state_updated
569             .wait_timeout_while(self.payload_state.lock().unwrap(), *BOOT_HANGUP_TIMEOUT, |s| {
570                 *s < PayloadState::Started
571             })
572             .unwrap();
573         drop(state); // we are not interested in state
574         let child_still_running = child.try_wait().ok() == Some(None);
575         if result.timed_out() && child_still_running {
576             error!(
577                 "Microdroid({}) failed to start payload within {} secs timeout. Shutting down.",
578                 child.id(),
579                 BOOT_HANGUP_TIMEOUT.as_secs()
580             );
581             self.update_payload_state(PayloadState::Hangup).unwrap();
582             if let Err(e) = self.kill() {
583                 error!("Error stopping timed-out VM with CID {}: {:?}", child.id(), e);
584             }
585         }
586     }
587 
monitor_vm_status(&self, child: Arc<SharedChild>)588     fn monitor_vm_status(&self, child: Arc<SharedChild>) {
589         let pid = child.id();
590 
591         loop {
592             {
593                 // Check VM state
594                 let vm_state = &*self.vm_state.lock().unwrap();
595                 if let VmState::Dead = vm_state {
596                     break;
597                 }
598 
599                 let mut vm_metric = self.vm_metric.lock().unwrap();
600 
601                 // Get CPU Information
602                 match get_guest_time(pid) {
603                     Ok(guest_time) => vm_metric.cpu_guest_time = Some(guest_time),
604                     Err(e) => error!("Failed to get guest CPU time: {e:?}"),
605                 }
606 
607                 // Get Memory Information
608                 match get_rss(pid) {
609                     Ok(rss) => {
610                         vm_metric.rss = match &vm_metric.rss {
611                             Some(x) => Some(Rss::extract_max(x, &rss)),
612                             None => Some(rss),
613                         }
614                     }
615                     Err(e) => error!("Failed to get guest RSS: {}", e),
616                 }
617             }
618 
619             thread::sleep(Duration::from_secs(1));
620         }
621     }
622 
623     /// Returns the last reported state of the VM payload.
payload_state(&self) -> PayloadState624     pub fn payload_state(&self) -> PayloadState {
625         *self.payload_state.lock().unwrap()
626     }
627 
628     /// Updates the payload state to the given value, if it is a valid state transition.
update_payload_state(&self, new_state: PayloadState) -> Result<(), Error>629     pub fn update_payload_state(&self, new_state: PayloadState) -> Result<(), Error> {
630         let mut state_locked = self.payload_state.lock().unwrap();
631         // Only allow forward transitions, e.g. from starting to started or finished, not back in
632         // the other direction.
633         if new_state > *state_locked {
634             *state_locked = new_state;
635             self.payload_state_updated.notify_all();
636             Ok(())
637         } else {
638             bail!("Invalid payload state transition from {:?} to {:?}", *state_locked, new_state)
639         }
640     }
641 
642     /// Kills the crosvm instance, if it is running.
kill(&self) -> Result<(), Error>643     pub fn kill(&self) -> Result<(), Error> {
644         let monitor_vm_exit_thread = {
645             let vm_state = &mut *self.vm_state.lock().unwrap();
646             if let VmState::Running { child, monitor_vm_exit_thread } = vm_state {
647                 let id = child.id();
648                 debug!("Killing crosvm({})", id);
649                 // TODO: Talk to crosvm to shutdown cleanly.
650                 child.kill().with_context(|| format!("Error killing crosvm({id}) instance"))?;
651                 monitor_vm_exit_thread.take()
652             } else {
653                 bail!("VM is not running")
654             }
655         };
656 
657         // Wait for monitor_vm_exit() to finish. Must release vm_state lock
658         // first, as monitor_vm_exit() takes it as well.
659         monitor_vm_exit_thread.map(JoinHandle::join);
660 
661         // Now that the VM has been killed, shut down the VirtualMachineService
662         // server to eagerly free up the server threads.
663         if let Some(vm_server) = &self.vm_context.vm_server {
664             vm_server.shutdown()?;
665         }
666 
667         Ok(())
668     }
669 
670     /// Returns current virtio-balloon size.
get_memory_balloon(&self) -> Result<u64, Error>671     pub fn get_memory_balloon(&self) -> Result<u64, Error> {
672         let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
673         let mut balloon_actual = 0u64;
674         // SAFETY: Pointers are valid for the lifetime of the call. Null `stats` is valid.
675         let success = unsafe {
676             crosvm_control::crosvm_client_balloon_stats(
677                 socket_path_cstring.as_ptr(),
678                 /* stats= */ std::ptr::null_mut(),
679                 &mut balloon_actual,
680             )
681         };
682         if !success {
683             bail!("Error requesting balloon stats");
684         }
685         Ok(balloon_actual)
686     }
687 
688     /// Inflates the virtio-balloon by `num_bytes` to reclaim guest memory. Called in response to
689     /// memory-trimming notifications.
set_memory_balloon(&self, num_bytes: u64) -> Result<(), Error>690     pub fn set_memory_balloon(&self, num_bytes: u64) -> Result<(), Error> {
691         let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
692         // SAFETY: Pointer is valid for the lifetime of the call.
693         let success = unsafe {
694             crosvm_control::crosvm_client_balloon_vms(socket_path_cstring.as_ptr(), num_bytes)
695         };
696         if !success {
697             bail!("Error sending balloon adjustment");
698         }
699         Ok(())
700     }
701 
702     /// Checks if ramdump has been created. If so, send it to tombstoned.
handle_ramdump(&self) -> Result<(), Error>703     fn handle_ramdump(&self) -> Result<(), Error> {
704         let ramdump_path = self.temporary_directory.join("ramdump");
705         if !ramdump_path.as_path().try_exists()? {
706             return Ok(());
707         }
708         if std::fs::metadata(&ramdump_path)?.len() > 0 {
709             Self::send_ramdump_to_tombstoned(&ramdump_path)?;
710         }
711         Ok(())
712     }
713 
send_ramdump_to_tombstoned(ramdump_path: &Path) -> Result<(), Error>714     fn send_ramdump_to_tombstoned(ramdump_path: &Path) -> Result<(), Error> {
715         let mut input = File::open(ramdump_path)
716             .context(format!("Failed to open ramdump {:?} for reading", ramdump_path))?;
717 
718         let pid = std::process::id() as i32;
719         let conn = TombstonedConnection::connect(pid, DebuggerdDumpType::Tombstone)
720             .context("Failed to connect to tombstoned")?;
721         let mut output = conn
722             .text_output
723             .as_ref()
724             .ok_or_else(|| anyhow!("Could not get file to write the tombstones on"))?;
725 
726         std::io::copy(&mut input, &mut output).context("Failed to send ramdump to tombstoned")?;
727         info!("Ramdump {:?} sent to tombstoned", ramdump_path);
728 
729         conn.notify_completion()?;
730         Ok(())
731     }
732 
733     /// Suspends the VM's vCPUs.
suspend(&self) -> Result<(), Error>734     pub fn suspend(&self) -> Result<(), Error> {
735         let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
736         // SAFETY: Pointer is valid for the lifetime of the call.
737         let success =
738             unsafe { crosvm_control::crosvm_client_suspend_vm(socket_path_cstring.as_ptr()) };
739         if !success {
740             bail!("Failed to suspend VM");
741         }
742         Ok(())
743     }
744 
745     /// Resumes the VM's vCPUs.
resume(&self) -> Result<(), Error>746     pub fn resume(&self) -> Result<(), Error> {
747         let socket_path_cstring = path_to_cstring(&self.crosvm_control_socket_path);
748         // SAFETY: Pointer is valid for the lifetime of the call.
749         let success =
750             unsafe { crosvm_control::crosvm_client_resume_vm(socket_path_cstring.as_ptr()) };
751         if !success {
752             bail!("Failed to resume VM");
753         }
754         Ok(())
755     }
756 }
757 
758 impl Rss {
extract_max(x: &Rss, y: &Rss) -> Rss759     fn extract_max(x: &Rss, y: &Rss) -> Rss {
760         Rss { vm: max(x.vm, y.vm), crosvm: max(x.crosvm, y.crosvm) }
761     }
762 }
763 
764 // Get Cpus_allowed mask
check_if_all_cpus_allowed() -> Result<bool>765 fn check_if_all_cpus_allowed() -> Result<bool> {
766     let file = read_to_string("/proc/self/status")?;
767     let lines: Vec<_> = file.split('\n').collect();
768 
769     for line in lines {
770         if line.contains("Cpus_allowed_list") {
771             let prop: Vec<_> = line.split_whitespace().collect();
772             if prop.len() != 2 {
773                 return Ok(false);
774             }
775             let cpu_list: Vec<_> = prop[1].split('-').collect();
776             //Only contiguous Cpu list allowed
777             if cpu_list.len() != 2 {
778                 return Ok(false);
779             }
780             if let Some(cpus) = get_num_cpus() {
781                 let max_cpu = cpu_list[1].parse::<usize>()?;
782                 if max_cpu == cpus - 1 {
783                     return Ok(true);
784                 } else {
785                     return Ok(false);
786                 }
787             }
788         }
789     }
790     Ok(false)
791 }
792 
793 // Get guest time from /proc/[crosvm pid]/stat
get_guest_time(pid: u32) -> Result<i64>794 fn get_guest_time(pid: u32) -> Result<i64> {
795     let file = read_to_string(format!("/proc/{}/stat", pid))?;
796     let data_list: Vec<_> = file.split_whitespace().collect();
797 
798     // Information about guest_time is at 43th place of the file split with the whitespace.
799     // Example of /proc/[pid]/stat :
800     // 6603 (kworker/104:1H-kblockd) I 2 0 0 0 -1 69238880 0 0 0 0 0 88 0 0 0 -20 1 0 1845 0 0
801     // 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 104 0 0 0 0 0 0 0 0 0 0 0 0 0
802     if data_list.len() < 43 {
803         bail!("Failed to parse command result for getting guest time : {}", file);
804     }
805 
806     let guest_time_ticks = data_list[42].parse::<i64>()?;
807     // SAFETY: It just returns an integer about CPU tick information.
808     let ticks_per_sec = unsafe { sysconf(_SC_CLK_TCK) };
809     Ok(guest_time_ticks * MILLIS_PER_SEC / ticks_per_sec)
810 }
811 
812 // Get rss from /proc/[crosvm pid]/smaps
get_rss(pid: u32) -> Result<Rss>813 fn get_rss(pid: u32) -> Result<Rss> {
814     let file = read_to_string(format!("/proc/{}/smaps", pid))?;
815     let lines: Vec<_> = file.split('\n').collect();
816 
817     let mut rss_vm_total = 0i64;
818     let mut rss_crosvm_total = 0i64;
819     let mut is_vm = false;
820     for line in lines {
821         if line.contains("crosvm_guest") {
822             is_vm = true;
823         } else if line.contains("Rss:") {
824             let data_list: Vec<_> = line.split_whitespace().collect();
825             if data_list.len() < 2 {
826                 bail!("Failed to parse command result for getting rss :\n{}", line);
827             }
828             let rss = data_list[1].parse::<i64>()?;
829 
830             if is_vm {
831                 rss_vm_total += rss;
832                 is_vm = false;
833             }
834             rss_crosvm_total += rss;
835         }
836     }
837 
838     Ok(Rss { vm: rss_vm_total, crosvm: rss_crosvm_total })
839 }
840 
death_reason(result: &Result<ExitStatus, io::Error>, mut failure_reason: &str) -> DeathReason841 fn death_reason(result: &Result<ExitStatus, io::Error>, mut failure_reason: &str) -> DeathReason {
842     if let Some((reason, info)) = failure_reason.split_once('|') {
843         // Separator indicates extra context information is present after the failure name.
844         error!("Failure info: {info}");
845         failure_reason = reason;
846     }
847     if let Ok(status) = result {
848         match failure_reason {
849             "PVM_FIRMWARE_PUBLIC_KEY_MISMATCH" => {
850                 return DeathReason::PVM_FIRMWARE_PUBLIC_KEY_MISMATCH
851             }
852             "PVM_FIRMWARE_INSTANCE_IMAGE_CHANGED" => {
853                 return DeathReason::PVM_FIRMWARE_INSTANCE_IMAGE_CHANGED
854             }
855             "MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE" => {
856                 return DeathReason::MICRODROID_FAILED_TO_CONNECT_TO_VIRTUALIZATION_SERVICE
857             }
858             "MICRODROID_PAYLOAD_HAS_CHANGED" => return DeathReason::MICRODROID_PAYLOAD_HAS_CHANGED,
859             "MICRODROID_PAYLOAD_VERIFICATION_FAILED" => {
860                 return DeathReason::MICRODROID_PAYLOAD_VERIFICATION_FAILED
861             }
862             "MICRODROID_INVALID_PAYLOAD_CONFIG" => {
863                 return DeathReason::MICRODROID_INVALID_PAYLOAD_CONFIG
864             }
865             "MICRODROID_UNKNOWN_RUNTIME_ERROR" => {
866                 return DeathReason::MICRODROID_UNKNOWN_RUNTIME_ERROR
867             }
868             "HANGUP" => return DeathReason::HANGUP,
869             _ => {}
870         }
871         match status.code() {
872             None => DeathReason::KILLED,
873             Some(0) => DeathReason::SHUTDOWN,
874             Some(CROSVM_START_ERROR_STATUS) => DeathReason::START_FAILED,
875             Some(CROSVM_REBOOT_STATUS) => DeathReason::REBOOT,
876             Some(CROSVM_CRASH_STATUS) => DeathReason::CRASH,
877             Some(CROSVM_WATCHDOG_REBOOT_STATUS) => DeathReason::WATCHDOG_REBOOT,
878             Some(_) => DeathReason::UNKNOWN,
879         }
880     } else {
881         DeathReason::INFRASTRUCTURE_ERROR
882     }
883 }
884 
exit_signal(result: &Result<ExitStatus, io::Error>) -> Option<i32>885 fn exit_signal(result: &Result<ExitStatus, io::Error>) -> Option<i32> {
886     match result {
887         Ok(status) => status.signal(),
888         Err(_) => None,
889     }
890 }
891 
892 const SYSFS_PLATFORM_DEVICES_PATH: &str = "/sys/devices/platform/";
893 const VFIO_PLATFORM_DRIVER_PATH: &str = "/sys/bus/platform/drivers/vfio-platform";
894 
vfio_argument_for_platform_device(device: &VfioDevice) -> Result<String, Error>895 fn vfio_argument_for_platform_device(device: &VfioDevice) -> Result<String, Error> {
896     // Check platform device exists
897     let path = Path::new(&device.getSysfsPath()?).canonicalize()?;
898     if !path.starts_with(SYSFS_PLATFORM_DEVICES_PATH) {
899         bail!("{path:?} is not a platform device");
900     }
901 
902     // Check platform device is bound to VFIO driver
903     let dev_driver_path = path.join("driver").canonicalize()?;
904     if dev_driver_path != Path::new(VFIO_PLATFORM_DRIVER_PATH) {
905         bail!("{path:?} is not bound to VFIO-platform driver");
906     }
907 
908     if let Some(p) = path.to_str() {
909         Ok(format!("--vfio={p},iommu=pkvm-iommu,dt-symbol={0}", device.getDtboLabel()?))
910     } else {
911         bail!("invalid path {path:?}");
912     }
913 }
914 
run_virtiofs(config: &CrosvmConfig) -> io::Result<()>915 fn run_virtiofs(config: &CrosvmConfig) -> io::Result<()> {
916     for shared_path in &config.shared_paths {
917         if shared_path.app_domain {
918             continue;
919         }
920         let ugid_map_value = format!(
921             "{} {} {} {} {} /",
922             shared_path.guest_uid,
923             shared_path.guest_gid,
924             shared_path.host_uid,
925             shared_path.host_gid,
926             shared_path.mask,
927         );
928 
929         let cfg_arg = format!("ugid_map='{}'", ugid_map_value);
930 
931         let mut command = Command::new(CROSVM_PATH);
932         command
933             .arg("device")
934             .arg("fs")
935             .arg(format!("--socket={}", &shared_path.socket_path))
936             .arg(format!("--tag={}", &shared_path.tag))
937             .arg(format!("--shared-dir={}", &shared_path.path))
938             .arg("--cfg")
939             .arg(cfg_arg.as_str())
940             .arg("--disable-sandbox")
941             .arg("--skip-pivot-root=true");
942 
943         print_crosvm_args(&command);
944 
945         let result = SharedChild::spawn(&mut command)?;
946         info!("Spawned virtiofs crosvm({})", result.id());
947     }
948 
949     Ok(())
950 }
951 
952 /// Starts an instance of `crosvm` to manage a new VM.
run_vm( config: CrosvmConfig, crosvm_control_socket_path: &Path, failure_pipe_write: File, ) -> Result<SharedChild, Error>953 fn run_vm(
954     config: CrosvmConfig,
955     crosvm_control_socket_path: &Path,
956     failure_pipe_write: File,
957 ) -> Result<SharedChild, Error> {
958     validate_config(&config)?;
959 
960     let mut command = Command::new(CROSVM_PATH);
961 
962     let vm_name = "crosvm_".to_owned() + &config.name;
963     command.arg0(vm_name.clone());
964     // TODO(qwandor): Remove --disable-sandbox.
965     command
966         .arg("--extended-status")
967         // Configure the logger for the crosvm process to silence logs from the disk crate which
968         // don't provide much information to us (but do spamming us).
969         .arg("--log-level")
970         .arg("info,disk=warn")
971         .arg("run")
972         .arg("--name")
973         .arg(vm_name)
974         .arg("--disable-sandbox")
975         .arg("--cid")
976         .arg(config.cid.to_string());
977 
978     if system_properties::read_bool("hypervisor.memory_reclaim.supported", false)?
979         && !config.no_balloon
980     {
981         command.arg("--balloon-page-reporting");
982     } else {
983         command.arg("--no-balloon");
984     }
985 
986     if !config.usb_config.controller {
987         command.arg("--no-usb");
988     }
989 
990     let mut memory_mib = config.memory_mib;
991 
992     if config.protected {
993         match system_properties::read(SYSPROP_CUSTOM_PVMFW_PATH)? {
994             Some(pvmfw_path) if !pvmfw_path.is_empty() => {
995                 if pvmfw_path == "none" {
996                     command.arg("--protected-vm-without-firmware")
997                 } else {
998                     command.arg("--protected-vm-with-firmware").arg(pvmfw_path)
999                 }
1000             }
1001             _ => command.arg("--protected-vm"),
1002         };
1003 
1004         // 3 virtio-console devices + vsock = 4.
1005         let virtio_pci_device_count = 4 + config.disks.len();
1006         // crosvm virtio queue has 256 entries, so 2 MiB per device (2 pages per entry) should be
1007         // enough.
1008         let swiotlb_size_mib = 2 * virtio_pci_device_count as u32;
1009         command.arg("--swiotlb").arg(swiotlb_size_mib.to_string());
1010 
1011         // b/346770542 for consistent "usable" memory across protected and non-protected VMs.
1012         memory_mib = memory_mib.saturating_add(swiotlb_size_mib);
1013 
1014         // Workaround to keep crash_dump from trying to read protected guest memory.
1015         // Context in b/238324526.
1016         command.arg("--unmap-guest-memory-on-fork");
1017 
1018         if config.ramdump.is_some() {
1019             // Protected VM needs to reserve memory for ramdump here. Note that we reserve more
1020             // memory for the restricted dma pool.
1021             let ramdump_reserve = RAMDUMP_RESERVED_MIB + swiotlb_size_mib;
1022             command.arg("--params").arg(format!("crashkernel={ramdump_reserve}M"));
1023         }
1024     } else if config.ramdump.is_some() {
1025         command.arg("--params").arg(format!("crashkernel={RAMDUMP_RESERVED_MIB}M"));
1026     }
1027     if config.debug_config.debug_level == DebugLevel::NONE
1028         && config.debug_config.should_prepare_console_output()
1029     {
1030         // bootconfig.normal will be used, but we need log.
1031         command.arg("--params").arg("printk.devkmsg=on");
1032         command.arg("--params").arg("console=hvc0");
1033     }
1034 
1035     // Move the PCI MMIO regions to near the end of the low-MMIO space.
1036     // This is done to accommodate a limitation in a partner's hypervisor.
1037     #[cfg(target_arch = "aarch64")]
1038     command
1039         .arg("--pci")
1040         .arg("mem=[start=0x70000000,size=0x2000000],cam=[start=0x72000000,size=0x1000000]");
1041 
1042     command.arg("--mem").arg(memory_mib.to_string());
1043 
1044     if let Some(cpus) = config.cpus {
1045         command.arg("--cpus").arg(cpus.to_string());
1046     }
1047 
1048     if config.host_cpu_topology {
1049         if cfg!(virt_cpufreq) && check_if_all_cpus_allowed()? {
1050             command.arg("--host-cpu-topology");
1051             cfg_if::cfg_if! {
1052                 if #[cfg(any(target_arch = "aarch64"))] {
1053                     command.arg("--virt-cpufreq");
1054                 }
1055             }
1056         } else if let Some(cpus) = get_num_cpus() {
1057             command.arg("--cpus").arg(cpus.to_string());
1058         } else {
1059             bail!("Could not determine the number of CPUs in the system");
1060         }
1061     }
1062 
1063     if let Some(gdb_port) = config.gdb_port {
1064         command.arg("--gdb").arg(gdb_port.to_string());
1065     }
1066 
1067     // Keep track of what file descriptors should be mapped to the crosvm process.
1068     let mut preserved_fds = config.indirect_files.into_iter().map(|f| f.into()).collect();
1069 
1070     if let Some(dump_dt_fd) = config.dump_dt_fd {
1071         let dump_dt_fd = add_preserved_fd(&mut preserved_fds, dump_dt_fd);
1072         command.arg("--dump-device-tree-blob").arg(dump_dt_fd);
1073     }
1074 
1075     // Setup the serial devices.
1076     // 1. uart device: used as the output device by bootloaders and as early console by linux
1077     // 2. uart device: used to report the reason for the VM failing.
1078     // 3. virtio-console device: used as the console device where kmsg is redirected to
1079     // 4. virtio-console device: used as the ramdump output
1080     // 5. virtio-console device: used as the logcat output
1081     //
1082     // When [console|log]_fd is not specified, the devices are attached to sink, which means what's
1083     // written there is discarded.
1084     let console_out_arg = format_serial_out_arg(&mut preserved_fds, config.console_out_fd);
1085     let console_in_arg = config
1086         .console_in_fd
1087         .map(|fd| format!(",input={}", add_preserved_fd(&mut preserved_fds, fd)))
1088         .unwrap_or_default();
1089     let log_arg = format_serial_out_arg(&mut preserved_fds, config.log_fd);
1090     let failure_serial_path = add_preserved_fd(&mut preserved_fds, failure_pipe_write);
1091     let ramdump_arg = format_serial_out_arg(&mut preserved_fds, config.ramdump);
1092     let console_input_device = config.console_input_device.as_deref().unwrap_or(CONSOLE_HVC0);
1093     match console_input_device {
1094         CONSOLE_HVC0 | CONSOLE_TTYS0 => {}
1095         _ => bail!("Unsupported serial device {console_input_device}"),
1096     };
1097 
1098     // Warning: Adding more serial devices requires you to shift the PCI device ID of the boot
1099     // disks in bootconfig.x86_64. This is because x86 crosvm puts serial devices and the block
1100     // devices in the same PCI bus and serial devices comes before the block devices. Arm crosvm
1101     // doesn't have the issue.
1102     // /dev/ttyS0
1103     command.arg(format!(
1104         "--serial={}{},hardware=serial,num=1",
1105         &console_out_arg,
1106         if console_input_device == CONSOLE_TTYS0 { &console_in_arg } else { "" }
1107     ));
1108     // /dev/ttyS1
1109     command.arg(format!("--serial=type=file,path={},hardware=serial,num=2", &failure_serial_path));
1110     // /dev/hvc0
1111     command.arg(format!(
1112         "--serial={}{},hardware=virtio-console,num=1",
1113         &console_out_arg,
1114         if console_input_device == CONSOLE_HVC0 { &console_in_arg } else { "" }
1115     ));
1116     // /dev/hvc1
1117     command.arg(format!("--serial={},hardware=virtio-console,num=2", &ramdump_arg));
1118     // /dev/hvc2
1119     command.arg(format!("--serial={},hardware=virtio-console,num=3", &log_arg));
1120 
1121     if let Some(bootloader) = config.bootloader {
1122         command.arg("--bios").arg(add_preserved_fd(&mut preserved_fds, bootloader));
1123     }
1124 
1125     if let Some(initrd) = config.initrd {
1126         command.arg("--initrd").arg(add_preserved_fd(&mut preserved_fds, initrd));
1127     }
1128 
1129     if let Some(params) = &config.params {
1130         command.arg("--params").arg(params);
1131     }
1132 
1133     for disk in config.disks {
1134         // Disk file locking is disabled because of missing SELinux policies.
1135         command.arg("--block").arg(format!(
1136             "path={},ro={},lock=false",
1137             add_preserved_fd(&mut preserved_fds, disk.image),
1138             !disk.writable,
1139         ));
1140     }
1141 
1142     if let Some(kernel) = config.kernel {
1143         command.arg(add_preserved_fd(&mut preserved_fds, kernel));
1144     }
1145 
1146     #[cfg(target_arch = "aarch64")]
1147     command.arg("--no-pmu");
1148 
1149     let control_sock = create_crosvm_control_listener(crosvm_control_socket_path)
1150         .context("failed to create control listener")?;
1151     command.arg("--socket").arg(add_preserved_fd(&mut preserved_fds, control_sock));
1152 
1153     if let Some(dt_overlay) = config.device_tree_overlay {
1154         command.arg("--device-tree-overlay").arg(add_preserved_fd(&mut preserved_fds, dt_overlay));
1155     }
1156 
1157     if cfg!(paravirtualized_devices) {
1158         if let Some(gpu_config) = &config.gpu_config {
1159             let mut gpu_args = Vec::new();
1160             if let Some(backend) = &gpu_config.backend {
1161                 gpu_args.push(format!("backend={}", backend));
1162             }
1163             if let Some(context_types) = &gpu_config.context_types {
1164                 gpu_args.push(format!("context-types={}", context_types.join(":")));
1165             }
1166             if let Some(pci_address) = &gpu_config.pci_address {
1167                 gpu_args.push(format!("pci-address={}", pci_address));
1168             }
1169             if let Some(renderer_features) = &gpu_config.renderer_features {
1170                 gpu_args.push(format!("renderer-features={}", renderer_features));
1171             }
1172             if gpu_config.renderer_use_egl.unwrap_or(false) {
1173                 gpu_args.push("egl=true".to_string());
1174             }
1175             if gpu_config.renderer_use_gles.unwrap_or(false) {
1176                 gpu_args.push("gles=true".to_string());
1177             }
1178             if gpu_config.renderer_use_glx.unwrap_or(false) {
1179                 gpu_args.push("glx=true".to_string());
1180             }
1181             if gpu_config.renderer_use_surfaceless.unwrap_or(false) {
1182                 gpu_args.push("surfaceless=true".to_string());
1183             }
1184             if gpu_config.renderer_use_vulkan.unwrap_or(false) {
1185                 gpu_args.push("vulkan=true".to_string());
1186             }
1187             command.arg(format!("--gpu={}", gpu_args.join(",")));
1188         }
1189         if let Some(display_config) = &config.display_config {
1190             command
1191                 .arg(format!(
1192                     "--gpu-display=mode=windowed[{},{}],dpi=[{},{}],refresh-rate={}",
1193                     display_config.width,
1194                     display_config.height,
1195                     display_config.horizontal_dpi,
1196                     display_config.vertical_dpi,
1197                     display_config.refresh_rate
1198                 ))
1199                 .arg(format!("--android-display-service={}", config.name));
1200         }
1201     }
1202 
1203     if cfg!(network) {
1204         if let Some(tap) = config.tap {
1205             add_preserved_fd(&mut preserved_fds, tap);
1206             let tap_fd = preserved_fds.last().unwrap().as_raw_fd();
1207             command.arg("--net").arg(format!("tap-fd={tap_fd}"));
1208         }
1209     }
1210 
1211     if cfg!(paravirtualized_devices) {
1212         for input_device_option in config.input_device_options.into_iter() {
1213             command.arg("--input");
1214             command.arg(match input_device_option {
1215                 InputDeviceOption::EvDev(file) => {
1216                     format!("evdev[path={}]", add_preserved_fd(&mut preserved_fds, file))
1217                 }
1218                 InputDeviceOption::Keyboard(file) => {
1219                     format!("keyboard[path={}]", add_preserved_fd(&mut preserved_fds, file))
1220                 }
1221                 InputDeviceOption::Mouse(file) => {
1222                     format!("mouse[path={}]", add_preserved_fd(&mut preserved_fds, file))
1223                 }
1224                 InputDeviceOption::SingleTouch { file, width, height, name } => format!(
1225                     "single-touch[path={},width={},height={}{}]",
1226                     add_preserved_fd(&mut preserved_fds, file),
1227                     width,
1228                     height,
1229                     name.as_ref().map_or("".into(), |n| format!(",name={}", n))
1230                 ),
1231                 InputDeviceOption::Switches(file) => {
1232                     format!("switches[path={}]", add_preserved_fd(&mut preserved_fds, file))
1233                 }
1234                 InputDeviceOption::MultiTouchTrackpad { file, width, height, name } => format!(
1235                     "multi-touch-trackpad[path={},width={},height={}{}]",
1236                     add_preserved_fd(&mut preserved_fds, file),
1237                     width,
1238                     height,
1239                     name.as_ref().map_or("".into(), |n| format!(",name={}", n))
1240                 ),
1241                 InputDeviceOption::MultiTouch { file, width, height, name } => format!(
1242                     "multi-touch[path={},width={},height={}{}]",
1243                     add_preserved_fd(&mut preserved_fds, file),
1244                     width,
1245                     height,
1246                     name.as_ref().map_or("".into(), |n| format!(",name={}", n))
1247                 ),
1248             });
1249         }
1250     }
1251 
1252     if config.hugepages {
1253         command.arg("--hugepages");
1254     }
1255 
1256     if config.boost_uclamp {
1257         command.arg("--boost-uclamp");
1258     }
1259 
1260     if !config.vfio_devices.is_empty() {
1261         if let Some(dtbo) = config.dtbo {
1262             command.arg(format!(
1263                 "--device-tree-overlay={},filter",
1264                 add_preserved_fd(&mut preserved_fds, dtbo)
1265             ));
1266         } else {
1267             bail!("VFIO devices assigned but no DTBO available");
1268         }
1269     };
1270     for device in config.vfio_devices {
1271         command.arg(vfio_argument_for_platform_device(&device)?);
1272     }
1273 
1274     for shared_path in &config.shared_paths {
1275         if shared_path.app_domain {
1276             if let Some(socket_fd) = &shared_path.socket_fd {
1277                 let socket_path =
1278                     add_preserved_fd(&mut preserved_fds, socket_fd.try_clone().unwrap());
1279                 let raw_fd: i32 = socket_path.rsplit_once('/').unwrap().1.parse().unwrap();
1280                 command
1281                     .arg("--vhost-user-fs")
1282                     .arg(format!("tag={},socket-fd={}", &shared_path.tag, raw_fd));
1283             }
1284         } else {
1285             if let Err(e) = wait_for_file(&shared_path.socket_path, 5) {
1286                 bail!("Error waiting for file: {}", e);
1287             }
1288             command
1289                 .arg("--vhost-user-fs")
1290                 .arg(format!("{},tag={}", &shared_path.socket_path, &shared_path.tag));
1291         }
1292     }
1293 
1294     debug!("Preserving FDs {:?}", preserved_fds);
1295     command.preserved_fds(preserved_fds);
1296 
1297     if cfg!(paravirtualized_devices) {
1298         if let Some(audio_config) = &config.audio_config {
1299             command.arg("--virtio-snd").arg(format!(
1300                 "backend=aaudio,num_input_devices={},num_output_devices={}",
1301                 if audio_config.use_microphone { 1 } else { 0 },
1302                 if audio_config.use_speaker { 1 } else { 0 }
1303             ));
1304         }
1305     }
1306 
1307     print_crosvm_args(&command);
1308 
1309     let result = SharedChild::spawn(&mut command)?;
1310     debug!("Spawned crosvm({}).", result.id());
1311     Ok(result)
1312 }
1313 
wait_for_file(path: &str, timeout_secs: u64) -> Result<(), std::io::Error>1314 fn wait_for_file(path: &str, timeout_secs: u64) -> Result<(), std::io::Error> {
1315     let start_time = std::time::Instant::now();
1316     let timeout = Duration::from_secs(timeout_secs);
1317 
1318     while start_time.elapsed() < timeout {
1319         if std::fs::metadata(path).is_ok() {
1320             return Ok(()); // File exists
1321         }
1322         thread::sleep(Duration::from_millis(100));
1323     }
1324 
1325     Err(std::io::Error::new(
1326         std::io::ErrorKind::NotFound,
1327         format!("File not found within {} seconds: {}", timeout_secs, path),
1328     ))
1329 }
1330 
1331 /// Ensure that the configuration has a valid combination of fields set, or return an error if not.
validate_config(config: &CrosvmConfig) -> Result<(), Error>1332 fn validate_config(config: &CrosvmConfig) -> Result<(), Error> {
1333     if config.bootloader.is_none() && config.kernel.is_none() {
1334         bail!("VM must have either a bootloader or a kernel image.");
1335     }
1336     if config.bootloader.is_some() && (config.kernel.is_some() || config.initrd.is_some()) {
1337         bail!("Can't have both bootloader and kernel/initrd image.");
1338     }
1339     let version = Version::parse(CROSVM_PLATFORM_VERSION).unwrap();
1340     if !config.platform_version.matches(&version) {
1341         bail!(
1342             "Incompatible platform version. The config is compatible with platform version(s) \
1343               {}, but the actual platform version is {}",
1344             config.platform_version,
1345             version
1346         );
1347     }
1348 
1349     Ok(())
1350 }
1351 
1352 /// Print arguments of the crosvm command. In doing so, /proc/self/fd/XX is annotated with the
1353 /// actual file path if the FD is backed by a regular file. If not, the /proc path is printed
1354 /// unmodified.
print_crosvm_args(command: &Command)1355 fn print_crosvm_args(command: &Command) {
1356     let re = Regex::new(r"/proc/self/fd/[\d]+").unwrap();
1357     info!(
1358         "Running crosvm with args: {:?}",
1359         command
1360             .get_args()
1361             .map(|s| s.to_string_lossy())
1362             .map(|s| {
1363                 re.replace_all(&s, |caps: &Captures| {
1364                     let path = &caps[0];
1365                     if let Ok(realpath) = std::fs::canonicalize(path) {
1366                         format!("{} ({})", path, realpath.to_string_lossy())
1367                     } else {
1368                         path.to_owned()
1369                     }
1370                 })
1371                 .into_owned()
1372             })
1373             .collect::<Vec<_>>()
1374     );
1375 }
1376 
1377 /// Adds the file descriptor for `file` to `preserved_fds`, and returns a string of the form
1378 /// "/proc/self/fd/N" where N is the file descriptor.
add_preserved_fd<F: Into<OwnedFd>>(preserved_fds: &mut Vec<OwnedFd>, file: F) -> String1379 fn add_preserved_fd<F: Into<OwnedFd>>(preserved_fds: &mut Vec<OwnedFd>, file: F) -> String {
1380     let fd = file.into();
1381     let raw_fd = fd.as_raw_fd();
1382     preserved_fds.push(fd);
1383     format!("/proc/self/fd/{}", raw_fd)
1384 }
1385 
1386 /// Adds the file descriptor for `file` (if any) to `preserved_fds`, and returns the appropriate
1387 /// string for a crosvm `--serial` flag. If `file` is none, creates a dummy sink device.
format_serial_out_arg(preserved_fds: &mut Vec<OwnedFd>, file: Option<File>) -> String1388 fn format_serial_out_arg(preserved_fds: &mut Vec<OwnedFd>, file: Option<File>) -> String {
1389     if let Some(file) = file {
1390         format!("type=file,path={}", add_preserved_fd(preserved_fds, file))
1391     } else {
1392         "type=sink".to_string()
1393     }
1394 }
1395 
1396 /// Creates a new pipe with the `O_CLOEXEC` flag set, and returns the read side and write side.
create_pipe() -> Result<(File, File), Error>1397 fn create_pipe() -> Result<(File, File), Error> {
1398     let (read_fd, write_fd) = pipe2(OFlag::O_CLOEXEC)?;
1399     Ok((read_fd.into(), write_fd.into()))
1400 }
1401 
1402 /// Creates and binds a unix seqpacket listening socket to be passed as crosvm's `--socket`
1403 /// argument. See `UnixSeqpacketListener::bind` in crosvm's code for reference.
create_crosvm_control_listener(crosvm_control_socket_path: &Path) -> Result<OwnedFd>1404 fn create_crosvm_control_listener(crosvm_control_socket_path: &Path) -> Result<OwnedFd> {
1405     use nix::sys::socket;
1406     let fd = socket::socket(
1407         socket::AddressFamily::Unix,
1408         socket::SockType::SeqPacket,
1409         socket::SockFlag::empty(),
1410         None,
1411     )
1412     .context("socket failed")?;
1413     socket::bind(fd.as_raw_fd(), &socket::UnixAddr::new(crosvm_control_socket_path)?)
1414         .context("bind failed")?;
1415     // The exact backlog size isn't imporant. crosvm uses 128 internally. We use 127 here
1416     // because of a `nix` bug.
1417     socket::listen(&fd, socket::Backlog::new(127).unwrap()).context("listen failed")?;
1418     Ok(fd)
1419 }
1420 
path_to_cstring(path: &Path) -> CString1421 fn path_to_cstring(path: &Path) -> CString {
1422     if let Some(s) = path.to_str() {
1423         if let Ok(s) = CString::new(s) {
1424             return s;
1425         }
1426     }
1427     // The path contains invalid utf8 or a null, which should never happen.
1428     panic!("bad path: {path:?}");
1429 }
1430