xref: /aosp_15_r20/external/crosvm/src/sys/windows.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // TODO(b:240716507): There is huge chunk for code which depends on haxm, whpx or gvm to be enabled
6 // but isn't marked so. Remove this when we do so.
7 #![allow(dead_code, unused_imports, unused_variables, unreachable_code)]
8 
9 pub(crate) mod control_server;
10 pub(crate) mod irq_wait;
11 pub(crate) mod main;
12 #[cfg(not(feature = "crash-report"))]
13 mod panic_hook;
14 
15 mod generic;
16 use generic as product;
17 pub(crate) mod run_vcpu;
18 
19 #[cfg(feature = "whpx")]
20 use std::arch::x86_64::__cpuid;
21 #[cfg(feature = "whpx")]
22 use std::arch::x86_64::__cpuid_count;
23 use std::cmp::Reverse;
24 use std::collections::BTreeMap;
25 use std::collections::HashMap;
26 use std::fs::File;
27 use std::fs::OpenOptions;
28 use std::io::stdin;
29 use std::iter;
30 use std::mem;
31 use std::os::windows::fs::OpenOptionsExt;
32 use std::path::PathBuf;
33 use std::sync::mpsc;
34 use std::sync::Arc;
35 
36 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
37 use aarch64::AArch64 as Arch;
38 use acpi_tables::sdt::SDT;
39 use anyhow::anyhow;
40 use anyhow::bail;
41 use anyhow::Context;
42 use anyhow::Result;
43 use arch::CpuConfigArch;
44 use arch::DtbOverlay;
45 use arch::IrqChipArch;
46 use arch::LinuxArch;
47 use arch::RunnableLinuxVm;
48 use arch::VcpuArch;
49 use arch::VirtioDeviceStub;
50 use arch::VmArch;
51 use arch::VmComponents;
52 use arch::VmImage;
53 use base::enable_high_res_timers;
54 use base::error;
55 use base::info;
56 use base::open_file_or_duplicate;
57 use base::warn;
58 use base::AsRawDescriptor;
59 #[cfg(feature = "gpu")]
60 use base::BlockingMode;
61 use base::CloseNotifier;
62 use base::Event;
63 use base::EventToken;
64 use base::EventType;
65 use base::FlushOnDropTube;
66 #[cfg(feature = "gpu")]
67 use base::FramingMode;
68 use base::FromRawDescriptor;
69 use base::ProtoTube;
70 use base::RawDescriptor;
71 use base::ReadNotifier;
72 use base::RecvTube;
73 use base::SendTube;
74 #[cfg(feature = "gpu")]
75 use base::StreamChannel;
76 use base::Terminal;
77 use base::TriggeredEvent;
78 use base::Tube;
79 use base::TubeError;
80 use base::VmEventType;
81 use base::WaitContext;
82 use broker_ipc::common_child_setup;
83 use broker_ipc::CommonChildStartupArgs;
84 use control_server::ControlServer;
85 use crosvm_cli::sys::windows::exit::Exit;
86 use crosvm_cli::sys::windows::exit::ExitContext;
87 use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
88 use crosvm_cli::sys::windows::exit::ExitContextOption;
89 use devices::create_devices_worker_thread;
90 use devices::serial_device::SerialHardware;
91 use devices::serial_device::SerialParameters;
92 use devices::tsc::get_tsc_sync_mitigations;
93 use devices::tsc::standard_deviation;
94 use devices::tsc::TscSyncMitigations;
95 use devices::virtio;
96 use devices::virtio::block::DiskOption;
97 #[cfg(feature = "audio")]
98 use devices::virtio::snd::common_backend::VirtioSnd;
99 #[cfg(feature = "audio")]
100 use devices::virtio::snd::parameters::Parameters as SndParameters;
101 #[cfg(feature = "gpu")]
102 use devices::virtio::vhost::user::device::gpu::sys::windows::GpuVmmConfig;
103 #[cfg(feature = "gpu")]
104 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventSplitConfig;
105 #[cfg(feature = "gpu")]
106 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventVmmConfig;
107 #[cfg(feature = "gpu")]
108 use devices::virtio::vhost::user::gpu::sys::windows::product::GpuBackendConfig as GpuBackendConfigProduct;
109 #[cfg(feature = "gpu")]
110 use devices::virtio::vhost::user::gpu::sys::windows::run_gpu_device_worker;
111 #[cfg(feature = "audio")]
112 use devices::virtio::vhost::user::snd::sys::windows::product::SndBackendConfig as SndBackendConfigProduct;
113 #[cfg(feature = "audio")]
114 use devices::virtio::vhost::user::snd::sys::windows::run_snd_device_worker;
115 #[cfg(feature = "audio")]
116 use devices::virtio::vhost::user::snd::sys::windows::SndSplitConfig;
117 #[cfg(feature = "balloon")]
118 use devices::virtio::BalloonFeatures;
119 use devices::virtio::Console;
120 #[cfg(feature = "gpu")]
121 use devices::virtio::GpuParameters;
122 use devices::BusDeviceObj;
123 use devices::BusResumeDevice;
124 #[cfg(feature = "gvm")]
125 use devices::GvmIrqChip;
126 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
127 use devices::IrqChip;
128 use devices::UserspaceIrqChip;
129 use devices::VcpuRunState;
130 use devices::VirtioPciDevice;
131 #[cfg(feature = "whpx")]
132 use devices::WhpxSplitIrqChip;
133 #[cfg(feature = "gpu")]
134 use gpu_display::EventDevice;
135 #[cfg(feature = "gpu")]
136 use gpu_display::WindowProcedureThread;
137 #[cfg(feature = "gpu")]
138 use gpu_display::WindowProcedureThreadBuilder;
139 #[cfg(feature = "gvm")]
140 use hypervisor::gvm::Gvm;
141 #[cfg(feature = "gvm")]
142 use hypervisor::gvm::GvmVcpu;
143 #[cfg(feature = "gvm")]
144 use hypervisor::gvm::GvmVersion;
145 #[cfg(feature = "gvm")]
146 use hypervisor::gvm::GvmVm;
147 #[cfg(feature = "haxm")]
148 use hypervisor::haxm::get_use_ghaxm;
149 #[cfg(feature = "haxm")]
150 use hypervisor::haxm::set_use_ghaxm;
151 #[cfg(feature = "haxm")]
152 use hypervisor::haxm::Haxm;
153 #[cfg(feature = "haxm")]
154 use hypervisor::haxm::HaxmVcpu;
155 #[cfg(feature = "haxm")]
156 use hypervisor::haxm::HaxmVm;
157 #[cfg(feature = "whpx")]
158 use hypervisor::whpx::Whpx;
159 #[cfg(feature = "whpx")]
160 use hypervisor::whpx::WhpxFeature;
161 #[cfg(feature = "whpx")]
162 use hypervisor::whpx::WhpxVcpu;
163 #[cfg(feature = "whpx")]
164 use hypervisor::whpx::WhpxVm;
165 use hypervisor::Hypervisor;
166 #[cfg(feature = "whpx")]
167 use hypervisor::HypervisorCap;
168 #[cfg(feature = "whpx")]
169 use hypervisor::HypervisorX86_64;
170 use hypervisor::ProtectionType;
171 use hypervisor::Vm;
172 use irq_wait::IrqWaitWorker;
173 use jail::FakeMinijailStub as Minijail;
174 #[cfg(not(feature = "crash-report"))]
175 pub(crate) use panic_hook::set_panic_hook;
176 use product::create_snd_mute_tube_pair;
177 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
178 use product::create_snd_state_tube;
179 #[cfg(feature = "pvclock")]
180 use product::handle_pvclock_request;
181 use product::merge_session_invariants;
182 use product::run_ime_thread;
183 use product::set_package_name;
184 pub(crate) use product::setup_metrics_reporting;
185 use product::start_service_ipc_listener;
186 use product::RunControlArgs;
187 use product::ServiceVmState;
188 use product::Token;
189 use resources::SystemAllocator;
190 use run_vcpu::run_all_vcpus;
191 use run_vcpu::VcpuRunMode;
192 use rutabaga_gfx::RutabagaGralloc;
193 use rutabaga_gfx::RutabagaGrallocBackendFlags;
194 use smallvec::SmallVec;
195 use sync::Mutex;
196 use tube_transporter::TubeToken;
197 use tube_transporter::TubeTransporterReader;
198 use vm_control::api::VmMemoryClient;
199 #[cfg(feature = "balloon")]
200 use vm_control::BalloonControlCommand;
201 #[cfg(feature = "balloon")]
202 use vm_control::BalloonTube;
203 use vm_control::DeviceControlCommand;
204 use vm_control::InitialAudioSessionState;
205 use vm_control::IrqHandlerRequest;
206 use vm_control::PvClockCommand;
207 use vm_control::VcpuControl;
208 use vm_control::VmMemoryRegionState;
209 use vm_control::VmMemoryRequest;
210 use vm_control::VmRequest;
211 use vm_control::VmResponse;
212 use vm_control::VmRunMode;
213 use vm_memory::GuestAddress;
214 use vm_memory::GuestMemory;
215 use vmm_vhost::Connection;
216 use vmm_vhost::FrontendReq;
217 use win_util::ProcessType;
218 #[cfg(feature = "whpx")]
219 use x86_64::cpuid::adjust_cpuid;
220 #[cfg(feature = "whpx")]
221 use x86_64::cpuid::CpuIdContext;
222 #[cfg(target_arch = "x86_64")]
223 use x86_64::X8664arch as Arch;
224 
225 use crate::crosvm::config::Config;
226 use crate::crosvm::config::Executable;
227 use crate::crosvm::config::InputDeviceOption;
228 #[cfg(any(feature = "gvm", feature = "whpx"))]
229 use crate::crosvm::config::IrqChipKind;
230 #[cfg(feature = "gpu")]
231 use crate::crosvm::config::TouchDeviceOption;
232 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
233 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
234 use crate::crosvm::sys::config::HypervisorKind;
235 use crate::crosvm::sys::windows::broker::BrokerTubes;
236 #[cfg(feature = "stats")]
237 use crate::crosvm::sys::windows::stats::StatisticsCollector;
238 #[cfg(feature = "gpu")]
239 pub(crate) use crate::sys::windows::product::get_gpu_product_configs;
240 #[cfg(feature = "audio")]
241 pub(crate) use crate::sys::windows::product::get_snd_product_configs;
242 #[cfg(feature = "gpu")]
243 pub(crate) use crate::sys::windows::product::get_window_procedure_thread_product_configs;
244 use crate::sys::windows::product::log_descriptor;
245 #[cfg(feature = "audio")]
246 pub(crate) use crate::sys::windows::product::num_input_sound_devices;
247 #[cfg(feature = "audio")]
248 pub(crate) use crate::sys::windows::product::num_input_sound_streams;
249 use crate::sys::windows::product::spawn_anti_tamper_thread;
250 use crate::sys::windows::product::MetricEventType;
251 
252 const DEFAULT_GUEST_CID: u64 = 3;
253 
254 // by default, if enabled, the balloon WS features will use 4 bins.
255 const VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS: u8 = 4;
256 
257 enum TaggedControlTube {
258     Vm(FlushOnDropTube),
259     Product(product::TaggedControlTube),
260 }
261 
262 impl ReadNotifier for TaggedControlTube {
get_read_notifier(&self) -> &dyn AsRawDescriptor263     fn get_read_notifier(&self) -> &dyn AsRawDescriptor {
264         match self {
265             Self::Vm(tube) => tube.0.get_read_notifier(),
266             Self::Product(tube) => tube.get_read_notifier(),
267         }
268     }
269 }
270 
271 impl CloseNotifier for TaggedControlTube {
get_close_notifier(&self) -> &dyn AsRawDescriptor272     fn get_close_notifier(&self) -> &dyn AsRawDescriptor {
273         match self {
274             Self::Vm(tube) => tube.0.get_close_notifier(),
275             Self::Product(tube) => tube.get_close_notifier(),
276         }
277     }
278 }
279 
280 pub enum ExitState {
281     Reset,
282     Stop,
283     Crash,
284     #[allow(dead_code)]
285     GuestPanic,
286     WatchdogReset,
287 }
288 
289 type DeviceResult<T = VirtioDeviceStub> = Result<T>;
290 
create_vhost_user_block_device( cfg: &Config, connection: Connection<FrontendReq>, ) -> DeviceResult291 fn create_vhost_user_block_device(
292     cfg: &Config,
293     connection: Connection<FrontendReq>,
294 ) -> DeviceResult {
295     let dev = virtio::VhostUserFrontend::new(
296         virtio::DeviceType::Block,
297         virtio::base_features(cfg.protection_type),
298         connection,
299         None,
300         None,
301     )
302     .exit_context(
303         Exit::VhostUserBlockDeviceNew,
304         "failed to set up vhost-user block device",
305     )?;
306 
307     Ok(VirtioDeviceStub {
308         dev: Box::new(dev),
309         jail: None,
310     })
311 }
312 
create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult313 fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult {
314     let features = virtio::base_features(cfg.protection_type);
315     let dev = virtio::BlockAsync::new(
316         features,
317         disk.open()?,
318         disk,
319         Some(disk_device_tube),
320         None,
321         None,
322     )
323     .exit_context(Exit::BlockDeviceNew, "failed to create block device")?;
324 
325     Ok(VirtioDeviceStub {
326         dev: Box::new(dev),
327         jail: None,
328     })
329 }
330 
331 #[cfg(feature = "gpu")]
create_vhost_user_gpu_device( base_features: u64, connection: Connection<FrontendReq>, ) -> DeviceResult332 fn create_vhost_user_gpu_device(
333     base_features: u64,
334     connection: Connection<FrontendReq>,
335 ) -> DeviceResult {
336     let dev = virtio::VhostUserFrontend::new(
337         virtio::DeviceType::Gpu,
338         base_features,
339         connection,
340         None,
341         None,
342     )
343     .exit_context(
344         Exit::VhostUserGpuDeviceNew,
345         "failed to set up vhost-user gpu device",
346     )?;
347 
348     Ok(VirtioDeviceStub {
349         dev: Box::new(dev),
350         jail: None,
351     })
352 }
353 
354 #[cfg(feature = "audio")]
create_vhost_user_snd_device( base_features: u64, connection: Connection<FrontendReq>, ) -> DeviceResult355 fn create_vhost_user_snd_device(
356     base_features: u64,
357     connection: Connection<FrontendReq>,
358 ) -> DeviceResult {
359     let dev = virtio::VhostUserFrontend::new(
360         virtio::DeviceType::Sound,
361         base_features,
362         connection,
363         None,
364         None,
365     )
366     .exit_context(
367         Exit::VhostUserSndDeviceNew,
368         "failed to set up vhost-user snd device",
369     )?;
370 
371     Ok(VirtioDeviceStub {
372         dev: Box::new(dev),
373         jail: None,
374     })
375 }
376 
377 #[cfg(feature = "gpu")]
create_multi_touch_device( cfg: &Config, event_pipe: StreamChannel, width: u32, height: u32, name: Option<&str>, idx: u32, ) -> DeviceResult378 fn create_multi_touch_device(
379     cfg: &Config,
380     event_pipe: StreamChannel,
381     width: u32,
382     height: u32,
383     name: Option<&str>,
384     idx: u32,
385 ) -> DeviceResult {
386     let dev = virtio::input::new_multi_touch(
387         idx,
388         event_pipe,
389         width,
390         height,
391         name,
392         virtio::base_features(cfg.protection_type),
393     )
394     .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
395     Ok(VirtioDeviceStub {
396         dev: Box::new(dev),
397         jail: None,
398     })
399 }
400 
401 #[cfg(feature = "gpu")]
create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult402 fn create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult {
403     let dev = virtio::input::new_mouse(idx, event_pipe, virtio::base_features(cfg.protection_type))
404         .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
405     Ok(VirtioDeviceStub {
406         dev: Box::new(dev),
407         jail: None,
408     })
409 }
410 
411 #[cfg(feature = "slirp")]
create_vhost_user_net_device(cfg: &Config, connection: Connection<FrontendReq>) -> DeviceResult412 fn create_vhost_user_net_device(cfg: &Config, connection: Connection<FrontendReq>) -> DeviceResult {
413     let features = virtio::base_features(cfg.protection_type);
414     let dev =
415         virtio::VhostUserFrontend::new(virtio::DeviceType::Net, features, connection, None, None)
416             .exit_context(
417             Exit::VhostUserNetDeviceNew,
418             "failed to set up vhost-user net device",
419         )?;
420 
421     Ok(VirtioDeviceStub {
422         dev: Box::new(dev),
423         jail: None,
424     })
425 }
426 
create_rng_device(cfg: &Config) -> DeviceResult427 fn create_rng_device(cfg: &Config) -> DeviceResult {
428     let dev = virtio::Rng::new(virtio::base_features(cfg.protection_type))
429         .exit_context(Exit::RngDeviceNew, "failed to set up rng")?;
430 
431     Ok(VirtioDeviceStub {
432         dev: Box::new(dev),
433         jail: None,
434     })
435 }
436 
create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult437 fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult {
438     let mut keep_rds = Vec::new();
439     let evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
440     let dev = param
441         .create_serial_device::<Console>(cfg.protection_type, &evt, &mut keep_rds)
442         .exit_context(Exit::CreateConsole, "failed to create console device")?;
443 
444     Ok(VirtioDeviceStub {
445         dev: Box::new(dev),
446         jail: None,
447     })
448 }
449 
450 #[cfg(feature = "balloon")]
create_balloon_device( cfg: &Config, balloon_device_tube: Tube, dynamic_mapping_device_tube: Tube, inflate_tube: Option<Tube>, init_balloon_size: u64, ) -> DeviceResult451 fn create_balloon_device(
452     cfg: &Config,
453     balloon_device_tube: Tube,
454     dynamic_mapping_device_tube: Tube,
455     inflate_tube: Option<Tube>,
456     init_balloon_size: u64,
457 ) -> DeviceResult {
458     let balloon_features =
459         (cfg.balloon_page_reporting as u64) << BalloonFeatures::PageReporting as u64;
460     let dev = virtio::Balloon::new(
461         virtio::base_features(cfg.protection_type),
462         balloon_device_tube,
463         VmMemoryClient::new(dynamic_mapping_device_tube),
464         inflate_tube,
465         init_balloon_size,
466         balloon_features,
467         #[cfg(feature = "registered_events")]
468         None,
469         VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS,
470     )
471     .exit_context(Exit::BalloonDeviceNew, "failed to create balloon")?;
472 
473     Ok(VirtioDeviceStub {
474         dev: Box::new(dev),
475         jail: None,
476     })
477 }
478 
create_vsock_device(cfg: &Config) -> DeviceResult479 fn create_vsock_device(cfg: &Config) -> DeviceResult {
480     // We only support a single guest, so we can confidently assign a default
481     // CID if one isn't provided. We choose the lowest non-reserved value.
482     let dev = virtio::vsock::Vsock::new(
483         cfg.vsock
484             .as_ref()
485             .map(|cfg| cfg.cid)
486             .unwrap_or(DEFAULT_GUEST_CID),
487         cfg.host_guid.clone(),
488         virtio::base_features(cfg.protection_type),
489     )
490     .exit_context(
491         Exit::UserspaceVsockDeviceNew,
492         "failed to create userspace vsock device",
493     )?;
494 
495     Ok(VirtioDeviceStub {
496         dev: Box::new(dev),
497         jail: None,
498     })
499 }
500 
create_virtio_devices( cfg: &mut Config, vm_evt_wrtube: &SendTube, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, initial_audio_session_states: &mut Vec<InitialAudioSessionState>, balloon_device_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<VirtioDeviceStub>>501 fn create_virtio_devices(
502     cfg: &mut Config,
503     vm_evt_wrtube: &SendTube,
504     #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
505     disk_device_tubes: &mut Vec<Tube>,
506     initial_audio_session_states: &mut Vec<InitialAudioSessionState>,
507     balloon_device_tube: Option<Tube>,
508     #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
509     dynamic_mapping_device_tube: Option<Tube>,
510     inflate_tube: Option<Tube>,
511     init_balloon_size: u64,
512     tsc_frequency: u64,
513     virtio_snd_state_device_tube: Option<Tube>,
514     virtio_snd_control_device_tube: Option<Tube>,
515 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
516     let mut devs = Vec::new();
517 
518     if cfg.block_vhost_user_tube.is_empty() {
519         // Disk devices must precede virtio-console devices or the kernel does not boot.
520         // TODO(b/171215421): figure out why this ordering is required and fix it.
521         for disk in &cfg.disks {
522             let disk_device_tube = disk_device_tubes.remove(0);
523             devs.push(create_block_device(cfg, disk, disk_device_tube)?);
524         }
525     } else {
526         info!("Starting up vhost user block backends...");
527         for _disk in &cfg.disks {
528             let disk_device_tube = cfg.block_vhost_user_tube.remove(0);
529             let connection = Connection::<FrontendReq>::from(disk_device_tube);
530             devs.push(create_vhost_user_block_device(cfg, connection)?);
531         }
532     }
533 
534     for (_, param) in cfg
535         .serial_parameters
536         .iter()
537         .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
538     {
539         let dev = create_console_device(cfg, param)?;
540         devs.push(dev);
541     }
542 
543     #[cfg(feature = "audio")]
544     {
545         let snd_split_configs = std::mem::take(&mut cfg.snd_split_configs);
546         for mut snd_split_cfg in snd_split_configs.into_iter() {
547             devs.push(create_virtio_snd_device(
548                 cfg,
549                 &mut snd_split_cfg,
550                 control_tubes,
551             )?);
552             if let Some(vmm_config) = snd_split_cfg.vmm_config {
553                 let initial_audio_session_state = InitialAudioSessionState {
554                     audio_client_guid: vmm_config.audio_client_guid,
555                     card_index: vmm_config.card_index,
556                 };
557                 initial_audio_session_states.push(initial_audio_session_state);
558             }
559         }
560     }
561 
562     #[cfg(feature = "pvclock")]
563     if let Some(tube) = pvclock_device_tube {
564         product::push_pvclock_device(cfg, &mut devs, tsc_frequency, tube);
565     }
566 
567     devs.push(create_rng_device(cfg)?);
568 
569     #[cfg(feature = "slirp")]
570     if let Some(net_vhost_user_tube) = cfg.net_vhost_user_tube.take() {
571         let connection = Connection::<FrontendReq>::from(net_vhost_user_tube);
572         devs.push(create_vhost_user_net_device(cfg, connection)?);
573     }
574 
575     #[cfg(feature = "balloon")]
576     if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) =
577         (balloon_device_tube, dynamic_mapping_device_tube)
578     {
579         devs.push(create_balloon_device(
580             cfg,
581             balloon_device_tube,
582             dynamic_mapping_device_tube,
583             inflate_tube,
584             init_balloon_size,
585         )?);
586     }
587 
588     devs.push(create_vsock_device(cfg)?);
589 
590     #[cfg(feature = "gpu")]
591     let event_devices = if let Some(InputEventSplitConfig {
592         backend_config,
593         vmm_config,
594     }) = cfg.input_event_split_config.take()
595     {
596         devs.extend(
597             create_virtio_input_event_devices(cfg, vmm_config)
598                 .context("create input event devices")?,
599         );
600         backend_config.map(|cfg| cfg.event_devices)
601     } else {
602         None
603     };
604 
605     #[cfg(feature = "gpu")]
606     if let Some(wndproc_thread_vmm_config) = cfg
607         .window_procedure_thread_split_config
608         .as_mut()
609         .map(|split_cfg| &mut split_cfg.vmm_config)
610     {
611         product::push_window_procedure_thread_control_tubes(
612             control_tubes,
613             wndproc_thread_vmm_config,
614         );
615     }
616 
617     #[cfg(feature = "gpu")]
618     let mut wndproc_thread = cfg
619         .window_procedure_thread_split_config
620         .as_mut()
621         .and_then(|cfg| cfg.wndproc_thread_builder.take())
622         .map(WindowProcedureThreadBuilder::start_thread)
623         .transpose()
624         .context("Failed to start the window procedure thread.")?;
625 
626     #[cfg(feature = "gpu")]
627     if let Some(gpu_vmm_config) = cfg.gpu_vmm_config.take() {
628         devs.push(create_virtio_gpu_device(
629             cfg,
630             gpu_vmm_config,
631             event_devices,
632             &mut wndproc_thread,
633             control_tubes,
634         )?);
635     }
636 
637     Ok(devs)
638 }
639 
640 #[cfg(feature = "gpu")]
create_virtio_input_event_devices( cfg: &Config, mut input_event_vmm_config: InputEventVmmConfig, ) -> DeviceResult<Vec<VirtioDeviceStub>>641 fn create_virtio_input_event_devices(
642     cfg: &Config,
643     mut input_event_vmm_config: InputEventVmmConfig,
644 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
645     let mut devs = Vec::new();
646 
647     // Iterate event devices, create the VMM end.
648     let mut multi_touch_pipes = input_event_vmm_config
649         .multi_touch_pipes
650         .drain(..)
651         .enumerate();
652     for input in &cfg.virtio_input {
653         match input {
654             InputDeviceOption::SingleTouch { .. } => {
655                 unimplemented!("--single-touch is no longer supported. Use --multi-touch instead.");
656             }
657             InputDeviceOption::MultiTouch {
658                 width,
659                 height,
660                 name,
661                 ..
662             } => {
663                 let Some((idx, pipe)) = multi_touch_pipes.next() else {
664                     break;
665                 };
666                 let mut width = *width;
667                 let mut height = *height;
668                 if idx == 0 {
669                     if width.is_none() {
670                         width = cfg.display_input_width;
671                     }
672                     if height.is_none() {
673                         height = cfg.display_input_height;
674                     }
675                 }
676                 devs.push(create_multi_touch_device(
677                     cfg,
678                     pipe,
679                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
680                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
681                     name.as_deref(),
682                     idx as u32,
683                 )?);
684             }
685             _ => {}
686         }
687     }
688     drop(multi_touch_pipes);
689 
690     product::push_mouse_device(cfg, &mut input_event_vmm_config, &mut devs)?;
691 
692     for (idx, pipe) in input_event_vmm_config.mouse_pipes.drain(..).enumerate() {
693         devs.push(create_mouse_device(cfg, pipe, idx as u32)?);
694     }
695 
696     let keyboard_pipe = input_event_vmm_config
697         .keyboard_pipes
698         .pop()
699         .expect("at least one keyboard should be in GPU VMM config");
700     let dev = virtio::input::new_keyboard(
701         /* idx= */ 0,
702         keyboard_pipe,
703         virtio::base_features(cfg.protection_type),
704     )
705     .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
706 
707     devs.push(VirtioDeviceStub {
708         dev: Box::new(dev),
709         jail: None,
710     });
711 
712     Ok(devs)
713 }
714 
715 #[cfg(feature = "gpu")]
create_virtio_gpu_device( cfg: &mut Config, mut gpu_vmm_config: GpuVmmConfig, event_devices: Option<Vec<EventDevice>>, wndproc_thread: &mut Option<WindowProcedureThread>, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, ) -> DeviceResult<VirtioDeviceStub>716 fn create_virtio_gpu_device(
717     cfg: &mut Config,
718     mut gpu_vmm_config: GpuVmmConfig,
719     event_devices: Option<Vec<EventDevice>>,
720     wndproc_thread: &mut Option<WindowProcedureThread>,
721     #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
722 ) -> DeviceResult<VirtioDeviceStub> {
723     let resource_bridges = Vec::<Tube>::new();
724 
725     product::push_gpu_control_tubes(control_tubes, &mut gpu_vmm_config);
726 
727     // If the GPU backend is passed, start up the vhost-user worker in the main process.
728     if let Some(backend_config) = cfg.gpu_backend_config.take() {
729         let event_devices = event_devices.ok_or_else(|| {
730             anyhow!("event devices are missing when creating virtio-gpu in the current process.")
731         })?;
732         let wndproc_thread = wndproc_thread
733             .take()
734             .ok_or_else(|| anyhow!("Window procedure thread is missing."))?;
735 
736         std::thread::spawn(move || {
737             run_gpu_device_worker(backend_config, event_devices, wndproc_thread)
738         });
739     }
740 
741     // The GPU is always vhost-user, even if running in the main process.
742     let gpu_device_tube = gpu_vmm_config
743         .main_vhost_user_tube
744         .take()
745         .expect("GPU VMM vhost-user tube should be set");
746     let connection = Connection::<FrontendReq>::from(gpu_device_tube);
747 
748     create_vhost_user_gpu_device(virtio::base_features(cfg.protection_type), connection)
749         .context("create vhost-user GPU device")
750 }
751 
752 #[cfg(feature = "audio")]
create_virtio_snd_device( cfg: &mut Config, snd_split_config: &mut SndSplitConfig, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, ) -> DeviceResult<VirtioDeviceStub>753 fn create_virtio_snd_device(
754     cfg: &mut Config,
755     snd_split_config: &mut SndSplitConfig,
756     #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
757 ) -> DeviceResult<VirtioDeviceStub> {
758     let snd_vmm_config = snd_split_config
759         .vmm_config
760         .as_mut()
761         .expect("snd_vmm_config must exist");
762     product::push_snd_control_tubes(control_tubes, snd_vmm_config);
763 
764     // If the SND backend is passed, start up the vhost-user worker in the main process.
765     if let Some(backend_config) = snd_split_config.backend_config.take() {
766         std::thread::spawn(move || run_snd_device_worker(backend_config));
767     }
768 
769     // The SND is always vhost-user, even if running in the main process.
770     let snd_device_tube = snd_vmm_config
771         .main_vhost_user_tube
772         .take()
773         .expect("Snd VMM vhost-user tube should be set");
774     let connection = Connection::<FrontendReq>::from(snd_device_tube);
775 
776     create_vhost_user_snd_device(virtio::base_features(cfg.protection_type), connection)
777         .context("create vhost-user SND device")
778 }
779 
create_devices( cfg: &mut Config, mem: &GuestMemory, exit_evt_wrtube: &SendTube, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<Tube>, control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, initial_audio_session_states: &mut Vec<InitialAudioSessionState>, balloon_device_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>780 fn create_devices(
781     cfg: &mut Config,
782     mem: &GuestMemory,
783     exit_evt_wrtube: &SendTube,
784     irq_control_tubes: &mut Vec<Tube>,
785     vm_memory_control_tubes: &mut Vec<Tube>,
786     control_tubes: &mut Vec<TaggedControlTube>,
787     disk_device_tubes: &mut Vec<Tube>,
788     initial_audio_session_states: &mut Vec<InitialAudioSessionState>,
789     balloon_device_tube: Option<Tube>,
790     #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
791     dynamic_mapping_device_tube: Option<Tube>,
792     inflate_tube: Option<Tube>,
793     init_balloon_size: u64,
794     tsc_frequency: u64,
795     virtio_snd_state_device_tube: Option<Tube>,
796     virtio_snd_control_device_tube: Option<Tube>,
797 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
798     let stubs = create_virtio_devices(
799         cfg,
800         exit_evt_wrtube,
801         control_tubes,
802         disk_device_tubes,
803         initial_audio_session_states,
804         balloon_device_tube,
805         #[cfg(feature = "pvclock")]
806         pvclock_device_tube,
807         dynamic_mapping_device_tube,
808         inflate_tube,
809         init_balloon_size,
810         tsc_frequency,
811         virtio_snd_state_device_tube,
812         virtio_snd_control_device_tube,
813     )?;
814 
815     let mut pci_devices = Vec::new();
816 
817     for stub in stubs {
818         let (msi_host_tube, msi_device_tube) =
819             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
820         irq_control_tubes.push(msi_host_tube);
821 
822         let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
823             let (host_tube, device_tube) =
824                 Tube::pair().context("failed to create VVU proxy tube")?;
825             vm_memory_control_tubes.push(host_tube);
826             Some(device_tube)
827         } else {
828             None
829         };
830 
831         let (ioevent_host_tube, ioevent_device_tube) =
832             Tube::pair().context("failed to create ioevent tube")?;
833         vm_memory_control_tubes.push(ioevent_host_tube);
834 
835         let (vm_control_host_tube, vm_control_device_tube) =
836             Tube::pair().context("failed to create vm_control tube")?;
837         control_tubes.push(TaggedControlTube::Vm(FlushOnDropTube::from(
838             vm_control_host_tube,
839         )));
840 
841         let dev = Box::new(
842             VirtioPciDevice::new(
843                 mem.clone(),
844                 stub.dev,
845                 msi_device_tube,
846                 cfg.disable_virtio_intx,
847                 shared_memory_tube.map(VmMemoryClient::new),
848                 VmMemoryClient::new(ioevent_device_tube),
849                 vm_control_device_tube,
850             )
851             .exit_context(Exit::VirtioPciDev, "failed to create virtio pci dev")?,
852         ) as Box<dyn BusDeviceObj>;
853         pci_devices.push((dev, stub.jail));
854     }
855 
856     Ok(pci_devices)
857 }
858 
859 #[derive(Debug)]
860 struct PvClockError(String);
861 
handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( event: &TriggeredEvent<Token>, vm_control_ids_to_remove: &mut Vec<usize>, next_control_id: &mut usize, service_vm_state: &mut ServiceVmState, disk_host_tubes: &[Tube], ipc_main_loop_tube: Option<&Tube>, #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>, vm_evt_rdtube: &RecvTube, control_tubes: &mut BTreeMap<usize, TaggedControlTube>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, virtio_snd_host_mute_tubes: &mut [Tube], proto_main_loop_tube: Option<&ProtoTube>, anti_tamper_main_thread_tube: &Option<ProtoTube>, #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>, memory_size_mb: u64, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, run_mode_arc: &VcpuRunMode, region_state: &mut VmMemoryRegionState, vm_control_server: Option<&mut ControlServer>, irq_handler_control: &Tube, device_ctrl_tube: &Tube, wait_ctx: &WaitContext<Token>, force_s2idle: bool, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], suspended_pvclock_state: &mut Option<hypervisor::ClockState>, ) -> Result<Option<ExitState>>862 fn handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
863     event: &TriggeredEvent<Token>,
864     vm_control_ids_to_remove: &mut Vec<usize>,
865     next_control_id: &mut usize,
866     service_vm_state: &mut ServiceVmState,
867     disk_host_tubes: &[Tube],
868     ipc_main_loop_tube: Option<&Tube>,
869     #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>,
870     vm_evt_rdtube: &RecvTube,
871     control_tubes: &mut BTreeMap<usize, TaggedControlTube>,
872     guest_os: &mut RunnableLinuxVm<V, Vcpu>,
873     sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
874     virtio_snd_host_mute_tubes: &mut [Tube],
875     proto_main_loop_tube: Option<&ProtoTube>,
876     anti_tamper_main_thread_tube: &Option<ProtoTube>,
877     #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>,
878     memory_size_mb: u64,
879     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
880     #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
881     run_mode_arc: &VcpuRunMode,
882     region_state: &mut VmMemoryRegionState,
883     vm_control_server: Option<&mut ControlServer>,
884     irq_handler_control: &Tube,
885     device_ctrl_tube: &Tube,
886     wait_ctx: &WaitContext<Token>,
887     force_s2idle: bool,
888     vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
889     suspended_pvclock_state: &mut Option<hypervisor::ClockState>,
890 ) -> Result<Option<ExitState>> {
891     let mut execute_vm_request = |request: VmRequest, guest_os: &mut RunnableLinuxVm<V, Vcpu>| {
892         if let VmRequest::Exit = request {
893             return (VmResponse::Ok, Some(VmRunMode::Exiting));
894         }
895         let vcpu_size = vcpu_boxes.lock().len();
896         let resp = request.execute(
897             &guest_os.vm,
898             disk_host_tubes,
899             &mut guest_os.pm,
900             #[cfg(feature = "gpu")]
901             gpu_control_tube,
902             #[cfg(not(feature = "gpu"))]
903             None,
904             None,
905             &mut None,
906             |msg| {
907                 kick_all_vcpus(
908                     run_mode_arc,
909                     vcpu_control_channels,
910                     vcpu_boxes,
911                     guest_os.irq_chip.as_ref(),
912                     #[cfg(feature = "pvclock")]
913                     pvclock_host_tube,
914                     &guest_os.resume_notify_devices,
915                     msg,
916                 );
917             },
918             force_s2idle,
919             #[cfg(feature = "swap")]
920             None,
921             device_ctrl_tube,
922             vcpu_size,
923             irq_handler_control,
924             || guest_os.irq_chip.as_ref().snapshot(vcpu_size),
925             suspended_pvclock_state,
926         );
927         (resp, None)
928     };
929 
930     match event.token {
931         Token::VmEvent => match vm_evt_rdtube.recv::<VmEventType>() {
932             Ok(vm_event) => {
933                 let exit_state = match vm_event {
934                     VmEventType::Exit => {
935                         info!("vcpu requested shutdown");
936                         Some(ExitState::Stop)
937                     }
938                     VmEventType::Reset => {
939                         info!("vcpu requested reset");
940                         Some(ExitState::Reset)
941                     }
942                     VmEventType::Crash => {
943                         info!("vcpu crashed");
944                         Some(ExitState::Crash)
945                     }
946                     VmEventType::Panic(_) => {
947                         error!("got pvpanic event. this event is not expected on Windows.");
948                         None
949                     }
950                     VmEventType::WatchdogReset => {
951                         info!("vcpu stall detected");
952                         Some(ExitState::WatchdogReset)
953                     }
954                 };
955                 return Ok(exit_state);
956             }
957             Err(e) => {
958                 warn!("failed to recv VmEvent: {}", e);
959             }
960         },
961         Token::BrokerShutdown => {
962             info!("main loop got broker shutdown event");
963             return Ok(Some(ExitState::Stop));
964         }
965         Token::VmControlServer => {
966             let server =
967                 vm_control_server.expect("control server must exist if this event triggers");
968             let client = server.accept();
969             let id = *next_control_id;
970             *next_control_id += 1;
971             wait_ctx
972                 .add(client.0.get_read_notifier(), Token::VmControl { id })
973                 .exit_context(
974                     Exit::WaitContextAdd,
975                     "failed to add trigger to wait context",
976                 )?;
977             wait_ctx
978                 .add(client.0.get_close_notifier(), Token::VmControl { id })
979                 .exit_context(
980                     Exit::WaitContextAdd,
981                     "failed to add trigger to wait context",
982                 )?;
983             control_tubes.insert(id, TaggedControlTube::Vm(client));
984         }
985         #[allow(clippy::collapsible_match)]
986         Token::VmControl { id } => {
987             if let Some(tube) = control_tubes.get(&id) {
988                 #[allow(clippy::single_match)]
989                 match tube {
990                     TaggedControlTube::Product(product_tube) => {
991                         product::handle_tagged_control_tube_event(
992                             product_tube,
993                             virtio_snd_host_mute_tubes,
994                             service_vm_state,
995                             ipc_main_loop_tube,
996                         )
997                     }
998                     TaggedControlTube::Vm(tube) => match tube.0.recv::<VmRequest>() {
999                         Ok(request) => {
1000                             let mut run_mode_opt = None;
1001                             let response = match request {
1002                                 VmRequest::HotPlugVfioCommand { device, add } => {
1003                                     // Suppress warnings.
1004                                     let _ = (device, add);
1005                                     unimplemented!("not implemented on Windows");
1006                                 }
1007                                 #[cfg(feature = "registered_events")]
1008                                 VmRequest::RegisterListener { socket_addr, event } => {
1009                                     unimplemented!("not implemented on Windows");
1010                                 }
1011                                 #[cfg(feature = "registered_events")]
1012                                 VmRequest::UnregisterListener { socket_addr, event } => {
1013                                     unimplemented!("not implemented on Windows");
1014                                 }
1015                                 #[cfg(feature = "registered_events")]
1016                                 VmRequest::Unregister { socket_addr } => {
1017                                     unimplemented!("not implemented on Windows");
1018                                 }
1019                                 #[cfg(feature = "balloon")]
1020                                 VmRequest::BalloonCommand(cmd) => {
1021                                     if let Some(balloon_tube) = balloon_tube {
1022                                         if let Some((r, key)) = balloon_tube.send_cmd(cmd, Some(id))
1023                                         {
1024                                             if key != id {
1025                                                 unimplemented!("not implemented on Windows");
1026                                             }
1027                                             Some(r)
1028                                         } else {
1029                                             None
1030                                         }
1031                                     } else {
1032                                         error!("balloon not enabled");
1033                                         None
1034                                     }
1035                                 }
1036                                 _ => {
1037                                     let (resp, run_mode_ret) =
1038                                         execute_vm_request(request, guest_os);
1039                                     run_mode_opt = run_mode_ret;
1040                                     Some(resp)
1041                                 }
1042                             };
1043 
1044                             if let Some(response) = response {
1045                                 if let Err(e) = tube.0.send(&response) {
1046                                     error!("failed to send VmResponse: {}", e);
1047                                 }
1048                             }
1049                             if let Some(exit_state) =
1050                                 handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1051                             {
1052                                 return Ok(Some(exit_state));
1053                             }
1054                         }
1055                         Err(e) => {
1056                             if let TubeError::Disconnected = e {
1057                                 vm_control_ids_to_remove.push(id);
1058                             } else {
1059                                 error!("failed to recv VmRequest: {}", e);
1060                             }
1061                         }
1062                     },
1063                 }
1064             }
1065         }
1066         #[cfg(feature = "balloon")]
1067         Token::BalloonTube => match balloon_tube.as_mut().expect("missing balloon tube").recv() {
1068             Ok(resp) => {
1069                 for (resp, idx) in resp {
1070                     if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
1071                         if let Err(e) = tube.0.send(&resp) {
1072                             error!("failed to send VmResponse: {}", e);
1073                         }
1074                     } else {
1075                         error!("Bad tube index {}", idx);
1076                     }
1077                 }
1078             }
1079             Err(err) => {
1080                 error!("Error processing balloon tube {:?}", err)
1081             }
1082         },
1083         #[cfg(not(feature = "balloon"))]
1084         Token::BalloonTube => unreachable!("balloon tube not registered"),
1085         #[allow(unreachable_patterns)]
1086         _ => {
1087             let run_mode_opt = product::handle_received_token(
1088                 &event.token,
1089                 anti_tamper_main_thread_tube,
1090                 #[cfg(feature = "balloon")]
1091                 balloon_tube,
1092                 control_tubes,
1093                 guest_os,
1094                 ipc_main_loop_tube,
1095                 memory_size_mb,
1096                 proto_main_loop_tube,
1097                 #[cfg(feature = "pvclock")]
1098                 pvclock_host_tube,
1099                 run_mode_arc,
1100                 service_vm_state,
1101                 vcpu_boxes,
1102                 virtio_snd_host_mute_tubes,
1103                 execute_vm_request,
1104             );
1105             if let Some(exit_state) = handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1106             {
1107                 return Ok(Some(exit_state));
1108             }
1109         }
1110     };
1111     Ok(None)
1112 }
1113 
1114 /// Handles a run mode change (if one occurred) if one is pending as a
1115 /// result a VmRequest. The parameter, run_mode_opt, is the run mode change
1116 /// proposed by the VmRequest's execution.
1117 ///
1118 /// Returns the exit state, if it changed due to a run mode change.
1119 /// None otherwise.
handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( run_mode_opt: &Option<VmRunMode>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, ) -> Option<ExitState>1120 fn handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1121     run_mode_opt: &Option<VmRunMode>,
1122     guest_os: &mut RunnableLinuxVm<V, Vcpu>,
1123 ) -> Option<ExitState> {
1124     if let Some(run_mode) = run_mode_opt {
1125         info!("control socket changed run mode to {}", run_mode);
1126         match run_mode {
1127             VmRunMode::Exiting => return Some(ExitState::Stop),
1128             _ => unreachable!(),
1129         }
1130     }
1131     // No exit state change.
1132     None
1133 }
1134 
1135 /// Commands to control the VM Memory handler thread.
1136 #[derive(serde::Serialize, serde::Deserialize)]
1137 pub enum VmMemoryHandlerRequest {
1138     /// No response is sent for this command.
1139     Exit,
1140 }
1141 
vm_memory_handler_thread( control_tubes: Vec<Tube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, handler_control: Tube, ) -> anyhow::Result<()>1142 fn vm_memory_handler_thread(
1143     control_tubes: Vec<Tube>,
1144     mut vm: impl Vm,
1145     sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
1146     mut gralloc: RutabagaGralloc,
1147     handler_control: Tube,
1148 ) -> anyhow::Result<()> {
1149     #[derive(EventToken)]
1150     enum Token {
1151         VmControl { id: usize },
1152         HandlerControl,
1153     }
1154 
1155     let wait_ctx =
1156         WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
1157             .context("failed to build wait context")?;
1158     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1159     for (id, socket) in control_tubes.iter() {
1160         wait_ctx
1161             .add(socket.get_read_notifier(), Token::VmControl { id: *id })
1162             .context("failed to add descriptor to wait context")?;
1163     }
1164 
1165     let mut region_state: VmMemoryRegionState = Default::default();
1166 
1167     'wait: loop {
1168         let events = {
1169             match wait_ctx.wait() {
1170                 Ok(v) => v,
1171                 Err(e) => {
1172                     error!("failed to poll: {}", e);
1173                     break;
1174                 }
1175             }
1176         };
1177 
1178         let mut vm_control_ids_to_remove = Vec::new();
1179         for event in events.iter().filter(|e| e.is_readable) {
1180             match event.token {
1181                 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
1182                     Ok(request) => match request {
1183                         VmMemoryHandlerRequest::Exit => break 'wait,
1184                     },
1185                     Err(e) => {
1186                         if let TubeError::Disconnected = e {
1187                             panic!("vm memory control tube disconnected.");
1188                         } else {
1189                             error!("failed to recv VmMemoryHandlerRequest: {}", e);
1190                         }
1191                     }
1192                 },
1193 
1194                 Token::VmControl { id } => {
1195                     if let Some(tube) = control_tubes.get(&id) {
1196                         match tube.recv::<VmMemoryRequest>() {
1197                             Ok(request) => {
1198                                 let response = request.execute(
1199                                     &mut vm,
1200                                     &mut sys_allocator_mutex.lock(),
1201                                     &mut gralloc,
1202                                     None,
1203                                     &mut region_state,
1204                                 );
1205                                 if let Err(e) = tube.send(&response) {
1206                                     error!("failed to send VmMemoryControlResponse: {}", e);
1207                                 }
1208                             }
1209                             Err(e) => {
1210                                 if let TubeError::Disconnected = e {
1211                                     vm_control_ids_to_remove.push(id);
1212                                 } else {
1213                                     error!("failed to recv VmMemoryControlRequest: {}", e);
1214                                 }
1215                             }
1216                         }
1217                     }
1218                 }
1219             }
1220         }
1221 
1222         remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1223         if events
1224             .iter()
1225             .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
1226         {
1227             error!("vm memory handler control hung up but did not request an exit.");
1228             break 'wait;
1229         }
1230     }
1231     Ok(())
1232 }
1233 
create_control_server( control_server_path: Option<PathBuf>, wait_ctx: &WaitContext<Token>, ) -> Result<Option<ControlServer>>1234 fn create_control_server(
1235     control_server_path: Option<PathBuf>,
1236     wait_ctx: &WaitContext<Token>,
1237 ) -> Result<Option<ControlServer>> {
1238     #[cfg(not(feature = "prod-build"))]
1239     {
1240         if let Some(path) = control_server_path {
1241             let server =
1242                 ControlServer::new(path.to_str().expect("control socket path must be a string"))
1243                     .exit_context(
1244                         Exit::FailedToCreateControlServer,
1245                         "failed to create control server",
1246                     )?;
1247             wait_ctx
1248                 .add(server.client_waiting(), Token::VmControlServer)
1249                 .exit_context(
1250                     Exit::WaitContextAdd,
1251                     "failed to add control server to wait context",
1252                 )?;
1253             return Ok(Some(server));
1254         }
1255     }
1256     Ok::<Option<ControlServer>, anyhow::Error>(None)
1257 }
1258 
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut guest_os: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, control_tubes: Vec<TaggedControlTube>, irq_control_tubes: Vec<Tube>, vm_memory_control_tubes: Vec<Tube>, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>, broker_shutdown_evt: Option<Event>, balloon_host_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>, disk_host_tubes: Vec<Tube>, initial_audio_session_states: Vec<InitialAudioSessionState>, gralloc: RutabagaGralloc, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, service_pipe_name: Option<String>, memory_size_mb: u64, host_cpu_topology: bool, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, mut product_args: RunControlArgs, mut virtio_snd_host_mute_tubes: Vec<Tube>, restore_path: Option<PathBuf>, control_server_path: Option<PathBuf>, force_s2idle: bool, suspended: bool, ) -> Result<ExitState>1259 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1260     mut guest_os: RunnableLinuxVm<V, Vcpu>,
1261     sys_allocator: SystemAllocator,
1262     control_tubes: Vec<TaggedControlTube>,
1263     irq_control_tubes: Vec<Tube>,
1264     vm_memory_control_tubes: Vec<Tube>,
1265     vm_evt_rdtube: RecvTube,
1266     vm_evt_wrtube: SendTube,
1267     #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>,
1268     broker_shutdown_evt: Option<Event>,
1269     balloon_host_tube: Option<Tube>,
1270     #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>,
1271     disk_host_tubes: Vec<Tube>,
1272     initial_audio_session_states: Vec<InitialAudioSessionState>,
1273     gralloc: RutabagaGralloc,
1274     #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
1275     service_pipe_name: Option<String>,
1276     memory_size_mb: u64,
1277     host_cpu_topology: bool,
1278     tsc_sync_mitigations: TscSyncMitigations,
1279     force_calibrated_tsc_leaf: bool,
1280     mut product_args: RunControlArgs,
1281     mut virtio_snd_host_mute_tubes: Vec<Tube>,
1282     restore_path: Option<PathBuf>,
1283     control_server_path: Option<PathBuf>,
1284     force_s2idle: bool,
1285     suspended: bool,
1286 ) -> Result<ExitState> {
1287     let (ipc_main_loop_tube, proto_main_loop_tube, _service_ipc) =
1288         start_service_ipc_listener(service_pipe_name)?;
1289 
1290     let mut service_vm_state = product::create_service_vm_state(memory_size_mb);
1291 
1292     let service_audio_states = product::create_service_audio_states_and_send_to_service(
1293         initial_audio_session_states,
1294         &ipc_main_loop_tube,
1295     )?;
1296 
1297     let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
1298 
1299     let exit_evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
1300     let (irq_handler_control, irq_handler_control_for_worker) = Tube::pair().exit_context(
1301         Exit::CreateTube,
1302         "failed to create IRQ handler control Tube",
1303     )?;
1304 
1305     // Create a separate thread to wait on IRQ events. This is a natural division
1306     // because IRQ interrupts have no dependencies on other events, and this lets
1307     // us avoid approaching the Windows WaitForMultipleObjects 64-object limit.
1308     let irq_join_handle = IrqWaitWorker::start(
1309         irq_handler_control_for_worker,
1310         guest_os
1311             .irq_chip
1312             .try_box_clone()
1313             .exit_context(Exit::CloneEvent, "failed to clone irq chip")?,
1314         irq_control_tubes,
1315         sys_allocator_mutex.clone(),
1316     );
1317 
1318     let mut triggers = vec![(vm_evt_rdtube.get_read_notifier(), Token::VmEvent)];
1319     product::push_triggers(&mut triggers, &ipc_main_loop_tube, &proto_main_loop_tube);
1320     let wait_ctx = WaitContext::build_with(&triggers).exit_context(
1321         Exit::WaitContextAdd,
1322         "failed to add trigger to wait context",
1323     )?;
1324 
1325     #[cfg(feature = "balloon")]
1326     let mut balloon_tube = balloon_host_tube
1327         .map(|tube| -> Result<BalloonTube> {
1328             wait_ctx
1329                 .add(tube.get_read_notifier(), Token::BalloonTube)
1330                 .context("failed to add trigger to wait context")?;
1331             Ok(BalloonTube::new(tube))
1332         })
1333         .transpose()
1334         .context("failed to create balloon tube")?;
1335 
1336     let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
1337     let vm_memory_handler_thread_join_handle = std::thread::Builder::new()
1338         .name("vm_memory_handler_thread".into())
1339         .spawn({
1340             let vm = guest_os.vm.try_clone().context("failed to clone Vm")?;
1341             let sys_allocator_mutex = sys_allocator_mutex.clone();
1342             move || {
1343                 vm_memory_handler_thread(
1344                     vm_memory_control_tubes,
1345                     vm,
1346                     sys_allocator_mutex,
1347                     gralloc,
1348                     vm_memory_handler_control_for_thread,
1349                 )
1350             }
1351         })
1352         .unwrap();
1353 
1354     if let Some(evt) = broker_shutdown_evt.as_ref() {
1355         wait_ctx.add(evt, Token::BrokerShutdown).exit_context(
1356             Exit::WaitContextAdd,
1357             "failed to add trigger to wait context",
1358         )?;
1359     }
1360 
1361     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1362     let mut next_control_id = control_tubes.len();
1363     for (id, control_tube) in control_tubes.iter() {
1364         #[allow(clippy::single_match)]
1365         match control_tube {
1366             TaggedControlTube::Product(product_tube) => wait_ctx
1367                 .add(
1368                     product_tube.get_read_notifier(),
1369                     Token::VmControl { id: *id },
1370                 )
1371                 .exit_context(
1372                     Exit::WaitContextAdd,
1373                     "failed to add trigger to wait context",
1374                 )?,
1375             _ => (),
1376         }
1377     }
1378 
1379     let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
1380     guest_os.devices_thread = match create_devices_worker_thread(
1381         guest_os.vm.get_memory().clone(),
1382         guest_os.io_bus.clone(),
1383         guest_os.mmio_bus.clone(),
1384         device_ctrl_resp,
1385     ) {
1386         Ok(join_handle) => Some(join_handle),
1387         Err(e) => {
1388             return Err(anyhow!("Failed to start devices thread: {}", e));
1389         }
1390     };
1391 
1392     let vcpus: Vec<Option<_>> = match guest_os.vcpus.take() {
1393         Some(vec) => vec.into_iter().map(|vcpu| Some(vcpu)).collect(),
1394         None => iter::repeat_with(|| None)
1395             .take(guest_os.vcpu_count)
1396             .collect(),
1397     };
1398 
1399     let anti_tamper_main_thread_tube = spawn_anti_tamper_thread(&wait_ctx);
1400 
1401     let mut vm_control_server = create_control_server(control_server_path, &wait_ctx)?;
1402 
1403     let ime_thread = run_ime_thread(&mut product_args, &exit_evt)?;
1404 
1405     let original_terminal_mode = stdin().set_raw_mode().ok();
1406 
1407     let vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>> = Arc::new(Mutex::new(Vec::new()));
1408     let run_mode_arc = Arc::new(VcpuRunMode::default());
1409 
1410     let run_mode_state = if suspended {
1411         // Sleep devices before creating vcpus.
1412         device_ctrl_tube
1413             .send(&DeviceControlCommand::SleepDevices)
1414             .context("send command to devices control socket")?;
1415         match device_ctrl_tube
1416             .recv()
1417             .context("receive from devices control socket")?
1418         {
1419             VmResponse::Ok => (),
1420             resp => bail!("device sleep failed: {}", resp),
1421         }
1422         run_mode_arc.set_and_notify(VmRunMode::Suspending);
1423         VmRunMode::Suspending
1424     } else {
1425         VmRunMode::Running
1426     };
1427 
1428     // If we are restoring from a snapshot, then start suspended.
1429     if restore_path.is_some() {
1430         run_mode_arc.set_and_notify(VmRunMode::Suspending);
1431     }
1432 
1433     let (vcpu_threads, vcpu_control_channels) = run_all_vcpus(
1434         vcpus,
1435         vcpu_boxes.clone(),
1436         &guest_os,
1437         &exit_evt,
1438         &vm_evt_wrtube,
1439         #[cfg(feature = "stats")]
1440         &stats,
1441         host_cpu_topology,
1442         run_mode_arc.clone(),
1443         tsc_sync_mitigations,
1444         force_calibrated_tsc_leaf,
1445     )?;
1446 
1447     // See comment on `VmRequest::execute`.
1448     let mut suspended_pvclock_state: Option<hypervisor::ClockState> = None;
1449 
1450     // Restore VM (if applicable).
1451     if let Some(path) = restore_path {
1452         vm_control::do_restore(
1453             &path,
1454             |msg| {
1455                 kick_all_vcpus(
1456                     run_mode_arc.as_ref(),
1457                     &vcpu_control_channels,
1458                     vcpu_boxes.as_ref(),
1459                     guest_os.irq_chip.as_ref(),
1460                     #[cfg(feature = "pvclock")]
1461                     &pvclock_host_tube,
1462                     &guest_os.resume_notify_devices,
1463                     msg,
1464                 )
1465             },
1466             |msg, index| {
1467                 kick_vcpu(
1468                     run_mode_arc.as_ref(),
1469                     &vcpu_control_channels,
1470                     vcpu_boxes.as_ref(),
1471                     guest_os.irq_chip.as_ref(),
1472                     index,
1473                     msg,
1474                 )
1475             },
1476             &irq_handler_control,
1477             &device_ctrl_tube,
1478             guest_os.vcpu_count,
1479             |image| {
1480                 guest_os
1481                     .irq_chip
1482                     .try_box_clone()?
1483                     .restore(image, guest_os.vcpu_count)
1484             },
1485             /* require_encrypted= */ false,
1486             &mut suspended_pvclock_state,
1487         )?;
1488         // Allow the vCPUs to start for real.
1489         kick_all_vcpus(
1490             run_mode_arc.as_ref(),
1491             &vcpu_control_channels,
1492             vcpu_boxes.as_ref(),
1493             guest_os.irq_chip.as_ref(),
1494             #[cfg(feature = "pvclock")]
1495             &pvclock_host_tube,
1496             &guest_os.resume_notify_devices,
1497             // Other platforms (unix) have multiple modes they could start in (e.g. starting for
1498             // guest kernel debugging, etc). If/when we support those modes on Windows, we'll need
1499             // to enter that mode here rather than VmRunMode::Running.
1500             VcpuControl::RunState(run_mode_state),
1501         );
1502     }
1503 
1504     let mut exit_state = ExitState::Stop;
1505     let mut region_state: VmMemoryRegionState = Default::default();
1506 
1507     'poll: loop {
1508         let events = {
1509             match wait_ctx.wait() {
1510                 Ok(v) => v,
1511                 Err(e) => {
1512                     error!("failed to wait: {}", e);
1513                     break;
1514                 }
1515             }
1516         };
1517 
1518         let mut vm_control_ids_to_remove = Vec::new();
1519         for event in events.iter().filter(|e| e.is_readable) {
1520             let state = handle_readable_event(
1521                 event,
1522                 &mut vm_control_ids_to_remove,
1523                 &mut next_control_id,
1524                 &mut service_vm_state,
1525                 disk_host_tubes.as_slice(),
1526                 ipc_main_loop_tube.as_ref(),
1527                 #[cfg(feature = "gpu")]
1528                 gpu_control_tube.as_ref(),
1529                 &vm_evt_rdtube,
1530                 &mut control_tubes,
1531                 &mut guest_os,
1532                 &sys_allocator_mutex,
1533                 &mut virtio_snd_host_mute_tubes,
1534                 proto_main_loop_tube.as_ref(),
1535                 &anti_tamper_main_thread_tube,
1536                 #[cfg(feature = "balloon")]
1537                 balloon_tube.as_mut(),
1538                 memory_size_mb,
1539                 vcpu_boxes.as_ref(),
1540                 #[cfg(feature = "pvclock")]
1541                 &pvclock_host_tube,
1542                 run_mode_arc.as_ref(),
1543                 &mut region_state,
1544                 vm_control_server.as_mut(),
1545                 &irq_handler_control,
1546                 &device_ctrl_tube,
1547                 &wait_ctx,
1548                 force_s2idle,
1549                 &vcpu_control_channels,
1550                 &mut suspended_pvclock_state,
1551             )?;
1552             if let Some(state) = state {
1553                 exit_state = state;
1554                 break 'poll;
1555             }
1556         }
1557 
1558         remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1559     }
1560 
1561     info!("run_control poll loop completed, forcing vCPUs to exit...");
1562 
1563     // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM.
1564     run_mode_arc.set_and_notify(VmRunMode::Exiting);
1565 
1566     // Force all vcpus to exit from the hypervisor
1567     for vcpu in vcpu_boxes.lock().iter() {
1568         vcpu.set_immediate_exit(true);
1569     }
1570 
1571     let mut res = Ok(exit_state);
1572     guest_os.irq_chip.kick_halted_vcpus();
1573     let _ = exit_evt.signal();
1574 
1575     if guest_os.devices_thread.is_some() {
1576         if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
1577             error!("failed to stop device control loop: {}", e);
1578         };
1579         if let Some(thread) = guest_os.devices_thread.take() {
1580             if let Err(e) = thread.join() {
1581                 error!("failed to exit devices thread: {:?}", e);
1582             }
1583         }
1584     }
1585 
1586     // Shut down the VM memory handler thread.
1587     if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
1588         error!(
1589             "failed to request exit from VM memory handler thread: {}",
1590             e
1591         );
1592     }
1593     if let Err(e) = vm_memory_handler_thread_join_handle.join() {
1594         error!("failed to exit VM Memory handler thread: {:?}", e);
1595     }
1596 
1597     // Shut down the IRQ handler thread.
1598     if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
1599         error!("failed to request exit from IRQ handler thread: {}", e);
1600     }
1601 
1602     // Ensure any child threads have ended by sending the Exit vm event (possibly again) to ensure
1603     // their run loops are aborted.
1604     let _ = vm_evt_wrtube.send::<VmEventType>(&VmEventType::Exit);
1605     for (i, thread) in vcpu_threads.into_iter().enumerate() {
1606         // wait till all the threads exit, so that guest_os.vm arc memory count is down to 1.
1607         // otherwise, we will hit a memory leak if we force kill the thread with terminate.
1608         match thread.join() {
1609             Ok(Err(e)) => {
1610                 error!("vcpu thread {} exited with an error: {}", i, e);
1611                 res = Err(e);
1612             }
1613             Ok(_) => {}
1614             Err(e) => error!("vcpu thread {} panicked: {:?}", i, e),
1615         }
1616     }
1617 
1618     info!("vCPU threads have exited.");
1619 
1620     if let Some(ime) = ime_thread {
1621         match ime.join() {
1622             Ok(Err(e)) => {
1623                 error!("ime thread exited with an error: {}", e);
1624                 if res.is_ok() {
1625                     // Prioritize past errors, but return this error if it is unique, otherwise just
1626                     // log it.
1627                     res = Err(e)
1628                 }
1629             }
1630             Ok(_) => {}
1631             Err(e) => error!("ime thread panicked: {:?}", e),
1632         }
1633     }
1634     info!("IME thread has exited.");
1635 
1636     // This cancels all the outstanding and any future blocking operations.
1637     // TODO(b/196911556): Shutdown executor for cleaner shutdown. Given we are using global, for a
1638     // cleaner shutdown we have to call disarm so that all the incoming requests are run and are
1639     // cancelled. If we call shutdown all blocking threads will go away and incoming operations
1640     // won't be scheduled to run and will be dropped leading to panic. I think ideal place to call
1641     // shutdown is when we drop non-global executor.
1642     cros_async::unblock_disarm();
1643     info!("blocking async pool has shut down.");
1644 
1645     let _ = irq_join_handle.join();
1646     info!("IrqWaitWorker has shut down.");
1647 
1648     #[cfg(feature = "stats")]
1649     if let Some(stats) = stats {
1650         println!("Statistics Collected:\n{}", stats.lock());
1651         println!("Statistics JSON:\n{}", stats.lock().json());
1652     }
1653 
1654     if let Some(mode) = original_terminal_mode {
1655         if let Err(e) = stdin().restore_mode(mode) {
1656             warn!("failed to restore terminal mode: {}", e);
1657         }
1658     }
1659 
1660     // Explicitly drop the VM structure here to allow the devices to clean up before the
1661     // control tubes are closed when this function exits.
1662     mem::drop(guest_os);
1663 
1664     info!("guest_os dropped, run_control is done.");
1665 
1666     res
1667 }
1668 
1669 /// Remove Tubes that have been closed from the WaitContext.
remove_closed_tubes<T, U>( wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier + CloseNotifier,1670 fn remove_closed_tubes<T, U>(
1671     wait_ctx: &WaitContext<T>,
1672     tubes: &mut BTreeMap<usize, U>,
1673     mut tube_ids_to_remove: Vec<usize>,
1674 ) -> anyhow::Result<()>
1675 where
1676     T: EventToken,
1677     U: ReadNotifier + CloseNotifier,
1678 {
1679     tube_ids_to_remove.dedup();
1680     for id in tube_ids_to_remove {
1681         if let Some(socket) = tubes.remove(&id) {
1682             wait_ctx
1683                 .delete(socket.get_read_notifier())
1684                 .context("failed to remove descriptor from wait context")?;
1685 
1686             // There may be a close notifier registered for this Tube. If there isn't one
1687             // registered, we just ignore the error.
1688             let _ = wait_ctx.delete(socket.get_close_notifier());
1689         }
1690     }
1691     Ok(())
1692 }
1693 
1694 /// Sends a message to all VCPUs.
kick_all_vcpus( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, resume_notify_devices: &[Arc<Mutex<dyn BusResumeDevice>>], msg: VcpuControl, )1695 fn kick_all_vcpus(
1696     run_mode: &VcpuRunMode,
1697     vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1698     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1699     irq_chip: &dyn IrqChipArch,
1700     #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1701     resume_notify_devices: &[Arc<Mutex<dyn BusResumeDevice>>],
1702     msg: VcpuControl,
1703 ) {
1704     // On Windows, we handle run mode switching directly rather than delegating to the VCPU thread
1705     // like unix does.
1706     match &msg {
1707         VcpuControl::RunState(VmRunMode::Suspending) => {
1708             suspend_all_vcpus(
1709                 run_mode,
1710                 vcpu_boxes,
1711                 irq_chip,
1712                 #[cfg(feature = "pvclock")]
1713                 pvclock_host_tube,
1714             );
1715             return;
1716         }
1717         VcpuControl::RunState(VmRunMode::Running) => {
1718             for device in resume_notify_devices {
1719                 device.lock().resume_imminent();
1720             }
1721             resume_all_vcpus(
1722                 run_mode,
1723                 vcpu_boxes,
1724                 irq_chip,
1725                 #[cfg(feature = "pvclock")]
1726                 pvclock_host_tube,
1727             );
1728             return;
1729         }
1730         _ => (),
1731     }
1732 
1733     // For non RunState commands, we dispatch just like unix would.
1734     for vcpu in vcpu_control_channels {
1735         if let Err(e) = vcpu.send(msg.clone()) {
1736             error!("failed to send VcpuControl message: {}", e);
1737         }
1738     }
1739 
1740     // Now that we've sent a message, we need VCPUs to exit so they can process it.
1741     for vcpu in vcpu_boxes.lock().iter() {
1742         vcpu.set_immediate_exit(true);
1743     }
1744     irq_chip.kick_halted_vcpus();
1745 
1746     // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1747     // the control message.
1748     let current_run_mode = run_mode.get_mode();
1749     if current_run_mode != VmRunMode::Running {
1750         run_mode.set_and_notify(current_run_mode);
1751     }
1752 }
1753 
1754 /// Sends a message to a single VCPU. On Windows, `VcpuControl::RunState` cannot be sent to a single
1755 /// VCPU.
kick_vcpu( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, index: usize, msg: VcpuControl, )1756 fn kick_vcpu(
1757     run_mode: &VcpuRunMode,
1758     vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1759     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1760     irq_chip: &dyn IrqChipArch,
1761     index: usize,
1762     msg: VcpuControl,
1763 ) {
1764     assert!(
1765         !matches!(msg, VcpuControl::RunState(_)),
1766         "Windows does not support RunState changes on a per VCPU basis"
1767     );
1768 
1769     let vcpu = vcpu_control_channels
1770         .get(index)
1771         .expect("invalid vcpu index specified");
1772     if let Err(e) = vcpu.send(msg) {
1773         error!("failed to send VcpuControl message: {}", e);
1774     }
1775 
1776     // Now that we've sent a message, we need the VCPU to exit so it can
1777     // process the message.
1778     vcpu_boxes
1779         .lock()
1780         .get(index)
1781         .expect("invalid vcpu index specified")
1782         .set_immediate_exit(true);
1783     irq_chip.kick_halted_vcpus();
1784 
1785     // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1786     // the control message. (Technically this wakes all VCPUs, but those without messages will go
1787     // back to sleep.)
1788     let current_run_mode = run_mode.get_mode();
1789     if current_run_mode != VmRunMode::Running {
1790         run_mode.set_and_notify(current_run_mode);
1791     }
1792 }
1793 
1794 /// Suspends all VCPUs. The VM will be effectively frozen in time once this function is called,
1795 /// though devices on the host will continue to run.
suspend_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, )1796 pub(crate) fn suspend_all_vcpus(
1797     run_mode: &VcpuRunMode,
1798     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1799     irq_chip: &dyn IrqChipArch,
1800     #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1801 ) {
1802     // VCPU threads MUST see the VmRunMode::Suspending flag first, otherwise
1803     // they may re-enter the VM.
1804     run_mode.set_and_notify(VmRunMode::Suspending);
1805 
1806     // Force all vcpus to exit from the hypervisor
1807     for vcpu in vcpu_boxes.lock().iter() {
1808         vcpu.set_immediate_exit(true);
1809     }
1810     irq_chip.kick_halted_vcpus();
1811 
1812     #[cfg(feature = "pvclock")]
1813     handle_pvclock_request(pvclock_host_tube, PvClockCommand::Suspend)
1814         .unwrap_or_else(|e| error!("Error handling pvclock suspend: {:?}", e));
1815 }
1816 
1817 /// Resumes all VCPUs.
resume_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, )1818 pub(crate) fn resume_all_vcpus(
1819     run_mode: &VcpuRunMode,
1820     vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1821     irq_chip: &dyn IrqChipArch,
1822     #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1823 ) {
1824     #[cfg(feature = "pvclock")]
1825     handle_pvclock_request(pvclock_host_tube, PvClockCommand::Resume)
1826         .unwrap_or_else(|e| error!("Error handling pvclock resume: {:?}", e));
1827 
1828     // Make sure any immediate exit bits are disabled
1829     for vcpu in vcpu_boxes.lock().iter() {
1830         vcpu.set_immediate_exit(false);
1831     }
1832 
1833     run_mode.set_and_notify(VmRunMode::Running);
1834 }
1835 
1836 #[cfg(feature = "gvm")]
1837 const GVM_MINIMUM_VERSION: GvmVersion = GvmVersion {
1838     major: 1,
1839     minor: 4,
1840     patch: 1,
1841 };
1842 
1843 #[cfg(feature = "gvm")]
create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm>1844 fn create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm> {
1845     match gvm.get_full_version() {
1846         Ok(version) => {
1847             if version < GVM_MINIMUM_VERSION {
1848                 error!(
1849                     "GVM version {} is below minimum version {}",
1850                     version, GVM_MINIMUM_VERSION
1851                 );
1852                 return Err(base::Error::new(libc::ENXIO).into());
1853             } else {
1854                 info!("Using GVM version {}.", version)
1855             }
1856         }
1857         Err(e) => {
1858             error!("unable to determine gvm version: {}", e);
1859             return Err(base::Error::new(libc::ENXIO).into());
1860         }
1861     }
1862     let vm = GvmVm::new(&gvm, mem)?;
1863     Ok(vm)
1864 }
1865 
1866 #[cfg(feature = "haxm")]
create_haxm_vm( haxm: Haxm, mem: GuestMemory, kernel_log_file: &Option<String>, ) -> Result<HaxmVm>1867 fn create_haxm_vm(
1868     haxm: Haxm,
1869     mem: GuestMemory,
1870     kernel_log_file: &Option<String>,
1871 ) -> Result<HaxmVm> {
1872     let vm = HaxmVm::new(&haxm, mem)?;
1873     if let Some(path) = kernel_log_file {
1874         use hypervisor::haxm::HAX_CAP_VM_LOG;
1875         if vm.check_raw_capability(HAX_CAP_VM_LOG) {
1876             match vm.register_log_file(path) {
1877                 Ok(_) => {}
1878                 Err(e) => match e.errno() {
1879                     libc::E2BIG => {
1880                         error!(
1881                             "kernel_log_file path is too long, kernel log file will not be written"
1882                         );
1883                     }
1884                     _ => return Err(e.into()),
1885                 },
1886             }
1887         } else {
1888             warn!(
1889                 "kernel_log_file specified but this version of HAXM does not support kernel log \
1890                   files"
1891             );
1892         }
1893     }
1894     Ok(vm)
1895 }
1896 
1897 #[cfg(feature = "whpx")]
1898 #[cfg(target_arch = "x86_64")]
create_whpx_vm( whpx: Whpx, mem: GuestMemory, cpu_count: usize, no_smt: bool, apic_emulation: bool, force_calibrated_tsc_leaf: bool, vm_evt_wrtube: SendTube, ) -> Result<WhpxVm>1899 fn create_whpx_vm(
1900     whpx: Whpx,
1901     mem: GuestMemory,
1902     cpu_count: usize,
1903     no_smt: bool,
1904     apic_emulation: bool,
1905     force_calibrated_tsc_leaf: bool,
1906     vm_evt_wrtube: SendTube,
1907 ) -> Result<WhpxVm> {
1908     let cpu_config = hypervisor::CpuConfigX86_64::new(
1909         force_calibrated_tsc_leaf,
1910         false, /* host_cpu_topology */
1911         false, /* enable_hwp */
1912         no_smt,
1913         false, /* itmt */
1914         None,  /* hybrid_type */
1915     );
1916 
1917     // context for non-cpu-specific cpuid results
1918     let ctx = CpuIdContext::new(
1919         0,
1920         cpu_count,
1921         None,
1922         cpu_config,
1923         whpx.check_capability(HypervisorCap::CalibratedTscLeafRequired),
1924         __cpuid_count,
1925         __cpuid,
1926     );
1927 
1928     // Get all cpuid entries that we should pre-set
1929     let mut cpuid = whpx.get_supported_cpuid()?;
1930 
1931     // Adjust them for crosvm
1932     for entry in cpuid.cpu_id_entries.iter_mut() {
1933         adjust_cpuid(entry, &ctx);
1934     }
1935 
1936     let vm = WhpxVm::new(
1937         &whpx,
1938         cpu_count,
1939         mem,
1940         cpuid,
1941         apic_emulation,
1942         Some(vm_evt_wrtube),
1943     )
1944     .exit_context(Exit::WhpxSetupError, "failed to create WHPX vm")?;
1945 
1946     Ok(vm)
1947 }
1948 
1949 #[cfg(feature = "gvm")]
create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip>1950 fn create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip> {
1951     info!("Creating GVM irqchip");
1952     let irq_chip = GvmIrqChip::new(vm.try_clone()?, vcpu_count)?;
1953     Ok(irq_chip)
1954 }
1955 
1956 #[cfg(feature = "whpx")]
1957 #[cfg(target_arch = "x86_64")]
create_whpx_split_irq_chip( vm: &WhpxVm, ioapic_device_tube: Tube, ) -> base::Result<WhpxSplitIrqChip>1958 fn create_whpx_split_irq_chip(
1959     vm: &WhpxVm,
1960     ioapic_device_tube: Tube,
1961 ) -> base::Result<WhpxSplitIrqChip> {
1962     info!("Creating WHPX split irqchip");
1963     WhpxSplitIrqChip::new(
1964         vm.try_clone()?,
1965         ioapic_device_tube,
1966         None, // ioapic_pins
1967     )
1968 }
1969 
create_userspace_irq_chip<Vcpu>( vcpu_count: usize, ioapic_device_tube: Tube, ) -> base::Result<UserspaceIrqChip<Vcpu>> where Vcpu: VcpuArch + 'static,1970 fn create_userspace_irq_chip<Vcpu>(
1971     vcpu_count: usize,
1972     ioapic_device_tube: Tube,
1973 ) -> base::Result<UserspaceIrqChip<Vcpu>>
1974 where
1975     Vcpu: VcpuArch + 'static,
1976 {
1977     info!("Creating userspace irqchip");
1978     let irq_chip =
1979         UserspaceIrqChip::new(vcpu_count, ioapic_device_tube, /* ioapic_pins: */ None)?;
1980     Ok(irq_chip)
1981 }
1982 
get_default_hypervisor() -> Option<HypervisorKind>1983 pub fn get_default_hypervisor() -> Option<HypervisorKind> {
1984     // The ordering here matters from most preferable to the least.
1985     #[cfg(feature = "whpx")]
1986     match hypervisor::whpx::Whpx::is_enabled() {
1987         true => return Some(HypervisorKind::Whpx),
1988         false => warn!("Whpx not enabled."),
1989     };
1990 
1991     #[cfg(feature = "haxm")]
1992     match Haxm::new() {
1993         Ok(_) => return Some(HypervisorKind::Ghaxm),
1994         Err(e) => warn!("Cannot initialize HAXM: {}", e),
1995     };
1996 
1997     #[cfg(feature = "gvm")]
1998     // Make sure Gvm device can be opened before selecting it.
1999     match Gvm::new() {
2000         Ok(_) => return Some(HypervisorKind::Gvm),
2001         Err(e) => warn!("Cannot initialize GVM: {}", e),
2002     };
2003 
2004     None
2005 }
2006 
setup_vm_components(cfg: &Config) -> Result<VmComponents>2007 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
2008     let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
2009         Some(
2010             File::open(initrd_path).with_exit_context(Exit::OpenInitrd, || {
2011                 format!("failed to open initrd {}", initrd_path.display())
2012             })?,
2013         )
2014     } else {
2015         None
2016     };
2017 
2018     let vm_image = match cfg.executable_path {
2019         Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
2020             File::open(kernel_path).with_exit_context(Exit::OpenKernel, || {
2021                 format!("failed to open kernel image {}", kernel_path.display(),)
2022             })?,
2023         ),
2024         Some(Executable::Bios(ref bios_path)) => {
2025             VmImage::Bios(File::open(bios_path).with_exit_context(Exit::OpenBios, || {
2026                 format!("failed to open bios {}", bios_path.display())
2027             })?)
2028         }
2029         _ => panic!("Did not receive a bios or kernel, should be impossible."),
2030     };
2031 
2032     let swiotlb = if let Some(size) = cfg.swiotlb {
2033         Some(
2034             size.checked_mul(1024 * 1024)
2035                 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
2036         )
2037     } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
2038         None
2039     } else {
2040         Some(64 * 1024 * 1024)
2041     };
2042 
2043     let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
2044     {
2045         (
2046             Some(
2047                 open_file_or_duplicate(
2048                     &pflash_parameters.path,
2049                     OpenOptions::new().read(true).write(true),
2050                 )
2051                 .with_context(|| {
2052                     format!("failed to open pflash {}", pflash_parameters.path.display())
2053                 })?,
2054             ),
2055             pflash_parameters.block_size,
2056         )
2057     } else {
2058         (None, 0)
2059     };
2060 
2061     Ok(VmComponents {
2062         memory_size: cfg
2063             .memory
2064             .unwrap_or(256)
2065             .checked_mul(1024 * 1024)
2066             .ok_or_else(|| anyhow!("requested memory size too large"))?,
2067         swiotlb,
2068         vcpu_count: cfg.vcpu_count.unwrap_or(1),
2069         fw_cfg_enable: false,
2070         bootorder_fw_cfg_blob: Vec::new(),
2071         vcpu_affinity: cfg.vcpu_affinity.clone(),
2072         cpu_clusters: cfg.cpu_clusters.clone(),
2073         cpu_capacity: cfg.cpu_capacity.clone(),
2074         no_smt: cfg.no_smt,
2075         hugepages: cfg.hugepages,
2076         hv_cfg: hypervisor::Config {
2077             protection_type: cfg.protection_type,
2078         },
2079         vm_image,
2080         android_fstab: cfg
2081             .android_fstab
2082             .as_ref()
2083             .map(|x| {
2084                 File::open(x).with_exit_context(Exit::OpenAndroidFstab, || {
2085                     format!("failed to open android fstab file {}", x.display())
2086                 })
2087             })
2088             .map_or(Ok(None), |v| v.map(Some))?,
2089         pstore: cfg.pstore.clone(),
2090         pflash_block_size,
2091         pflash_image,
2092         initrd_image,
2093         extra_kernel_params: cfg.params.clone(),
2094         acpi_sdts: cfg
2095             .acpi_tables
2096             .iter()
2097             .map(|path| {
2098                 SDT::from_file(path).with_exit_context(Exit::OpenAcpiTable, || {
2099                     format!("failed to open ACPI file {}", path.display())
2100                 })
2101             })
2102             .collect::<Result<Vec<SDT>>>()?,
2103         rt_cpus: cfg.rt_cpus.clone(),
2104         delay_rt: cfg.delay_rt,
2105         no_i8042: cfg.no_i8042,
2106         no_rtc: cfg.no_rtc,
2107         host_cpu_topology: cfg.host_cpu_topology,
2108         #[cfg(target_arch = "x86_64")]
2109         force_s2idle: cfg.force_s2idle,
2110         fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
2111         itmt: false,
2112         pvm_fw: None,
2113         pci_config: cfg.pci_config,
2114         #[cfg(target_arch = "x86_64")]
2115         smbios: cfg.smbios.clone(),
2116         dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
2117         #[cfg(target_arch = "x86_64")]
2118         break_linux_pci_config_io: cfg.break_linux_pci_config_io,
2119         boot_cpu: cfg.boot_cpu,
2120     })
2121 }
2122 
2123 // Enum that allows us to assign a variable to what is essentially a &dyn IrqChipArch.
2124 enum WindowsIrqChip<V: VcpuArch> {
2125     Userspace(UserspaceIrqChip<V>),
2126     #[cfg(feature = "gvm")]
2127     Gvm(GvmIrqChip),
2128     #[cfg(feature = "whpx")]
2129     WhpxSplit(WhpxSplitIrqChip),
2130 }
2131 
2132 impl<V: VcpuArch> WindowsIrqChip<V> {
2133     // Convert our enum to a &mut dyn IrqChipArch
as_mut(&mut self) -> &mut dyn IrqChipArch2134     fn as_mut(&mut self) -> &mut dyn IrqChipArch {
2135         match self {
2136             WindowsIrqChip::Userspace(i) => i,
2137             #[cfg(feature = "gvm")]
2138             WindowsIrqChip::Gvm(i) => i,
2139             #[cfg(feature = "whpx")]
2140             WindowsIrqChip::WhpxSplit(i) => i,
2141         }
2142     }
2143 }
2144 
2145 /// Storage for the VM TSC offset for each vcpu. Stored in a static because the tracing thread will
2146 /// need access to it when tracing is enabled.
2147 static TSC_OFFSETS: sync::Mutex<Vec<Option<u64>>> = sync::Mutex::new(Vec::new());
2148 
2149 /// Save the TSC offset for a particular vcpu.
2150 ///
2151 /// After setting the TSC offset for a vcpu, this function checks the standard deviation of offsets
2152 /// for all the VCPUs and logs this information. If the TSC offsets differ too much between vcpus
2153 /// it can cause clock issues in the guest.
save_vcpu_tsc_offset(offset: u64, vcpu_id: usize)2154 pub fn save_vcpu_tsc_offset(offset: u64, vcpu_id: usize) {
2155     let offsets_copy = {
2156         let mut offsets = TSC_OFFSETS.lock();
2157         // make sure offsets vec is large enough before inserting
2158         let newlen = std::cmp::max(offsets.len(), vcpu_id + 1);
2159         offsets.resize(newlen, None);
2160         offsets[vcpu_id] = Some(offset);
2161 
2162         offsets.clone()
2163     };
2164 
2165     // do statistics on a clone of the offsets so we don't hold up other vcpus at this point
2166     info!(
2167         "TSC offset standard deviation is: {}",
2168         standard_deviation(
2169             &offsets_copy
2170                 .iter()
2171                 .filter(|x| x.is_some())
2172                 .map(|x| x.unwrap() as u128)
2173                 .collect::<Vec<u128>>()
2174         )
2175     );
2176 }
2177 
2178 /// Get the TSC offset of any vcpu. It will pick the first non-None offset it finds in TSC_OFFSETS.
2179 #[cfg(feature = "perfetto")]
get_vcpu_tsc_offset() -> u642180 pub fn get_vcpu_tsc_offset() -> u64 {
2181     if let Some(offset) = TSC_OFFSETS.lock().iter().flatten().next() {
2182         return *offset;
2183     }
2184     0
2185 }
2186 
2187 /// Callback that is registered with tracing crate, and will be called by the tracing thread when
2188 /// tracing is enabled or disabled. Regardless of whether tracing is being enabled or disabled for
2189 /// a given category or instance, we just emit a clock snapshot that maps the guest TSC to the
2190 /// host TSC. Redundant snapshots should not be a problem for perfetto.
2191 #[cfg(feature = "perfetto")]
set_tsc_clock_snapshot()2192 fn set_tsc_clock_snapshot() {
2193     let freq = match devices::tsc::tsc_frequency() {
2194         Err(e) => {
2195             error!(
2196                 "Could not determine tsc frequency, unable to snapshot tsc offset: {}",
2197                 e
2198             );
2199             return;
2200         }
2201         Ok(freq) => freq,
2202     };
2203 
2204     // The offset is host-guest tsc value
2205     let offset = get_vcpu_tsc_offset();
2206     // Safe because _rdtsc takes no arguments;
2207     let host_tsc = unsafe { std::arch::x86_64::_rdtsc() };
2208     perfetto::snapshot_clock(perfetto::ClockSnapshot::new(
2209         // Technically our multiplier should be freq/1_000_000_000, but perfetto doesn't
2210         // support floating point multipliers yet. So for now we set the freq in Hz and rely
2211         // on the merge tool to fix it.
2212         perfetto::Clock::new(
2213             perfetto::BuiltinClock::Tsc as u32,
2214             host_tsc.wrapping_add(offset),
2215         )
2216         .set_multiplier(freq as u64),
2217         perfetto::Clock::new(
2218             // The host builtin clock ids are all offset from the guest ids by
2219             // HOST_GUEST_CLOCK_ID_OFFSET when the traces are merged. Because this snapshot
2220             // contains both a guest and host clock, we need to offset it before merge.
2221             perfetto::BuiltinClock::Tsc as u32 + cros_tracing::HOST_GUEST_CLOCK_ID_OFFSET,
2222             host_tsc,
2223         )
2224         .set_multiplier(freq as u64),
2225     ));
2226 }
2227 
2228 /// Launches run_config for the broker, reading configuration from a TubeTransporter.
run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState>2229 pub fn run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState> {
2230     let tube_transporter =
2231         // SAFETY:
2232         // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that
2233         // the blocking & framing modes are accurate because we create them ourselves in the broker.
2234         unsafe { TubeTransporterReader::from_raw_descriptor(raw_tube_transporter) };
2235 
2236     let mut tube_data_list = tube_transporter
2237         .read_tubes()
2238         .exit_context(Exit::TubeTransporterInit, "failed to init tube transporter")?;
2239 
2240     let bootstrap_tube = tube_data_list
2241         .get_tube(TubeToken::Bootstrap)
2242         .exit_context(Exit::TubeFailure, "failed to get bootstrap tube")?;
2243 
2244     let mut cfg: Config = bootstrap_tube
2245         .recv::<Config>()
2246         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2247 
2248     let startup_args: CommonChildStartupArgs = bootstrap_tube
2249         .recv::<CommonChildStartupArgs>()
2250         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2251     let _child_cleanup = common_child_setup(startup_args).exit_context(
2252         Exit::CommonChildSetupError,
2253         "failed to perform common child setup",
2254     )?;
2255 
2256     cfg.broker_shutdown_event = Some(
2257         bootstrap_tube
2258             .recv::<Event>()
2259             .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?,
2260     );
2261     #[cfg(feature = "crash-report")]
2262     let crash_tube_map = bootstrap_tube
2263         .recv::<HashMap<ProcessType, Vec<SendTube>>>()
2264         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2265     #[cfg(feature = "crash-report")]
2266     crash_report::set_crash_tube_map(crash_tube_map);
2267 
2268     let BrokerTubes {
2269         vm_evt_wrtube,
2270         vm_evt_rdtube,
2271     } = bootstrap_tube
2272         .recv::<BrokerTubes>()
2273         .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2274 
2275     run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2276 }
2277 
run_config(cfg: Config) -> Result<ExitState>2278 pub fn run_config(cfg: Config) -> Result<ExitState> {
2279     let _raise_timer_resolution = enable_high_res_timers()
2280         .exit_context(Exit::EnableHighResTimer, "failed to enable high res timer")?;
2281 
2282     // There is no broker when using run_config(), so the vm_evt tubes need to be created.
2283     let (vm_evt_wrtube, vm_evt_rdtube) =
2284         Tube::directional_pair().context("failed to create vm event tube")?;
2285 
2286     run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2287 }
2288 
create_guest_memory( components: &VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>2289 fn create_guest_memory(
2290     components: &VmComponents,
2291     arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
2292     hypervisor: &impl Hypervisor,
2293 ) -> Result<GuestMemory> {
2294     let guest_mem_layout = Arch::guest_memory_layout(components, arch_memory_layout, hypervisor)
2295         .exit_context(
2296             Exit::GuestMemoryLayout,
2297             "failed to create guest memory layout",
2298         )?;
2299     GuestMemory::new_with_options(&guest_mem_layout)
2300         .exit_context(Exit::CreateGuestMemory, "failed to create guest memory")
2301 }
2302 
run_config_inner( cfg: Config, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState>2303 fn run_config_inner(
2304     cfg: Config,
2305     vm_evt_wrtube: SendTube,
2306     vm_evt_rdtube: RecvTube,
2307 ) -> Result<ExitState> {
2308     product::setup_common_metric_invariants(&cfg);
2309 
2310     #[cfg(feature = "perfetto")]
2311     cros_tracing::add_per_trace_callback(set_tsc_clock_snapshot);
2312 
2313     let components: VmComponents = setup_vm_components(&cfg)?;
2314     let arch_memory_layout = Arch::arch_memory_layout(&components)?;
2315 
2316     #[allow(unused_mut)]
2317     let mut hypervisor = cfg
2318         .hypervisor
2319         .or_else(get_default_hypervisor)
2320         .exit_context(Exit::NoDefaultHypervisor, "no enabled hypervisor")?;
2321 
2322     #[cfg(feature = "whpx")]
2323     if hypervisor::whpx::Whpx::is_enabled() {
2324         // If WHPX is enabled, no other hypervisor can be used, so just override it
2325         hypervisor = HypervisorKind::Whpx;
2326     }
2327 
2328     match hypervisor {
2329         #[cfg(feature = "haxm")]
2330         HypervisorKind::Haxm | HypervisorKind::Ghaxm => {
2331             if hypervisor == HypervisorKind::Haxm {
2332                 set_use_ghaxm(false);
2333             }
2334             info!("Creating HAXM ghaxm={}", get_use_ghaxm());
2335             let haxm = Haxm::new()?;
2336             let guest_mem = create_guest_memory(&components, &arch_memory_layout, &haxm)?;
2337             let vm = create_haxm_vm(haxm, guest_mem, &cfg.kernel_log_file)?;
2338             let (ioapic_host_tube, ioapic_device_tube) =
2339                 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2340             let irq_chip =
2341                 create_userspace_irq_chip::<HaxmVcpu>(components.vcpu_count, ioapic_device_tube)?;
2342             run_vm::<HaxmVcpu, HaxmVm>(
2343                 cfg,
2344                 components,
2345                 &arch_memory_layout,
2346                 vm,
2347                 WindowsIrqChip::Userspace(irq_chip).as_mut(),
2348                 Some(ioapic_host_tube),
2349                 vm_evt_wrtube,
2350                 vm_evt_rdtube,
2351             )
2352         }
2353         #[cfg(feature = "whpx")]
2354         HypervisorKind::Whpx => {
2355             let apic_emulation_supported =
2356                 Whpx::check_whpx_feature(WhpxFeature::LocalApicEmulation)
2357                     .exit_context(Exit::WhpxSetupError, "failed to set up whpx")?;
2358 
2359             let no_smt = cfg.no_smt;
2360 
2361             // Default to WhpxSplitIrqChip if it's supported because it's more performant
2362             let irq_chip = cfg.irq_chip.unwrap_or(if apic_emulation_supported {
2363                 IrqChipKind::Split
2364             } else {
2365                 IrqChipKind::Userspace
2366             });
2367 
2368             // Both WHPX irq chips use a userspace IOAPIC
2369             let (ioapic_host_tube, ioapic_device_tube) =
2370                 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2371 
2372             info!("Creating Whpx");
2373             let whpx = Whpx::new()?;
2374             let guest_mem = create_guest_memory(&components, &arch_memory_layout, &whpx)?;
2375             let vm = create_whpx_vm(
2376                 whpx,
2377                 guest_mem,
2378                 components.vcpu_count,
2379                 no_smt,
2380                 apic_emulation_supported && irq_chip == IrqChipKind::Split,
2381                 cfg.force_calibrated_tsc_leaf,
2382                 vm_evt_wrtube
2383                     .try_clone()
2384                     .expect("could not clone vm_evt_wrtube"),
2385             )?;
2386 
2387             let mut irq_chip = match irq_chip {
2388                 IrqChipKind::Kernel => unimplemented!("Kernel irqchip mode not supported by WHPX"),
2389                 IrqChipKind::Split => {
2390                     if !apic_emulation_supported {
2391                         panic!(
2392                             "split irqchip specified but your WHPX version does not support \
2393                                local apic emulation"
2394                         );
2395                     }
2396                     WindowsIrqChip::WhpxSplit(create_whpx_split_irq_chip(&vm, ioapic_device_tube)?)
2397                 }
2398                 IrqChipKind::Userspace => {
2399                     WindowsIrqChip::Userspace(create_userspace_irq_chip::<WhpxVcpu>(
2400                         components.vcpu_count,
2401                         ioapic_device_tube,
2402                     )?)
2403                 }
2404             };
2405             run_vm::<WhpxVcpu, WhpxVm>(
2406                 cfg,
2407                 components,
2408                 &arch_memory_layout,
2409                 vm,
2410                 irq_chip.as_mut(),
2411                 Some(ioapic_host_tube),
2412                 vm_evt_wrtube,
2413                 vm_evt_rdtube,
2414             )
2415         }
2416         #[cfg(feature = "gvm")]
2417         HypervisorKind::Gvm => {
2418             info!("Creating GVM");
2419             let gvm = Gvm::new()?;
2420             let guest_mem = create_guest_memory(&components, &arch_memory_layout, &gvm)?;
2421             let vm = create_gvm_vm(gvm, guest_mem)?;
2422             let ioapic_host_tube;
2423             let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
2424                 IrqChipKind::Split => unimplemented!("Split irqchip mode not supported by GVM"),
2425                 IrqChipKind::Kernel => {
2426                     ioapic_host_tube = None;
2427                     WindowsIrqChip::Gvm(create_gvm_irq_chip(&vm, components.vcpu_count)?)
2428                 }
2429                 IrqChipKind::Userspace => {
2430                     let (host_tube, ioapic_device_tube) =
2431                         Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2432                     ioapic_host_tube = Some(host_tube);
2433                     WindowsIrqChip::Userspace(create_userspace_irq_chip::<GvmVcpu>(
2434                         components.vcpu_count,
2435                         ioapic_device_tube,
2436                     )?)
2437                 }
2438             };
2439             run_vm::<GvmVcpu, GvmVm>(
2440                 cfg,
2441                 components,
2442                 &arch_memory_layout,
2443                 vm,
2444                 irq_chip.as_mut(),
2445                 ioapic_host_tube,
2446                 vm_evt_wrtube,
2447                 vm_evt_rdtube,
2448             )
2449         }
2450     }
2451 }
2452 
2453 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
run_vm<Vcpu, V>( #[allow(unused_mut)] mut cfg: Config, #[allow(unused_mut)] mut components: VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,2454 fn run_vm<Vcpu, V>(
2455     #[allow(unused_mut)] mut cfg: Config,
2456     #[allow(unused_mut)] mut components: VmComponents,
2457     arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
2458     mut vm: V,
2459     irq_chip: &mut dyn IrqChipArch,
2460     ioapic_host_tube: Option<Tube>,
2461     vm_evt_wrtube: SendTube,
2462     vm_evt_rdtube: RecvTube,
2463 ) -> Result<ExitState>
2464 where
2465     Vcpu: VcpuArch + 'static,
2466     V: VmArch + 'static,
2467 {
2468     let vm_memory_size_mb = components.memory_size / (1024 * 1024);
2469     let mut control_tubes = Vec::new();
2470     let mut irq_control_tubes = Vec::new();
2471     let mut vm_memory_control_tubes = Vec::new();
2472     // Create one control tube per disk.
2473     let mut disk_device_tubes = Vec::new();
2474     let mut disk_host_tubes = Vec::new();
2475     let disk_count = cfg.disks.len();
2476     for _ in 0..disk_count {
2477         let (disk_host_tube, disk_device_tube) =
2478             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2479         disk_host_tubes.push(disk_host_tube);
2480         disk_device_tubes.push(disk_device_tube);
2481     }
2482 
2483     if let Some(ioapic_host_tube) = ioapic_host_tube {
2484         irq_control_tubes.push(ioapic_host_tube);
2485     }
2486 
2487     // Balloon gets a special socket so balloon requests can be forwarded from the main process.
2488     let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
2489         let (balloon_host_tube, balloon_device_tube) =
2490             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2491         (Some(balloon_host_tube), Some(balloon_device_tube))
2492     } else {
2493         (None, None)
2494     };
2495     // The balloon device also needs a tube to communicate back to the main process to
2496     // handle remapping memory dynamically.
2497     let dynamic_mapping_device_tube = if cfg.balloon {
2498         let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
2499             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2500         vm_memory_control_tubes.push(dynamic_mapping_host_tube);
2501         Some(dynamic_mapping_device_tube)
2502     } else {
2503         None
2504     };
2505 
2506     // PvClock gets a tube for handling suspend/resume requests from the main thread.
2507     #[cfg(feature = "pvclock")]
2508     let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock {
2509         let (host, device) =
2510             Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2511         (Some(host), Some(device))
2512     } else {
2513         (None, None)
2514     };
2515 
2516     let gralloc = RutabagaGralloc::new(RutabagaGrallocBackendFlags::new())
2517         .exit_context(Exit::CreateGralloc, "failed to create gralloc")?;
2518 
2519     let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
2520     let mut sys_allocator = SystemAllocator::new(
2521         Arch::get_system_allocator_config(&vm, arch_memory_layout),
2522         pstore_size,
2523         &cfg.mmio_address_ranges,
2524     )
2525     .context("failed to create system allocator")?;
2526 
2527     // Allocate the ramoops region first.
2528     let ramoops_region = match &components.pstore {
2529         Some(pstore) => Some(
2530             arch::pstore::create_memory_region(
2531                 &mut vm,
2532                 sys_allocator.reserved_region().unwrap(),
2533                 pstore,
2534             )
2535             .exit_context(
2536                 Exit::Pstore,
2537                 format!("failed to allocate pstore region {:?}", &components.pstore),
2538             )?,
2539         ),
2540         None => None,
2541     };
2542 
2543     let init_balloon_size = components
2544         .memory_size
2545         .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
2546             m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
2547         }))
2548         .context("failed to calculate init balloon size")?;
2549 
2550     let tsc_state = devices::tsc::tsc_state().exit_code(Exit::TscCalibrationFailed)?;
2551     let tsc_sync_mitigations = get_tsc_sync_mitigations(&tsc_state, components.vcpu_count);
2552 
2553     if tsc_state.core_grouping.size() > 1 {
2554         // Host TSCs are not in sync, log a metric about it.
2555         warn!(
2556             "Host TSCs are not in sync, applying the following mitigations: {:?}",
2557             tsc_sync_mitigations
2558         );
2559         log_descriptor(
2560             MetricEventType::TscCoresOutOfSync,
2561             // casting u64 as i64 is a no-op, so we don't lose any part of the bitmask
2562             tsc_state.core_grouping.core_grouping_bitmask() as i64,
2563         );
2564     }
2565 
2566     #[cfg(feature = "gpu")]
2567     let gpu_control_tube = cfg
2568         .gpu_vmm_config
2569         .as_mut()
2570         .and_then(|config| config.gpu_control_host_tube.take());
2571     let product_args = product::get_run_control_args(&mut cfg);
2572 
2573     // We open these files before lowering the token, as in the future a stricter policy may
2574     // prevent it.
2575     let dt_overlays = cfg
2576         .device_tree_overlay
2577         .iter()
2578         .map(|o| {
2579             Ok(DtbOverlay {
2580                 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2581                     .with_context(|| {
2582                         format!("failed to open device tree overlay {}", o.path.display())
2583                     })?,
2584             })
2585         })
2586         .collect::<Result<Vec<DtbOverlay>>>()?;
2587 
2588     // Lower the token, locking the main process down to a stricter security policy.
2589     //
2590     // WARNING:
2591     //
2592     // Windows system calls can behave in unusual ways if they happen concurrently to the token
2593     // lowering. For example, access denied can happen if Tube pairs are created in another thread
2594     // (b/281108137), and lower_token happens right before the client pipe is connected. Tubes are
2595     // not privileged resources, but can be broken due to the token changing unexpectedly.
2596     //
2597     // We explicitly lower the token here and *then* call run_control to make it clear that any
2598     // resources that require a privileged token should be created on the main thread & passed into
2599     // run_control, to follow the correct order:
2600     // - Privileged resources are created.
2601     // - Token is lowered.
2602     // - Threads are spawned & may create more non-privileged resources (without fear of the token
2603     //   changing at an undefined time).
2604     //
2605     // Recommendation: If you find your code doesnt work in run_control because of the sandbox, you
2606     // should split any resource creation to before this token lowering & pass the resources into
2607     // run_control. Don't move the token lowering somewhere else without considering multi-threaded
2608     // effects.
2609     #[cfg(feature = "sandbox")]
2610     if sandbox::is_sandbox_target() {
2611         sandbox::TargetServices::get()
2612             .exit_code_from_err("failed to create sandbox")?
2613             .expect("Could not create sandbox!")
2614             .lower_token();
2615     }
2616 
2617     let virtio_snd_state_device_tube = create_snd_state_tube(&mut control_tubes)?;
2618 
2619     let (virtio_snd_host_mute_tube, virtio_snd_device_mute_tube) = create_snd_mute_tube_pair()?;
2620 
2621     let mut initial_audio_session_states: Vec<InitialAudioSessionState> = Vec::new();
2622 
2623     let pci_devices = create_devices(
2624         &mut cfg,
2625         vm.get_memory(),
2626         &vm_evt_wrtube,
2627         &mut irq_control_tubes,
2628         &mut vm_memory_control_tubes,
2629         &mut control_tubes,
2630         &mut disk_device_tubes,
2631         &mut initial_audio_session_states,
2632         balloon_device_tube,
2633         #[cfg(feature = "pvclock")]
2634         pvclock_device_tube,
2635         dynamic_mapping_device_tube,
2636         /* inflate_tube= */ None,
2637         init_balloon_size,
2638         tsc_state.frequency,
2639         virtio_snd_state_device_tube,
2640         virtio_snd_device_mute_tube,
2641     )?;
2642 
2643     let mut vcpu_ids = Vec::new();
2644 
2645     let (vwmdt_host_tube, vmwdt_device_tube) = Tube::pair().context("failed to create tube")?;
2646     let windows = Arch::build_vm::<V, Vcpu>(
2647         components,
2648         arch_memory_layout,
2649         &vm_evt_wrtube,
2650         &mut sys_allocator,
2651         &cfg.serial_parameters,
2652         None,
2653         (cfg.battery_config.as_ref().map(|t| t.type_), None),
2654         vm,
2655         ramoops_region,
2656         pci_devices,
2657         irq_chip,
2658         &mut vcpu_ids,
2659         cfg.dump_device_tree_blob.clone(),
2660         /* debugcon_jail= */ None,
2661         None,
2662         None,
2663         /* guest_suspended_cvar= */ None,
2664         dt_overlays,
2665         cfg.fdt_position,
2666         cfg.no_pmu,
2667     )
2668     .exit_context(Exit::BuildVm, "the architecture failed to build the vm")?;
2669 
2670     #[cfg(feature = "stats")]
2671     let stats = if cfg.exit_stats {
2672         Some(Arc::new(Mutex::new(StatisticsCollector::new())))
2673     } else {
2674         None
2675     };
2676 
2677     run_control(
2678         windows,
2679         sys_allocator,
2680         control_tubes,
2681         irq_control_tubes,
2682         vm_memory_control_tubes,
2683         vm_evt_rdtube,
2684         vm_evt_wrtube,
2685         #[cfg(feature = "gpu")]
2686         gpu_control_tube,
2687         cfg.broker_shutdown_event.take(),
2688         balloon_host_tube,
2689         #[cfg(feature = "pvclock")]
2690         pvclock_host_tube,
2691         disk_host_tubes,
2692         initial_audio_session_states,
2693         gralloc,
2694         #[cfg(feature = "stats")]
2695         stats,
2696         cfg.service_pipe_name,
2697         vm_memory_size_mb,
2698         cfg.host_cpu_topology,
2699         tsc_sync_mitigations,
2700         cfg.force_calibrated_tsc_leaf,
2701         product_args,
2702         match virtio_snd_host_mute_tube {
2703             Some(virtio_snd_host_mute_tube) => vec![virtio_snd_host_mute_tube],
2704             None => vec![],
2705         },
2706         cfg.restore_path,
2707         cfg.socket_path,
2708         cfg.force_s2idle,
2709         cfg.suspended,
2710     )
2711 }
2712 
2713 #[cfg(test)]
2714 mod tests {
2715     use tempfile::TempDir;
2716 
2717     use super::*;
2718 
create_config(test_dir: &TempDir) -> Config2719     fn create_config(test_dir: &TempDir) -> Config {
2720         let mut config = Config::default();
2721 
2722         let dummy_kernel_path = test_dir.path().join("dummy_kernel.txt");
2723         OpenOptions::new()
2724             .create_new(true)
2725             .write(true)
2726             .open(&dummy_kernel_path)
2727             .expect("Could not open file!");
2728         config.executable_path = Some(Executable::Kernel(dummy_kernel_path));
2729 
2730         config
2731     }
2732 
2733     #[test]
2734     #[should_panic(expected = "Did not receive a bios or kernel")]
setup_vm_components_panics_when_no_kernel_provided()2735     fn setup_vm_components_panics_when_no_kernel_provided() {
2736         let mut config =
2737             create_config(&TempDir::new().expect("Could not create temporary directory!"));
2738         config.executable_path = None;
2739         let _ = setup_vm_components(&config);
2740     }
2741 
2742     #[test]
setup_vm_components_stores_memory_in_bytes()2743     fn setup_vm_components_stores_memory_in_bytes() {
2744         let tempdir = TempDir::new().expect("Could not create temporary directory!");
2745         let mut config = create_config(&tempdir);
2746         config.memory = Some(1);
2747         let vm_components = setup_vm_components(&config).expect("failed to setup vm components");
2748         assert_eq!(vm_components.memory_size, 1024 * 1024);
2749     }
2750 
2751     #[test]
setup_vm_components_fails_when_memory_too_large()2752     fn setup_vm_components_fails_when_memory_too_large() {
2753         let tempdir = TempDir::new().expect("Could not create temporary directory!");
2754         let mut config = create_config(&tempdir);
2755         // One mb more than a u64 can hold in bytes
2756         config.memory = Some((u64::MAX / 1024 / 1024) + 1);
2757         setup_vm_components(&config).err().expect("expected error");
2758     }
2759 }
2760