1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // TODO(b:240716507): There is huge chunk for code which depends on haxm, whpx or gvm to be enabled
6 // but isn't marked so. Remove this when we do so.
7 #![allow(dead_code, unused_imports, unused_variables, unreachable_code)]
8
9 pub(crate) mod control_server;
10 pub(crate) mod irq_wait;
11 pub(crate) mod main;
12 #[cfg(not(feature = "crash-report"))]
13 mod panic_hook;
14
15 mod generic;
16 use generic as product;
17 pub(crate) mod run_vcpu;
18
19 #[cfg(feature = "whpx")]
20 use std::arch::x86_64::__cpuid;
21 #[cfg(feature = "whpx")]
22 use std::arch::x86_64::__cpuid_count;
23 use std::cmp::Reverse;
24 use std::collections::BTreeMap;
25 use std::collections::HashMap;
26 use std::fs::File;
27 use std::fs::OpenOptions;
28 use std::io::stdin;
29 use std::iter;
30 use std::mem;
31 use std::os::windows::fs::OpenOptionsExt;
32 use std::path::PathBuf;
33 use std::sync::mpsc;
34 use std::sync::Arc;
35
36 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
37 use aarch64::AArch64 as Arch;
38 use acpi_tables::sdt::SDT;
39 use anyhow::anyhow;
40 use anyhow::bail;
41 use anyhow::Context;
42 use anyhow::Result;
43 use arch::CpuConfigArch;
44 use arch::DtbOverlay;
45 use arch::IrqChipArch;
46 use arch::LinuxArch;
47 use arch::RunnableLinuxVm;
48 use arch::VcpuArch;
49 use arch::VirtioDeviceStub;
50 use arch::VmArch;
51 use arch::VmComponents;
52 use arch::VmImage;
53 use base::enable_high_res_timers;
54 use base::error;
55 use base::info;
56 use base::open_file_or_duplicate;
57 use base::warn;
58 use base::AsRawDescriptor;
59 #[cfg(feature = "gpu")]
60 use base::BlockingMode;
61 use base::CloseNotifier;
62 use base::Event;
63 use base::EventToken;
64 use base::EventType;
65 use base::FlushOnDropTube;
66 #[cfg(feature = "gpu")]
67 use base::FramingMode;
68 use base::FromRawDescriptor;
69 use base::ProtoTube;
70 use base::RawDescriptor;
71 use base::ReadNotifier;
72 use base::RecvTube;
73 use base::SendTube;
74 #[cfg(feature = "gpu")]
75 use base::StreamChannel;
76 use base::Terminal;
77 use base::TriggeredEvent;
78 use base::Tube;
79 use base::TubeError;
80 use base::VmEventType;
81 use base::WaitContext;
82 use broker_ipc::common_child_setup;
83 use broker_ipc::CommonChildStartupArgs;
84 use control_server::ControlServer;
85 use crosvm_cli::sys::windows::exit::Exit;
86 use crosvm_cli::sys::windows::exit::ExitContext;
87 use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
88 use crosvm_cli::sys::windows::exit::ExitContextOption;
89 use devices::create_devices_worker_thread;
90 use devices::serial_device::SerialHardware;
91 use devices::serial_device::SerialParameters;
92 use devices::tsc::get_tsc_sync_mitigations;
93 use devices::tsc::standard_deviation;
94 use devices::tsc::TscSyncMitigations;
95 use devices::virtio;
96 use devices::virtio::block::DiskOption;
97 #[cfg(feature = "audio")]
98 use devices::virtio::snd::common_backend::VirtioSnd;
99 #[cfg(feature = "audio")]
100 use devices::virtio::snd::parameters::Parameters as SndParameters;
101 #[cfg(feature = "gpu")]
102 use devices::virtio::vhost::user::device::gpu::sys::windows::GpuVmmConfig;
103 #[cfg(feature = "gpu")]
104 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventSplitConfig;
105 #[cfg(feature = "gpu")]
106 use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventVmmConfig;
107 #[cfg(feature = "gpu")]
108 use devices::virtio::vhost::user::gpu::sys::windows::product::GpuBackendConfig as GpuBackendConfigProduct;
109 #[cfg(feature = "gpu")]
110 use devices::virtio::vhost::user::gpu::sys::windows::run_gpu_device_worker;
111 #[cfg(feature = "audio")]
112 use devices::virtio::vhost::user::snd::sys::windows::product::SndBackendConfig as SndBackendConfigProduct;
113 #[cfg(feature = "audio")]
114 use devices::virtio::vhost::user::snd::sys::windows::run_snd_device_worker;
115 #[cfg(feature = "audio")]
116 use devices::virtio::vhost::user::snd::sys::windows::SndSplitConfig;
117 #[cfg(feature = "balloon")]
118 use devices::virtio::BalloonFeatures;
119 use devices::virtio::Console;
120 #[cfg(feature = "gpu")]
121 use devices::virtio::GpuParameters;
122 use devices::BusDeviceObj;
123 use devices::BusResumeDevice;
124 #[cfg(feature = "gvm")]
125 use devices::GvmIrqChip;
126 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
127 use devices::IrqChip;
128 use devices::UserspaceIrqChip;
129 use devices::VcpuRunState;
130 use devices::VirtioPciDevice;
131 #[cfg(feature = "whpx")]
132 use devices::WhpxSplitIrqChip;
133 #[cfg(feature = "gpu")]
134 use gpu_display::EventDevice;
135 #[cfg(feature = "gpu")]
136 use gpu_display::WindowProcedureThread;
137 #[cfg(feature = "gpu")]
138 use gpu_display::WindowProcedureThreadBuilder;
139 #[cfg(feature = "gvm")]
140 use hypervisor::gvm::Gvm;
141 #[cfg(feature = "gvm")]
142 use hypervisor::gvm::GvmVcpu;
143 #[cfg(feature = "gvm")]
144 use hypervisor::gvm::GvmVersion;
145 #[cfg(feature = "gvm")]
146 use hypervisor::gvm::GvmVm;
147 #[cfg(feature = "haxm")]
148 use hypervisor::haxm::get_use_ghaxm;
149 #[cfg(feature = "haxm")]
150 use hypervisor::haxm::set_use_ghaxm;
151 #[cfg(feature = "haxm")]
152 use hypervisor::haxm::Haxm;
153 #[cfg(feature = "haxm")]
154 use hypervisor::haxm::HaxmVcpu;
155 #[cfg(feature = "haxm")]
156 use hypervisor::haxm::HaxmVm;
157 #[cfg(feature = "whpx")]
158 use hypervisor::whpx::Whpx;
159 #[cfg(feature = "whpx")]
160 use hypervisor::whpx::WhpxFeature;
161 #[cfg(feature = "whpx")]
162 use hypervisor::whpx::WhpxVcpu;
163 #[cfg(feature = "whpx")]
164 use hypervisor::whpx::WhpxVm;
165 use hypervisor::Hypervisor;
166 #[cfg(feature = "whpx")]
167 use hypervisor::HypervisorCap;
168 #[cfg(feature = "whpx")]
169 use hypervisor::HypervisorX86_64;
170 use hypervisor::ProtectionType;
171 use hypervisor::Vm;
172 use irq_wait::IrqWaitWorker;
173 use jail::FakeMinijailStub as Minijail;
174 #[cfg(not(feature = "crash-report"))]
175 pub(crate) use panic_hook::set_panic_hook;
176 use product::create_snd_mute_tube_pair;
177 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
178 use product::create_snd_state_tube;
179 #[cfg(feature = "pvclock")]
180 use product::handle_pvclock_request;
181 use product::merge_session_invariants;
182 use product::run_ime_thread;
183 use product::set_package_name;
184 pub(crate) use product::setup_metrics_reporting;
185 use product::start_service_ipc_listener;
186 use product::RunControlArgs;
187 use product::ServiceVmState;
188 use product::Token;
189 use resources::SystemAllocator;
190 use run_vcpu::run_all_vcpus;
191 use run_vcpu::VcpuRunMode;
192 use rutabaga_gfx::RutabagaGralloc;
193 use rutabaga_gfx::RutabagaGrallocBackendFlags;
194 use smallvec::SmallVec;
195 use sync::Mutex;
196 use tube_transporter::TubeToken;
197 use tube_transporter::TubeTransporterReader;
198 use vm_control::api::VmMemoryClient;
199 #[cfg(feature = "balloon")]
200 use vm_control::BalloonControlCommand;
201 #[cfg(feature = "balloon")]
202 use vm_control::BalloonTube;
203 use vm_control::DeviceControlCommand;
204 use vm_control::InitialAudioSessionState;
205 use vm_control::IrqHandlerRequest;
206 use vm_control::PvClockCommand;
207 use vm_control::VcpuControl;
208 use vm_control::VmMemoryRegionState;
209 use vm_control::VmMemoryRequest;
210 use vm_control::VmRequest;
211 use vm_control::VmResponse;
212 use vm_control::VmRunMode;
213 use vm_memory::GuestAddress;
214 use vm_memory::GuestMemory;
215 use vmm_vhost::Connection;
216 use vmm_vhost::FrontendReq;
217 use win_util::ProcessType;
218 #[cfg(feature = "whpx")]
219 use x86_64::cpuid::adjust_cpuid;
220 #[cfg(feature = "whpx")]
221 use x86_64::cpuid::CpuIdContext;
222 #[cfg(target_arch = "x86_64")]
223 use x86_64::X8664arch as Arch;
224
225 use crate::crosvm::config::Config;
226 use crate::crosvm::config::Executable;
227 use crate::crosvm::config::InputDeviceOption;
228 #[cfg(any(feature = "gvm", feature = "whpx"))]
229 use crate::crosvm::config::IrqChipKind;
230 #[cfg(feature = "gpu")]
231 use crate::crosvm::config::TouchDeviceOption;
232 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
233 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
234 use crate::crosvm::sys::config::HypervisorKind;
235 use crate::crosvm::sys::windows::broker::BrokerTubes;
236 #[cfg(feature = "stats")]
237 use crate::crosvm::sys::windows::stats::StatisticsCollector;
238 #[cfg(feature = "gpu")]
239 pub(crate) use crate::sys::windows::product::get_gpu_product_configs;
240 #[cfg(feature = "audio")]
241 pub(crate) use crate::sys::windows::product::get_snd_product_configs;
242 #[cfg(feature = "gpu")]
243 pub(crate) use crate::sys::windows::product::get_window_procedure_thread_product_configs;
244 use crate::sys::windows::product::log_descriptor;
245 #[cfg(feature = "audio")]
246 pub(crate) use crate::sys::windows::product::num_input_sound_devices;
247 #[cfg(feature = "audio")]
248 pub(crate) use crate::sys::windows::product::num_input_sound_streams;
249 use crate::sys::windows::product::spawn_anti_tamper_thread;
250 use crate::sys::windows::product::MetricEventType;
251
252 const DEFAULT_GUEST_CID: u64 = 3;
253
254 // by default, if enabled, the balloon WS features will use 4 bins.
255 const VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS: u8 = 4;
256
257 enum TaggedControlTube {
258 Vm(FlushOnDropTube),
259 Product(product::TaggedControlTube),
260 }
261
262 impl ReadNotifier for TaggedControlTube {
get_read_notifier(&self) -> &dyn AsRawDescriptor263 fn get_read_notifier(&self) -> &dyn AsRawDescriptor {
264 match self {
265 Self::Vm(tube) => tube.0.get_read_notifier(),
266 Self::Product(tube) => tube.get_read_notifier(),
267 }
268 }
269 }
270
271 impl CloseNotifier for TaggedControlTube {
get_close_notifier(&self) -> &dyn AsRawDescriptor272 fn get_close_notifier(&self) -> &dyn AsRawDescriptor {
273 match self {
274 Self::Vm(tube) => tube.0.get_close_notifier(),
275 Self::Product(tube) => tube.get_close_notifier(),
276 }
277 }
278 }
279
280 pub enum ExitState {
281 Reset,
282 Stop,
283 Crash,
284 #[allow(dead_code)]
285 GuestPanic,
286 WatchdogReset,
287 }
288
289 type DeviceResult<T = VirtioDeviceStub> = Result<T>;
290
create_vhost_user_block_device( cfg: &Config, connection: Connection<FrontendReq>, ) -> DeviceResult291 fn create_vhost_user_block_device(
292 cfg: &Config,
293 connection: Connection<FrontendReq>,
294 ) -> DeviceResult {
295 let dev = virtio::VhostUserFrontend::new(
296 virtio::DeviceType::Block,
297 virtio::base_features(cfg.protection_type),
298 connection,
299 None,
300 None,
301 )
302 .exit_context(
303 Exit::VhostUserBlockDeviceNew,
304 "failed to set up vhost-user block device",
305 )?;
306
307 Ok(VirtioDeviceStub {
308 dev: Box::new(dev),
309 jail: None,
310 })
311 }
312
create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult313 fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult {
314 let features = virtio::base_features(cfg.protection_type);
315 let dev = virtio::BlockAsync::new(
316 features,
317 disk.open()?,
318 disk,
319 Some(disk_device_tube),
320 None,
321 None,
322 )
323 .exit_context(Exit::BlockDeviceNew, "failed to create block device")?;
324
325 Ok(VirtioDeviceStub {
326 dev: Box::new(dev),
327 jail: None,
328 })
329 }
330
331 #[cfg(feature = "gpu")]
create_vhost_user_gpu_device( base_features: u64, connection: Connection<FrontendReq>, ) -> DeviceResult332 fn create_vhost_user_gpu_device(
333 base_features: u64,
334 connection: Connection<FrontendReq>,
335 ) -> DeviceResult {
336 let dev = virtio::VhostUserFrontend::new(
337 virtio::DeviceType::Gpu,
338 base_features,
339 connection,
340 None,
341 None,
342 )
343 .exit_context(
344 Exit::VhostUserGpuDeviceNew,
345 "failed to set up vhost-user gpu device",
346 )?;
347
348 Ok(VirtioDeviceStub {
349 dev: Box::new(dev),
350 jail: None,
351 })
352 }
353
354 #[cfg(feature = "audio")]
create_vhost_user_snd_device( base_features: u64, connection: Connection<FrontendReq>, ) -> DeviceResult355 fn create_vhost_user_snd_device(
356 base_features: u64,
357 connection: Connection<FrontendReq>,
358 ) -> DeviceResult {
359 let dev = virtio::VhostUserFrontend::new(
360 virtio::DeviceType::Sound,
361 base_features,
362 connection,
363 None,
364 None,
365 )
366 .exit_context(
367 Exit::VhostUserSndDeviceNew,
368 "failed to set up vhost-user snd device",
369 )?;
370
371 Ok(VirtioDeviceStub {
372 dev: Box::new(dev),
373 jail: None,
374 })
375 }
376
377 #[cfg(feature = "gpu")]
create_multi_touch_device( cfg: &Config, event_pipe: StreamChannel, width: u32, height: u32, name: Option<&str>, idx: u32, ) -> DeviceResult378 fn create_multi_touch_device(
379 cfg: &Config,
380 event_pipe: StreamChannel,
381 width: u32,
382 height: u32,
383 name: Option<&str>,
384 idx: u32,
385 ) -> DeviceResult {
386 let dev = virtio::input::new_multi_touch(
387 idx,
388 event_pipe,
389 width,
390 height,
391 name,
392 virtio::base_features(cfg.protection_type),
393 )
394 .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
395 Ok(VirtioDeviceStub {
396 dev: Box::new(dev),
397 jail: None,
398 })
399 }
400
401 #[cfg(feature = "gpu")]
create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult402 fn create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult {
403 let dev = virtio::input::new_mouse(idx, event_pipe, virtio::base_features(cfg.protection_type))
404 .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
405 Ok(VirtioDeviceStub {
406 dev: Box::new(dev),
407 jail: None,
408 })
409 }
410
411 #[cfg(feature = "slirp")]
create_vhost_user_net_device(cfg: &Config, connection: Connection<FrontendReq>) -> DeviceResult412 fn create_vhost_user_net_device(cfg: &Config, connection: Connection<FrontendReq>) -> DeviceResult {
413 let features = virtio::base_features(cfg.protection_type);
414 let dev =
415 virtio::VhostUserFrontend::new(virtio::DeviceType::Net, features, connection, None, None)
416 .exit_context(
417 Exit::VhostUserNetDeviceNew,
418 "failed to set up vhost-user net device",
419 )?;
420
421 Ok(VirtioDeviceStub {
422 dev: Box::new(dev),
423 jail: None,
424 })
425 }
426
create_rng_device(cfg: &Config) -> DeviceResult427 fn create_rng_device(cfg: &Config) -> DeviceResult {
428 let dev = virtio::Rng::new(virtio::base_features(cfg.protection_type))
429 .exit_context(Exit::RngDeviceNew, "failed to set up rng")?;
430
431 Ok(VirtioDeviceStub {
432 dev: Box::new(dev),
433 jail: None,
434 })
435 }
436
create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult437 fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult {
438 let mut keep_rds = Vec::new();
439 let evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
440 let dev = param
441 .create_serial_device::<Console>(cfg.protection_type, &evt, &mut keep_rds)
442 .exit_context(Exit::CreateConsole, "failed to create console device")?;
443
444 Ok(VirtioDeviceStub {
445 dev: Box::new(dev),
446 jail: None,
447 })
448 }
449
450 #[cfg(feature = "balloon")]
create_balloon_device( cfg: &Config, balloon_device_tube: Tube, dynamic_mapping_device_tube: Tube, inflate_tube: Option<Tube>, init_balloon_size: u64, ) -> DeviceResult451 fn create_balloon_device(
452 cfg: &Config,
453 balloon_device_tube: Tube,
454 dynamic_mapping_device_tube: Tube,
455 inflate_tube: Option<Tube>,
456 init_balloon_size: u64,
457 ) -> DeviceResult {
458 let balloon_features =
459 (cfg.balloon_page_reporting as u64) << BalloonFeatures::PageReporting as u64;
460 let dev = virtio::Balloon::new(
461 virtio::base_features(cfg.protection_type),
462 balloon_device_tube,
463 VmMemoryClient::new(dynamic_mapping_device_tube),
464 inflate_tube,
465 init_balloon_size,
466 balloon_features,
467 #[cfg(feature = "registered_events")]
468 None,
469 VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS,
470 )
471 .exit_context(Exit::BalloonDeviceNew, "failed to create balloon")?;
472
473 Ok(VirtioDeviceStub {
474 dev: Box::new(dev),
475 jail: None,
476 })
477 }
478
create_vsock_device(cfg: &Config) -> DeviceResult479 fn create_vsock_device(cfg: &Config) -> DeviceResult {
480 // We only support a single guest, so we can confidently assign a default
481 // CID if one isn't provided. We choose the lowest non-reserved value.
482 let dev = virtio::vsock::Vsock::new(
483 cfg.vsock
484 .as_ref()
485 .map(|cfg| cfg.cid)
486 .unwrap_or(DEFAULT_GUEST_CID),
487 cfg.host_guid.clone(),
488 virtio::base_features(cfg.protection_type),
489 )
490 .exit_context(
491 Exit::UserspaceVsockDeviceNew,
492 "failed to create userspace vsock device",
493 )?;
494
495 Ok(VirtioDeviceStub {
496 dev: Box::new(dev),
497 jail: None,
498 })
499 }
500
create_virtio_devices( cfg: &mut Config, vm_evt_wrtube: &SendTube, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, initial_audio_session_states: &mut Vec<InitialAudioSessionState>, balloon_device_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<VirtioDeviceStub>>501 fn create_virtio_devices(
502 cfg: &mut Config,
503 vm_evt_wrtube: &SendTube,
504 #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
505 disk_device_tubes: &mut Vec<Tube>,
506 initial_audio_session_states: &mut Vec<InitialAudioSessionState>,
507 balloon_device_tube: Option<Tube>,
508 #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
509 dynamic_mapping_device_tube: Option<Tube>,
510 inflate_tube: Option<Tube>,
511 init_balloon_size: u64,
512 tsc_frequency: u64,
513 virtio_snd_state_device_tube: Option<Tube>,
514 virtio_snd_control_device_tube: Option<Tube>,
515 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
516 let mut devs = Vec::new();
517
518 if cfg.block_vhost_user_tube.is_empty() {
519 // Disk devices must precede virtio-console devices or the kernel does not boot.
520 // TODO(b/171215421): figure out why this ordering is required and fix it.
521 for disk in &cfg.disks {
522 let disk_device_tube = disk_device_tubes.remove(0);
523 devs.push(create_block_device(cfg, disk, disk_device_tube)?);
524 }
525 } else {
526 info!("Starting up vhost user block backends...");
527 for _disk in &cfg.disks {
528 let disk_device_tube = cfg.block_vhost_user_tube.remove(0);
529 let connection = Connection::<FrontendReq>::from(disk_device_tube);
530 devs.push(create_vhost_user_block_device(cfg, connection)?);
531 }
532 }
533
534 for (_, param) in cfg
535 .serial_parameters
536 .iter()
537 .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
538 {
539 let dev = create_console_device(cfg, param)?;
540 devs.push(dev);
541 }
542
543 #[cfg(feature = "audio")]
544 {
545 let snd_split_configs = std::mem::take(&mut cfg.snd_split_configs);
546 for mut snd_split_cfg in snd_split_configs.into_iter() {
547 devs.push(create_virtio_snd_device(
548 cfg,
549 &mut snd_split_cfg,
550 control_tubes,
551 )?);
552 if let Some(vmm_config) = snd_split_cfg.vmm_config {
553 let initial_audio_session_state = InitialAudioSessionState {
554 audio_client_guid: vmm_config.audio_client_guid,
555 card_index: vmm_config.card_index,
556 };
557 initial_audio_session_states.push(initial_audio_session_state);
558 }
559 }
560 }
561
562 #[cfg(feature = "pvclock")]
563 if let Some(tube) = pvclock_device_tube {
564 product::push_pvclock_device(cfg, &mut devs, tsc_frequency, tube);
565 }
566
567 devs.push(create_rng_device(cfg)?);
568
569 #[cfg(feature = "slirp")]
570 if let Some(net_vhost_user_tube) = cfg.net_vhost_user_tube.take() {
571 let connection = Connection::<FrontendReq>::from(net_vhost_user_tube);
572 devs.push(create_vhost_user_net_device(cfg, connection)?);
573 }
574
575 #[cfg(feature = "balloon")]
576 if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) =
577 (balloon_device_tube, dynamic_mapping_device_tube)
578 {
579 devs.push(create_balloon_device(
580 cfg,
581 balloon_device_tube,
582 dynamic_mapping_device_tube,
583 inflate_tube,
584 init_balloon_size,
585 )?);
586 }
587
588 devs.push(create_vsock_device(cfg)?);
589
590 #[cfg(feature = "gpu")]
591 let event_devices = if let Some(InputEventSplitConfig {
592 backend_config,
593 vmm_config,
594 }) = cfg.input_event_split_config.take()
595 {
596 devs.extend(
597 create_virtio_input_event_devices(cfg, vmm_config)
598 .context("create input event devices")?,
599 );
600 backend_config.map(|cfg| cfg.event_devices)
601 } else {
602 None
603 };
604
605 #[cfg(feature = "gpu")]
606 if let Some(wndproc_thread_vmm_config) = cfg
607 .window_procedure_thread_split_config
608 .as_mut()
609 .map(|split_cfg| &mut split_cfg.vmm_config)
610 {
611 product::push_window_procedure_thread_control_tubes(
612 control_tubes,
613 wndproc_thread_vmm_config,
614 );
615 }
616
617 #[cfg(feature = "gpu")]
618 let mut wndproc_thread = cfg
619 .window_procedure_thread_split_config
620 .as_mut()
621 .and_then(|cfg| cfg.wndproc_thread_builder.take())
622 .map(WindowProcedureThreadBuilder::start_thread)
623 .transpose()
624 .context("Failed to start the window procedure thread.")?;
625
626 #[cfg(feature = "gpu")]
627 if let Some(gpu_vmm_config) = cfg.gpu_vmm_config.take() {
628 devs.push(create_virtio_gpu_device(
629 cfg,
630 gpu_vmm_config,
631 event_devices,
632 &mut wndproc_thread,
633 control_tubes,
634 )?);
635 }
636
637 Ok(devs)
638 }
639
640 #[cfg(feature = "gpu")]
create_virtio_input_event_devices( cfg: &Config, mut input_event_vmm_config: InputEventVmmConfig, ) -> DeviceResult<Vec<VirtioDeviceStub>>641 fn create_virtio_input_event_devices(
642 cfg: &Config,
643 mut input_event_vmm_config: InputEventVmmConfig,
644 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
645 let mut devs = Vec::new();
646
647 // Iterate event devices, create the VMM end.
648 let mut multi_touch_pipes = input_event_vmm_config
649 .multi_touch_pipes
650 .drain(..)
651 .enumerate();
652 for input in &cfg.virtio_input {
653 match input {
654 InputDeviceOption::SingleTouch { .. } => {
655 unimplemented!("--single-touch is no longer supported. Use --multi-touch instead.");
656 }
657 InputDeviceOption::MultiTouch {
658 width,
659 height,
660 name,
661 ..
662 } => {
663 let Some((idx, pipe)) = multi_touch_pipes.next() else {
664 break;
665 };
666 let mut width = *width;
667 let mut height = *height;
668 if idx == 0 {
669 if width.is_none() {
670 width = cfg.display_input_width;
671 }
672 if height.is_none() {
673 height = cfg.display_input_height;
674 }
675 }
676 devs.push(create_multi_touch_device(
677 cfg,
678 pipe,
679 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
680 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
681 name.as_deref(),
682 idx as u32,
683 )?);
684 }
685 _ => {}
686 }
687 }
688 drop(multi_touch_pipes);
689
690 product::push_mouse_device(cfg, &mut input_event_vmm_config, &mut devs)?;
691
692 for (idx, pipe) in input_event_vmm_config.mouse_pipes.drain(..).enumerate() {
693 devs.push(create_mouse_device(cfg, pipe, idx as u32)?);
694 }
695
696 let keyboard_pipe = input_event_vmm_config
697 .keyboard_pipes
698 .pop()
699 .expect("at least one keyboard should be in GPU VMM config");
700 let dev = virtio::input::new_keyboard(
701 /* idx= */ 0,
702 keyboard_pipe,
703 virtio::base_features(cfg.protection_type),
704 )
705 .exit_context(Exit::InputDeviceNew, "failed to set up input device")?;
706
707 devs.push(VirtioDeviceStub {
708 dev: Box::new(dev),
709 jail: None,
710 });
711
712 Ok(devs)
713 }
714
715 #[cfg(feature = "gpu")]
create_virtio_gpu_device( cfg: &mut Config, mut gpu_vmm_config: GpuVmmConfig, event_devices: Option<Vec<EventDevice>>, wndproc_thread: &mut Option<WindowProcedureThread>, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, ) -> DeviceResult<VirtioDeviceStub>716 fn create_virtio_gpu_device(
717 cfg: &mut Config,
718 mut gpu_vmm_config: GpuVmmConfig,
719 event_devices: Option<Vec<EventDevice>>,
720 wndproc_thread: &mut Option<WindowProcedureThread>,
721 #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
722 ) -> DeviceResult<VirtioDeviceStub> {
723 let resource_bridges = Vec::<Tube>::new();
724
725 product::push_gpu_control_tubes(control_tubes, &mut gpu_vmm_config);
726
727 // If the GPU backend is passed, start up the vhost-user worker in the main process.
728 if let Some(backend_config) = cfg.gpu_backend_config.take() {
729 let event_devices = event_devices.ok_or_else(|| {
730 anyhow!("event devices are missing when creating virtio-gpu in the current process.")
731 })?;
732 let wndproc_thread = wndproc_thread
733 .take()
734 .ok_or_else(|| anyhow!("Window procedure thread is missing."))?;
735
736 std::thread::spawn(move || {
737 run_gpu_device_worker(backend_config, event_devices, wndproc_thread)
738 });
739 }
740
741 // The GPU is always vhost-user, even if running in the main process.
742 let gpu_device_tube = gpu_vmm_config
743 .main_vhost_user_tube
744 .take()
745 .expect("GPU VMM vhost-user tube should be set");
746 let connection = Connection::<FrontendReq>::from(gpu_device_tube);
747
748 create_vhost_user_gpu_device(virtio::base_features(cfg.protection_type), connection)
749 .context("create vhost-user GPU device")
750 }
751
752 #[cfg(feature = "audio")]
create_virtio_snd_device( cfg: &mut Config, snd_split_config: &mut SndSplitConfig, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>, ) -> DeviceResult<VirtioDeviceStub>753 fn create_virtio_snd_device(
754 cfg: &mut Config,
755 snd_split_config: &mut SndSplitConfig,
756 #[allow(clippy::ptr_arg)] control_tubes: &mut Vec<TaggedControlTube>,
757 ) -> DeviceResult<VirtioDeviceStub> {
758 let snd_vmm_config = snd_split_config
759 .vmm_config
760 .as_mut()
761 .expect("snd_vmm_config must exist");
762 product::push_snd_control_tubes(control_tubes, snd_vmm_config);
763
764 // If the SND backend is passed, start up the vhost-user worker in the main process.
765 if let Some(backend_config) = snd_split_config.backend_config.take() {
766 std::thread::spawn(move || run_snd_device_worker(backend_config));
767 }
768
769 // The SND is always vhost-user, even if running in the main process.
770 let snd_device_tube = snd_vmm_config
771 .main_vhost_user_tube
772 .take()
773 .expect("Snd VMM vhost-user tube should be set");
774 let connection = Connection::<FrontendReq>::from(snd_device_tube);
775
776 create_vhost_user_snd_device(virtio::base_features(cfg.protection_type), connection)
777 .context("create vhost-user SND device")
778 }
779
create_devices( cfg: &mut Config, mem: &GuestMemory, exit_evt_wrtube: &SendTube, irq_control_tubes: &mut Vec<Tube>, vm_memory_control_tubes: &mut Vec<Tube>, control_tubes: &mut Vec<TaggedControlTube>, disk_device_tubes: &mut Vec<Tube>, initial_audio_session_states: &mut Vec<InitialAudioSessionState>, balloon_device_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>, dynamic_mapping_device_tube: Option<Tube>, inflate_tube: Option<Tube>, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option<Tube>, virtio_snd_control_device_tube: Option<Tube>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>780 fn create_devices(
781 cfg: &mut Config,
782 mem: &GuestMemory,
783 exit_evt_wrtube: &SendTube,
784 irq_control_tubes: &mut Vec<Tube>,
785 vm_memory_control_tubes: &mut Vec<Tube>,
786 control_tubes: &mut Vec<TaggedControlTube>,
787 disk_device_tubes: &mut Vec<Tube>,
788 initial_audio_session_states: &mut Vec<InitialAudioSessionState>,
789 balloon_device_tube: Option<Tube>,
790 #[cfg(feature = "pvclock")] pvclock_device_tube: Option<Tube>,
791 dynamic_mapping_device_tube: Option<Tube>,
792 inflate_tube: Option<Tube>,
793 init_balloon_size: u64,
794 tsc_frequency: u64,
795 virtio_snd_state_device_tube: Option<Tube>,
796 virtio_snd_control_device_tube: Option<Tube>,
797 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
798 let stubs = create_virtio_devices(
799 cfg,
800 exit_evt_wrtube,
801 control_tubes,
802 disk_device_tubes,
803 initial_audio_session_states,
804 balloon_device_tube,
805 #[cfg(feature = "pvclock")]
806 pvclock_device_tube,
807 dynamic_mapping_device_tube,
808 inflate_tube,
809 init_balloon_size,
810 tsc_frequency,
811 virtio_snd_state_device_tube,
812 virtio_snd_control_device_tube,
813 )?;
814
815 let mut pci_devices = Vec::new();
816
817 for stub in stubs {
818 let (msi_host_tube, msi_device_tube) =
819 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
820 irq_control_tubes.push(msi_host_tube);
821
822 let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
823 let (host_tube, device_tube) =
824 Tube::pair().context("failed to create VVU proxy tube")?;
825 vm_memory_control_tubes.push(host_tube);
826 Some(device_tube)
827 } else {
828 None
829 };
830
831 let (ioevent_host_tube, ioevent_device_tube) =
832 Tube::pair().context("failed to create ioevent tube")?;
833 vm_memory_control_tubes.push(ioevent_host_tube);
834
835 let (vm_control_host_tube, vm_control_device_tube) =
836 Tube::pair().context("failed to create vm_control tube")?;
837 control_tubes.push(TaggedControlTube::Vm(FlushOnDropTube::from(
838 vm_control_host_tube,
839 )));
840
841 let dev = Box::new(
842 VirtioPciDevice::new(
843 mem.clone(),
844 stub.dev,
845 msi_device_tube,
846 cfg.disable_virtio_intx,
847 shared_memory_tube.map(VmMemoryClient::new),
848 VmMemoryClient::new(ioevent_device_tube),
849 vm_control_device_tube,
850 )
851 .exit_context(Exit::VirtioPciDev, "failed to create virtio pci dev")?,
852 ) as Box<dyn BusDeviceObj>;
853 pci_devices.push((dev, stub.jail));
854 }
855
856 Ok(pci_devices)
857 }
858
859 #[derive(Debug)]
860 struct PvClockError(String);
861
handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( event: &TriggeredEvent<Token>, vm_control_ids_to_remove: &mut Vec<usize>, next_control_id: &mut usize, service_vm_state: &mut ServiceVmState, disk_host_tubes: &[Tube], ipc_main_loop_tube: Option<&Tube>, #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>, vm_evt_rdtube: &RecvTube, control_tubes: &mut BTreeMap<usize, TaggedControlTube>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, virtio_snd_host_mute_tubes: &mut [Tube], proto_main_loop_tube: Option<&ProtoTube>, anti_tamper_main_thread_tube: &Option<ProtoTube>, #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>, memory_size_mb: u64, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, run_mode_arc: &VcpuRunMode, region_state: &mut VmMemoryRegionState, vm_control_server: Option<&mut ControlServer>, irq_handler_control: &Tube, device_ctrl_tube: &Tube, wait_ctx: &WaitContext<Token>, force_s2idle: bool, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], suspended_pvclock_state: &mut Option<hypervisor::ClockState>, ) -> Result<Option<ExitState>>862 fn handle_readable_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
863 event: &TriggeredEvent<Token>,
864 vm_control_ids_to_remove: &mut Vec<usize>,
865 next_control_id: &mut usize,
866 service_vm_state: &mut ServiceVmState,
867 disk_host_tubes: &[Tube],
868 ipc_main_loop_tube: Option<&Tube>,
869 #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>,
870 vm_evt_rdtube: &RecvTube,
871 control_tubes: &mut BTreeMap<usize, TaggedControlTube>,
872 guest_os: &mut RunnableLinuxVm<V, Vcpu>,
873 sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
874 virtio_snd_host_mute_tubes: &mut [Tube],
875 proto_main_loop_tube: Option<&ProtoTube>,
876 anti_tamper_main_thread_tube: &Option<ProtoTube>,
877 #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>,
878 memory_size_mb: u64,
879 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
880 #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
881 run_mode_arc: &VcpuRunMode,
882 region_state: &mut VmMemoryRegionState,
883 vm_control_server: Option<&mut ControlServer>,
884 irq_handler_control: &Tube,
885 device_ctrl_tube: &Tube,
886 wait_ctx: &WaitContext<Token>,
887 force_s2idle: bool,
888 vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
889 suspended_pvclock_state: &mut Option<hypervisor::ClockState>,
890 ) -> Result<Option<ExitState>> {
891 let mut execute_vm_request = |request: VmRequest, guest_os: &mut RunnableLinuxVm<V, Vcpu>| {
892 if let VmRequest::Exit = request {
893 return (VmResponse::Ok, Some(VmRunMode::Exiting));
894 }
895 let vcpu_size = vcpu_boxes.lock().len();
896 let resp = request.execute(
897 &guest_os.vm,
898 disk_host_tubes,
899 &mut guest_os.pm,
900 #[cfg(feature = "gpu")]
901 gpu_control_tube,
902 #[cfg(not(feature = "gpu"))]
903 None,
904 None,
905 &mut None,
906 |msg| {
907 kick_all_vcpus(
908 run_mode_arc,
909 vcpu_control_channels,
910 vcpu_boxes,
911 guest_os.irq_chip.as_ref(),
912 #[cfg(feature = "pvclock")]
913 pvclock_host_tube,
914 &guest_os.resume_notify_devices,
915 msg,
916 );
917 },
918 force_s2idle,
919 #[cfg(feature = "swap")]
920 None,
921 device_ctrl_tube,
922 vcpu_size,
923 irq_handler_control,
924 || guest_os.irq_chip.as_ref().snapshot(vcpu_size),
925 suspended_pvclock_state,
926 );
927 (resp, None)
928 };
929
930 match event.token {
931 Token::VmEvent => match vm_evt_rdtube.recv::<VmEventType>() {
932 Ok(vm_event) => {
933 let exit_state = match vm_event {
934 VmEventType::Exit => {
935 info!("vcpu requested shutdown");
936 Some(ExitState::Stop)
937 }
938 VmEventType::Reset => {
939 info!("vcpu requested reset");
940 Some(ExitState::Reset)
941 }
942 VmEventType::Crash => {
943 info!("vcpu crashed");
944 Some(ExitState::Crash)
945 }
946 VmEventType::Panic(_) => {
947 error!("got pvpanic event. this event is not expected on Windows.");
948 None
949 }
950 VmEventType::WatchdogReset => {
951 info!("vcpu stall detected");
952 Some(ExitState::WatchdogReset)
953 }
954 };
955 return Ok(exit_state);
956 }
957 Err(e) => {
958 warn!("failed to recv VmEvent: {}", e);
959 }
960 },
961 Token::BrokerShutdown => {
962 info!("main loop got broker shutdown event");
963 return Ok(Some(ExitState::Stop));
964 }
965 Token::VmControlServer => {
966 let server =
967 vm_control_server.expect("control server must exist if this event triggers");
968 let client = server.accept();
969 let id = *next_control_id;
970 *next_control_id += 1;
971 wait_ctx
972 .add(client.0.get_read_notifier(), Token::VmControl { id })
973 .exit_context(
974 Exit::WaitContextAdd,
975 "failed to add trigger to wait context",
976 )?;
977 wait_ctx
978 .add(client.0.get_close_notifier(), Token::VmControl { id })
979 .exit_context(
980 Exit::WaitContextAdd,
981 "failed to add trigger to wait context",
982 )?;
983 control_tubes.insert(id, TaggedControlTube::Vm(client));
984 }
985 #[allow(clippy::collapsible_match)]
986 Token::VmControl { id } => {
987 if let Some(tube) = control_tubes.get(&id) {
988 #[allow(clippy::single_match)]
989 match tube {
990 TaggedControlTube::Product(product_tube) => {
991 product::handle_tagged_control_tube_event(
992 product_tube,
993 virtio_snd_host_mute_tubes,
994 service_vm_state,
995 ipc_main_loop_tube,
996 )
997 }
998 TaggedControlTube::Vm(tube) => match tube.0.recv::<VmRequest>() {
999 Ok(request) => {
1000 let mut run_mode_opt = None;
1001 let response = match request {
1002 VmRequest::HotPlugVfioCommand { device, add } => {
1003 // Suppress warnings.
1004 let _ = (device, add);
1005 unimplemented!("not implemented on Windows");
1006 }
1007 #[cfg(feature = "registered_events")]
1008 VmRequest::RegisterListener { socket_addr, event } => {
1009 unimplemented!("not implemented on Windows");
1010 }
1011 #[cfg(feature = "registered_events")]
1012 VmRequest::UnregisterListener { socket_addr, event } => {
1013 unimplemented!("not implemented on Windows");
1014 }
1015 #[cfg(feature = "registered_events")]
1016 VmRequest::Unregister { socket_addr } => {
1017 unimplemented!("not implemented on Windows");
1018 }
1019 #[cfg(feature = "balloon")]
1020 VmRequest::BalloonCommand(cmd) => {
1021 if let Some(balloon_tube) = balloon_tube {
1022 if let Some((r, key)) = balloon_tube.send_cmd(cmd, Some(id))
1023 {
1024 if key != id {
1025 unimplemented!("not implemented on Windows");
1026 }
1027 Some(r)
1028 } else {
1029 None
1030 }
1031 } else {
1032 error!("balloon not enabled");
1033 None
1034 }
1035 }
1036 _ => {
1037 let (resp, run_mode_ret) =
1038 execute_vm_request(request, guest_os);
1039 run_mode_opt = run_mode_ret;
1040 Some(resp)
1041 }
1042 };
1043
1044 if let Some(response) = response {
1045 if let Err(e) = tube.0.send(&response) {
1046 error!("failed to send VmResponse: {}", e);
1047 }
1048 }
1049 if let Some(exit_state) =
1050 handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1051 {
1052 return Ok(Some(exit_state));
1053 }
1054 }
1055 Err(e) => {
1056 if let TubeError::Disconnected = e {
1057 vm_control_ids_to_remove.push(id);
1058 } else {
1059 error!("failed to recv VmRequest: {}", e);
1060 }
1061 }
1062 },
1063 }
1064 }
1065 }
1066 #[cfg(feature = "balloon")]
1067 Token::BalloonTube => match balloon_tube.as_mut().expect("missing balloon tube").recv() {
1068 Ok(resp) => {
1069 for (resp, idx) in resp {
1070 if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
1071 if let Err(e) = tube.0.send(&resp) {
1072 error!("failed to send VmResponse: {}", e);
1073 }
1074 } else {
1075 error!("Bad tube index {}", idx);
1076 }
1077 }
1078 }
1079 Err(err) => {
1080 error!("Error processing balloon tube {:?}", err)
1081 }
1082 },
1083 #[cfg(not(feature = "balloon"))]
1084 Token::BalloonTube => unreachable!("balloon tube not registered"),
1085 #[allow(unreachable_patterns)]
1086 _ => {
1087 let run_mode_opt = product::handle_received_token(
1088 &event.token,
1089 anti_tamper_main_thread_tube,
1090 #[cfg(feature = "balloon")]
1091 balloon_tube,
1092 control_tubes,
1093 guest_os,
1094 ipc_main_loop_tube,
1095 memory_size_mb,
1096 proto_main_loop_tube,
1097 #[cfg(feature = "pvclock")]
1098 pvclock_host_tube,
1099 run_mode_arc,
1100 service_vm_state,
1101 vcpu_boxes,
1102 virtio_snd_host_mute_tubes,
1103 execute_vm_request,
1104 );
1105 if let Some(exit_state) = handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os)
1106 {
1107 return Ok(Some(exit_state));
1108 }
1109 }
1110 };
1111 Ok(None)
1112 }
1113
1114 /// Handles a run mode change (if one occurred) if one is pending as a
1115 /// result a VmRequest. The parameter, run_mode_opt, is the run mode change
1116 /// proposed by the VmRequest's execution.
1117 ///
1118 /// Returns the exit state, if it changed due to a run mode change.
1119 /// None otherwise.
handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( run_mode_opt: &Option<VmRunMode>, guest_os: &mut RunnableLinuxVm<V, Vcpu>, ) -> Option<ExitState>1120 fn handle_run_mode_change_for_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1121 run_mode_opt: &Option<VmRunMode>,
1122 guest_os: &mut RunnableLinuxVm<V, Vcpu>,
1123 ) -> Option<ExitState> {
1124 if let Some(run_mode) = run_mode_opt {
1125 info!("control socket changed run mode to {}", run_mode);
1126 match run_mode {
1127 VmRunMode::Exiting => return Some(ExitState::Stop),
1128 _ => unreachable!(),
1129 }
1130 }
1131 // No exit state change.
1132 None
1133 }
1134
1135 /// Commands to control the VM Memory handler thread.
1136 #[derive(serde::Serialize, serde::Deserialize)]
1137 pub enum VmMemoryHandlerRequest {
1138 /// No response is sent for this command.
1139 Exit,
1140 }
1141
vm_memory_handler_thread( control_tubes: Vec<Tube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, handler_control: Tube, ) -> anyhow::Result<()>1142 fn vm_memory_handler_thread(
1143 control_tubes: Vec<Tube>,
1144 mut vm: impl Vm,
1145 sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
1146 mut gralloc: RutabagaGralloc,
1147 handler_control: Tube,
1148 ) -> anyhow::Result<()> {
1149 #[derive(EventToken)]
1150 enum Token {
1151 VmControl { id: usize },
1152 HandlerControl,
1153 }
1154
1155 let wait_ctx =
1156 WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
1157 .context("failed to build wait context")?;
1158 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1159 for (id, socket) in control_tubes.iter() {
1160 wait_ctx
1161 .add(socket.get_read_notifier(), Token::VmControl { id: *id })
1162 .context("failed to add descriptor to wait context")?;
1163 }
1164
1165 let mut region_state: VmMemoryRegionState = Default::default();
1166
1167 'wait: loop {
1168 let events = {
1169 match wait_ctx.wait() {
1170 Ok(v) => v,
1171 Err(e) => {
1172 error!("failed to poll: {}", e);
1173 break;
1174 }
1175 }
1176 };
1177
1178 let mut vm_control_ids_to_remove = Vec::new();
1179 for event in events.iter().filter(|e| e.is_readable) {
1180 match event.token {
1181 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
1182 Ok(request) => match request {
1183 VmMemoryHandlerRequest::Exit => break 'wait,
1184 },
1185 Err(e) => {
1186 if let TubeError::Disconnected = e {
1187 panic!("vm memory control tube disconnected.");
1188 } else {
1189 error!("failed to recv VmMemoryHandlerRequest: {}", e);
1190 }
1191 }
1192 },
1193
1194 Token::VmControl { id } => {
1195 if let Some(tube) = control_tubes.get(&id) {
1196 match tube.recv::<VmMemoryRequest>() {
1197 Ok(request) => {
1198 let response = request.execute(
1199 &mut vm,
1200 &mut sys_allocator_mutex.lock(),
1201 &mut gralloc,
1202 None,
1203 &mut region_state,
1204 );
1205 if let Err(e) = tube.send(&response) {
1206 error!("failed to send VmMemoryControlResponse: {}", e);
1207 }
1208 }
1209 Err(e) => {
1210 if let TubeError::Disconnected = e {
1211 vm_control_ids_to_remove.push(id);
1212 } else {
1213 error!("failed to recv VmMemoryControlRequest: {}", e);
1214 }
1215 }
1216 }
1217 }
1218 }
1219 }
1220 }
1221
1222 remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1223 if events
1224 .iter()
1225 .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
1226 {
1227 error!("vm memory handler control hung up but did not request an exit.");
1228 break 'wait;
1229 }
1230 }
1231 Ok(())
1232 }
1233
create_control_server( control_server_path: Option<PathBuf>, wait_ctx: &WaitContext<Token>, ) -> Result<Option<ControlServer>>1234 fn create_control_server(
1235 control_server_path: Option<PathBuf>,
1236 wait_ctx: &WaitContext<Token>,
1237 ) -> Result<Option<ControlServer>> {
1238 #[cfg(not(feature = "prod-build"))]
1239 {
1240 if let Some(path) = control_server_path {
1241 let server =
1242 ControlServer::new(path.to_str().expect("control socket path must be a string"))
1243 .exit_context(
1244 Exit::FailedToCreateControlServer,
1245 "failed to create control server",
1246 )?;
1247 wait_ctx
1248 .add(server.client_waiting(), Token::VmControlServer)
1249 .exit_context(
1250 Exit::WaitContextAdd,
1251 "failed to add control server to wait context",
1252 )?;
1253 return Ok(Some(server));
1254 }
1255 }
1256 Ok::<Option<ControlServer>, anyhow::Error>(None)
1257 }
1258
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut guest_os: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, control_tubes: Vec<TaggedControlTube>, irq_control_tubes: Vec<Tube>, vm_memory_control_tubes: Vec<Tube>, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>, broker_shutdown_evt: Option<Event>, balloon_host_tube: Option<Tube>, #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>, disk_host_tubes: Vec<Tube>, initial_audio_session_states: Vec<InitialAudioSessionState>, gralloc: RutabagaGralloc, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, service_pipe_name: Option<String>, memory_size_mb: u64, host_cpu_topology: bool, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, mut product_args: RunControlArgs, mut virtio_snd_host_mute_tubes: Vec<Tube>, restore_path: Option<PathBuf>, control_server_path: Option<PathBuf>, force_s2idle: bool, suspended: bool, ) -> Result<ExitState>1259 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
1260 mut guest_os: RunnableLinuxVm<V, Vcpu>,
1261 sys_allocator: SystemAllocator,
1262 control_tubes: Vec<TaggedControlTube>,
1263 irq_control_tubes: Vec<Tube>,
1264 vm_memory_control_tubes: Vec<Tube>,
1265 vm_evt_rdtube: RecvTube,
1266 vm_evt_wrtube: SendTube,
1267 #[cfg(feature = "gpu")] gpu_control_tube: Option<Tube>,
1268 broker_shutdown_evt: Option<Event>,
1269 balloon_host_tube: Option<Tube>,
1270 #[cfg(feature = "pvclock")] pvclock_host_tube: Option<Tube>,
1271 disk_host_tubes: Vec<Tube>,
1272 initial_audio_session_states: Vec<InitialAudioSessionState>,
1273 gralloc: RutabagaGralloc,
1274 #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
1275 service_pipe_name: Option<String>,
1276 memory_size_mb: u64,
1277 host_cpu_topology: bool,
1278 tsc_sync_mitigations: TscSyncMitigations,
1279 force_calibrated_tsc_leaf: bool,
1280 mut product_args: RunControlArgs,
1281 mut virtio_snd_host_mute_tubes: Vec<Tube>,
1282 restore_path: Option<PathBuf>,
1283 control_server_path: Option<PathBuf>,
1284 force_s2idle: bool,
1285 suspended: bool,
1286 ) -> Result<ExitState> {
1287 let (ipc_main_loop_tube, proto_main_loop_tube, _service_ipc) =
1288 start_service_ipc_listener(service_pipe_name)?;
1289
1290 let mut service_vm_state = product::create_service_vm_state(memory_size_mb);
1291
1292 let service_audio_states = product::create_service_audio_states_and_send_to_service(
1293 initial_audio_session_states,
1294 &ipc_main_loop_tube,
1295 )?;
1296
1297 let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
1298
1299 let exit_evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?;
1300 let (irq_handler_control, irq_handler_control_for_worker) = Tube::pair().exit_context(
1301 Exit::CreateTube,
1302 "failed to create IRQ handler control Tube",
1303 )?;
1304
1305 // Create a separate thread to wait on IRQ events. This is a natural division
1306 // because IRQ interrupts have no dependencies on other events, and this lets
1307 // us avoid approaching the Windows WaitForMultipleObjects 64-object limit.
1308 let irq_join_handle = IrqWaitWorker::start(
1309 irq_handler_control_for_worker,
1310 guest_os
1311 .irq_chip
1312 .try_box_clone()
1313 .exit_context(Exit::CloneEvent, "failed to clone irq chip")?,
1314 irq_control_tubes,
1315 sys_allocator_mutex.clone(),
1316 );
1317
1318 let mut triggers = vec![(vm_evt_rdtube.get_read_notifier(), Token::VmEvent)];
1319 product::push_triggers(&mut triggers, &ipc_main_loop_tube, &proto_main_loop_tube);
1320 let wait_ctx = WaitContext::build_with(&triggers).exit_context(
1321 Exit::WaitContextAdd,
1322 "failed to add trigger to wait context",
1323 )?;
1324
1325 #[cfg(feature = "balloon")]
1326 let mut balloon_tube = balloon_host_tube
1327 .map(|tube| -> Result<BalloonTube> {
1328 wait_ctx
1329 .add(tube.get_read_notifier(), Token::BalloonTube)
1330 .context("failed to add trigger to wait context")?;
1331 Ok(BalloonTube::new(tube))
1332 })
1333 .transpose()
1334 .context("failed to create balloon tube")?;
1335
1336 let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
1337 let vm_memory_handler_thread_join_handle = std::thread::Builder::new()
1338 .name("vm_memory_handler_thread".into())
1339 .spawn({
1340 let vm = guest_os.vm.try_clone().context("failed to clone Vm")?;
1341 let sys_allocator_mutex = sys_allocator_mutex.clone();
1342 move || {
1343 vm_memory_handler_thread(
1344 vm_memory_control_tubes,
1345 vm,
1346 sys_allocator_mutex,
1347 gralloc,
1348 vm_memory_handler_control_for_thread,
1349 )
1350 }
1351 })
1352 .unwrap();
1353
1354 if let Some(evt) = broker_shutdown_evt.as_ref() {
1355 wait_ctx.add(evt, Token::BrokerShutdown).exit_context(
1356 Exit::WaitContextAdd,
1357 "failed to add trigger to wait context",
1358 )?;
1359 }
1360
1361 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
1362 let mut next_control_id = control_tubes.len();
1363 for (id, control_tube) in control_tubes.iter() {
1364 #[allow(clippy::single_match)]
1365 match control_tube {
1366 TaggedControlTube::Product(product_tube) => wait_ctx
1367 .add(
1368 product_tube.get_read_notifier(),
1369 Token::VmControl { id: *id },
1370 )
1371 .exit_context(
1372 Exit::WaitContextAdd,
1373 "failed to add trigger to wait context",
1374 )?,
1375 _ => (),
1376 }
1377 }
1378
1379 let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
1380 guest_os.devices_thread = match create_devices_worker_thread(
1381 guest_os.vm.get_memory().clone(),
1382 guest_os.io_bus.clone(),
1383 guest_os.mmio_bus.clone(),
1384 device_ctrl_resp,
1385 ) {
1386 Ok(join_handle) => Some(join_handle),
1387 Err(e) => {
1388 return Err(anyhow!("Failed to start devices thread: {}", e));
1389 }
1390 };
1391
1392 let vcpus: Vec<Option<_>> = match guest_os.vcpus.take() {
1393 Some(vec) => vec.into_iter().map(|vcpu| Some(vcpu)).collect(),
1394 None => iter::repeat_with(|| None)
1395 .take(guest_os.vcpu_count)
1396 .collect(),
1397 };
1398
1399 let anti_tamper_main_thread_tube = spawn_anti_tamper_thread(&wait_ctx);
1400
1401 let mut vm_control_server = create_control_server(control_server_path, &wait_ctx)?;
1402
1403 let ime_thread = run_ime_thread(&mut product_args, &exit_evt)?;
1404
1405 let original_terminal_mode = stdin().set_raw_mode().ok();
1406
1407 let vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>> = Arc::new(Mutex::new(Vec::new()));
1408 let run_mode_arc = Arc::new(VcpuRunMode::default());
1409
1410 let run_mode_state = if suspended {
1411 // Sleep devices before creating vcpus.
1412 device_ctrl_tube
1413 .send(&DeviceControlCommand::SleepDevices)
1414 .context("send command to devices control socket")?;
1415 match device_ctrl_tube
1416 .recv()
1417 .context("receive from devices control socket")?
1418 {
1419 VmResponse::Ok => (),
1420 resp => bail!("device sleep failed: {}", resp),
1421 }
1422 run_mode_arc.set_and_notify(VmRunMode::Suspending);
1423 VmRunMode::Suspending
1424 } else {
1425 VmRunMode::Running
1426 };
1427
1428 // If we are restoring from a snapshot, then start suspended.
1429 if restore_path.is_some() {
1430 run_mode_arc.set_and_notify(VmRunMode::Suspending);
1431 }
1432
1433 let (vcpu_threads, vcpu_control_channels) = run_all_vcpus(
1434 vcpus,
1435 vcpu_boxes.clone(),
1436 &guest_os,
1437 &exit_evt,
1438 &vm_evt_wrtube,
1439 #[cfg(feature = "stats")]
1440 &stats,
1441 host_cpu_topology,
1442 run_mode_arc.clone(),
1443 tsc_sync_mitigations,
1444 force_calibrated_tsc_leaf,
1445 )?;
1446
1447 // See comment on `VmRequest::execute`.
1448 let mut suspended_pvclock_state: Option<hypervisor::ClockState> = None;
1449
1450 // Restore VM (if applicable).
1451 if let Some(path) = restore_path {
1452 vm_control::do_restore(
1453 &path,
1454 |msg| {
1455 kick_all_vcpus(
1456 run_mode_arc.as_ref(),
1457 &vcpu_control_channels,
1458 vcpu_boxes.as_ref(),
1459 guest_os.irq_chip.as_ref(),
1460 #[cfg(feature = "pvclock")]
1461 &pvclock_host_tube,
1462 &guest_os.resume_notify_devices,
1463 msg,
1464 )
1465 },
1466 |msg, index| {
1467 kick_vcpu(
1468 run_mode_arc.as_ref(),
1469 &vcpu_control_channels,
1470 vcpu_boxes.as_ref(),
1471 guest_os.irq_chip.as_ref(),
1472 index,
1473 msg,
1474 )
1475 },
1476 &irq_handler_control,
1477 &device_ctrl_tube,
1478 guest_os.vcpu_count,
1479 |image| {
1480 guest_os
1481 .irq_chip
1482 .try_box_clone()?
1483 .restore(image, guest_os.vcpu_count)
1484 },
1485 /* require_encrypted= */ false,
1486 &mut suspended_pvclock_state,
1487 )?;
1488 // Allow the vCPUs to start for real.
1489 kick_all_vcpus(
1490 run_mode_arc.as_ref(),
1491 &vcpu_control_channels,
1492 vcpu_boxes.as_ref(),
1493 guest_os.irq_chip.as_ref(),
1494 #[cfg(feature = "pvclock")]
1495 &pvclock_host_tube,
1496 &guest_os.resume_notify_devices,
1497 // Other platforms (unix) have multiple modes they could start in (e.g. starting for
1498 // guest kernel debugging, etc). If/when we support those modes on Windows, we'll need
1499 // to enter that mode here rather than VmRunMode::Running.
1500 VcpuControl::RunState(run_mode_state),
1501 );
1502 }
1503
1504 let mut exit_state = ExitState::Stop;
1505 let mut region_state: VmMemoryRegionState = Default::default();
1506
1507 'poll: loop {
1508 let events = {
1509 match wait_ctx.wait() {
1510 Ok(v) => v,
1511 Err(e) => {
1512 error!("failed to wait: {}", e);
1513 break;
1514 }
1515 }
1516 };
1517
1518 let mut vm_control_ids_to_remove = Vec::new();
1519 for event in events.iter().filter(|e| e.is_readable) {
1520 let state = handle_readable_event(
1521 event,
1522 &mut vm_control_ids_to_remove,
1523 &mut next_control_id,
1524 &mut service_vm_state,
1525 disk_host_tubes.as_slice(),
1526 ipc_main_loop_tube.as_ref(),
1527 #[cfg(feature = "gpu")]
1528 gpu_control_tube.as_ref(),
1529 &vm_evt_rdtube,
1530 &mut control_tubes,
1531 &mut guest_os,
1532 &sys_allocator_mutex,
1533 &mut virtio_snd_host_mute_tubes,
1534 proto_main_loop_tube.as_ref(),
1535 &anti_tamper_main_thread_tube,
1536 #[cfg(feature = "balloon")]
1537 balloon_tube.as_mut(),
1538 memory_size_mb,
1539 vcpu_boxes.as_ref(),
1540 #[cfg(feature = "pvclock")]
1541 &pvclock_host_tube,
1542 run_mode_arc.as_ref(),
1543 &mut region_state,
1544 vm_control_server.as_mut(),
1545 &irq_handler_control,
1546 &device_ctrl_tube,
1547 &wait_ctx,
1548 force_s2idle,
1549 &vcpu_control_channels,
1550 &mut suspended_pvclock_state,
1551 )?;
1552 if let Some(state) = state {
1553 exit_state = state;
1554 break 'poll;
1555 }
1556 }
1557
1558 remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?;
1559 }
1560
1561 info!("run_control poll loop completed, forcing vCPUs to exit...");
1562
1563 // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM.
1564 run_mode_arc.set_and_notify(VmRunMode::Exiting);
1565
1566 // Force all vcpus to exit from the hypervisor
1567 for vcpu in vcpu_boxes.lock().iter() {
1568 vcpu.set_immediate_exit(true);
1569 }
1570
1571 let mut res = Ok(exit_state);
1572 guest_os.irq_chip.kick_halted_vcpus();
1573 let _ = exit_evt.signal();
1574
1575 if guest_os.devices_thread.is_some() {
1576 if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
1577 error!("failed to stop device control loop: {}", e);
1578 };
1579 if let Some(thread) = guest_os.devices_thread.take() {
1580 if let Err(e) = thread.join() {
1581 error!("failed to exit devices thread: {:?}", e);
1582 }
1583 }
1584 }
1585
1586 // Shut down the VM memory handler thread.
1587 if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
1588 error!(
1589 "failed to request exit from VM memory handler thread: {}",
1590 e
1591 );
1592 }
1593 if let Err(e) = vm_memory_handler_thread_join_handle.join() {
1594 error!("failed to exit VM Memory handler thread: {:?}", e);
1595 }
1596
1597 // Shut down the IRQ handler thread.
1598 if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
1599 error!("failed to request exit from IRQ handler thread: {}", e);
1600 }
1601
1602 // Ensure any child threads have ended by sending the Exit vm event (possibly again) to ensure
1603 // their run loops are aborted.
1604 let _ = vm_evt_wrtube.send::<VmEventType>(&VmEventType::Exit);
1605 for (i, thread) in vcpu_threads.into_iter().enumerate() {
1606 // wait till all the threads exit, so that guest_os.vm arc memory count is down to 1.
1607 // otherwise, we will hit a memory leak if we force kill the thread with terminate.
1608 match thread.join() {
1609 Ok(Err(e)) => {
1610 error!("vcpu thread {} exited with an error: {}", i, e);
1611 res = Err(e);
1612 }
1613 Ok(_) => {}
1614 Err(e) => error!("vcpu thread {} panicked: {:?}", i, e),
1615 }
1616 }
1617
1618 info!("vCPU threads have exited.");
1619
1620 if let Some(ime) = ime_thread {
1621 match ime.join() {
1622 Ok(Err(e)) => {
1623 error!("ime thread exited with an error: {}", e);
1624 if res.is_ok() {
1625 // Prioritize past errors, but return this error if it is unique, otherwise just
1626 // log it.
1627 res = Err(e)
1628 }
1629 }
1630 Ok(_) => {}
1631 Err(e) => error!("ime thread panicked: {:?}", e),
1632 }
1633 }
1634 info!("IME thread has exited.");
1635
1636 // This cancels all the outstanding and any future blocking operations.
1637 // TODO(b/196911556): Shutdown executor for cleaner shutdown. Given we are using global, for a
1638 // cleaner shutdown we have to call disarm so that all the incoming requests are run and are
1639 // cancelled. If we call shutdown all blocking threads will go away and incoming operations
1640 // won't be scheduled to run and will be dropped leading to panic. I think ideal place to call
1641 // shutdown is when we drop non-global executor.
1642 cros_async::unblock_disarm();
1643 info!("blocking async pool has shut down.");
1644
1645 let _ = irq_join_handle.join();
1646 info!("IrqWaitWorker has shut down.");
1647
1648 #[cfg(feature = "stats")]
1649 if let Some(stats) = stats {
1650 println!("Statistics Collected:\n{}", stats.lock());
1651 println!("Statistics JSON:\n{}", stats.lock().json());
1652 }
1653
1654 if let Some(mode) = original_terminal_mode {
1655 if let Err(e) = stdin().restore_mode(mode) {
1656 warn!("failed to restore terminal mode: {}", e);
1657 }
1658 }
1659
1660 // Explicitly drop the VM structure here to allow the devices to clean up before the
1661 // control tubes are closed when this function exits.
1662 mem::drop(guest_os);
1663
1664 info!("guest_os dropped, run_control is done.");
1665
1666 res
1667 }
1668
1669 /// Remove Tubes that have been closed from the WaitContext.
remove_closed_tubes<T, U>( wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier + CloseNotifier,1670 fn remove_closed_tubes<T, U>(
1671 wait_ctx: &WaitContext<T>,
1672 tubes: &mut BTreeMap<usize, U>,
1673 mut tube_ids_to_remove: Vec<usize>,
1674 ) -> anyhow::Result<()>
1675 where
1676 T: EventToken,
1677 U: ReadNotifier + CloseNotifier,
1678 {
1679 tube_ids_to_remove.dedup();
1680 for id in tube_ids_to_remove {
1681 if let Some(socket) = tubes.remove(&id) {
1682 wait_ctx
1683 .delete(socket.get_read_notifier())
1684 .context("failed to remove descriptor from wait context")?;
1685
1686 // There may be a close notifier registered for this Tube. If there isn't one
1687 // registered, we just ignore the error.
1688 let _ = wait_ctx.delete(socket.get_close_notifier());
1689 }
1690 }
1691 Ok(())
1692 }
1693
1694 /// Sends a message to all VCPUs.
kick_all_vcpus( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, resume_notify_devices: &[Arc<Mutex<dyn BusResumeDevice>>], msg: VcpuControl, )1695 fn kick_all_vcpus(
1696 run_mode: &VcpuRunMode,
1697 vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1698 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1699 irq_chip: &dyn IrqChipArch,
1700 #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1701 resume_notify_devices: &[Arc<Mutex<dyn BusResumeDevice>>],
1702 msg: VcpuControl,
1703 ) {
1704 // On Windows, we handle run mode switching directly rather than delegating to the VCPU thread
1705 // like unix does.
1706 match &msg {
1707 VcpuControl::RunState(VmRunMode::Suspending) => {
1708 suspend_all_vcpus(
1709 run_mode,
1710 vcpu_boxes,
1711 irq_chip,
1712 #[cfg(feature = "pvclock")]
1713 pvclock_host_tube,
1714 );
1715 return;
1716 }
1717 VcpuControl::RunState(VmRunMode::Running) => {
1718 for device in resume_notify_devices {
1719 device.lock().resume_imminent();
1720 }
1721 resume_all_vcpus(
1722 run_mode,
1723 vcpu_boxes,
1724 irq_chip,
1725 #[cfg(feature = "pvclock")]
1726 pvclock_host_tube,
1727 );
1728 return;
1729 }
1730 _ => (),
1731 }
1732
1733 // For non RunState commands, we dispatch just like unix would.
1734 for vcpu in vcpu_control_channels {
1735 if let Err(e) = vcpu.send(msg.clone()) {
1736 error!("failed to send VcpuControl message: {}", e);
1737 }
1738 }
1739
1740 // Now that we've sent a message, we need VCPUs to exit so they can process it.
1741 for vcpu in vcpu_boxes.lock().iter() {
1742 vcpu.set_immediate_exit(true);
1743 }
1744 irq_chip.kick_halted_vcpus();
1745
1746 // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1747 // the control message.
1748 let current_run_mode = run_mode.get_mode();
1749 if current_run_mode != VmRunMode::Running {
1750 run_mode.set_and_notify(current_run_mode);
1751 }
1752 }
1753
1754 /// Sends a message to a single VCPU. On Windows, `VcpuControl::RunState` cannot be sent to a single
1755 /// VCPU.
kick_vcpu( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender<VcpuControl>], vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, index: usize, msg: VcpuControl, )1756 fn kick_vcpu(
1757 run_mode: &VcpuRunMode,
1758 vcpu_control_channels: &[mpsc::Sender<VcpuControl>],
1759 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1760 irq_chip: &dyn IrqChipArch,
1761 index: usize,
1762 msg: VcpuControl,
1763 ) {
1764 assert!(
1765 !matches!(msg, VcpuControl::RunState(_)),
1766 "Windows does not support RunState changes on a per VCPU basis"
1767 );
1768
1769 let vcpu = vcpu_control_channels
1770 .get(index)
1771 .expect("invalid vcpu index specified");
1772 if let Err(e) = vcpu.send(msg) {
1773 error!("failed to send VcpuControl message: {}", e);
1774 }
1775
1776 // Now that we've sent a message, we need the VCPU to exit so it can
1777 // process the message.
1778 vcpu_boxes
1779 .lock()
1780 .get(index)
1781 .expect("invalid vcpu index specified")
1782 .set_immediate_exit(true);
1783 irq_chip.kick_halted_vcpus();
1784
1785 // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes
1786 // the control message. (Technically this wakes all VCPUs, but those without messages will go
1787 // back to sleep.)
1788 let current_run_mode = run_mode.get_mode();
1789 if current_run_mode != VmRunMode::Running {
1790 run_mode.set_and_notify(current_run_mode);
1791 }
1792 }
1793
1794 /// Suspends all VCPUs. The VM will be effectively frozen in time once this function is called,
1795 /// though devices on the host will continue to run.
suspend_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, )1796 pub(crate) fn suspend_all_vcpus(
1797 run_mode: &VcpuRunMode,
1798 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1799 irq_chip: &dyn IrqChipArch,
1800 #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1801 ) {
1802 // VCPU threads MUST see the VmRunMode::Suspending flag first, otherwise
1803 // they may re-enter the VM.
1804 run_mode.set_and_notify(VmRunMode::Suspending);
1805
1806 // Force all vcpus to exit from the hypervisor
1807 for vcpu in vcpu_boxes.lock().iter() {
1808 vcpu.set_immediate_exit(true);
1809 }
1810 irq_chip.kick_halted_vcpus();
1811
1812 #[cfg(feature = "pvclock")]
1813 handle_pvclock_request(pvclock_host_tube, PvClockCommand::Suspend)
1814 .unwrap_or_else(|e| error!("Error handling pvclock suspend: {:?}", e));
1815 }
1816
1817 /// Resumes all VCPUs.
resume_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>, )1818 pub(crate) fn resume_all_vcpus(
1819 run_mode: &VcpuRunMode,
1820 vcpu_boxes: &Mutex<Vec<Box<dyn VcpuArch>>>,
1821 irq_chip: &dyn IrqChipArch,
1822 #[cfg(feature = "pvclock")] pvclock_host_tube: &Option<Tube>,
1823 ) {
1824 #[cfg(feature = "pvclock")]
1825 handle_pvclock_request(pvclock_host_tube, PvClockCommand::Resume)
1826 .unwrap_or_else(|e| error!("Error handling pvclock resume: {:?}", e));
1827
1828 // Make sure any immediate exit bits are disabled
1829 for vcpu in vcpu_boxes.lock().iter() {
1830 vcpu.set_immediate_exit(false);
1831 }
1832
1833 run_mode.set_and_notify(VmRunMode::Running);
1834 }
1835
1836 #[cfg(feature = "gvm")]
1837 const GVM_MINIMUM_VERSION: GvmVersion = GvmVersion {
1838 major: 1,
1839 minor: 4,
1840 patch: 1,
1841 };
1842
1843 #[cfg(feature = "gvm")]
create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm>1844 fn create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result<GvmVm> {
1845 match gvm.get_full_version() {
1846 Ok(version) => {
1847 if version < GVM_MINIMUM_VERSION {
1848 error!(
1849 "GVM version {} is below minimum version {}",
1850 version, GVM_MINIMUM_VERSION
1851 );
1852 return Err(base::Error::new(libc::ENXIO).into());
1853 } else {
1854 info!("Using GVM version {}.", version)
1855 }
1856 }
1857 Err(e) => {
1858 error!("unable to determine gvm version: {}", e);
1859 return Err(base::Error::new(libc::ENXIO).into());
1860 }
1861 }
1862 let vm = GvmVm::new(&gvm, mem)?;
1863 Ok(vm)
1864 }
1865
1866 #[cfg(feature = "haxm")]
create_haxm_vm( haxm: Haxm, mem: GuestMemory, kernel_log_file: &Option<String>, ) -> Result<HaxmVm>1867 fn create_haxm_vm(
1868 haxm: Haxm,
1869 mem: GuestMemory,
1870 kernel_log_file: &Option<String>,
1871 ) -> Result<HaxmVm> {
1872 let vm = HaxmVm::new(&haxm, mem)?;
1873 if let Some(path) = kernel_log_file {
1874 use hypervisor::haxm::HAX_CAP_VM_LOG;
1875 if vm.check_raw_capability(HAX_CAP_VM_LOG) {
1876 match vm.register_log_file(path) {
1877 Ok(_) => {}
1878 Err(e) => match e.errno() {
1879 libc::E2BIG => {
1880 error!(
1881 "kernel_log_file path is too long, kernel log file will not be written"
1882 );
1883 }
1884 _ => return Err(e.into()),
1885 },
1886 }
1887 } else {
1888 warn!(
1889 "kernel_log_file specified but this version of HAXM does not support kernel log \
1890 files"
1891 );
1892 }
1893 }
1894 Ok(vm)
1895 }
1896
1897 #[cfg(feature = "whpx")]
1898 #[cfg(target_arch = "x86_64")]
create_whpx_vm( whpx: Whpx, mem: GuestMemory, cpu_count: usize, no_smt: bool, apic_emulation: bool, force_calibrated_tsc_leaf: bool, vm_evt_wrtube: SendTube, ) -> Result<WhpxVm>1899 fn create_whpx_vm(
1900 whpx: Whpx,
1901 mem: GuestMemory,
1902 cpu_count: usize,
1903 no_smt: bool,
1904 apic_emulation: bool,
1905 force_calibrated_tsc_leaf: bool,
1906 vm_evt_wrtube: SendTube,
1907 ) -> Result<WhpxVm> {
1908 let cpu_config = hypervisor::CpuConfigX86_64::new(
1909 force_calibrated_tsc_leaf,
1910 false, /* host_cpu_topology */
1911 false, /* enable_hwp */
1912 no_smt,
1913 false, /* itmt */
1914 None, /* hybrid_type */
1915 );
1916
1917 // context for non-cpu-specific cpuid results
1918 let ctx = CpuIdContext::new(
1919 0,
1920 cpu_count,
1921 None,
1922 cpu_config,
1923 whpx.check_capability(HypervisorCap::CalibratedTscLeafRequired),
1924 __cpuid_count,
1925 __cpuid,
1926 );
1927
1928 // Get all cpuid entries that we should pre-set
1929 let mut cpuid = whpx.get_supported_cpuid()?;
1930
1931 // Adjust them for crosvm
1932 for entry in cpuid.cpu_id_entries.iter_mut() {
1933 adjust_cpuid(entry, &ctx);
1934 }
1935
1936 let vm = WhpxVm::new(
1937 &whpx,
1938 cpu_count,
1939 mem,
1940 cpuid,
1941 apic_emulation,
1942 Some(vm_evt_wrtube),
1943 )
1944 .exit_context(Exit::WhpxSetupError, "failed to create WHPX vm")?;
1945
1946 Ok(vm)
1947 }
1948
1949 #[cfg(feature = "gvm")]
create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip>1950 fn create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result<GvmIrqChip> {
1951 info!("Creating GVM irqchip");
1952 let irq_chip = GvmIrqChip::new(vm.try_clone()?, vcpu_count)?;
1953 Ok(irq_chip)
1954 }
1955
1956 #[cfg(feature = "whpx")]
1957 #[cfg(target_arch = "x86_64")]
create_whpx_split_irq_chip( vm: &WhpxVm, ioapic_device_tube: Tube, ) -> base::Result<WhpxSplitIrqChip>1958 fn create_whpx_split_irq_chip(
1959 vm: &WhpxVm,
1960 ioapic_device_tube: Tube,
1961 ) -> base::Result<WhpxSplitIrqChip> {
1962 info!("Creating WHPX split irqchip");
1963 WhpxSplitIrqChip::new(
1964 vm.try_clone()?,
1965 ioapic_device_tube,
1966 None, // ioapic_pins
1967 )
1968 }
1969
create_userspace_irq_chip<Vcpu>( vcpu_count: usize, ioapic_device_tube: Tube, ) -> base::Result<UserspaceIrqChip<Vcpu>> where Vcpu: VcpuArch + 'static,1970 fn create_userspace_irq_chip<Vcpu>(
1971 vcpu_count: usize,
1972 ioapic_device_tube: Tube,
1973 ) -> base::Result<UserspaceIrqChip<Vcpu>>
1974 where
1975 Vcpu: VcpuArch + 'static,
1976 {
1977 info!("Creating userspace irqchip");
1978 let irq_chip =
1979 UserspaceIrqChip::new(vcpu_count, ioapic_device_tube, /* ioapic_pins: */ None)?;
1980 Ok(irq_chip)
1981 }
1982
get_default_hypervisor() -> Option<HypervisorKind>1983 pub fn get_default_hypervisor() -> Option<HypervisorKind> {
1984 // The ordering here matters from most preferable to the least.
1985 #[cfg(feature = "whpx")]
1986 match hypervisor::whpx::Whpx::is_enabled() {
1987 true => return Some(HypervisorKind::Whpx),
1988 false => warn!("Whpx not enabled."),
1989 };
1990
1991 #[cfg(feature = "haxm")]
1992 match Haxm::new() {
1993 Ok(_) => return Some(HypervisorKind::Ghaxm),
1994 Err(e) => warn!("Cannot initialize HAXM: {}", e),
1995 };
1996
1997 #[cfg(feature = "gvm")]
1998 // Make sure Gvm device can be opened before selecting it.
1999 match Gvm::new() {
2000 Ok(_) => return Some(HypervisorKind::Gvm),
2001 Err(e) => warn!("Cannot initialize GVM: {}", e),
2002 };
2003
2004 None
2005 }
2006
setup_vm_components(cfg: &Config) -> Result<VmComponents>2007 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
2008 let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
2009 Some(
2010 File::open(initrd_path).with_exit_context(Exit::OpenInitrd, || {
2011 format!("failed to open initrd {}", initrd_path.display())
2012 })?,
2013 )
2014 } else {
2015 None
2016 };
2017
2018 let vm_image = match cfg.executable_path {
2019 Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
2020 File::open(kernel_path).with_exit_context(Exit::OpenKernel, || {
2021 format!("failed to open kernel image {}", kernel_path.display(),)
2022 })?,
2023 ),
2024 Some(Executable::Bios(ref bios_path)) => {
2025 VmImage::Bios(File::open(bios_path).with_exit_context(Exit::OpenBios, || {
2026 format!("failed to open bios {}", bios_path.display())
2027 })?)
2028 }
2029 _ => panic!("Did not receive a bios or kernel, should be impossible."),
2030 };
2031
2032 let swiotlb = if let Some(size) = cfg.swiotlb {
2033 Some(
2034 size.checked_mul(1024 * 1024)
2035 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
2036 )
2037 } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
2038 None
2039 } else {
2040 Some(64 * 1024 * 1024)
2041 };
2042
2043 let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
2044 {
2045 (
2046 Some(
2047 open_file_or_duplicate(
2048 &pflash_parameters.path,
2049 OpenOptions::new().read(true).write(true),
2050 )
2051 .with_context(|| {
2052 format!("failed to open pflash {}", pflash_parameters.path.display())
2053 })?,
2054 ),
2055 pflash_parameters.block_size,
2056 )
2057 } else {
2058 (None, 0)
2059 };
2060
2061 Ok(VmComponents {
2062 memory_size: cfg
2063 .memory
2064 .unwrap_or(256)
2065 .checked_mul(1024 * 1024)
2066 .ok_or_else(|| anyhow!("requested memory size too large"))?,
2067 swiotlb,
2068 vcpu_count: cfg.vcpu_count.unwrap_or(1),
2069 fw_cfg_enable: false,
2070 bootorder_fw_cfg_blob: Vec::new(),
2071 vcpu_affinity: cfg.vcpu_affinity.clone(),
2072 cpu_clusters: cfg.cpu_clusters.clone(),
2073 cpu_capacity: cfg.cpu_capacity.clone(),
2074 no_smt: cfg.no_smt,
2075 hugepages: cfg.hugepages,
2076 hv_cfg: hypervisor::Config {
2077 protection_type: cfg.protection_type,
2078 },
2079 vm_image,
2080 android_fstab: cfg
2081 .android_fstab
2082 .as_ref()
2083 .map(|x| {
2084 File::open(x).with_exit_context(Exit::OpenAndroidFstab, || {
2085 format!("failed to open android fstab file {}", x.display())
2086 })
2087 })
2088 .map_or(Ok(None), |v| v.map(Some))?,
2089 pstore: cfg.pstore.clone(),
2090 pflash_block_size,
2091 pflash_image,
2092 initrd_image,
2093 extra_kernel_params: cfg.params.clone(),
2094 acpi_sdts: cfg
2095 .acpi_tables
2096 .iter()
2097 .map(|path| {
2098 SDT::from_file(path).with_exit_context(Exit::OpenAcpiTable, || {
2099 format!("failed to open ACPI file {}", path.display())
2100 })
2101 })
2102 .collect::<Result<Vec<SDT>>>()?,
2103 rt_cpus: cfg.rt_cpus.clone(),
2104 delay_rt: cfg.delay_rt,
2105 no_i8042: cfg.no_i8042,
2106 no_rtc: cfg.no_rtc,
2107 host_cpu_topology: cfg.host_cpu_topology,
2108 #[cfg(target_arch = "x86_64")]
2109 force_s2idle: cfg.force_s2idle,
2110 fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
2111 itmt: false,
2112 pvm_fw: None,
2113 pci_config: cfg.pci_config,
2114 #[cfg(target_arch = "x86_64")]
2115 smbios: cfg.smbios.clone(),
2116 dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
2117 #[cfg(target_arch = "x86_64")]
2118 break_linux_pci_config_io: cfg.break_linux_pci_config_io,
2119 boot_cpu: cfg.boot_cpu,
2120 })
2121 }
2122
2123 // Enum that allows us to assign a variable to what is essentially a &dyn IrqChipArch.
2124 enum WindowsIrqChip<V: VcpuArch> {
2125 Userspace(UserspaceIrqChip<V>),
2126 #[cfg(feature = "gvm")]
2127 Gvm(GvmIrqChip),
2128 #[cfg(feature = "whpx")]
2129 WhpxSplit(WhpxSplitIrqChip),
2130 }
2131
2132 impl<V: VcpuArch> WindowsIrqChip<V> {
2133 // Convert our enum to a &mut dyn IrqChipArch
as_mut(&mut self) -> &mut dyn IrqChipArch2134 fn as_mut(&mut self) -> &mut dyn IrqChipArch {
2135 match self {
2136 WindowsIrqChip::Userspace(i) => i,
2137 #[cfg(feature = "gvm")]
2138 WindowsIrqChip::Gvm(i) => i,
2139 #[cfg(feature = "whpx")]
2140 WindowsIrqChip::WhpxSplit(i) => i,
2141 }
2142 }
2143 }
2144
2145 /// Storage for the VM TSC offset for each vcpu. Stored in a static because the tracing thread will
2146 /// need access to it when tracing is enabled.
2147 static TSC_OFFSETS: sync::Mutex<Vec<Option<u64>>> = sync::Mutex::new(Vec::new());
2148
2149 /// Save the TSC offset for a particular vcpu.
2150 ///
2151 /// After setting the TSC offset for a vcpu, this function checks the standard deviation of offsets
2152 /// for all the VCPUs and logs this information. If the TSC offsets differ too much between vcpus
2153 /// it can cause clock issues in the guest.
save_vcpu_tsc_offset(offset: u64, vcpu_id: usize)2154 pub fn save_vcpu_tsc_offset(offset: u64, vcpu_id: usize) {
2155 let offsets_copy = {
2156 let mut offsets = TSC_OFFSETS.lock();
2157 // make sure offsets vec is large enough before inserting
2158 let newlen = std::cmp::max(offsets.len(), vcpu_id + 1);
2159 offsets.resize(newlen, None);
2160 offsets[vcpu_id] = Some(offset);
2161
2162 offsets.clone()
2163 };
2164
2165 // do statistics on a clone of the offsets so we don't hold up other vcpus at this point
2166 info!(
2167 "TSC offset standard deviation is: {}",
2168 standard_deviation(
2169 &offsets_copy
2170 .iter()
2171 .filter(|x| x.is_some())
2172 .map(|x| x.unwrap() as u128)
2173 .collect::<Vec<u128>>()
2174 )
2175 );
2176 }
2177
2178 /// Get the TSC offset of any vcpu. It will pick the first non-None offset it finds in TSC_OFFSETS.
2179 #[cfg(feature = "perfetto")]
get_vcpu_tsc_offset() -> u642180 pub fn get_vcpu_tsc_offset() -> u64 {
2181 if let Some(offset) = TSC_OFFSETS.lock().iter().flatten().next() {
2182 return *offset;
2183 }
2184 0
2185 }
2186
2187 /// Callback that is registered with tracing crate, and will be called by the tracing thread when
2188 /// tracing is enabled or disabled. Regardless of whether tracing is being enabled or disabled for
2189 /// a given category or instance, we just emit a clock snapshot that maps the guest TSC to the
2190 /// host TSC. Redundant snapshots should not be a problem for perfetto.
2191 #[cfg(feature = "perfetto")]
set_tsc_clock_snapshot()2192 fn set_tsc_clock_snapshot() {
2193 let freq = match devices::tsc::tsc_frequency() {
2194 Err(e) => {
2195 error!(
2196 "Could not determine tsc frequency, unable to snapshot tsc offset: {}",
2197 e
2198 );
2199 return;
2200 }
2201 Ok(freq) => freq,
2202 };
2203
2204 // The offset is host-guest tsc value
2205 let offset = get_vcpu_tsc_offset();
2206 // Safe because _rdtsc takes no arguments;
2207 let host_tsc = unsafe { std::arch::x86_64::_rdtsc() };
2208 perfetto::snapshot_clock(perfetto::ClockSnapshot::new(
2209 // Technically our multiplier should be freq/1_000_000_000, but perfetto doesn't
2210 // support floating point multipliers yet. So for now we set the freq in Hz and rely
2211 // on the merge tool to fix it.
2212 perfetto::Clock::new(
2213 perfetto::BuiltinClock::Tsc as u32,
2214 host_tsc.wrapping_add(offset),
2215 )
2216 .set_multiplier(freq as u64),
2217 perfetto::Clock::new(
2218 // The host builtin clock ids are all offset from the guest ids by
2219 // HOST_GUEST_CLOCK_ID_OFFSET when the traces are merged. Because this snapshot
2220 // contains both a guest and host clock, we need to offset it before merge.
2221 perfetto::BuiltinClock::Tsc as u32 + cros_tracing::HOST_GUEST_CLOCK_ID_OFFSET,
2222 host_tsc,
2223 )
2224 .set_multiplier(freq as u64),
2225 ));
2226 }
2227
2228 /// Launches run_config for the broker, reading configuration from a TubeTransporter.
run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState>2229 pub fn run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result<ExitState> {
2230 let tube_transporter =
2231 // SAFETY:
2232 // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that
2233 // the blocking & framing modes are accurate because we create them ourselves in the broker.
2234 unsafe { TubeTransporterReader::from_raw_descriptor(raw_tube_transporter) };
2235
2236 let mut tube_data_list = tube_transporter
2237 .read_tubes()
2238 .exit_context(Exit::TubeTransporterInit, "failed to init tube transporter")?;
2239
2240 let bootstrap_tube = tube_data_list
2241 .get_tube(TubeToken::Bootstrap)
2242 .exit_context(Exit::TubeFailure, "failed to get bootstrap tube")?;
2243
2244 let mut cfg: Config = bootstrap_tube
2245 .recv::<Config>()
2246 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2247
2248 let startup_args: CommonChildStartupArgs = bootstrap_tube
2249 .recv::<CommonChildStartupArgs>()
2250 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2251 let _child_cleanup = common_child_setup(startup_args).exit_context(
2252 Exit::CommonChildSetupError,
2253 "failed to perform common child setup",
2254 )?;
2255
2256 cfg.broker_shutdown_event = Some(
2257 bootstrap_tube
2258 .recv::<Event>()
2259 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?,
2260 );
2261 #[cfg(feature = "crash-report")]
2262 let crash_tube_map = bootstrap_tube
2263 .recv::<HashMap<ProcessType, Vec<SendTube>>>()
2264 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2265 #[cfg(feature = "crash-report")]
2266 crash_report::set_crash_tube_map(crash_tube_map);
2267
2268 let BrokerTubes {
2269 vm_evt_wrtube,
2270 vm_evt_rdtube,
2271 } = bootstrap_tube
2272 .recv::<BrokerTubes>()
2273 .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?;
2274
2275 run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2276 }
2277
run_config(cfg: Config) -> Result<ExitState>2278 pub fn run_config(cfg: Config) -> Result<ExitState> {
2279 let _raise_timer_resolution = enable_high_res_timers()
2280 .exit_context(Exit::EnableHighResTimer, "failed to enable high res timer")?;
2281
2282 // There is no broker when using run_config(), so the vm_evt tubes need to be created.
2283 let (vm_evt_wrtube, vm_evt_rdtube) =
2284 Tube::directional_pair().context("failed to create vm event tube")?;
2285
2286 run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube)
2287 }
2288
create_guest_memory( components: &VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>2289 fn create_guest_memory(
2290 components: &VmComponents,
2291 arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
2292 hypervisor: &impl Hypervisor,
2293 ) -> Result<GuestMemory> {
2294 let guest_mem_layout = Arch::guest_memory_layout(components, arch_memory_layout, hypervisor)
2295 .exit_context(
2296 Exit::GuestMemoryLayout,
2297 "failed to create guest memory layout",
2298 )?;
2299 GuestMemory::new_with_options(&guest_mem_layout)
2300 .exit_context(Exit::CreateGuestMemory, "failed to create guest memory")
2301 }
2302
run_config_inner( cfg: Config, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState>2303 fn run_config_inner(
2304 cfg: Config,
2305 vm_evt_wrtube: SendTube,
2306 vm_evt_rdtube: RecvTube,
2307 ) -> Result<ExitState> {
2308 product::setup_common_metric_invariants(&cfg);
2309
2310 #[cfg(feature = "perfetto")]
2311 cros_tracing::add_per_trace_callback(set_tsc_clock_snapshot);
2312
2313 let components: VmComponents = setup_vm_components(&cfg)?;
2314 let arch_memory_layout = Arch::arch_memory_layout(&components)?;
2315
2316 #[allow(unused_mut)]
2317 let mut hypervisor = cfg
2318 .hypervisor
2319 .or_else(get_default_hypervisor)
2320 .exit_context(Exit::NoDefaultHypervisor, "no enabled hypervisor")?;
2321
2322 #[cfg(feature = "whpx")]
2323 if hypervisor::whpx::Whpx::is_enabled() {
2324 // If WHPX is enabled, no other hypervisor can be used, so just override it
2325 hypervisor = HypervisorKind::Whpx;
2326 }
2327
2328 match hypervisor {
2329 #[cfg(feature = "haxm")]
2330 HypervisorKind::Haxm | HypervisorKind::Ghaxm => {
2331 if hypervisor == HypervisorKind::Haxm {
2332 set_use_ghaxm(false);
2333 }
2334 info!("Creating HAXM ghaxm={}", get_use_ghaxm());
2335 let haxm = Haxm::new()?;
2336 let guest_mem = create_guest_memory(&components, &arch_memory_layout, &haxm)?;
2337 let vm = create_haxm_vm(haxm, guest_mem, &cfg.kernel_log_file)?;
2338 let (ioapic_host_tube, ioapic_device_tube) =
2339 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2340 let irq_chip =
2341 create_userspace_irq_chip::<HaxmVcpu>(components.vcpu_count, ioapic_device_tube)?;
2342 run_vm::<HaxmVcpu, HaxmVm>(
2343 cfg,
2344 components,
2345 &arch_memory_layout,
2346 vm,
2347 WindowsIrqChip::Userspace(irq_chip).as_mut(),
2348 Some(ioapic_host_tube),
2349 vm_evt_wrtube,
2350 vm_evt_rdtube,
2351 )
2352 }
2353 #[cfg(feature = "whpx")]
2354 HypervisorKind::Whpx => {
2355 let apic_emulation_supported =
2356 Whpx::check_whpx_feature(WhpxFeature::LocalApicEmulation)
2357 .exit_context(Exit::WhpxSetupError, "failed to set up whpx")?;
2358
2359 let no_smt = cfg.no_smt;
2360
2361 // Default to WhpxSplitIrqChip if it's supported because it's more performant
2362 let irq_chip = cfg.irq_chip.unwrap_or(if apic_emulation_supported {
2363 IrqChipKind::Split
2364 } else {
2365 IrqChipKind::Userspace
2366 });
2367
2368 // Both WHPX irq chips use a userspace IOAPIC
2369 let (ioapic_host_tube, ioapic_device_tube) =
2370 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2371
2372 info!("Creating Whpx");
2373 let whpx = Whpx::new()?;
2374 let guest_mem = create_guest_memory(&components, &arch_memory_layout, &whpx)?;
2375 let vm = create_whpx_vm(
2376 whpx,
2377 guest_mem,
2378 components.vcpu_count,
2379 no_smt,
2380 apic_emulation_supported && irq_chip == IrqChipKind::Split,
2381 cfg.force_calibrated_tsc_leaf,
2382 vm_evt_wrtube
2383 .try_clone()
2384 .expect("could not clone vm_evt_wrtube"),
2385 )?;
2386
2387 let mut irq_chip = match irq_chip {
2388 IrqChipKind::Kernel => unimplemented!("Kernel irqchip mode not supported by WHPX"),
2389 IrqChipKind::Split => {
2390 if !apic_emulation_supported {
2391 panic!(
2392 "split irqchip specified but your WHPX version does not support \
2393 local apic emulation"
2394 );
2395 }
2396 WindowsIrqChip::WhpxSplit(create_whpx_split_irq_chip(&vm, ioapic_device_tube)?)
2397 }
2398 IrqChipKind::Userspace => {
2399 WindowsIrqChip::Userspace(create_userspace_irq_chip::<WhpxVcpu>(
2400 components.vcpu_count,
2401 ioapic_device_tube,
2402 )?)
2403 }
2404 };
2405 run_vm::<WhpxVcpu, WhpxVm>(
2406 cfg,
2407 components,
2408 &arch_memory_layout,
2409 vm,
2410 irq_chip.as_mut(),
2411 Some(ioapic_host_tube),
2412 vm_evt_wrtube,
2413 vm_evt_rdtube,
2414 )
2415 }
2416 #[cfg(feature = "gvm")]
2417 HypervisorKind::Gvm => {
2418 info!("Creating GVM");
2419 let gvm = Gvm::new()?;
2420 let guest_mem = create_guest_memory(&components, &arch_memory_layout, &gvm)?;
2421 let vm = create_gvm_vm(gvm, guest_mem)?;
2422 let ioapic_host_tube;
2423 let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
2424 IrqChipKind::Split => unimplemented!("Split irqchip mode not supported by GVM"),
2425 IrqChipKind::Kernel => {
2426 ioapic_host_tube = None;
2427 WindowsIrqChip::Gvm(create_gvm_irq_chip(&vm, components.vcpu_count)?)
2428 }
2429 IrqChipKind::Userspace => {
2430 let (host_tube, ioapic_device_tube) =
2431 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2432 ioapic_host_tube = Some(host_tube);
2433 WindowsIrqChip::Userspace(create_userspace_irq_chip::<GvmVcpu>(
2434 components.vcpu_count,
2435 ioapic_device_tube,
2436 )?)
2437 }
2438 };
2439 run_vm::<GvmVcpu, GvmVm>(
2440 cfg,
2441 components,
2442 &arch_memory_layout,
2443 vm,
2444 irq_chip.as_mut(),
2445 ioapic_host_tube,
2446 vm_evt_wrtube,
2447 vm_evt_rdtube,
2448 )
2449 }
2450 }
2451 }
2452
2453 #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))]
run_vm<Vcpu, V>( #[allow(unused_mut)] mut cfg: Config, #[allow(unused_mut)] mut components: VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,2454 fn run_vm<Vcpu, V>(
2455 #[allow(unused_mut)] mut cfg: Config,
2456 #[allow(unused_mut)] mut components: VmComponents,
2457 arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
2458 mut vm: V,
2459 irq_chip: &mut dyn IrqChipArch,
2460 ioapic_host_tube: Option<Tube>,
2461 vm_evt_wrtube: SendTube,
2462 vm_evt_rdtube: RecvTube,
2463 ) -> Result<ExitState>
2464 where
2465 Vcpu: VcpuArch + 'static,
2466 V: VmArch + 'static,
2467 {
2468 let vm_memory_size_mb = components.memory_size / (1024 * 1024);
2469 let mut control_tubes = Vec::new();
2470 let mut irq_control_tubes = Vec::new();
2471 let mut vm_memory_control_tubes = Vec::new();
2472 // Create one control tube per disk.
2473 let mut disk_device_tubes = Vec::new();
2474 let mut disk_host_tubes = Vec::new();
2475 let disk_count = cfg.disks.len();
2476 for _ in 0..disk_count {
2477 let (disk_host_tube, disk_device_tube) =
2478 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2479 disk_host_tubes.push(disk_host_tube);
2480 disk_device_tubes.push(disk_device_tube);
2481 }
2482
2483 if let Some(ioapic_host_tube) = ioapic_host_tube {
2484 irq_control_tubes.push(ioapic_host_tube);
2485 }
2486
2487 // Balloon gets a special socket so balloon requests can be forwarded from the main process.
2488 let (balloon_host_tube, balloon_device_tube) = if cfg.balloon {
2489 let (balloon_host_tube, balloon_device_tube) =
2490 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2491 (Some(balloon_host_tube), Some(balloon_device_tube))
2492 } else {
2493 (None, None)
2494 };
2495 // The balloon device also needs a tube to communicate back to the main process to
2496 // handle remapping memory dynamically.
2497 let dynamic_mapping_device_tube = if cfg.balloon {
2498 let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) =
2499 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2500 vm_memory_control_tubes.push(dynamic_mapping_host_tube);
2501 Some(dynamic_mapping_device_tube)
2502 } else {
2503 None
2504 };
2505
2506 // PvClock gets a tube for handling suspend/resume requests from the main thread.
2507 #[cfg(feature = "pvclock")]
2508 let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock {
2509 let (host, device) =
2510 Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?;
2511 (Some(host), Some(device))
2512 } else {
2513 (None, None)
2514 };
2515
2516 let gralloc = RutabagaGralloc::new(RutabagaGrallocBackendFlags::new())
2517 .exit_context(Exit::CreateGralloc, "failed to create gralloc")?;
2518
2519 let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
2520 let mut sys_allocator = SystemAllocator::new(
2521 Arch::get_system_allocator_config(&vm, arch_memory_layout),
2522 pstore_size,
2523 &cfg.mmio_address_ranges,
2524 )
2525 .context("failed to create system allocator")?;
2526
2527 // Allocate the ramoops region first.
2528 let ramoops_region = match &components.pstore {
2529 Some(pstore) => Some(
2530 arch::pstore::create_memory_region(
2531 &mut vm,
2532 sys_allocator.reserved_region().unwrap(),
2533 pstore,
2534 )
2535 .exit_context(
2536 Exit::Pstore,
2537 format!("failed to allocate pstore region {:?}", &components.pstore),
2538 )?,
2539 ),
2540 None => None,
2541 };
2542
2543 let init_balloon_size = components
2544 .memory_size
2545 .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| {
2546 m.checked_mul(1024 * 1024).unwrap_or(u64::MAX)
2547 }))
2548 .context("failed to calculate init balloon size")?;
2549
2550 let tsc_state = devices::tsc::tsc_state().exit_code(Exit::TscCalibrationFailed)?;
2551 let tsc_sync_mitigations = get_tsc_sync_mitigations(&tsc_state, components.vcpu_count);
2552
2553 if tsc_state.core_grouping.size() > 1 {
2554 // Host TSCs are not in sync, log a metric about it.
2555 warn!(
2556 "Host TSCs are not in sync, applying the following mitigations: {:?}",
2557 tsc_sync_mitigations
2558 );
2559 log_descriptor(
2560 MetricEventType::TscCoresOutOfSync,
2561 // casting u64 as i64 is a no-op, so we don't lose any part of the bitmask
2562 tsc_state.core_grouping.core_grouping_bitmask() as i64,
2563 );
2564 }
2565
2566 #[cfg(feature = "gpu")]
2567 let gpu_control_tube = cfg
2568 .gpu_vmm_config
2569 .as_mut()
2570 .and_then(|config| config.gpu_control_host_tube.take());
2571 let product_args = product::get_run_control_args(&mut cfg);
2572
2573 // We open these files before lowering the token, as in the future a stricter policy may
2574 // prevent it.
2575 let dt_overlays = cfg
2576 .device_tree_overlay
2577 .iter()
2578 .map(|o| {
2579 Ok(DtbOverlay {
2580 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2581 .with_context(|| {
2582 format!("failed to open device tree overlay {}", o.path.display())
2583 })?,
2584 })
2585 })
2586 .collect::<Result<Vec<DtbOverlay>>>()?;
2587
2588 // Lower the token, locking the main process down to a stricter security policy.
2589 //
2590 // WARNING:
2591 //
2592 // Windows system calls can behave in unusual ways if they happen concurrently to the token
2593 // lowering. For example, access denied can happen if Tube pairs are created in another thread
2594 // (b/281108137), and lower_token happens right before the client pipe is connected. Tubes are
2595 // not privileged resources, but can be broken due to the token changing unexpectedly.
2596 //
2597 // We explicitly lower the token here and *then* call run_control to make it clear that any
2598 // resources that require a privileged token should be created on the main thread & passed into
2599 // run_control, to follow the correct order:
2600 // - Privileged resources are created.
2601 // - Token is lowered.
2602 // - Threads are spawned & may create more non-privileged resources (without fear of the token
2603 // changing at an undefined time).
2604 //
2605 // Recommendation: If you find your code doesnt work in run_control because of the sandbox, you
2606 // should split any resource creation to before this token lowering & pass the resources into
2607 // run_control. Don't move the token lowering somewhere else without considering multi-threaded
2608 // effects.
2609 #[cfg(feature = "sandbox")]
2610 if sandbox::is_sandbox_target() {
2611 sandbox::TargetServices::get()
2612 .exit_code_from_err("failed to create sandbox")?
2613 .expect("Could not create sandbox!")
2614 .lower_token();
2615 }
2616
2617 let virtio_snd_state_device_tube = create_snd_state_tube(&mut control_tubes)?;
2618
2619 let (virtio_snd_host_mute_tube, virtio_snd_device_mute_tube) = create_snd_mute_tube_pair()?;
2620
2621 let mut initial_audio_session_states: Vec<InitialAudioSessionState> = Vec::new();
2622
2623 let pci_devices = create_devices(
2624 &mut cfg,
2625 vm.get_memory(),
2626 &vm_evt_wrtube,
2627 &mut irq_control_tubes,
2628 &mut vm_memory_control_tubes,
2629 &mut control_tubes,
2630 &mut disk_device_tubes,
2631 &mut initial_audio_session_states,
2632 balloon_device_tube,
2633 #[cfg(feature = "pvclock")]
2634 pvclock_device_tube,
2635 dynamic_mapping_device_tube,
2636 /* inflate_tube= */ None,
2637 init_balloon_size,
2638 tsc_state.frequency,
2639 virtio_snd_state_device_tube,
2640 virtio_snd_device_mute_tube,
2641 )?;
2642
2643 let mut vcpu_ids = Vec::new();
2644
2645 let (vwmdt_host_tube, vmwdt_device_tube) = Tube::pair().context("failed to create tube")?;
2646 let windows = Arch::build_vm::<V, Vcpu>(
2647 components,
2648 arch_memory_layout,
2649 &vm_evt_wrtube,
2650 &mut sys_allocator,
2651 &cfg.serial_parameters,
2652 None,
2653 (cfg.battery_config.as_ref().map(|t| t.type_), None),
2654 vm,
2655 ramoops_region,
2656 pci_devices,
2657 irq_chip,
2658 &mut vcpu_ids,
2659 cfg.dump_device_tree_blob.clone(),
2660 /* debugcon_jail= */ None,
2661 None,
2662 None,
2663 /* guest_suspended_cvar= */ None,
2664 dt_overlays,
2665 cfg.fdt_position,
2666 cfg.no_pmu,
2667 )
2668 .exit_context(Exit::BuildVm, "the architecture failed to build the vm")?;
2669
2670 #[cfg(feature = "stats")]
2671 let stats = if cfg.exit_stats {
2672 Some(Arc::new(Mutex::new(StatisticsCollector::new())))
2673 } else {
2674 None
2675 };
2676
2677 run_control(
2678 windows,
2679 sys_allocator,
2680 control_tubes,
2681 irq_control_tubes,
2682 vm_memory_control_tubes,
2683 vm_evt_rdtube,
2684 vm_evt_wrtube,
2685 #[cfg(feature = "gpu")]
2686 gpu_control_tube,
2687 cfg.broker_shutdown_event.take(),
2688 balloon_host_tube,
2689 #[cfg(feature = "pvclock")]
2690 pvclock_host_tube,
2691 disk_host_tubes,
2692 initial_audio_session_states,
2693 gralloc,
2694 #[cfg(feature = "stats")]
2695 stats,
2696 cfg.service_pipe_name,
2697 vm_memory_size_mb,
2698 cfg.host_cpu_topology,
2699 tsc_sync_mitigations,
2700 cfg.force_calibrated_tsc_leaf,
2701 product_args,
2702 match virtio_snd_host_mute_tube {
2703 Some(virtio_snd_host_mute_tube) => vec![virtio_snd_host_mute_tube],
2704 None => vec![],
2705 },
2706 cfg.restore_path,
2707 cfg.socket_path,
2708 cfg.force_s2idle,
2709 cfg.suspended,
2710 )
2711 }
2712
2713 #[cfg(test)]
2714 mod tests {
2715 use tempfile::TempDir;
2716
2717 use super::*;
2718
create_config(test_dir: &TempDir) -> Config2719 fn create_config(test_dir: &TempDir) -> Config {
2720 let mut config = Config::default();
2721
2722 let dummy_kernel_path = test_dir.path().join("dummy_kernel.txt");
2723 OpenOptions::new()
2724 .create_new(true)
2725 .write(true)
2726 .open(&dummy_kernel_path)
2727 .expect("Could not open file!");
2728 config.executable_path = Some(Executable::Kernel(dummy_kernel_path));
2729
2730 config
2731 }
2732
2733 #[test]
2734 #[should_panic(expected = "Did not receive a bios or kernel")]
setup_vm_components_panics_when_no_kernel_provided()2735 fn setup_vm_components_panics_when_no_kernel_provided() {
2736 let mut config =
2737 create_config(&TempDir::new().expect("Could not create temporary directory!"));
2738 config.executable_path = None;
2739 let _ = setup_vm_components(&config);
2740 }
2741
2742 #[test]
setup_vm_components_stores_memory_in_bytes()2743 fn setup_vm_components_stores_memory_in_bytes() {
2744 let tempdir = TempDir::new().expect("Could not create temporary directory!");
2745 let mut config = create_config(&tempdir);
2746 config.memory = Some(1);
2747 let vm_components = setup_vm_components(&config).expect("failed to setup vm components");
2748 assert_eq!(vm_components.memory_size, 1024 * 1024);
2749 }
2750
2751 #[test]
setup_vm_components_fails_when_memory_too_large()2752 fn setup_vm_components_fails_when_memory_too_large() {
2753 let tempdir = TempDir::new().expect("Could not create temporary directory!");
2754 let mut config = create_config(&tempdir);
2755 // One mb more than a u64 can hold in bytes
2756 config.memory = Some((u64::MAX / 1024 / 1024) + 1);
2757 setup_vm_components(&config).err().expect("expected error");
2758 }
2759 }
2760