// Copyright 2022 The ChromiumOS Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // TODO(b:240716507): There is huge chunk for code which depends on haxm, whpx or gvm to be enabled // but isn't marked so. Remove this when we do so. #![allow(dead_code, unused_imports, unused_variables, unreachable_code)] pub(crate) mod control_server; pub(crate) mod irq_wait; pub(crate) mod main; #[cfg(not(feature = "crash-report"))] mod panic_hook; mod generic; use generic as product; pub(crate) mod run_vcpu; #[cfg(feature = "whpx")] use std::arch::x86_64::__cpuid; #[cfg(feature = "whpx")] use std::arch::x86_64::__cpuid_count; use std::cmp::Reverse; use std::collections::BTreeMap; use std::collections::HashMap; use std::fs::File; use std::fs::OpenOptions; use std::io::stdin; use std::iter; use std::mem; use std::os::windows::fs::OpenOptionsExt; use std::path::PathBuf; use std::sync::mpsc; use std::sync::Arc; #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] use aarch64::AArch64 as Arch; use acpi_tables::sdt::SDT; use anyhow::anyhow; use anyhow::bail; use anyhow::Context; use anyhow::Result; use arch::CpuConfigArch; use arch::DtbOverlay; use arch::IrqChipArch; use arch::LinuxArch; use arch::RunnableLinuxVm; use arch::VcpuArch; use arch::VirtioDeviceStub; use arch::VmArch; use arch::VmComponents; use arch::VmImage; use base::enable_high_res_timers; use base::error; use base::info; use base::open_file_or_duplicate; use base::warn; use base::AsRawDescriptor; #[cfg(feature = "gpu")] use base::BlockingMode; use base::CloseNotifier; use base::Event; use base::EventToken; use base::EventType; use base::FlushOnDropTube; #[cfg(feature = "gpu")] use base::FramingMode; use base::FromRawDescriptor; use base::ProtoTube; use base::RawDescriptor; use base::ReadNotifier; use base::RecvTube; use base::SendTube; #[cfg(feature = "gpu")] use base::StreamChannel; use base::Terminal; use base::TriggeredEvent; use base::Tube; use base::TubeError; use base::VmEventType; use base::WaitContext; use broker_ipc::common_child_setup; use broker_ipc::CommonChildStartupArgs; use control_server::ControlServer; use crosvm_cli::sys::windows::exit::Exit; use crosvm_cli::sys::windows::exit::ExitContext; use crosvm_cli::sys::windows::exit::ExitContextAnyhow; use crosvm_cli::sys::windows::exit::ExitContextOption; use devices::create_devices_worker_thread; use devices::serial_device::SerialHardware; use devices::serial_device::SerialParameters; use devices::tsc::get_tsc_sync_mitigations; use devices::tsc::standard_deviation; use devices::tsc::TscSyncMitigations; use devices::virtio; use devices::virtio::block::DiskOption; #[cfg(feature = "audio")] use devices::virtio::snd::common_backend::VirtioSnd; #[cfg(feature = "audio")] use devices::virtio::snd::parameters::Parameters as SndParameters; #[cfg(feature = "gpu")] use devices::virtio::vhost::user::device::gpu::sys::windows::GpuVmmConfig; #[cfg(feature = "gpu")] use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventSplitConfig; #[cfg(feature = "gpu")] use devices::virtio::vhost::user::device::gpu::sys::windows::InputEventVmmConfig; #[cfg(feature = "gpu")] use devices::virtio::vhost::user::gpu::sys::windows::product::GpuBackendConfig as GpuBackendConfigProduct; #[cfg(feature = "gpu")] use devices::virtio::vhost::user::gpu::sys::windows::run_gpu_device_worker; #[cfg(feature = "audio")] use devices::virtio::vhost::user::snd::sys::windows::product::SndBackendConfig as SndBackendConfigProduct; #[cfg(feature = "audio")] use devices::virtio::vhost::user::snd::sys::windows::run_snd_device_worker; #[cfg(feature = "audio")] use devices::virtio::vhost::user::snd::sys::windows::SndSplitConfig; #[cfg(feature = "balloon")] use devices::virtio::BalloonFeatures; use devices::virtio::Console; #[cfg(feature = "gpu")] use devices::virtio::GpuParameters; use devices::BusDeviceObj; use devices::BusResumeDevice; #[cfg(feature = "gvm")] use devices::GvmIrqChip; #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] use devices::IrqChip; use devices::UserspaceIrqChip; use devices::VcpuRunState; use devices::VirtioPciDevice; #[cfg(feature = "whpx")] use devices::WhpxSplitIrqChip; #[cfg(feature = "gpu")] use gpu_display::EventDevice; #[cfg(feature = "gpu")] use gpu_display::WindowProcedureThread; #[cfg(feature = "gpu")] use gpu_display::WindowProcedureThreadBuilder; #[cfg(feature = "gvm")] use hypervisor::gvm::Gvm; #[cfg(feature = "gvm")] use hypervisor::gvm::GvmVcpu; #[cfg(feature = "gvm")] use hypervisor::gvm::GvmVersion; #[cfg(feature = "gvm")] use hypervisor::gvm::GvmVm; #[cfg(feature = "haxm")] use hypervisor::haxm::get_use_ghaxm; #[cfg(feature = "haxm")] use hypervisor::haxm::set_use_ghaxm; #[cfg(feature = "haxm")] use hypervisor::haxm::Haxm; #[cfg(feature = "haxm")] use hypervisor::haxm::HaxmVcpu; #[cfg(feature = "haxm")] use hypervisor::haxm::HaxmVm; #[cfg(feature = "whpx")] use hypervisor::whpx::Whpx; #[cfg(feature = "whpx")] use hypervisor::whpx::WhpxFeature; #[cfg(feature = "whpx")] use hypervisor::whpx::WhpxVcpu; #[cfg(feature = "whpx")] use hypervisor::whpx::WhpxVm; use hypervisor::Hypervisor; #[cfg(feature = "whpx")] use hypervisor::HypervisorCap; #[cfg(feature = "whpx")] use hypervisor::HypervisorX86_64; use hypervisor::ProtectionType; use hypervisor::Vm; use irq_wait::IrqWaitWorker; use jail::FakeMinijailStub as Minijail; #[cfg(not(feature = "crash-report"))] pub(crate) use panic_hook::set_panic_hook; use product::create_snd_mute_tube_pair; #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))] use product::create_snd_state_tube; #[cfg(feature = "pvclock")] use product::handle_pvclock_request; use product::merge_session_invariants; use product::run_ime_thread; use product::set_package_name; pub(crate) use product::setup_metrics_reporting; use product::start_service_ipc_listener; use product::RunControlArgs; use product::ServiceVmState; use product::Token; use resources::SystemAllocator; use run_vcpu::run_all_vcpus; use run_vcpu::VcpuRunMode; use rutabaga_gfx::RutabagaGralloc; use rutabaga_gfx::RutabagaGrallocBackendFlags; use smallvec::SmallVec; use sync::Mutex; use tube_transporter::TubeToken; use tube_transporter::TubeTransporterReader; use vm_control::api::VmMemoryClient; #[cfg(feature = "balloon")] use vm_control::BalloonControlCommand; #[cfg(feature = "balloon")] use vm_control::BalloonTube; use vm_control::DeviceControlCommand; use vm_control::InitialAudioSessionState; use vm_control::IrqHandlerRequest; use vm_control::PvClockCommand; use vm_control::VcpuControl; use vm_control::VmMemoryRegionState; use vm_control::VmMemoryRequest; use vm_control::VmRequest; use vm_control::VmResponse; use vm_control::VmRunMode; use vm_memory::GuestAddress; use vm_memory::GuestMemory; use vmm_vhost::Connection; use vmm_vhost::FrontendReq; use win_util::ProcessType; #[cfg(feature = "whpx")] use x86_64::cpuid::adjust_cpuid; #[cfg(feature = "whpx")] use x86_64::cpuid::CpuIdContext; #[cfg(target_arch = "x86_64")] use x86_64::X8664arch as Arch; use crate::crosvm::config::Config; use crate::crosvm::config::Executable; use crate::crosvm::config::InputDeviceOption; #[cfg(any(feature = "gvm", feature = "whpx"))] use crate::crosvm::config::IrqChipKind; #[cfg(feature = "gpu")] use crate::crosvm::config::TouchDeviceOption; use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT; use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH; use crate::crosvm::sys::config::HypervisorKind; use crate::crosvm::sys::windows::broker::BrokerTubes; #[cfg(feature = "stats")] use crate::crosvm::sys::windows::stats::StatisticsCollector; #[cfg(feature = "gpu")] pub(crate) use crate::sys::windows::product::get_gpu_product_configs; #[cfg(feature = "audio")] pub(crate) use crate::sys::windows::product::get_snd_product_configs; #[cfg(feature = "gpu")] pub(crate) use crate::sys::windows::product::get_window_procedure_thread_product_configs; use crate::sys::windows::product::log_descriptor; #[cfg(feature = "audio")] pub(crate) use crate::sys::windows::product::num_input_sound_devices; #[cfg(feature = "audio")] pub(crate) use crate::sys::windows::product::num_input_sound_streams; use crate::sys::windows::product::spawn_anti_tamper_thread; use crate::sys::windows::product::MetricEventType; const DEFAULT_GUEST_CID: u64 = 3; // by default, if enabled, the balloon WS features will use 4 bins. const VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS: u8 = 4; enum TaggedControlTube { Vm(FlushOnDropTube), Product(product::TaggedControlTube), } impl ReadNotifier for TaggedControlTube { fn get_read_notifier(&self) -> &dyn AsRawDescriptor { match self { Self::Vm(tube) => tube.0.get_read_notifier(), Self::Product(tube) => tube.get_read_notifier(), } } } impl CloseNotifier for TaggedControlTube { fn get_close_notifier(&self) -> &dyn AsRawDescriptor { match self { Self::Vm(tube) => tube.0.get_close_notifier(), Self::Product(tube) => tube.get_close_notifier(), } } } pub enum ExitState { Reset, Stop, Crash, #[allow(dead_code)] GuestPanic, WatchdogReset, } type DeviceResult = Result; fn create_vhost_user_block_device( cfg: &Config, connection: Connection, ) -> DeviceResult { let dev = virtio::VhostUserFrontend::new( virtio::DeviceType::Block, virtio::base_features(cfg.protection_type), connection, None, None, ) .exit_context( Exit::VhostUserBlockDeviceNew, "failed to set up vhost-user block device", )?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult { let features = virtio::base_features(cfg.protection_type); let dev = virtio::BlockAsync::new( features, disk.open()?, disk, Some(disk_device_tube), None, None, ) .exit_context(Exit::BlockDeviceNew, "failed to create block device")?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } #[cfg(feature = "gpu")] fn create_vhost_user_gpu_device( base_features: u64, connection: Connection, ) -> DeviceResult { let dev = virtio::VhostUserFrontend::new( virtio::DeviceType::Gpu, base_features, connection, None, None, ) .exit_context( Exit::VhostUserGpuDeviceNew, "failed to set up vhost-user gpu device", )?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } #[cfg(feature = "audio")] fn create_vhost_user_snd_device( base_features: u64, connection: Connection, ) -> DeviceResult { let dev = virtio::VhostUserFrontend::new( virtio::DeviceType::Sound, base_features, connection, None, None, ) .exit_context( Exit::VhostUserSndDeviceNew, "failed to set up vhost-user snd device", )?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } #[cfg(feature = "gpu")] fn create_multi_touch_device( cfg: &Config, event_pipe: StreamChannel, width: u32, height: u32, name: Option<&str>, idx: u32, ) -> DeviceResult { let dev = virtio::input::new_multi_touch( idx, event_pipe, width, height, name, virtio::base_features(cfg.protection_type), ) .exit_context(Exit::InputDeviceNew, "failed to set up input device")?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } #[cfg(feature = "gpu")] fn create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult { let dev = virtio::input::new_mouse(idx, event_pipe, virtio::base_features(cfg.protection_type)) .exit_context(Exit::InputDeviceNew, "failed to set up input device")?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } #[cfg(feature = "slirp")] fn create_vhost_user_net_device(cfg: &Config, connection: Connection) -> DeviceResult { let features = virtio::base_features(cfg.protection_type); let dev = virtio::VhostUserFrontend::new(virtio::DeviceType::Net, features, connection, None, None) .exit_context( Exit::VhostUserNetDeviceNew, "failed to set up vhost-user net device", )?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } fn create_rng_device(cfg: &Config) -> DeviceResult { let dev = virtio::Rng::new(virtio::base_features(cfg.protection_type)) .exit_context(Exit::RngDeviceNew, "failed to set up rng")?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult { let mut keep_rds = Vec::new(); let evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; let dev = param .create_serial_device::(cfg.protection_type, &evt, &mut keep_rds) .exit_context(Exit::CreateConsole, "failed to create console device")?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } #[cfg(feature = "balloon")] fn create_balloon_device( cfg: &Config, balloon_device_tube: Tube, dynamic_mapping_device_tube: Tube, inflate_tube: Option, init_balloon_size: u64, ) -> DeviceResult { let balloon_features = (cfg.balloon_page_reporting as u64) << BalloonFeatures::PageReporting as u64; let dev = virtio::Balloon::new( virtio::base_features(cfg.protection_type), balloon_device_tube, VmMemoryClient::new(dynamic_mapping_device_tube), inflate_tube, init_balloon_size, balloon_features, #[cfg(feature = "registered_events")] None, VIRTIO_BALLOON_WS_DEFAULT_NUM_BINS, ) .exit_context(Exit::BalloonDeviceNew, "failed to create balloon")?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } fn create_vsock_device(cfg: &Config) -> DeviceResult { // We only support a single guest, so we can confidently assign a default // CID if one isn't provided. We choose the lowest non-reserved value. let dev = virtio::vsock::Vsock::new( cfg.vsock .as_ref() .map(|cfg| cfg.cid) .unwrap_or(DEFAULT_GUEST_CID), cfg.host_guid.clone(), virtio::base_features(cfg.protection_type), ) .exit_context( Exit::UserspaceVsockDeviceNew, "failed to create userspace vsock device", )?; Ok(VirtioDeviceStub { dev: Box::new(dev), jail: None, }) } fn create_virtio_devices( cfg: &mut Config, vm_evt_wrtube: &SendTube, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec, disk_device_tubes: &mut Vec, initial_audio_session_states: &mut Vec, balloon_device_tube: Option, #[cfg(feature = "pvclock")] pvclock_device_tube: Option, dynamic_mapping_device_tube: Option, inflate_tube: Option, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option, virtio_snd_control_device_tube: Option, ) -> DeviceResult> { let mut devs = Vec::new(); if cfg.block_vhost_user_tube.is_empty() { // Disk devices must precede virtio-console devices or the kernel does not boot. // TODO(b/171215421): figure out why this ordering is required and fix it. for disk in &cfg.disks { let disk_device_tube = disk_device_tubes.remove(0); devs.push(create_block_device(cfg, disk, disk_device_tube)?); } } else { info!("Starting up vhost user block backends..."); for _disk in &cfg.disks { let disk_device_tube = cfg.block_vhost_user_tube.remove(0); let connection = Connection::::from(disk_device_tube); devs.push(create_vhost_user_block_device(cfg, connection)?); } } for (_, param) in cfg .serial_parameters .iter() .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole) { let dev = create_console_device(cfg, param)?; devs.push(dev); } #[cfg(feature = "audio")] { let snd_split_configs = std::mem::take(&mut cfg.snd_split_configs); for mut snd_split_cfg in snd_split_configs.into_iter() { devs.push(create_virtio_snd_device( cfg, &mut snd_split_cfg, control_tubes, )?); if let Some(vmm_config) = snd_split_cfg.vmm_config { let initial_audio_session_state = InitialAudioSessionState { audio_client_guid: vmm_config.audio_client_guid, card_index: vmm_config.card_index, }; initial_audio_session_states.push(initial_audio_session_state); } } } #[cfg(feature = "pvclock")] if let Some(tube) = pvclock_device_tube { product::push_pvclock_device(cfg, &mut devs, tsc_frequency, tube); } devs.push(create_rng_device(cfg)?); #[cfg(feature = "slirp")] if let Some(net_vhost_user_tube) = cfg.net_vhost_user_tube.take() { let connection = Connection::::from(net_vhost_user_tube); devs.push(create_vhost_user_net_device(cfg, connection)?); } #[cfg(feature = "balloon")] if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) = (balloon_device_tube, dynamic_mapping_device_tube) { devs.push(create_balloon_device( cfg, balloon_device_tube, dynamic_mapping_device_tube, inflate_tube, init_balloon_size, )?); } devs.push(create_vsock_device(cfg)?); #[cfg(feature = "gpu")] let event_devices = if let Some(InputEventSplitConfig { backend_config, vmm_config, }) = cfg.input_event_split_config.take() { devs.extend( create_virtio_input_event_devices(cfg, vmm_config) .context("create input event devices")?, ); backend_config.map(|cfg| cfg.event_devices) } else { None }; #[cfg(feature = "gpu")] if let Some(wndproc_thread_vmm_config) = cfg .window_procedure_thread_split_config .as_mut() .map(|split_cfg| &mut split_cfg.vmm_config) { product::push_window_procedure_thread_control_tubes( control_tubes, wndproc_thread_vmm_config, ); } #[cfg(feature = "gpu")] let mut wndproc_thread = cfg .window_procedure_thread_split_config .as_mut() .and_then(|cfg| cfg.wndproc_thread_builder.take()) .map(WindowProcedureThreadBuilder::start_thread) .transpose() .context("Failed to start the window procedure thread.")?; #[cfg(feature = "gpu")] if let Some(gpu_vmm_config) = cfg.gpu_vmm_config.take() { devs.push(create_virtio_gpu_device( cfg, gpu_vmm_config, event_devices, &mut wndproc_thread, control_tubes, )?); } Ok(devs) } #[cfg(feature = "gpu")] fn create_virtio_input_event_devices( cfg: &Config, mut input_event_vmm_config: InputEventVmmConfig, ) -> DeviceResult> { let mut devs = Vec::new(); // Iterate event devices, create the VMM end. let mut multi_touch_pipes = input_event_vmm_config .multi_touch_pipes .drain(..) .enumerate(); for input in &cfg.virtio_input { match input { InputDeviceOption::SingleTouch { .. } => { unimplemented!("--single-touch is no longer supported. Use --multi-touch instead."); } InputDeviceOption::MultiTouch { width, height, name, .. } => { let Some((idx, pipe)) = multi_touch_pipes.next() else { break; }; let mut width = *width; let mut height = *height; if idx == 0 { if width.is_none() { width = cfg.display_input_width; } if height.is_none() { height = cfg.display_input_height; } } devs.push(create_multi_touch_device( cfg, pipe, width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH), height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT), name.as_deref(), idx as u32, )?); } _ => {} } } drop(multi_touch_pipes); product::push_mouse_device(cfg, &mut input_event_vmm_config, &mut devs)?; for (idx, pipe) in input_event_vmm_config.mouse_pipes.drain(..).enumerate() { devs.push(create_mouse_device(cfg, pipe, idx as u32)?); } let keyboard_pipe = input_event_vmm_config .keyboard_pipes .pop() .expect("at least one keyboard should be in GPU VMM config"); let dev = virtio::input::new_keyboard( /* idx= */ 0, keyboard_pipe, virtio::base_features(cfg.protection_type), ) .exit_context(Exit::InputDeviceNew, "failed to set up input device")?; devs.push(VirtioDeviceStub { dev: Box::new(dev), jail: None, }); Ok(devs) } #[cfg(feature = "gpu")] fn create_virtio_gpu_device( cfg: &mut Config, mut gpu_vmm_config: GpuVmmConfig, event_devices: Option>, wndproc_thread: &mut Option, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec, ) -> DeviceResult { let resource_bridges = Vec::::new(); product::push_gpu_control_tubes(control_tubes, &mut gpu_vmm_config); // If the GPU backend is passed, start up the vhost-user worker in the main process. if let Some(backend_config) = cfg.gpu_backend_config.take() { let event_devices = event_devices.ok_or_else(|| { anyhow!("event devices are missing when creating virtio-gpu in the current process.") })?; let wndproc_thread = wndproc_thread .take() .ok_or_else(|| anyhow!("Window procedure thread is missing."))?; std::thread::spawn(move || { run_gpu_device_worker(backend_config, event_devices, wndproc_thread) }); } // The GPU is always vhost-user, even if running in the main process. let gpu_device_tube = gpu_vmm_config .main_vhost_user_tube .take() .expect("GPU VMM vhost-user tube should be set"); let connection = Connection::::from(gpu_device_tube); create_vhost_user_gpu_device(virtio::base_features(cfg.protection_type), connection) .context("create vhost-user GPU device") } #[cfg(feature = "audio")] fn create_virtio_snd_device( cfg: &mut Config, snd_split_config: &mut SndSplitConfig, #[allow(clippy::ptr_arg)] control_tubes: &mut Vec, ) -> DeviceResult { let snd_vmm_config = snd_split_config .vmm_config .as_mut() .expect("snd_vmm_config must exist"); product::push_snd_control_tubes(control_tubes, snd_vmm_config); // If the SND backend is passed, start up the vhost-user worker in the main process. if let Some(backend_config) = snd_split_config.backend_config.take() { std::thread::spawn(move || run_snd_device_worker(backend_config)); } // The SND is always vhost-user, even if running in the main process. let snd_device_tube = snd_vmm_config .main_vhost_user_tube .take() .expect("Snd VMM vhost-user tube should be set"); let connection = Connection::::from(snd_device_tube); create_vhost_user_snd_device(virtio::base_features(cfg.protection_type), connection) .context("create vhost-user SND device") } fn create_devices( cfg: &mut Config, mem: &GuestMemory, exit_evt_wrtube: &SendTube, irq_control_tubes: &mut Vec, vm_memory_control_tubes: &mut Vec, control_tubes: &mut Vec, disk_device_tubes: &mut Vec, initial_audio_session_states: &mut Vec, balloon_device_tube: Option, #[cfg(feature = "pvclock")] pvclock_device_tube: Option, dynamic_mapping_device_tube: Option, inflate_tube: Option, init_balloon_size: u64, tsc_frequency: u64, virtio_snd_state_device_tube: Option, virtio_snd_control_device_tube: Option, ) -> DeviceResult, Option)>> { let stubs = create_virtio_devices( cfg, exit_evt_wrtube, control_tubes, disk_device_tubes, initial_audio_session_states, balloon_device_tube, #[cfg(feature = "pvclock")] pvclock_device_tube, dynamic_mapping_device_tube, inflate_tube, init_balloon_size, tsc_frequency, virtio_snd_state_device_tube, virtio_snd_control_device_tube, )?; let mut pci_devices = Vec::new(); for stub in stubs { let (msi_host_tube, msi_device_tube) = Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; irq_control_tubes.push(msi_host_tube); let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() { let (host_tube, device_tube) = Tube::pair().context("failed to create VVU proxy tube")?; vm_memory_control_tubes.push(host_tube); Some(device_tube) } else { None }; let (ioevent_host_tube, ioevent_device_tube) = Tube::pair().context("failed to create ioevent tube")?; vm_memory_control_tubes.push(ioevent_host_tube); let (vm_control_host_tube, vm_control_device_tube) = Tube::pair().context("failed to create vm_control tube")?; control_tubes.push(TaggedControlTube::Vm(FlushOnDropTube::from( vm_control_host_tube, ))); let dev = Box::new( VirtioPciDevice::new( mem.clone(), stub.dev, msi_device_tube, cfg.disable_virtio_intx, shared_memory_tube.map(VmMemoryClient::new), VmMemoryClient::new(ioevent_device_tube), vm_control_device_tube, ) .exit_context(Exit::VirtioPciDev, "failed to create virtio pci dev")?, ) as Box; pci_devices.push((dev, stub.jail)); } Ok(pci_devices) } #[derive(Debug)] struct PvClockError(String); fn handle_readable_event( event: &TriggeredEvent, vm_control_ids_to_remove: &mut Vec, next_control_id: &mut usize, service_vm_state: &mut ServiceVmState, disk_host_tubes: &[Tube], ipc_main_loop_tube: Option<&Tube>, #[cfg(feature = "gpu")] gpu_control_tube: Option<&Tube>, vm_evt_rdtube: &RecvTube, control_tubes: &mut BTreeMap, guest_os: &mut RunnableLinuxVm, sys_allocator_mutex: &Arc>, virtio_snd_host_mute_tubes: &mut [Tube], proto_main_loop_tube: Option<&ProtoTube>, anti_tamper_main_thread_tube: &Option, #[cfg(feature = "balloon")] mut balloon_tube: Option<&mut BalloonTube>, memory_size_mb: u64, vcpu_boxes: &Mutex>>, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option, run_mode_arc: &VcpuRunMode, region_state: &mut VmMemoryRegionState, vm_control_server: Option<&mut ControlServer>, irq_handler_control: &Tube, device_ctrl_tube: &Tube, wait_ctx: &WaitContext, force_s2idle: bool, vcpu_control_channels: &[mpsc::Sender], suspended_pvclock_state: &mut Option, ) -> Result> { let mut execute_vm_request = |request: VmRequest, guest_os: &mut RunnableLinuxVm| { if let VmRequest::Exit = request { return (VmResponse::Ok, Some(VmRunMode::Exiting)); } let vcpu_size = vcpu_boxes.lock().len(); let resp = request.execute( &guest_os.vm, disk_host_tubes, &mut guest_os.pm, #[cfg(feature = "gpu")] gpu_control_tube, #[cfg(not(feature = "gpu"))] None, None, &mut None, |msg| { kick_all_vcpus( run_mode_arc, vcpu_control_channels, vcpu_boxes, guest_os.irq_chip.as_ref(), #[cfg(feature = "pvclock")] pvclock_host_tube, &guest_os.resume_notify_devices, msg, ); }, force_s2idle, #[cfg(feature = "swap")] None, device_ctrl_tube, vcpu_size, irq_handler_control, || guest_os.irq_chip.as_ref().snapshot(vcpu_size), suspended_pvclock_state, ); (resp, None) }; match event.token { Token::VmEvent => match vm_evt_rdtube.recv::() { Ok(vm_event) => { let exit_state = match vm_event { VmEventType::Exit => { info!("vcpu requested shutdown"); Some(ExitState::Stop) } VmEventType::Reset => { info!("vcpu requested reset"); Some(ExitState::Reset) } VmEventType::Crash => { info!("vcpu crashed"); Some(ExitState::Crash) } VmEventType::Panic(_) => { error!("got pvpanic event. this event is not expected on Windows."); None } VmEventType::WatchdogReset => { info!("vcpu stall detected"); Some(ExitState::WatchdogReset) } }; return Ok(exit_state); } Err(e) => { warn!("failed to recv VmEvent: {}", e); } }, Token::BrokerShutdown => { info!("main loop got broker shutdown event"); return Ok(Some(ExitState::Stop)); } Token::VmControlServer => { let server = vm_control_server.expect("control server must exist if this event triggers"); let client = server.accept(); let id = *next_control_id; *next_control_id += 1; wait_ctx .add(client.0.get_read_notifier(), Token::VmControl { id }) .exit_context( Exit::WaitContextAdd, "failed to add trigger to wait context", )?; wait_ctx .add(client.0.get_close_notifier(), Token::VmControl { id }) .exit_context( Exit::WaitContextAdd, "failed to add trigger to wait context", )?; control_tubes.insert(id, TaggedControlTube::Vm(client)); } #[allow(clippy::collapsible_match)] Token::VmControl { id } => { if let Some(tube) = control_tubes.get(&id) { #[allow(clippy::single_match)] match tube { TaggedControlTube::Product(product_tube) => { product::handle_tagged_control_tube_event( product_tube, virtio_snd_host_mute_tubes, service_vm_state, ipc_main_loop_tube, ) } TaggedControlTube::Vm(tube) => match tube.0.recv::() { Ok(request) => { let mut run_mode_opt = None; let response = match request { VmRequest::HotPlugVfioCommand { device, add } => { // Suppress warnings. let _ = (device, add); unimplemented!("not implemented on Windows"); } #[cfg(feature = "registered_events")] VmRequest::RegisterListener { socket_addr, event } => { unimplemented!("not implemented on Windows"); } #[cfg(feature = "registered_events")] VmRequest::UnregisterListener { socket_addr, event } => { unimplemented!("not implemented on Windows"); } #[cfg(feature = "registered_events")] VmRequest::Unregister { socket_addr } => { unimplemented!("not implemented on Windows"); } #[cfg(feature = "balloon")] VmRequest::BalloonCommand(cmd) => { if let Some(balloon_tube) = balloon_tube { if let Some((r, key)) = balloon_tube.send_cmd(cmd, Some(id)) { if key != id { unimplemented!("not implemented on Windows"); } Some(r) } else { None } } else { error!("balloon not enabled"); None } } _ => { let (resp, run_mode_ret) = execute_vm_request(request, guest_os); run_mode_opt = run_mode_ret; Some(resp) } }; if let Some(response) = response { if let Err(e) = tube.0.send(&response) { error!("failed to send VmResponse: {}", e); } } if let Some(exit_state) = handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os) { return Ok(Some(exit_state)); } } Err(e) => { if let TubeError::Disconnected = e { vm_control_ids_to_remove.push(id); } else { error!("failed to recv VmRequest: {}", e); } } }, } } } #[cfg(feature = "balloon")] Token::BalloonTube => match balloon_tube.as_mut().expect("missing balloon tube").recv() { Ok(resp) => { for (resp, idx) in resp { if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) { if let Err(e) = tube.0.send(&resp) { error!("failed to send VmResponse: {}", e); } } else { error!("Bad tube index {}", idx); } } } Err(err) => { error!("Error processing balloon tube {:?}", err) } }, #[cfg(not(feature = "balloon"))] Token::BalloonTube => unreachable!("balloon tube not registered"), #[allow(unreachable_patterns)] _ => { let run_mode_opt = product::handle_received_token( &event.token, anti_tamper_main_thread_tube, #[cfg(feature = "balloon")] balloon_tube, control_tubes, guest_os, ipc_main_loop_tube, memory_size_mb, proto_main_loop_tube, #[cfg(feature = "pvclock")] pvclock_host_tube, run_mode_arc, service_vm_state, vcpu_boxes, virtio_snd_host_mute_tubes, execute_vm_request, ); if let Some(exit_state) = handle_run_mode_change_for_vm_request(&run_mode_opt, guest_os) { return Ok(Some(exit_state)); } } }; Ok(None) } /// Handles a run mode change (if one occurred) if one is pending as a /// result a VmRequest. The parameter, run_mode_opt, is the run mode change /// proposed by the VmRequest's execution. /// /// Returns the exit state, if it changed due to a run mode change. /// None otherwise. fn handle_run_mode_change_for_vm_request( run_mode_opt: &Option, guest_os: &mut RunnableLinuxVm, ) -> Option { if let Some(run_mode) = run_mode_opt { info!("control socket changed run mode to {}", run_mode); match run_mode { VmRunMode::Exiting => return Some(ExitState::Stop), _ => unreachable!(), } } // No exit state change. None } /// Commands to control the VM Memory handler thread. #[derive(serde::Serialize, serde::Deserialize)] pub enum VmMemoryHandlerRequest { /// No response is sent for this command. Exit, } fn vm_memory_handler_thread( control_tubes: Vec, mut vm: impl Vm, sys_allocator_mutex: Arc>, mut gralloc: RutabagaGralloc, handler_control: Tube, ) -> anyhow::Result<()> { #[derive(EventToken)] enum Token { VmControl { id: usize }, HandlerControl, } let wait_ctx = WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)]) .context("failed to build wait context")?; let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate()); for (id, socket) in control_tubes.iter() { wait_ctx .add(socket.get_read_notifier(), Token::VmControl { id: *id }) .context("failed to add descriptor to wait context")?; } let mut region_state: VmMemoryRegionState = Default::default(); 'wait: loop { let events = { match wait_ctx.wait() { Ok(v) => v, Err(e) => { error!("failed to poll: {}", e); break; } } }; let mut vm_control_ids_to_remove = Vec::new(); for event in events.iter().filter(|e| e.is_readable) { match event.token { Token::HandlerControl => match handler_control.recv::() { Ok(request) => match request { VmMemoryHandlerRequest::Exit => break 'wait, }, Err(e) => { if let TubeError::Disconnected = e { panic!("vm memory control tube disconnected."); } else { error!("failed to recv VmMemoryHandlerRequest: {}", e); } } }, Token::VmControl { id } => { if let Some(tube) = control_tubes.get(&id) { match tube.recv::() { Ok(request) => { let response = request.execute( &mut vm, &mut sys_allocator_mutex.lock(), &mut gralloc, None, &mut region_state, ); if let Err(e) = tube.send(&response) { error!("failed to send VmMemoryControlResponse: {}", e); } } Err(e) => { if let TubeError::Disconnected = e { vm_control_ids_to_remove.push(id); } else { error!("failed to recv VmMemoryControlRequest: {}", e); } } } } } } } remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?; if events .iter() .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl)) { error!("vm memory handler control hung up but did not request an exit."); break 'wait; } } Ok(()) } fn create_control_server( control_server_path: Option, wait_ctx: &WaitContext, ) -> Result> { #[cfg(not(feature = "prod-build"))] { if let Some(path) = control_server_path { let server = ControlServer::new(path.to_str().expect("control socket path must be a string")) .exit_context( Exit::FailedToCreateControlServer, "failed to create control server", )?; wait_ctx .add(server.client_waiting(), Token::VmControlServer) .exit_context( Exit::WaitContextAdd, "failed to add control server to wait context", )?; return Ok(Some(server)); } } Ok::, anyhow::Error>(None) } fn run_control( mut guest_os: RunnableLinuxVm, sys_allocator: SystemAllocator, control_tubes: Vec, irq_control_tubes: Vec, vm_memory_control_tubes: Vec, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, #[cfg(feature = "gpu")] gpu_control_tube: Option, broker_shutdown_evt: Option, balloon_host_tube: Option, #[cfg(feature = "pvclock")] pvclock_host_tube: Option, disk_host_tubes: Vec, initial_audio_session_states: Vec, gralloc: RutabagaGralloc, #[cfg(feature = "stats")] stats: Option>>, service_pipe_name: Option, memory_size_mb: u64, host_cpu_topology: bool, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, mut product_args: RunControlArgs, mut virtio_snd_host_mute_tubes: Vec, restore_path: Option, control_server_path: Option, force_s2idle: bool, suspended: bool, ) -> Result { let (ipc_main_loop_tube, proto_main_loop_tube, _service_ipc) = start_service_ipc_listener(service_pipe_name)?; let mut service_vm_state = product::create_service_vm_state(memory_size_mb); let service_audio_states = product::create_service_audio_states_and_send_to_service( initial_audio_session_states, &ipc_main_loop_tube, )?; let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator)); let exit_evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; let (irq_handler_control, irq_handler_control_for_worker) = Tube::pair().exit_context( Exit::CreateTube, "failed to create IRQ handler control Tube", )?; // Create a separate thread to wait on IRQ events. This is a natural division // because IRQ interrupts have no dependencies on other events, and this lets // us avoid approaching the Windows WaitForMultipleObjects 64-object limit. let irq_join_handle = IrqWaitWorker::start( irq_handler_control_for_worker, guest_os .irq_chip .try_box_clone() .exit_context(Exit::CloneEvent, "failed to clone irq chip")?, irq_control_tubes, sys_allocator_mutex.clone(), ); let mut triggers = vec![(vm_evt_rdtube.get_read_notifier(), Token::VmEvent)]; product::push_triggers(&mut triggers, &ipc_main_loop_tube, &proto_main_loop_tube); let wait_ctx = WaitContext::build_with(&triggers).exit_context( Exit::WaitContextAdd, "failed to add trigger to wait context", )?; #[cfg(feature = "balloon")] let mut balloon_tube = balloon_host_tube .map(|tube| -> Result { wait_ctx .add(tube.get_read_notifier(), Token::BalloonTube) .context("failed to add trigger to wait context")?; Ok(BalloonTube::new(tube)) }) .transpose() .context("failed to create balloon tube")?; let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?; let vm_memory_handler_thread_join_handle = std::thread::Builder::new() .name("vm_memory_handler_thread".into()) .spawn({ let vm = guest_os.vm.try_clone().context("failed to clone Vm")?; let sys_allocator_mutex = sys_allocator_mutex.clone(); move || { vm_memory_handler_thread( vm_memory_control_tubes, vm, sys_allocator_mutex, gralloc, vm_memory_handler_control_for_thread, ) } }) .unwrap(); if let Some(evt) = broker_shutdown_evt.as_ref() { wait_ctx.add(evt, Token::BrokerShutdown).exit_context( Exit::WaitContextAdd, "failed to add trigger to wait context", )?; } let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate()); let mut next_control_id = control_tubes.len(); for (id, control_tube) in control_tubes.iter() { #[allow(clippy::single_match)] match control_tube { TaggedControlTube::Product(product_tube) => wait_ctx .add( product_tube.get_read_notifier(), Token::VmControl { id: *id }, ) .exit_context( Exit::WaitContextAdd, "failed to add trigger to wait context", )?, _ => (), } } let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?; guest_os.devices_thread = match create_devices_worker_thread( guest_os.vm.get_memory().clone(), guest_os.io_bus.clone(), guest_os.mmio_bus.clone(), device_ctrl_resp, ) { Ok(join_handle) => Some(join_handle), Err(e) => { return Err(anyhow!("Failed to start devices thread: {}", e)); } }; let vcpus: Vec> = match guest_os.vcpus.take() { Some(vec) => vec.into_iter().map(|vcpu| Some(vcpu)).collect(), None => iter::repeat_with(|| None) .take(guest_os.vcpu_count) .collect(), }; let anti_tamper_main_thread_tube = spawn_anti_tamper_thread(&wait_ctx); let mut vm_control_server = create_control_server(control_server_path, &wait_ctx)?; let ime_thread = run_ime_thread(&mut product_args, &exit_evt)?; let original_terminal_mode = stdin().set_raw_mode().ok(); let vcpu_boxes: Arc>>> = Arc::new(Mutex::new(Vec::new())); let run_mode_arc = Arc::new(VcpuRunMode::default()); let run_mode_state = if suspended { // Sleep devices before creating vcpus. device_ctrl_tube .send(&DeviceControlCommand::SleepDevices) .context("send command to devices control socket")?; match device_ctrl_tube .recv() .context("receive from devices control socket")? { VmResponse::Ok => (), resp => bail!("device sleep failed: {}", resp), } run_mode_arc.set_and_notify(VmRunMode::Suspending); VmRunMode::Suspending } else { VmRunMode::Running }; // If we are restoring from a snapshot, then start suspended. if restore_path.is_some() { run_mode_arc.set_and_notify(VmRunMode::Suspending); } let (vcpu_threads, vcpu_control_channels) = run_all_vcpus( vcpus, vcpu_boxes.clone(), &guest_os, &exit_evt, &vm_evt_wrtube, #[cfg(feature = "stats")] &stats, host_cpu_topology, run_mode_arc.clone(), tsc_sync_mitigations, force_calibrated_tsc_leaf, )?; // See comment on `VmRequest::execute`. let mut suspended_pvclock_state: Option = None; // Restore VM (if applicable). if let Some(path) = restore_path { vm_control::do_restore( &path, |msg| { kick_all_vcpus( run_mode_arc.as_ref(), &vcpu_control_channels, vcpu_boxes.as_ref(), guest_os.irq_chip.as_ref(), #[cfg(feature = "pvclock")] &pvclock_host_tube, &guest_os.resume_notify_devices, msg, ) }, |msg, index| { kick_vcpu( run_mode_arc.as_ref(), &vcpu_control_channels, vcpu_boxes.as_ref(), guest_os.irq_chip.as_ref(), index, msg, ) }, &irq_handler_control, &device_ctrl_tube, guest_os.vcpu_count, |image| { guest_os .irq_chip .try_box_clone()? .restore(image, guest_os.vcpu_count) }, /* require_encrypted= */ false, &mut suspended_pvclock_state, )?; // Allow the vCPUs to start for real. kick_all_vcpus( run_mode_arc.as_ref(), &vcpu_control_channels, vcpu_boxes.as_ref(), guest_os.irq_chip.as_ref(), #[cfg(feature = "pvclock")] &pvclock_host_tube, &guest_os.resume_notify_devices, // Other platforms (unix) have multiple modes they could start in (e.g. starting for // guest kernel debugging, etc). If/when we support those modes on Windows, we'll need // to enter that mode here rather than VmRunMode::Running. VcpuControl::RunState(run_mode_state), ); } let mut exit_state = ExitState::Stop; let mut region_state: VmMemoryRegionState = Default::default(); 'poll: loop { let events = { match wait_ctx.wait() { Ok(v) => v, Err(e) => { error!("failed to wait: {}", e); break; } } }; let mut vm_control_ids_to_remove = Vec::new(); for event in events.iter().filter(|e| e.is_readable) { let state = handle_readable_event( event, &mut vm_control_ids_to_remove, &mut next_control_id, &mut service_vm_state, disk_host_tubes.as_slice(), ipc_main_loop_tube.as_ref(), #[cfg(feature = "gpu")] gpu_control_tube.as_ref(), &vm_evt_rdtube, &mut control_tubes, &mut guest_os, &sys_allocator_mutex, &mut virtio_snd_host_mute_tubes, proto_main_loop_tube.as_ref(), &anti_tamper_main_thread_tube, #[cfg(feature = "balloon")] balloon_tube.as_mut(), memory_size_mb, vcpu_boxes.as_ref(), #[cfg(feature = "pvclock")] &pvclock_host_tube, run_mode_arc.as_ref(), &mut region_state, vm_control_server.as_mut(), &irq_handler_control, &device_ctrl_tube, &wait_ctx, force_s2idle, &vcpu_control_channels, &mut suspended_pvclock_state, )?; if let Some(state) = state { exit_state = state; break 'poll; } } remove_closed_tubes(&wait_ctx, &mut control_tubes, vm_control_ids_to_remove)?; } info!("run_control poll loop completed, forcing vCPUs to exit..."); // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM. run_mode_arc.set_and_notify(VmRunMode::Exiting); // Force all vcpus to exit from the hypervisor for vcpu in vcpu_boxes.lock().iter() { vcpu.set_immediate_exit(true); } let mut res = Ok(exit_state); guest_os.irq_chip.kick_halted_vcpus(); let _ = exit_evt.signal(); if guest_os.devices_thread.is_some() { if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) { error!("failed to stop device control loop: {}", e); }; if let Some(thread) = guest_os.devices_thread.take() { if let Err(e) = thread.join() { error!("failed to exit devices thread: {:?}", e); } } } // Shut down the VM memory handler thread. if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) { error!( "failed to request exit from VM memory handler thread: {}", e ); } if let Err(e) = vm_memory_handler_thread_join_handle.join() { error!("failed to exit VM Memory handler thread: {:?}", e); } // Shut down the IRQ handler thread. if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) { error!("failed to request exit from IRQ handler thread: {}", e); } // Ensure any child threads have ended by sending the Exit vm event (possibly again) to ensure // their run loops are aborted. let _ = vm_evt_wrtube.send::(&VmEventType::Exit); for (i, thread) in vcpu_threads.into_iter().enumerate() { // wait till all the threads exit, so that guest_os.vm arc memory count is down to 1. // otherwise, we will hit a memory leak if we force kill the thread with terminate. match thread.join() { Ok(Err(e)) => { error!("vcpu thread {} exited with an error: {}", i, e); res = Err(e); } Ok(_) => {} Err(e) => error!("vcpu thread {} panicked: {:?}", i, e), } } info!("vCPU threads have exited."); if let Some(ime) = ime_thread { match ime.join() { Ok(Err(e)) => { error!("ime thread exited with an error: {}", e); if res.is_ok() { // Prioritize past errors, but return this error if it is unique, otherwise just // log it. res = Err(e) } } Ok(_) => {} Err(e) => error!("ime thread panicked: {:?}", e), } } info!("IME thread has exited."); // This cancels all the outstanding and any future blocking operations. // TODO(b/196911556): Shutdown executor for cleaner shutdown. Given we are using global, for a // cleaner shutdown we have to call disarm so that all the incoming requests are run and are // cancelled. If we call shutdown all blocking threads will go away and incoming operations // won't be scheduled to run and will be dropped leading to panic. I think ideal place to call // shutdown is when we drop non-global executor. cros_async::unblock_disarm(); info!("blocking async pool has shut down."); let _ = irq_join_handle.join(); info!("IrqWaitWorker has shut down."); #[cfg(feature = "stats")] if let Some(stats) = stats { println!("Statistics Collected:\n{}", stats.lock()); println!("Statistics JSON:\n{}", stats.lock().json()); } if let Some(mode) = original_terminal_mode { if let Err(e) = stdin().restore_mode(mode) { warn!("failed to restore terminal mode: {}", e); } } // Explicitly drop the VM structure here to allow the devices to clean up before the // control tubes are closed when this function exits. mem::drop(guest_os); info!("guest_os dropped, run_control is done."); res } /// Remove Tubes that have been closed from the WaitContext. fn remove_closed_tubes( wait_ctx: &WaitContext, tubes: &mut BTreeMap, mut tube_ids_to_remove: Vec, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier + CloseNotifier, { tube_ids_to_remove.dedup(); for id in tube_ids_to_remove { if let Some(socket) = tubes.remove(&id) { wait_ctx .delete(socket.get_read_notifier()) .context("failed to remove descriptor from wait context")?; // There may be a close notifier registered for this Tube. If there isn't one // registered, we just ignore the error. let _ = wait_ctx.delete(socket.get_close_notifier()); } } Ok(()) } /// Sends a message to all VCPUs. fn kick_all_vcpus( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender], vcpu_boxes: &Mutex>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option, resume_notify_devices: &[Arc>], msg: VcpuControl, ) { // On Windows, we handle run mode switching directly rather than delegating to the VCPU thread // like unix does. match &msg { VcpuControl::RunState(VmRunMode::Suspending) => { suspend_all_vcpus( run_mode, vcpu_boxes, irq_chip, #[cfg(feature = "pvclock")] pvclock_host_tube, ); return; } VcpuControl::RunState(VmRunMode::Running) => { for device in resume_notify_devices { device.lock().resume_imminent(); } resume_all_vcpus( run_mode, vcpu_boxes, irq_chip, #[cfg(feature = "pvclock")] pvclock_host_tube, ); return; } _ => (), } // For non RunState commands, we dispatch just like unix would. for vcpu in vcpu_control_channels { if let Err(e) = vcpu.send(msg.clone()) { error!("failed to send VcpuControl message: {}", e); } } // Now that we've sent a message, we need VCPUs to exit so they can process it. for vcpu in vcpu_boxes.lock().iter() { vcpu.set_immediate_exit(true); } irq_chip.kick_halted_vcpus(); // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes // the control message. let current_run_mode = run_mode.get_mode(); if current_run_mode != VmRunMode::Running { run_mode.set_and_notify(current_run_mode); } } /// Sends a message to a single VCPU. On Windows, `VcpuControl::RunState` cannot be sent to a single /// VCPU. fn kick_vcpu( run_mode: &VcpuRunMode, vcpu_control_channels: &[mpsc::Sender], vcpu_boxes: &Mutex>>, irq_chip: &dyn IrqChipArch, index: usize, msg: VcpuControl, ) { assert!( !matches!(msg, VcpuControl::RunState(_)), "Windows does not support RunState changes on a per VCPU basis" ); let vcpu = vcpu_control_channels .get(index) .expect("invalid vcpu index specified"); if let Err(e) = vcpu.send(msg) { error!("failed to send VcpuControl message: {}", e); } // Now that we've sent a message, we need the VCPU to exit so it can // process the message. vcpu_boxes .lock() .get(index) .expect("invalid vcpu index specified") .set_immediate_exit(true); irq_chip.kick_halted_vcpus(); // If the VCPU isn't running, we have to notify the run_mode condvar to wake it so it processes // the control message. (Technically this wakes all VCPUs, but those without messages will go // back to sleep.) let current_run_mode = run_mode.get_mode(); if current_run_mode != VmRunMode::Running { run_mode.set_and_notify(current_run_mode); } } /// Suspends all VCPUs. The VM will be effectively frozen in time once this function is called, /// though devices on the host will continue to run. pub(crate) fn suspend_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option, ) { // VCPU threads MUST see the VmRunMode::Suspending flag first, otherwise // they may re-enter the VM. run_mode.set_and_notify(VmRunMode::Suspending); // Force all vcpus to exit from the hypervisor for vcpu in vcpu_boxes.lock().iter() { vcpu.set_immediate_exit(true); } irq_chip.kick_halted_vcpus(); #[cfg(feature = "pvclock")] handle_pvclock_request(pvclock_host_tube, PvClockCommand::Suspend) .unwrap_or_else(|e| error!("Error handling pvclock suspend: {:?}", e)); } /// Resumes all VCPUs. pub(crate) fn resume_all_vcpus( run_mode: &VcpuRunMode, vcpu_boxes: &Mutex>>, irq_chip: &dyn IrqChipArch, #[cfg(feature = "pvclock")] pvclock_host_tube: &Option, ) { #[cfg(feature = "pvclock")] handle_pvclock_request(pvclock_host_tube, PvClockCommand::Resume) .unwrap_or_else(|e| error!("Error handling pvclock resume: {:?}", e)); // Make sure any immediate exit bits are disabled for vcpu in vcpu_boxes.lock().iter() { vcpu.set_immediate_exit(false); } run_mode.set_and_notify(VmRunMode::Running); } #[cfg(feature = "gvm")] const GVM_MINIMUM_VERSION: GvmVersion = GvmVersion { major: 1, minor: 4, patch: 1, }; #[cfg(feature = "gvm")] fn create_gvm_vm(gvm: Gvm, mem: GuestMemory) -> Result { match gvm.get_full_version() { Ok(version) => { if version < GVM_MINIMUM_VERSION { error!( "GVM version {} is below minimum version {}", version, GVM_MINIMUM_VERSION ); return Err(base::Error::new(libc::ENXIO).into()); } else { info!("Using GVM version {}.", version) } } Err(e) => { error!("unable to determine gvm version: {}", e); return Err(base::Error::new(libc::ENXIO).into()); } } let vm = GvmVm::new(&gvm, mem)?; Ok(vm) } #[cfg(feature = "haxm")] fn create_haxm_vm( haxm: Haxm, mem: GuestMemory, kernel_log_file: &Option, ) -> Result { let vm = HaxmVm::new(&haxm, mem)?; if let Some(path) = kernel_log_file { use hypervisor::haxm::HAX_CAP_VM_LOG; if vm.check_raw_capability(HAX_CAP_VM_LOG) { match vm.register_log_file(path) { Ok(_) => {} Err(e) => match e.errno() { libc::E2BIG => { error!( "kernel_log_file path is too long, kernel log file will not be written" ); } _ => return Err(e.into()), }, } } else { warn!( "kernel_log_file specified but this version of HAXM does not support kernel log \ files" ); } } Ok(vm) } #[cfg(feature = "whpx")] #[cfg(target_arch = "x86_64")] fn create_whpx_vm( whpx: Whpx, mem: GuestMemory, cpu_count: usize, no_smt: bool, apic_emulation: bool, force_calibrated_tsc_leaf: bool, vm_evt_wrtube: SendTube, ) -> Result { let cpu_config = hypervisor::CpuConfigX86_64::new( force_calibrated_tsc_leaf, false, /* host_cpu_topology */ false, /* enable_hwp */ no_smt, false, /* itmt */ None, /* hybrid_type */ ); // context for non-cpu-specific cpuid results let ctx = CpuIdContext::new( 0, cpu_count, None, cpu_config, whpx.check_capability(HypervisorCap::CalibratedTscLeafRequired), __cpuid_count, __cpuid, ); // Get all cpuid entries that we should pre-set let mut cpuid = whpx.get_supported_cpuid()?; // Adjust them for crosvm for entry in cpuid.cpu_id_entries.iter_mut() { adjust_cpuid(entry, &ctx); } let vm = WhpxVm::new( &whpx, cpu_count, mem, cpuid, apic_emulation, Some(vm_evt_wrtube), ) .exit_context(Exit::WhpxSetupError, "failed to create WHPX vm")?; Ok(vm) } #[cfg(feature = "gvm")] fn create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result { info!("Creating GVM irqchip"); let irq_chip = GvmIrqChip::new(vm.try_clone()?, vcpu_count)?; Ok(irq_chip) } #[cfg(feature = "whpx")] #[cfg(target_arch = "x86_64")] fn create_whpx_split_irq_chip( vm: &WhpxVm, ioapic_device_tube: Tube, ) -> base::Result { info!("Creating WHPX split irqchip"); WhpxSplitIrqChip::new( vm.try_clone()?, ioapic_device_tube, None, // ioapic_pins ) } fn create_userspace_irq_chip( vcpu_count: usize, ioapic_device_tube: Tube, ) -> base::Result> where Vcpu: VcpuArch + 'static, { info!("Creating userspace irqchip"); let irq_chip = UserspaceIrqChip::new(vcpu_count, ioapic_device_tube, /* ioapic_pins: */ None)?; Ok(irq_chip) } pub fn get_default_hypervisor() -> Option { // The ordering here matters from most preferable to the least. #[cfg(feature = "whpx")] match hypervisor::whpx::Whpx::is_enabled() { true => return Some(HypervisorKind::Whpx), false => warn!("Whpx not enabled."), }; #[cfg(feature = "haxm")] match Haxm::new() { Ok(_) => return Some(HypervisorKind::Ghaxm), Err(e) => warn!("Cannot initialize HAXM: {}", e), }; #[cfg(feature = "gvm")] // Make sure Gvm device can be opened before selecting it. match Gvm::new() { Ok(_) => return Some(HypervisorKind::Gvm), Err(e) => warn!("Cannot initialize GVM: {}", e), }; None } fn setup_vm_components(cfg: &Config) -> Result { let initrd_image = if let Some(initrd_path) = &cfg.initrd_path { Some( File::open(initrd_path).with_exit_context(Exit::OpenInitrd, || { format!("failed to open initrd {}", initrd_path.display()) })?, ) } else { None }; let vm_image = match cfg.executable_path { Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel( File::open(kernel_path).with_exit_context(Exit::OpenKernel, || { format!("failed to open kernel image {}", kernel_path.display(),) })?, ), Some(Executable::Bios(ref bios_path)) => { VmImage::Bios(File::open(bios_path).with_exit_context(Exit::OpenBios, || { format!("failed to open bios {}", bios_path.display()) })?) } _ => panic!("Did not receive a bios or kernel, should be impossible."), }; let swiotlb = if let Some(size) = cfg.swiotlb { Some( size.checked_mul(1024 * 1024) .ok_or_else(|| anyhow!("requested swiotlb size too large"))?, ) } else if matches!(cfg.protection_type, ProtectionType::Unprotected) { None } else { Some(64 * 1024 * 1024) }; let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters { ( Some( open_file_or_duplicate( &pflash_parameters.path, OpenOptions::new().read(true).write(true), ) .with_context(|| { format!("failed to open pflash {}", pflash_parameters.path.display()) })?, ), pflash_parameters.block_size, ) } else { (None, 0) }; Ok(VmComponents { memory_size: cfg .memory .unwrap_or(256) .checked_mul(1024 * 1024) .ok_or_else(|| anyhow!("requested memory size too large"))?, swiotlb, vcpu_count: cfg.vcpu_count.unwrap_or(1), fw_cfg_enable: false, bootorder_fw_cfg_blob: Vec::new(), vcpu_affinity: cfg.vcpu_affinity.clone(), cpu_clusters: cfg.cpu_clusters.clone(), cpu_capacity: cfg.cpu_capacity.clone(), no_smt: cfg.no_smt, hugepages: cfg.hugepages, hv_cfg: hypervisor::Config { protection_type: cfg.protection_type, }, vm_image, android_fstab: cfg .android_fstab .as_ref() .map(|x| { File::open(x).with_exit_context(Exit::OpenAndroidFstab, || { format!("failed to open android fstab file {}", x.display()) }) }) .map_or(Ok(None), |v| v.map(Some))?, pstore: cfg.pstore.clone(), pflash_block_size, pflash_image, initrd_image, extra_kernel_params: cfg.params.clone(), acpi_sdts: cfg .acpi_tables .iter() .map(|path| { SDT::from_file(path).with_exit_context(Exit::OpenAcpiTable, || { format!("failed to open ACPI file {}", path.display()) }) }) .collect::>>()?, rt_cpus: cfg.rt_cpus.clone(), delay_rt: cfg.delay_rt, no_i8042: cfg.no_i8042, no_rtc: cfg.no_rtc, host_cpu_topology: cfg.host_cpu_topology, #[cfg(target_arch = "x86_64")] force_s2idle: cfg.force_s2idle, fw_cfg_parameters: cfg.fw_cfg_parameters.clone(), itmt: false, pvm_fw: None, pci_config: cfg.pci_config, #[cfg(target_arch = "x86_64")] smbios: cfg.smbios.clone(), dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(), #[cfg(target_arch = "x86_64")] break_linux_pci_config_io: cfg.break_linux_pci_config_io, boot_cpu: cfg.boot_cpu, }) } // Enum that allows us to assign a variable to what is essentially a &dyn IrqChipArch. enum WindowsIrqChip { Userspace(UserspaceIrqChip), #[cfg(feature = "gvm")] Gvm(GvmIrqChip), #[cfg(feature = "whpx")] WhpxSplit(WhpxSplitIrqChip), } impl WindowsIrqChip { // Convert our enum to a &mut dyn IrqChipArch fn as_mut(&mut self) -> &mut dyn IrqChipArch { match self { WindowsIrqChip::Userspace(i) => i, #[cfg(feature = "gvm")] WindowsIrqChip::Gvm(i) => i, #[cfg(feature = "whpx")] WindowsIrqChip::WhpxSplit(i) => i, } } } /// Storage for the VM TSC offset for each vcpu. Stored in a static because the tracing thread will /// need access to it when tracing is enabled. static TSC_OFFSETS: sync::Mutex>> = sync::Mutex::new(Vec::new()); /// Save the TSC offset for a particular vcpu. /// /// After setting the TSC offset for a vcpu, this function checks the standard deviation of offsets /// for all the VCPUs and logs this information. If the TSC offsets differ too much between vcpus /// it can cause clock issues in the guest. pub fn save_vcpu_tsc_offset(offset: u64, vcpu_id: usize) { let offsets_copy = { let mut offsets = TSC_OFFSETS.lock(); // make sure offsets vec is large enough before inserting let newlen = std::cmp::max(offsets.len(), vcpu_id + 1); offsets.resize(newlen, None); offsets[vcpu_id] = Some(offset); offsets.clone() }; // do statistics on a clone of the offsets so we don't hold up other vcpus at this point info!( "TSC offset standard deviation is: {}", standard_deviation( &offsets_copy .iter() .filter(|x| x.is_some()) .map(|x| x.unwrap() as u128) .collect::>() ) ); } /// Get the TSC offset of any vcpu. It will pick the first non-None offset it finds in TSC_OFFSETS. #[cfg(feature = "perfetto")] pub fn get_vcpu_tsc_offset() -> u64 { if let Some(offset) = TSC_OFFSETS.lock().iter().flatten().next() { return *offset; } 0 } /// Callback that is registered with tracing crate, and will be called by the tracing thread when /// tracing is enabled or disabled. Regardless of whether tracing is being enabled or disabled for /// a given category or instance, we just emit a clock snapshot that maps the guest TSC to the /// host TSC. Redundant snapshots should not be a problem for perfetto. #[cfg(feature = "perfetto")] fn set_tsc_clock_snapshot() { let freq = match devices::tsc::tsc_frequency() { Err(e) => { error!( "Could not determine tsc frequency, unable to snapshot tsc offset: {}", e ); return; } Ok(freq) => freq, }; // The offset is host-guest tsc value let offset = get_vcpu_tsc_offset(); // Safe because _rdtsc takes no arguments; let host_tsc = unsafe { std::arch::x86_64::_rdtsc() }; perfetto::snapshot_clock(perfetto::ClockSnapshot::new( // Technically our multiplier should be freq/1_000_000_000, but perfetto doesn't // support floating point multipliers yet. So for now we set the freq in Hz and rely // on the merge tool to fix it. perfetto::Clock::new( perfetto::BuiltinClock::Tsc as u32, host_tsc.wrapping_add(offset), ) .set_multiplier(freq as u64), perfetto::Clock::new( // The host builtin clock ids are all offset from the guest ids by // HOST_GUEST_CLOCK_ID_OFFSET when the traces are merged. Because this snapshot // contains both a guest and host clock, we need to offset it before merge. perfetto::BuiltinClock::Tsc as u32 + cros_tracing::HOST_GUEST_CLOCK_ID_OFFSET, host_tsc, ) .set_multiplier(freq as u64), )); } /// Launches run_config for the broker, reading configuration from a TubeTransporter. pub fn run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result { let tube_transporter = // SAFETY: // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that // the blocking & framing modes are accurate because we create them ourselves in the broker. unsafe { TubeTransporterReader::from_raw_descriptor(raw_tube_transporter) }; let mut tube_data_list = tube_transporter .read_tubes() .exit_context(Exit::TubeTransporterInit, "failed to init tube transporter")?; let bootstrap_tube = tube_data_list .get_tube(TubeToken::Bootstrap) .exit_context(Exit::TubeFailure, "failed to get bootstrap tube")?; let mut cfg: Config = bootstrap_tube .recv::() .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?; let startup_args: CommonChildStartupArgs = bootstrap_tube .recv::() .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?; let _child_cleanup = common_child_setup(startup_args).exit_context( Exit::CommonChildSetupError, "failed to perform common child setup", )?; cfg.broker_shutdown_event = Some( bootstrap_tube .recv::() .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?, ); #[cfg(feature = "crash-report")] let crash_tube_map = bootstrap_tube .recv::>>() .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?; #[cfg(feature = "crash-report")] crash_report::set_crash_tube_map(crash_tube_map); let BrokerTubes { vm_evt_wrtube, vm_evt_rdtube, } = bootstrap_tube .recv::() .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?; run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube) } pub fn run_config(cfg: Config) -> Result { let _raise_timer_resolution = enable_high_res_timers() .exit_context(Exit::EnableHighResTimer, "failed to enable high res timer")?; // There is no broker when using run_config(), so the vm_evt tubes need to be created. let (vm_evt_wrtube, vm_evt_rdtube) = Tube::directional_pair().context("failed to create vm event tube")?; run_config_inner(cfg, vm_evt_wrtube, vm_evt_rdtube) } fn create_guest_memory( components: &VmComponents, arch_memory_layout: &::ArchMemoryLayout, hypervisor: &impl Hypervisor, ) -> Result { let guest_mem_layout = Arch::guest_memory_layout(components, arch_memory_layout, hypervisor) .exit_context( Exit::GuestMemoryLayout, "failed to create guest memory layout", )?; GuestMemory::new_with_options(&guest_mem_layout) .exit_context(Exit::CreateGuestMemory, "failed to create guest memory") } fn run_config_inner( cfg: Config, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result { product::setup_common_metric_invariants(&cfg); #[cfg(feature = "perfetto")] cros_tracing::add_per_trace_callback(set_tsc_clock_snapshot); let components: VmComponents = setup_vm_components(&cfg)?; let arch_memory_layout = Arch::arch_memory_layout(&components)?; #[allow(unused_mut)] let mut hypervisor = cfg .hypervisor .or_else(get_default_hypervisor) .exit_context(Exit::NoDefaultHypervisor, "no enabled hypervisor")?; #[cfg(feature = "whpx")] if hypervisor::whpx::Whpx::is_enabled() { // If WHPX is enabled, no other hypervisor can be used, so just override it hypervisor = HypervisorKind::Whpx; } match hypervisor { #[cfg(feature = "haxm")] HypervisorKind::Haxm | HypervisorKind::Ghaxm => { if hypervisor == HypervisorKind::Haxm { set_use_ghaxm(false); } info!("Creating HAXM ghaxm={}", get_use_ghaxm()); let haxm = Haxm::new()?; let guest_mem = create_guest_memory(&components, &arch_memory_layout, &haxm)?; let vm = create_haxm_vm(haxm, guest_mem, &cfg.kernel_log_file)?; let (ioapic_host_tube, ioapic_device_tube) = Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; let irq_chip = create_userspace_irq_chip::(components.vcpu_count, ioapic_device_tube)?; run_vm::( cfg, components, &arch_memory_layout, vm, WindowsIrqChip::Userspace(irq_chip).as_mut(), Some(ioapic_host_tube), vm_evt_wrtube, vm_evt_rdtube, ) } #[cfg(feature = "whpx")] HypervisorKind::Whpx => { let apic_emulation_supported = Whpx::check_whpx_feature(WhpxFeature::LocalApicEmulation) .exit_context(Exit::WhpxSetupError, "failed to set up whpx")?; let no_smt = cfg.no_smt; // Default to WhpxSplitIrqChip if it's supported because it's more performant let irq_chip = cfg.irq_chip.unwrap_or(if apic_emulation_supported { IrqChipKind::Split } else { IrqChipKind::Userspace }); // Both WHPX irq chips use a userspace IOAPIC let (ioapic_host_tube, ioapic_device_tube) = Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; info!("Creating Whpx"); let whpx = Whpx::new()?; let guest_mem = create_guest_memory(&components, &arch_memory_layout, &whpx)?; let vm = create_whpx_vm( whpx, guest_mem, components.vcpu_count, no_smt, apic_emulation_supported && irq_chip == IrqChipKind::Split, cfg.force_calibrated_tsc_leaf, vm_evt_wrtube .try_clone() .expect("could not clone vm_evt_wrtube"), )?; let mut irq_chip = match irq_chip { IrqChipKind::Kernel => unimplemented!("Kernel irqchip mode not supported by WHPX"), IrqChipKind::Split => { if !apic_emulation_supported { panic!( "split irqchip specified but your WHPX version does not support \ local apic emulation" ); } WindowsIrqChip::WhpxSplit(create_whpx_split_irq_chip(&vm, ioapic_device_tube)?) } IrqChipKind::Userspace => { WindowsIrqChip::Userspace(create_userspace_irq_chip::( components.vcpu_count, ioapic_device_tube, )?) } }; run_vm::( cfg, components, &arch_memory_layout, vm, irq_chip.as_mut(), Some(ioapic_host_tube), vm_evt_wrtube, vm_evt_rdtube, ) } #[cfg(feature = "gvm")] HypervisorKind::Gvm => { info!("Creating GVM"); let gvm = Gvm::new()?; let guest_mem = create_guest_memory(&components, &arch_memory_layout, &gvm)?; let vm = create_gvm_vm(gvm, guest_mem)?; let ioapic_host_tube; let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) { IrqChipKind::Split => unimplemented!("Split irqchip mode not supported by GVM"), IrqChipKind::Kernel => { ioapic_host_tube = None; WindowsIrqChip::Gvm(create_gvm_irq_chip(&vm, components.vcpu_count)?) } IrqChipKind::Userspace => { let (host_tube, ioapic_device_tube) = Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; ioapic_host_tube = Some(host_tube); WindowsIrqChip::Userspace(create_userspace_irq_chip::( components.vcpu_count, ioapic_device_tube, )?) } }; run_vm::( cfg, components, &arch_memory_layout, vm, irq_chip.as_mut(), ioapic_host_tube, vm_evt_wrtube, vm_evt_rdtube, ) } } } #[cfg(any(feature = "haxm", feature = "gvm", feature = "whpx"))] fn run_vm( #[allow(unused_mut)] mut cfg: Config, #[allow(unused_mut)] mut components: VmComponents, arch_memory_layout: &::ArchMemoryLayout, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option, vm_evt_wrtube: SendTube, vm_evt_rdtube: RecvTube, ) -> Result where Vcpu: VcpuArch + 'static, V: VmArch + 'static, { let vm_memory_size_mb = components.memory_size / (1024 * 1024); let mut control_tubes = Vec::new(); let mut irq_control_tubes = Vec::new(); let mut vm_memory_control_tubes = Vec::new(); // Create one control tube per disk. let mut disk_device_tubes = Vec::new(); let mut disk_host_tubes = Vec::new(); let disk_count = cfg.disks.len(); for _ in 0..disk_count { let (disk_host_tube, disk_device_tube) = Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; disk_host_tubes.push(disk_host_tube); disk_device_tubes.push(disk_device_tube); } if let Some(ioapic_host_tube) = ioapic_host_tube { irq_control_tubes.push(ioapic_host_tube); } // Balloon gets a special socket so balloon requests can be forwarded from the main process. let (balloon_host_tube, balloon_device_tube) = if cfg.balloon { let (balloon_host_tube, balloon_device_tube) = Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; (Some(balloon_host_tube), Some(balloon_device_tube)) } else { (None, None) }; // The balloon device also needs a tube to communicate back to the main process to // handle remapping memory dynamically. let dynamic_mapping_device_tube = if cfg.balloon { let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) = Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; vm_memory_control_tubes.push(dynamic_mapping_host_tube); Some(dynamic_mapping_device_tube) } else { None }; // PvClock gets a tube for handling suspend/resume requests from the main thread. #[cfg(feature = "pvclock")] let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock { let (host, device) = Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; (Some(host), Some(device)) } else { (None, None) }; let gralloc = RutabagaGralloc::new(RutabagaGrallocBackendFlags::new()) .exit_context(Exit::CreateGralloc, "failed to create gralloc")?; let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64); let mut sys_allocator = SystemAllocator::new( Arch::get_system_allocator_config(&vm, arch_memory_layout), pstore_size, &cfg.mmio_address_ranges, ) .context("failed to create system allocator")?; // Allocate the ramoops region first. let ramoops_region = match &components.pstore { Some(pstore) => Some( arch::pstore::create_memory_region( &mut vm, sys_allocator.reserved_region().unwrap(), pstore, ) .exit_context( Exit::Pstore, format!("failed to allocate pstore region {:?}", &components.pstore), )?, ), None => None, }; let init_balloon_size = components .memory_size .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| { m.checked_mul(1024 * 1024).unwrap_or(u64::MAX) })) .context("failed to calculate init balloon size")?; let tsc_state = devices::tsc::tsc_state().exit_code(Exit::TscCalibrationFailed)?; let tsc_sync_mitigations = get_tsc_sync_mitigations(&tsc_state, components.vcpu_count); if tsc_state.core_grouping.size() > 1 { // Host TSCs are not in sync, log a metric about it. warn!( "Host TSCs are not in sync, applying the following mitigations: {:?}", tsc_sync_mitigations ); log_descriptor( MetricEventType::TscCoresOutOfSync, // casting u64 as i64 is a no-op, so we don't lose any part of the bitmask tsc_state.core_grouping.core_grouping_bitmask() as i64, ); } #[cfg(feature = "gpu")] let gpu_control_tube = cfg .gpu_vmm_config .as_mut() .and_then(|config| config.gpu_control_host_tube.take()); let product_args = product::get_run_control_args(&mut cfg); // We open these files before lowering the token, as in the future a stricter policy may // prevent it. let dt_overlays = cfg .device_tree_overlay .iter() .map(|o| { Ok(DtbOverlay { file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true)) .with_context(|| { format!("failed to open device tree overlay {}", o.path.display()) })?, }) }) .collect::>>()?; // Lower the token, locking the main process down to a stricter security policy. // // WARNING: // // Windows system calls can behave in unusual ways if they happen concurrently to the token // lowering. For example, access denied can happen if Tube pairs are created in another thread // (b/281108137), and lower_token happens right before the client pipe is connected. Tubes are // not privileged resources, but can be broken due to the token changing unexpectedly. // // We explicitly lower the token here and *then* call run_control to make it clear that any // resources that require a privileged token should be created on the main thread & passed into // run_control, to follow the correct order: // - Privileged resources are created. // - Token is lowered. // - Threads are spawned & may create more non-privileged resources (without fear of the token // changing at an undefined time). // // Recommendation: If you find your code doesnt work in run_control because of the sandbox, you // should split any resource creation to before this token lowering & pass the resources into // run_control. Don't move the token lowering somewhere else without considering multi-threaded // effects. #[cfg(feature = "sandbox")] if sandbox::is_sandbox_target() { sandbox::TargetServices::get() .exit_code_from_err("failed to create sandbox")? .expect("Could not create sandbox!") .lower_token(); } let virtio_snd_state_device_tube = create_snd_state_tube(&mut control_tubes)?; let (virtio_snd_host_mute_tube, virtio_snd_device_mute_tube) = create_snd_mute_tube_pair()?; let mut initial_audio_session_states: Vec = Vec::new(); let pci_devices = create_devices( &mut cfg, vm.get_memory(), &vm_evt_wrtube, &mut irq_control_tubes, &mut vm_memory_control_tubes, &mut control_tubes, &mut disk_device_tubes, &mut initial_audio_session_states, balloon_device_tube, #[cfg(feature = "pvclock")] pvclock_device_tube, dynamic_mapping_device_tube, /* inflate_tube= */ None, init_balloon_size, tsc_state.frequency, virtio_snd_state_device_tube, virtio_snd_device_mute_tube, )?; let mut vcpu_ids = Vec::new(); let (vwmdt_host_tube, vmwdt_device_tube) = Tube::pair().context("failed to create tube")?; let windows = Arch::build_vm::( components, arch_memory_layout, &vm_evt_wrtube, &mut sys_allocator, &cfg.serial_parameters, None, (cfg.battery_config.as_ref().map(|t| t.type_), None), vm, ramoops_region, pci_devices, irq_chip, &mut vcpu_ids, cfg.dump_device_tree_blob.clone(), /* debugcon_jail= */ None, None, None, /* guest_suspended_cvar= */ None, dt_overlays, cfg.fdt_position, cfg.no_pmu, ) .exit_context(Exit::BuildVm, "the architecture failed to build the vm")?; #[cfg(feature = "stats")] let stats = if cfg.exit_stats { Some(Arc::new(Mutex::new(StatisticsCollector::new()))) } else { None }; run_control( windows, sys_allocator, control_tubes, irq_control_tubes, vm_memory_control_tubes, vm_evt_rdtube, vm_evt_wrtube, #[cfg(feature = "gpu")] gpu_control_tube, cfg.broker_shutdown_event.take(), balloon_host_tube, #[cfg(feature = "pvclock")] pvclock_host_tube, disk_host_tubes, initial_audio_session_states, gralloc, #[cfg(feature = "stats")] stats, cfg.service_pipe_name, vm_memory_size_mb, cfg.host_cpu_topology, tsc_sync_mitigations, cfg.force_calibrated_tsc_leaf, product_args, match virtio_snd_host_mute_tube { Some(virtio_snd_host_mute_tube) => vec![virtio_snd_host_mute_tube], None => vec![], }, cfg.restore_path, cfg.socket_path, cfg.force_s2idle, cfg.suspended, ) } #[cfg(test)] mod tests { use tempfile::TempDir; use super::*; fn create_config(test_dir: &TempDir) -> Config { let mut config = Config::default(); let dummy_kernel_path = test_dir.path().join("dummy_kernel.txt"); OpenOptions::new() .create_new(true) .write(true) .open(&dummy_kernel_path) .expect("Could not open file!"); config.executable_path = Some(Executable::Kernel(dummy_kernel_path)); config } #[test] #[should_panic(expected = "Did not receive a bios or kernel")] fn setup_vm_components_panics_when_no_kernel_provided() { let mut config = create_config(&TempDir::new().expect("Could not create temporary directory!")); config.executable_path = None; let _ = setup_vm_components(&config); } #[test] fn setup_vm_components_stores_memory_in_bytes() { let tempdir = TempDir::new().expect("Could not create temporary directory!"); let mut config = create_config(&tempdir); config.memory = Some(1); let vm_components = setup_vm_components(&config).expect("failed to setup vm components"); assert_eq!(vm_components.memory_size, 1024 * 1024); } #[test] fn setup_vm_components_fails_when_memory_too_large() { let tempdir = TempDir::new().expect("Could not create temporary directory!"); let mut config = create_config(&tempdir); // One mb more than a u64 can hold in bytes config.memory = Some((u64::MAX / 1024 / 1024) + 1); setup_vm_components(&config).err().expect("expected error"); } }