1 // Copyright 2024 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::fs::File;
6 use std::path::PathBuf;
7 use std::sync::atomic::AtomicU32;
8 use std::sync::atomic::Ordering;
9 use std::sync::Arc;
10 use std::time::Duration;
11
12 use anyhow::Context;
13 use base::sched_attr;
14 use base::sched_setattr;
15 use base::set_cpu_affinity;
16 use base::warn;
17 use base::Error;
18 use base::Event;
19 use base::EventToken;
20 use base::Timer;
21 use base::TimerTrait;
22 use base::Tube;
23 use base::WaitContext;
24 use base::WorkerThread;
25 use sync::Mutex;
26
27 use crate::pci::CrosvmDeviceId;
28 use crate::BusAccessInfo;
29 use crate::BusDevice;
30 use crate::DeviceId;
31 use crate::Suspendable;
32
33 const CPUFREQ_GOV_SCALE_FACTOR_DEFAULT: u32 = 100;
34 const CPUFREQ_GOV_SCALE_FACTOR_SCHEDUTIL: u32 = 80;
35
36 const SCHED_FLAG_RESET_ON_FORK: u64 = 0x1;
37 const SCHED_FLAG_KEEP_POLICY: u64 = 0x08;
38 const SCHED_FLAG_KEEP_PARAMS: u64 = 0x10;
39 const SCHED_FLAG_UTIL_CLAMP_MIN: u64 = 0x20;
40 const SCHED_FLAG_UTIL_CLAMP_MAX: u64 = 0x40;
41
42 const VCPUFREQ_CUR_PERF: u32 = 0x0;
43 const VCPUFREQ_SET_PERF: u32 = 0x4;
44 const VCPUFREQ_FREQTBL_LEN: u32 = 0x8;
45 const VCPUFREQ_FREQTBL_SEL: u32 = 0xc;
46 const VCPUFREQ_FREQTBL_RD: u32 = 0x10;
47 const VCPUFREQ_PERF_DOMAIN: u32 = 0x14;
48
49 const SCHED_FLAG_KEEP_ALL: u64 = SCHED_FLAG_KEEP_POLICY | SCHED_FLAG_KEEP_PARAMS;
50 const SCHED_CAPACITY_SCALE: u32 = 1024;
51
52 // Timer values in microseconds
53 const MIN_TIMER_US: u32 = 75;
54 const TIMER_OVERHEAD_US: u32 = 15;
55
56 /// Upstream linux compatible version of the virtual cpufreq interface
57 pub struct VirtCpufreqV2 {
58 vcpu_freq_table: Vec<u32>,
59 pcpu_fmax: u32,
60 pcpu_capacity: u32,
61 pcpu: u32,
62 util_factor: u32,
63 freqtbl_sel: u32,
64 vcpu_domain: u32,
65 domain_uclamp_min: Option<File>,
66 domain_uclamp_max: Option<File>,
67 vcpu_fmax: u32,
68 vcpu_capacity: u32,
69 vcpu_relative_capacity: u32,
70 worker: Option<WorkerThread<()>>,
71 timer: Arc<Mutex<Timer>>,
72 vm_ctrl: Arc<Mutex<Tube>>,
73 pcpu_min_cap: u32,
74 /// The largest(or the last) pCPU index to be used by all the vCPUs. This index is used to
75 /// figure out the proper placement of the throttle workers which are placed on pCPUs right
76 /// after the last pCPU being used the vCPUs. Throttle workers require their own exclusive
77 /// pCPU allocation and this ensure that the workers are placed contiguously and makes it
78 /// easier for user to manage pCPU allocations when running multiple instances on a large
79 /// server.
80 largest_pcpu_idx: usize,
81 //TODO: Put the shared_domain_members in a struct
82 shared_domain_vcpus: Vec<usize>,
83 shared_domain_perf: Arc<AtomicU32>,
84 }
85
get_cpu_info(cpu_id: u32, property: &str) -> Result<u32, Error>86 fn get_cpu_info(cpu_id: u32, property: &str) -> Result<u32, Error> {
87 let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
88 std::fs::read_to_string(path)?
89 .trim()
90 .parse()
91 .map_err(|_| Error::new(libc::EINVAL))
92 }
93
get_cpu_info_str(cpu_id: u32, property: &str) -> Result<String, Error>94 fn get_cpu_info_str(cpu_id: u32, property: &str) -> Result<String, Error> {
95 let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
96 std::fs::read_to_string(path).map_err(|_| Error::new(libc::EINVAL))
97 }
98
get_cpu_capacity(cpu_id: u32) -> Result<u32, Error>99 fn get_cpu_capacity(cpu_id: u32) -> Result<u32, Error> {
100 get_cpu_info(cpu_id, "cpu_capacity")
101 }
102
get_cpu_maxfreq_khz(cpu_id: u32) -> Result<u32, Error>103 fn get_cpu_maxfreq_khz(cpu_id: u32) -> Result<u32, Error> {
104 get_cpu_info(cpu_id, "cpufreq/cpuinfo_max_freq")
105 }
106
get_cpu_minfreq_khz(cpu_id: u32) -> Result<u32, Error>107 fn get_cpu_minfreq_khz(cpu_id: u32) -> Result<u32, Error> {
108 get_cpu_info(cpu_id, "cpufreq/cpuinfo_min_freq")
109 }
110
get_cpu_curfreq_khz(cpu_id: u32) -> Result<u32, Error>111 fn get_cpu_curfreq_khz(cpu_id: u32) -> Result<u32, Error> {
112 get_cpu_info(cpu_id, "cpufreq/scaling_cur_freq")
113 }
114
get_cpu_util_factor(cpu_id: u32) -> Result<u32, Error>115 fn get_cpu_util_factor(cpu_id: u32) -> Result<u32, Error> {
116 let gov = get_cpu_info_str(cpu_id, "cpufreq/scaling_governor")?;
117 match gov.trim() {
118 "schedutil" => Ok(CPUFREQ_GOV_SCALE_FACTOR_SCHEDUTIL),
119 _ => Ok(CPUFREQ_GOV_SCALE_FACTOR_DEFAULT),
120 }
121 }
122
123 impl VirtCpufreqV2 {
new( pcpu: u32, vcpu_freq_table: Vec<u32>, vcpu_domain_path: Option<PathBuf>, vcpu_domain: u32, vcpu_capacity: u32, largest_pcpu_idx: usize, vm_ctrl: Arc<Mutex<Tube>>, shared_domain_vcpus: Vec<usize>, shared_domain_perf: Arc<AtomicU32>, ) -> Self124 pub fn new(
125 pcpu: u32,
126 vcpu_freq_table: Vec<u32>,
127 vcpu_domain_path: Option<PathBuf>,
128 vcpu_domain: u32,
129 vcpu_capacity: u32,
130 largest_pcpu_idx: usize,
131 vm_ctrl: Arc<Mutex<Tube>>,
132 shared_domain_vcpus: Vec<usize>,
133 shared_domain_perf: Arc<AtomicU32>,
134 ) -> Self {
135 let pcpu_capacity = get_cpu_capacity(pcpu).expect("Error reading capacity");
136 let pcpu_fmax = get_cpu_maxfreq_khz(pcpu).expect("Error reading max freq");
137 let util_factor = get_cpu_util_factor(pcpu).expect("Error getting util factor");
138 let freqtbl_sel = 0;
139 let mut domain_uclamp_min = None;
140 let mut domain_uclamp_max = None;
141 // The vcpu_capacity passed in is normalized for frequency, reverse the normalization to
142 // get the performance per clock ratio between the vCPU and the pCPU its running on. This
143 // "relative capacity" is an approximation of the delta in IPC (Instructions per Cycle)
144 // between the pCPU vs vCPU running a usecase containing a mix of instruction types.
145 let vcpu_fmax = vcpu_freq_table.clone().into_iter().max().unwrap();
146 let vcpu_relative_capacity =
147 u32::try_from(u64::from(vcpu_capacity) * u64::from(pcpu_fmax) / u64::from(vcpu_fmax))
148 .unwrap();
149 let pcpu_min_cap =
150 get_cpu_minfreq_khz(pcpu).expect("Error reading min freq") * pcpu_capacity / pcpu_fmax;
151
152 if let Some(cgroup_path) = &vcpu_domain_path {
153 domain_uclamp_min = Some(
154 File::create(cgroup_path.join("cpu.uclamp.min")).unwrap_or_else(|err| {
155 panic!(
156 "Err: {}, Unable to open: {}",
157 err,
158 cgroup_path.join("cpu.uclamp.min").display()
159 )
160 }),
161 );
162 domain_uclamp_max = Some(
163 File::create(cgroup_path.join("cpu.uclamp.max")).unwrap_or_else(|err| {
164 panic!(
165 "Err: {}, Unable to open: {}",
166 err,
167 cgroup_path.join("cpu.uclamp.max").display()
168 )
169 }),
170 );
171 }
172
173 VirtCpufreqV2 {
174 vcpu_freq_table,
175 pcpu_fmax,
176 pcpu_capacity,
177 pcpu,
178 util_factor,
179 freqtbl_sel,
180 vcpu_domain,
181 domain_uclamp_min,
182 domain_uclamp_max,
183 vcpu_fmax,
184 vcpu_capacity,
185 vcpu_relative_capacity,
186 worker: None,
187 timer: Arc::new(Mutex::new(Timer::new().expect("failed to create Timer"))),
188 vm_ctrl,
189 pcpu_min_cap,
190 largest_pcpu_idx,
191 shared_domain_vcpus,
192 shared_domain_perf,
193 }
194 }
195 }
196
197 impl BusDevice for VirtCpufreqV2 {
device_id(&self) -> DeviceId198 fn device_id(&self) -> DeviceId {
199 CrosvmDeviceId::VirtCpufreq.into()
200 }
201
debug_label(&self) -> String202 fn debug_label(&self) -> String {
203 "VirtCpufreq Device".to_owned()
204 }
205
read(&mut self, info: BusAccessInfo, data: &mut [u8])206 fn read(&mut self, info: BusAccessInfo, data: &mut [u8]) {
207 if data.len() != std::mem::size_of::<u32>() {
208 warn!(
209 "{}: unsupported read length {}, only support 4bytes read",
210 self.debug_label(),
211 data.len()
212 );
213 return;
214 }
215
216 let val = match info.offset as u32 {
217 VCPUFREQ_CUR_PERF => {
218 let shared_util = self.shared_domain_perf.load(Ordering::SeqCst);
219 if shared_util != 0 && shared_util < self.pcpu_min_cap {
220 shared_util * self.vcpu_fmax / self.vcpu_capacity
221 } else {
222 match get_cpu_curfreq_khz(self.pcpu) {
223 Ok(freq) => u32::try_from(
224 u64::from(freq) * u64::from(self.pcpu_capacity)
225 / u64::from(self.vcpu_relative_capacity),
226 )
227 .unwrap(),
228 Err(_) => 0,
229 }
230 }
231 }
232 VCPUFREQ_FREQTBL_LEN => self.vcpu_freq_table.len() as u32,
233 VCPUFREQ_PERF_DOMAIN => self.vcpu_domain,
234 VCPUFREQ_FREQTBL_RD => *self
235 .vcpu_freq_table
236 .get(self.freqtbl_sel as usize)
237 .unwrap_or(&0),
238 _ => {
239 warn!("{}: unsupported read address {}", self.debug_label(), info);
240 return;
241 }
242 };
243
244 let val_arr = val.to_ne_bytes();
245 data.copy_from_slice(&val_arr);
246 }
247
write(&mut self, info: BusAccessInfo, data: &[u8])248 fn write(&mut self, info: BusAccessInfo, data: &[u8]) {
249 let val: u32 = match data.try_into().map(u32::from_ne_bytes) {
250 Ok(v) => v,
251 Err(e) => {
252 warn!(
253 "{}: unsupported write length {:#}, only support 4bytes write",
254 self.debug_label(),
255 e
256 );
257 return;
258 }
259 };
260
261 match info.offset as u32 {
262 VCPUFREQ_SET_PERF => {
263 // Util margin depends on the cpufreq governor on the host
264 let util_raw = match u32::try_from(
265 u64::from(self.vcpu_capacity) * u64::from(val) / u64::from(self.vcpu_fmax),
266 ) {
267 Ok(util) => util,
268 Err(e) => {
269 warn!("Potential overflow {:#}", e);
270 SCHED_CAPACITY_SCALE
271 }
272 };
273
274 let util = util_raw * self.util_factor / CPUFREQ_GOV_SCALE_FACTOR_DEFAULT;
275
276 if let (Some(domain_uclamp_min), Some(domain_uclamp_max)) =
277 (&mut self.domain_uclamp_min, &mut self.domain_uclamp_max)
278 {
279 use std::io::Write;
280 let val = util as f32 * 100.0 / SCHED_CAPACITY_SCALE as f32;
281 let val_formatted = format!("{:4}", val).into_bytes();
282
283 if self.vcpu_fmax != self.pcpu_fmax {
284 if let Err(e) = domain_uclamp_max.write(&val_formatted) {
285 warn!("Error setting uclamp_max: {:#}", e);
286 }
287 }
288 if let Err(e) = domain_uclamp_min.write(&val_formatted) {
289 warn!("Error setting uclamp_min: {:#}", e);
290 }
291 } else {
292 let mut sched_attr = sched_attr::default();
293 sched_attr.sched_flags = SCHED_FLAG_KEEP_ALL
294 | SCHED_FLAG_UTIL_CLAMP_MIN
295 | SCHED_FLAG_UTIL_CLAMP_MAX
296 | SCHED_FLAG_RESET_ON_FORK;
297 sched_attr.sched_util_min = util;
298
299 if self.vcpu_fmax != self.pcpu_fmax {
300 sched_attr.sched_util_max = util;
301 } else {
302 sched_attr.sched_util_max = SCHED_CAPACITY_SCALE;
303 }
304
305 if let Err(e) = sched_setattr(0, &mut sched_attr, 0) {
306 panic!("{}: Error setting util value: {:#}", self.debug_label(), e);
307 }
308 }
309
310 self.shared_domain_perf.store(util_raw, Ordering::SeqCst);
311 let timer = self.timer.clone();
312 if self.worker.is_none() {
313 let vcpu_id = info.id;
314 let vm_ctrl = self.vm_ctrl.clone();
315 let worker_cpu_affinity = self.largest_pcpu_idx + self.vcpu_domain as usize + 1;
316 let shared_domain_vcpus = self.shared_domain_vcpus.clone();
317
318 self.worker = Some(WorkerThread::start(
319 format!("vcpu_throttle{vcpu_id}"),
320 move |kill_evt| {
321 vcpufreq_worker_thread(
322 shared_domain_vcpus,
323 kill_evt,
324 timer,
325 vm_ctrl,
326 worker_cpu_affinity,
327 )
328 .expect("error running vpucfreq_worker")
329 },
330 ));
331 } else if util_raw < self.pcpu_min_cap {
332 // The period is porportional to the performance requested by the vCPU, we
333 // reduce the timeout period to increase the amount of throttling applied to
334 // the vCPU as the performance decreases. Ex. If vCPU requests half of the
335 // performance relatively to its pCPU@FMin, the vCPU will spend 50% of its
336 // cycles being throttled to increase time for the same workload that otherwise
337 // would've taken 1/2 of the time if ran at pCPU@FMin. We could've
338 // alternatively adjusted the workload and used some fixed period (such as
339 // 250us), but there's a floor for the minimum delay we add (cost of handling
340 // the userspace exit) and limits the range of performance we can emulate.
341 let timeout_period = (MIN_TIMER_US + TIMER_OVERHEAD_US) as f32
342 / (1.0 - (util_raw as f32 / self.pcpu_min_cap as f32));
343 let _ = timer
344 .lock()
345 .reset_repeating(Duration::from_micros(timeout_period as u64));
346 } else {
347 let _ = timer.lock().clear();
348 }
349 }
350 VCPUFREQ_FREQTBL_SEL => self.freqtbl_sel = val,
351 _ => {
352 warn!("{}: unsupported read address {}", self.debug_label(), info);
353 }
354 }
355 }
356 }
357
vcpufreq_worker_thread( shared_domain_vcpus: Vec<usize>, kill_evt: Event, timer: Arc<Mutex<Timer>>, vm_ctrl: Arc<Mutex<Tube>>, cpu_affinity: usize, ) -> anyhow::Result<()>358 pub fn vcpufreq_worker_thread(
359 shared_domain_vcpus: Vec<usize>,
360 kill_evt: Event,
361 timer: Arc<Mutex<Timer>>,
362 vm_ctrl: Arc<Mutex<Tube>>,
363 cpu_affinity: usize,
364 ) -> anyhow::Result<()> {
365 #[derive(EventToken)]
366 enum Token {
367 // The timer expired.
368 TimerExpire,
369 // The parent thread requested an exit.
370 Kill,
371 }
372
373 let wait_ctx = WaitContext::build_with(&[
374 (&*timer.lock(), Token::TimerExpire),
375 (&kill_evt, Token::Kill),
376 ])
377 .context("Failed to create wait_ctx")?;
378
379 // The vcpufreq thread has strict scheduling requirements, let's affine it away from the vCPU
380 // threads and clamp its util to high value.
381 let cpu_set: Vec<usize> = vec![cpu_affinity];
382 set_cpu_affinity(cpu_set)?;
383
384 let mut sched_attr = sched_attr::default();
385 sched_attr.sched_flags = SCHED_FLAG_KEEP_ALL
386 | SCHED_FLAG_UTIL_CLAMP_MIN
387 | SCHED_FLAG_UTIL_CLAMP_MAX
388 | SCHED_FLAG_RESET_ON_FORK;
389 sched_attr.sched_util_min = SCHED_CAPACITY_SCALE;
390 sched_attr.sched_util_max = SCHED_CAPACITY_SCALE;
391 if let Err(e) = sched_setattr(0, &mut sched_attr, 0) {
392 warn!("Error setting util value: {}", e);
393 }
394
395 loop {
396 let events = wait_ctx.wait().context("Failed to wait for events")?;
397 for event in events.iter().filter(|e| e.is_readable) {
398 match event.token {
399 Token::TimerExpire => {
400 timer
401 .lock()
402 .mark_waited()
403 .context("failed to reset timer")?;
404 let vm_ctrl_unlocked = vm_ctrl.lock();
405 for vcpu_id in &shared_domain_vcpus {
406 let msg = vm_control::VmRequest::Throttle(*vcpu_id, MIN_TIMER_US);
407 vm_ctrl_unlocked
408 .send(&msg)
409 .context("failed to stall vCPUs")?;
410 }
411 }
412 Token::Kill => {
413 return Ok(());
414 }
415 }
416 }
417 }
418 }
419
420 impl Suspendable for VirtCpufreqV2 {}
421