xref: /aosp_15_r20/external/crosvm/devices/src/virtcpufreq_v2.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2024 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::fs::File;
6 use std::path::PathBuf;
7 use std::sync::atomic::AtomicU32;
8 use std::sync::atomic::Ordering;
9 use std::sync::Arc;
10 use std::time::Duration;
11 
12 use anyhow::Context;
13 use base::sched_attr;
14 use base::sched_setattr;
15 use base::set_cpu_affinity;
16 use base::warn;
17 use base::Error;
18 use base::Event;
19 use base::EventToken;
20 use base::Timer;
21 use base::TimerTrait;
22 use base::Tube;
23 use base::WaitContext;
24 use base::WorkerThread;
25 use sync::Mutex;
26 
27 use crate::pci::CrosvmDeviceId;
28 use crate::BusAccessInfo;
29 use crate::BusDevice;
30 use crate::DeviceId;
31 use crate::Suspendable;
32 
33 const CPUFREQ_GOV_SCALE_FACTOR_DEFAULT: u32 = 100;
34 const CPUFREQ_GOV_SCALE_FACTOR_SCHEDUTIL: u32 = 80;
35 
36 const SCHED_FLAG_RESET_ON_FORK: u64 = 0x1;
37 const SCHED_FLAG_KEEP_POLICY: u64 = 0x08;
38 const SCHED_FLAG_KEEP_PARAMS: u64 = 0x10;
39 const SCHED_FLAG_UTIL_CLAMP_MIN: u64 = 0x20;
40 const SCHED_FLAG_UTIL_CLAMP_MAX: u64 = 0x40;
41 
42 const VCPUFREQ_CUR_PERF: u32 = 0x0;
43 const VCPUFREQ_SET_PERF: u32 = 0x4;
44 const VCPUFREQ_FREQTBL_LEN: u32 = 0x8;
45 const VCPUFREQ_FREQTBL_SEL: u32 = 0xc;
46 const VCPUFREQ_FREQTBL_RD: u32 = 0x10;
47 const VCPUFREQ_PERF_DOMAIN: u32 = 0x14;
48 
49 const SCHED_FLAG_KEEP_ALL: u64 = SCHED_FLAG_KEEP_POLICY | SCHED_FLAG_KEEP_PARAMS;
50 const SCHED_CAPACITY_SCALE: u32 = 1024;
51 
52 // Timer values in microseconds
53 const MIN_TIMER_US: u32 = 75;
54 const TIMER_OVERHEAD_US: u32 = 15;
55 
56 /// Upstream linux compatible version of the virtual cpufreq interface
57 pub struct VirtCpufreqV2 {
58     vcpu_freq_table: Vec<u32>,
59     pcpu_fmax: u32,
60     pcpu_capacity: u32,
61     pcpu: u32,
62     util_factor: u32,
63     freqtbl_sel: u32,
64     vcpu_domain: u32,
65     domain_uclamp_min: Option<File>,
66     domain_uclamp_max: Option<File>,
67     vcpu_fmax: u32,
68     vcpu_capacity: u32,
69     vcpu_relative_capacity: u32,
70     worker: Option<WorkerThread<()>>,
71     timer: Arc<Mutex<Timer>>,
72     vm_ctrl: Arc<Mutex<Tube>>,
73     pcpu_min_cap: u32,
74     /// The largest(or the last) pCPU index to be used by all the vCPUs. This index is used to
75     /// figure out the proper placement of the throttle workers which are placed on pCPUs right
76     /// after the last pCPU being used the vCPUs. Throttle workers require their own exclusive
77     /// pCPU allocation and this ensure that the workers are placed contiguously and makes it
78     /// easier for user to manage pCPU allocations when running multiple instances on a large
79     /// server.
80     largest_pcpu_idx: usize,
81     //TODO: Put the shared_domain_members in a struct
82     shared_domain_vcpus: Vec<usize>,
83     shared_domain_perf: Arc<AtomicU32>,
84 }
85 
get_cpu_info(cpu_id: u32, property: &str) -> Result<u32, Error>86 fn get_cpu_info(cpu_id: u32, property: &str) -> Result<u32, Error> {
87     let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
88     std::fs::read_to_string(path)?
89         .trim()
90         .parse()
91         .map_err(|_| Error::new(libc::EINVAL))
92 }
93 
get_cpu_info_str(cpu_id: u32, property: &str) -> Result<String, Error>94 fn get_cpu_info_str(cpu_id: u32, property: &str) -> Result<String, Error> {
95     let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
96     std::fs::read_to_string(path).map_err(|_| Error::new(libc::EINVAL))
97 }
98 
get_cpu_capacity(cpu_id: u32) -> Result<u32, Error>99 fn get_cpu_capacity(cpu_id: u32) -> Result<u32, Error> {
100     get_cpu_info(cpu_id, "cpu_capacity")
101 }
102 
get_cpu_maxfreq_khz(cpu_id: u32) -> Result<u32, Error>103 fn get_cpu_maxfreq_khz(cpu_id: u32) -> Result<u32, Error> {
104     get_cpu_info(cpu_id, "cpufreq/cpuinfo_max_freq")
105 }
106 
get_cpu_minfreq_khz(cpu_id: u32) -> Result<u32, Error>107 fn get_cpu_minfreq_khz(cpu_id: u32) -> Result<u32, Error> {
108     get_cpu_info(cpu_id, "cpufreq/cpuinfo_min_freq")
109 }
110 
get_cpu_curfreq_khz(cpu_id: u32) -> Result<u32, Error>111 fn get_cpu_curfreq_khz(cpu_id: u32) -> Result<u32, Error> {
112     get_cpu_info(cpu_id, "cpufreq/scaling_cur_freq")
113 }
114 
get_cpu_util_factor(cpu_id: u32) -> Result<u32, Error>115 fn get_cpu_util_factor(cpu_id: u32) -> Result<u32, Error> {
116     let gov = get_cpu_info_str(cpu_id, "cpufreq/scaling_governor")?;
117     match gov.trim() {
118         "schedutil" => Ok(CPUFREQ_GOV_SCALE_FACTOR_SCHEDUTIL),
119         _ => Ok(CPUFREQ_GOV_SCALE_FACTOR_DEFAULT),
120     }
121 }
122 
123 impl VirtCpufreqV2 {
new( pcpu: u32, vcpu_freq_table: Vec<u32>, vcpu_domain_path: Option<PathBuf>, vcpu_domain: u32, vcpu_capacity: u32, largest_pcpu_idx: usize, vm_ctrl: Arc<Mutex<Tube>>, shared_domain_vcpus: Vec<usize>, shared_domain_perf: Arc<AtomicU32>, ) -> Self124     pub fn new(
125         pcpu: u32,
126         vcpu_freq_table: Vec<u32>,
127         vcpu_domain_path: Option<PathBuf>,
128         vcpu_domain: u32,
129         vcpu_capacity: u32,
130         largest_pcpu_idx: usize,
131         vm_ctrl: Arc<Mutex<Tube>>,
132         shared_domain_vcpus: Vec<usize>,
133         shared_domain_perf: Arc<AtomicU32>,
134     ) -> Self {
135         let pcpu_capacity = get_cpu_capacity(pcpu).expect("Error reading capacity");
136         let pcpu_fmax = get_cpu_maxfreq_khz(pcpu).expect("Error reading max freq");
137         let util_factor = get_cpu_util_factor(pcpu).expect("Error getting util factor");
138         let freqtbl_sel = 0;
139         let mut domain_uclamp_min = None;
140         let mut domain_uclamp_max = None;
141         // The vcpu_capacity passed in is normalized for frequency, reverse the normalization to
142         // get the performance per clock ratio between the vCPU and the pCPU its running on. This
143         // "relative capacity" is an approximation of the delta in IPC (Instructions per Cycle)
144         // between the pCPU vs vCPU running a usecase containing a mix of instruction types.
145         let vcpu_fmax = vcpu_freq_table.clone().into_iter().max().unwrap();
146         let vcpu_relative_capacity =
147             u32::try_from(u64::from(vcpu_capacity) * u64::from(pcpu_fmax) / u64::from(vcpu_fmax))
148                 .unwrap();
149         let pcpu_min_cap =
150             get_cpu_minfreq_khz(pcpu).expect("Error reading min freq") * pcpu_capacity / pcpu_fmax;
151 
152         if let Some(cgroup_path) = &vcpu_domain_path {
153             domain_uclamp_min = Some(
154                 File::create(cgroup_path.join("cpu.uclamp.min")).unwrap_or_else(|err| {
155                     panic!(
156                         "Err: {}, Unable to open: {}",
157                         err,
158                         cgroup_path.join("cpu.uclamp.min").display()
159                     )
160                 }),
161             );
162             domain_uclamp_max = Some(
163                 File::create(cgroup_path.join("cpu.uclamp.max")).unwrap_or_else(|err| {
164                     panic!(
165                         "Err: {}, Unable to open: {}",
166                         err,
167                         cgroup_path.join("cpu.uclamp.max").display()
168                     )
169                 }),
170             );
171         }
172 
173         VirtCpufreqV2 {
174             vcpu_freq_table,
175             pcpu_fmax,
176             pcpu_capacity,
177             pcpu,
178             util_factor,
179             freqtbl_sel,
180             vcpu_domain,
181             domain_uclamp_min,
182             domain_uclamp_max,
183             vcpu_fmax,
184             vcpu_capacity,
185             vcpu_relative_capacity,
186             worker: None,
187             timer: Arc::new(Mutex::new(Timer::new().expect("failed to create Timer"))),
188             vm_ctrl,
189             pcpu_min_cap,
190             largest_pcpu_idx,
191             shared_domain_vcpus,
192             shared_domain_perf,
193         }
194     }
195 }
196 
197 impl BusDevice for VirtCpufreqV2 {
device_id(&self) -> DeviceId198     fn device_id(&self) -> DeviceId {
199         CrosvmDeviceId::VirtCpufreq.into()
200     }
201 
debug_label(&self) -> String202     fn debug_label(&self) -> String {
203         "VirtCpufreq Device".to_owned()
204     }
205 
read(&mut self, info: BusAccessInfo, data: &mut [u8])206     fn read(&mut self, info: BusAccessInfo, data: &mut [u8]) {
207         if data.len() != std::mem::size_of::<u32>() {
208             warn!(
209                 "{}: unsupported read length {}, only support 4bytes read",
210                 self.debug_label(),
211                 data.len()
212             );
213             return;
214         }
215 
216         let val = match info.offset as u32 {
217             VCPUFREQ_CUR_PERF => {
218                 let shared_util = self.shared_domain_perf.load(Ordering::SeqCst);
219                 if shared_util != 0 && shared_util < self.pcpu_min_cap {
220                     shared_util * self.vcpu_fmax / self.vcpu_capacity
221                 } else {
222                     match get_cpu_curfreq_khz(self.pcpu) {
223                         Ok(freq) => u32::try_from(
224                             u64::from(freq) * u64::from(self.pcpu_capacity)
225                                 / u64::from(self.vcpu_relative_capacity),
226                         )
227                         .unwrap(),
228                         Err(_) => 0,
229                     }
230                 }
231             }
232             VCPUFREQ_FREQTBL_LEN => self.vcpu_freq_table.len() as u32,
233             VCPUFREQ_PERF_DOMAIN => self.vcpu_domain,
234             VCPUFREQ_FREQTBL_RD => *self
235                 .vcpu_freq_table
236                 .get(self.freqtbl_sel as usize)
237                 .unwrap_or(&0),
238             _ => {
239                 warn!("{}: unsupported read address {}", self.debug_label(), info);
240                 return;
241             }
242         };
243 
244         let val_arr = val.to_ne_bytes();
245         data.copy_from_slice(&val_arr);
246     }
247 
write(&mut self, info: BusAccessInfo, data: &[u8])248     fn write(&mut self, info: BusAccessInfo, data: &[u8]) {
249         let val: u32 = match data.try_into().map(u32::from_ne_bytes) {
250             Ok(v) => v,
251             Err(e) => {
252                 warn!(
253                     "{}: unsupported write length {:#}, only support 4bytes write",
254                     self.debug_label(),
255                     e
256                 );
257                 return;
258             }
259         };
260 
261         match info.offset as u32 {
262             VCPUFREQ_SET_PERF => {
263                 // Util margin depends on the cpufreq governor on the host
264                 let util_raw = match u32::try_from(
265                     u64::from(self.vcpu_capacity) * u64::from(val) / u64::from(self.vcpu_fmax),
266                 ) {
267                     Ok(util) => util,
268                     Err(e) => {
269                         warn!("Potential overflow {:#}", e);
270                         SCHED_CAPACITY_SCALE
271                     }
272                 };
273 
274                 let util = util_raw * self.util_factor / CPUFREQ_GOV_SCALE_FACTOR_DEFAULT;
275 
276                 if let (Some(domain_uclamp_min), Some(domain_uclamp_max)) =
277                     (&mut self.domain_uclamp_min, &mut self.domain_uclamp_max)
278                 {
279                     use std::io::Write;
280                     let val = util as f32 * 100.0 / SCHED_CAPACITY_SCALE as f32;
281                     let val_formatted = format!("{:4}", val).into_bytes();
282 
283                     if self.vcpu_fmax != self.pcpu_fmax {
284                         if let Err(e) = domain_uclamp_max.write(&val_formatted) {
285                             warn!("Error setting uclamp_max: {:#}", e);
286                         }
287                     }
288                     if let Err(e) = domain_uclamp_min.write(&val_formatted) {
289                         warn!("Error setting uclamp_min: {:#}", e);
290                     }
291                 } else {
292                     let mut sched_attr = sched_attr::default();
293                     sched_attr.sched_flags = SCHED_FLAG_KEEP_ALL
294                         | SCHED_FLAG_UTIL_CLAMP_MIN
295                         | SCHED_FLAG_UTIL_CLAMP_MAX
296                         | SCHED_FLAG_RESET_ON_FORK;
297                     sched_attr.sched_util_min = util;
298 
299                     if self.vcpu_fmax != self.pcpu_fmax {
300                         sched_attr.sched_util_max = util;
301                     } else {
302                         sched_attr.sched_util_max = SCHED_CAPACITY_SCALE;
303                     }
304 
305                     if let Err(e) = sched_setattr(0, &mut sched_attr, 0) {
306                         panic!("{}: Error setting util value: {:#}", self.debug_label(), e);
307                     }
308                 }
309 
310                 self.shared_domain_perf.store(util_raw, Ordering::SeqCst);
311                 let timer = self.timer.clone();
312                 if self.worker.is_none() {
313                     let vcpu_id = info.id;
314                     let vm_ctrl = self.vm_ctrl.clone();
315                     let worker_cpu_affinity = self.largest_pcpu_idx + self.vcpu_domain as usize + 1;
316                     let shared_domain_vcpus = self.shared_domain_vcpus.clone();
317 
318                     self.worker = Some(WorkerThread::start(
319                         format!("vcpu_throttle{vcpu_id}"),
320                         move |kill_evt| {
321                             vcpufreq_worker_thread(
322                                 shared_domain_vcpus,
323                                 kill_evt,
324                                 timer,
325                                 vm_ctrl,
326                                 worker_cpu_affinity,
327                             )
328                             .expect("error running vpucfreq_worker")
329                         },
330                     ));
331                 } else if util_raw < self.pcpu_min_cap {
332                     // The period is porportional to the performance requested by the vCPU, we
333                     // reduce the timeout period to increase the amount of throttling applied to
334                     // the vCPU as the performance decreases. Ex. If vCPU requests half of the
335                     // performance relatively to its pCPU@FMin, the vCPU will spend 50% of its
336                     // cycles being throttled to increase time for the same workload that otherwise
337                     // would've taken 1/2 of the time if ran at pCPU@FMin. We could've
338                     // alternatively adjusted the workload and used some fixed period (such as
339                     // 250us), but there's a floor for the minimum delay we add (cost of handling
340                     // the userspace exit) and limits the range of performance we can emulate.
341                     let timeout_period = (MIN_TIMER_US + TIMER_OVERHEAD_US) as f32
342                         / (1.0 - (util_raw as f32 / self.pcpu_min_cap as f32));
343                     let _ = timer
344                         .lock()
345                         .reset_repeating(Duration::from_micros(timeout_period as u64));
346                 } else {
347                     let _ = timer.lock().clear();
348                 }
349             }
350             VCPUFREQ_FREQTBL_SEL => self.freqtbl_sel = val,
351             _ => {
352                 warn!("{}: unsupported read address {}", self.debug_label(), info);
353             }
354         }
355     }
356 }
357 
vcpufreq_worker_thread( shared_domain_vcpus: Vec<usize>, kill_evt: Event, timer: Arc<Mutex<Timer>>, vm_ctrl: Arc<Mutex<Tube>>, cpu_affinity: usize, ) -> anyhow::Result<()>358 pub fn vcpufreq_worker_thread(
359     shared_domain_vcpus: Vec<usize>,
360     kill_evt: Event,
361     timer: Arc<Mutex<Timer>>,
362     vm_ctrl: Arc<Mutex<Tube>>,
363     cpu_affinity: usize,
364 ) -> anyhow::Result<()> {
365     #[derive(EventToken)]
366     enum Token {
367         // The timer expired.
368         TimerExpire,
369         // The parent thread requested an exit.
370         Kill,
371     }
372 
373     let wait_ctx = WaitContext::build_with(&[
374         (&*timer.lock(), Token::TimerExpire),
375         (&kill_evt, Token::Kill),
376     ])
377     .context("Failed to create wait_ctx")?;
378 
379     // The vcpufreq thread has strict scheduling requirements, let's affine it away from the vCPU
380     // threads and clamp its util to high value.
381     let cpu_set: Vec<usize> = vec![cpu_affinity];
382     set_cpu_affinity(cpu_set)?;
383 
384     let mut sched_attr = sched_attr::default();
385     sched_attr.sched_flags = SCHED_FLAG_KEEP_ALL
386         | SCHED_FLAG_UTIL_CLAMP_MIN
387         | SCHED_FLAG_UTIL_CLAMP_MAX
388         | SCHED_FLAG_RESET_ON_FORK;
389     sched_attr.sched_util_min = SCHED_CAPACITY_SCALE;
390     sched_attr.sched_util_max = SCHED_CAPACITY_SCALE;
391     if let Err(e) = sched_setattr(0, &mut sched_attr, 0) {
392         warn!("Error setting util value: {}", e);
393     }
394 
395     loop {
396         let events = wait_ctx.wait().context("Failed to wait for events")?;
397         for event in events.iter().filter(|e| e.is_readable) {
398             match event.token {
399                 Token::TimerExpire => {
400                     timer
401                         .lock()
402                         .mark_waited()
403                         .context("failed to reset timer")?;
404                     let vm_ctrl_unlocked = vm_ctrl.lock();
405                     for vcpu_id in &shared_domain_vcpus {
406                         let msg = vm_control::VmRequest::Throttle(*vcpu_id, MIN_TIMER_US);
407                         vm_ctrl_unlocked
408                             .send(&msg)
409                             .context("failed to stall vCPUs")?;
410                     }
411                 }
412                 Token::Kill => {
413                     return Ok(());
414                 }
415             }
416         }
417     }
418 }
419 
420 impl Suspendable for VirtCpufreqV2 {}
421