xref: /aosp_15_r20/external/crosvm/metrics/src/sys/windows/system_metrics.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::fmt;
6 use std::fmt::Write as _;
7 use std::mem;
8 use std::sync::Arc;
9 use std::sync::Mutex;
10 use std::sync::Weak;
11 use std::thread;
12 use std::thread::JoinHandle;
13 use std::time::Duration;
14 use std::time::Instant;
15 
16 use base::error;
17 use base::AsRawDescriptor;
18 use base::Error as SysError;
19 use base::Event;
20 use base::EventToken;
21 use base::FromRawDescriptor;
22 use base::SafeDescriptor;
23 use base::WaitContext;
24 use chrono::DateTime;
25 use chrono::Utc;
26 use winapi::shared::minwindef::DWORD;
27 use winapi::shared::minwindef::FILETIME;
28 use winapi::um::processthreadsapi::GetProcessTimes;
29 use winapi::um::processthreadsapi::GetSystemTimes;
30 use winapi::um::processthreadsapi::OpenProcess;
31 use winapi::um::psapi::GetProcessMemoryInfo;
32 use winapi::um::psapi::PROCESS_MEMORY_COUNTERS;
33 use winapi::um::psapi::PROCESS_MEMORY_COUNTERS_EX;
34 use winapi::um::winbase::GetProcessIoCounters;
35 use winapi::um::winnt::IO_COUNTERS;
36 use winapi::um::winnt::LARGE_INTEGER;
37 use winapi::um::winnt::LONGLONG;
38 use winapi::um::winnt::PROCESS_QUERY_LIMITED_INFORMATION;
39 use winapi::um::winnt::PROCESS_VM_READ;
40 use winapi::um::winnt::SYNCHRONIZE;
41 
42 use crate::log_metric;
43 use crate::sys::windows::Error;
44 use crate::sys::windows::Result;
45 use crate::sys::windows::METRICS_UPLOAD_INTERVAL;
46 use crate::MetricEventType;
47 
48 const BYTES_PER_MB: usize = 1024 * 1024;
49 const WORKER_REPORT_INTERVAL: Duration = Duration::from_secs(1);
50 
51 type SysResult<T> = std::result::Result<T, SysError>;
52 
53 /// A worker job which periodically logs system metrics.
54 struct Worker {
55     exit_evt: Event,
56     io: Arc<Mutex<Option<ProcessIoRecord>>>,
57     measurements: Arc<Mutex<Option<Measurements>>>,
58     memory: Arc<Mutex<ProcessMemory>>,
59     memory_acc: Arc<Mutex<Option<ProcessMemoryAccumulated>>>,
60     metrics_string: Arc<Mutex<String>>,
61 }
62 
63 impl Worker {
run(&mut self)64     fn run(&mut self) {
65         #[derive(EventToken)]
66         enum Token {
67             Exit,
68         }
69         let event_ctx: WaitContext<Token> =
70             match WaitContext::build_with(&[(&self.exit_evt, Token::Exit)]) {
71                 Ok(event_ctx) => event_ctx,
72                 Err(e) => {
73                     error!("failed creating WaitContext: {}", e);
74                     return;
75                 }
76             };
77         let mut last_metric_upload_time = Instant::now();
78         'poll: loop {
79             let events = match event_ctx.wait_timeout(WORKER_REPORT_INTERVAL) {
80                 Ok(events) => events,
81                 Err(e) => {
82                     error!("failed polling for events: {}", e);
83                     return;
84                 }
85             };
86             if events.is_empty() {
87                 self.collect_metrics();
88                 // Time budget for UI thread is very limited.
89                 // We make the metric string for displaying in UI in the
90                 // worker thread for best performance.
91                 self.make_metrics_string();
92 
93                 self.upload_metrics(&mut last_metric_upload_time);
94             }
95 
96             if events.into_iter().any(|e| e.is_readable) {
97                 break 'poll;
98             }
99         }
100     }
101 
make_metrics_string(&mut self)102     fn make_metrics_string(&mut self) {
103         let mut metrics_string = self.metrics_string.lock().unwrap();
104         *metrics_string = format!(
105             "{}\n{}",
106             self.cpu_metrics_string(),
107             self.mem_metrics_string()
108         );
109     }
110 
upload_metrics(&self, last_metric_upload_time: &mut Instant)111     fn upload_metrics(&self, last_metric_upload_time: &mut Instant) {
112         if last_metric_upload_time.elapsed() >= METRICS_UPLOAD_INTERVAL {
113             let mut memory_acc = self.memory_acc.lock().unwrap();
114             if let Some(acc) = &*memory_acc {
115                 let mem = acc.accumulated.physical / acc.accumulated_count / BYTES_PER_MB;
116                 // The i64 cast will not cause overflow because the mem is at most 10TB for
117                 // Windows 10.
118                 log_metric(MetricEventType::MemoryUsage, mem as i64);
119             }
120             *memory_acc = None;
121 
122             let mut cpu_measurements = self.measurements.lock().unwrap();
123             if let Some(measurements) = &*cpu_measurements {
124                 let sys_time = measurements.current.sys_time;
125                 let process_time = measurements.current.process_time;
126                 let prev_sys_time = measurements.last_upload.sys_time;
127                 let prev_process_time = measurements.last_upload.process_time;
128 
129                 let diff_systime_kernel =
130                     compute_filetime_subtraction(sys_time.kernel, prev_sys_time.kernel);
131                 let diff_systime_user =
132                     compute_filetime_subtraction(sys_time.user, prev_sys_time.user);
133 
134                 let diff_processtime_kernel =
135                     compute_filetime_subtraction(process_time.kernel, prev_process_time.kernel);
136                 let diff_processtime_user =
137                     compute_filetime_subtraction(process_time.user, prev_process_time.user);
138 
139                 let total_systime = diff_systime_kernel + diff_systime_user;
140                 let total_processtime = diff_processtime_kernel + diff_processtime_user;
141 
142                 if total_systime > 0 {
143                     let cpu_usage = 100 * total_processtime / total_systime;
144                     // The i64 cast will not cause overflow because the usage is at most 100.
145                     log_metric(MetricEventType::CpuUsage, cpu_usage);
146                 }
147             }
148             *cpu_measurements = None;
149 
150             let mut io = self.io.lock().unwrap();
151             if let Some(io_record) = &*io {
152                 let new_io_read_bytes =
153                     io_record.current.read_bytes - io_record.last_upload.read_bytes;
154                 let new_io_write_bytes =
155                     io_record.current.write_bytes - io_record.last_upload.write_bytes;
156 
157                 let ms_elapsed =
158                     (io_record.current_time - io_record.last_upload_time).num_milliseconds();
159                 if ms_elapsed > 0 {
160                     let io_read_bytes_per_sec =
161                         (new_io_read_bytes as f32) / (ms_elapsed as f32) * 1000.0;
162                     let io_write_bytes_per_sec =
163                         (new_io_write_bytes as f32) / (ms_elapsed as f32) * 1000.0;
164                     log_metric(MetricEventType::ReadIo, io_read_bytes_per_sec as i64);
165                     log_metric(MetricEventType::WriteIo, io_write_bytes_per_sec as i64);
166                 }
167             }
168             *io = None;
169             *last_metric_upload_time = Instant::now();
170         }
171     }
172 
collect_metrics(&mut self)173     fn collect_metrics(&mut self) {
174         match self.get_cpu_metrics() {
175             Ok(new_measurement) => {
176                 let mut measurements = self.measurements.lock().unwrap();
177                 let next_measurements = match *measurements {
178                     Some(Measurements {
179                         current,
180                         last_upload,
181                         ..
182                     }) => Measurements {
183                         current: new_measurement,
184                         previous: current,
185                         last_upload,
186                     },
187                     None => Measurements {
188                         current: new_measurement,
189                         previous: new_measurement,
190                         last_upload: new_measurement,
191                     },
192                 };
193                 *measurements = Some(next_measurements);
194             }
195             Err(e) => {
196                 // Do not panic because of cpu query related failures.
197                 error!("Get cpu measurement failed: {}", e);
198             }
199         }
200 
201         match self.get_mem_metrics() {
202             Ok(mem) => {
203                 // Keep running sum and count to calculate averages.
204                 let mut memory_acc = self.memory_acc.lock().unwrap();
205                 let updated_memory_acc = match *memory_acc {
206                     Some(acc) => accumulate_process_memory(acc, mem),
207                     None => ProcessMemoryAccumulated {
208                         accumulated: mem,
209                         accumulated_count: 1,
210                     },
211                 };
212                 *memory_acc = Some(updated_memory_acc);
213                 *self.memory.lock().unwrap() = mem
214             }
215             Err(e) => {
216                 // Do not panic because of memory query failures.
217                 error!("Get cpu measurement failed: {}", e);
218             }
219         }
220 
221         match self.get_io_metrics() {
222             Ok(new_io) => {
223                 let mut io_record = self.io.lock().unwrap();
224                 let updated_io = match *io_record {
225                     Some(io) => ProcessIoRecord {
226                         current: new_io,
227                         current_time: Utc::now(),
228                         last_upload: io.last_upload,
229                         last_upload_time: io.last_upload_time,
230                     },
231                     None => ProcessIoRecord {
232                         current: new_io,
233                         current_time: Utc::now(),
234                         last_upload: new_io,
235                         last_upload_time: Utc::now(),
236                     },
237                 };
238                 *io_record = Some(updated_io);
239             }
240             Err(e) => {
241                 // Do not panic because of io query failures.
242                 error!("Get io measurement failed: {}", e);
243             }
244         }
245     }
246 
get_mem_metrics(&self) -> SysResult<ProcessMemory>247     fn get_mem_metrics(&self) -> SysResult<ProcessMemory> {
248         let process_handle = CoreWinMetrics::get_process_handle()?;
249 
250         let mut counters = PROCESS_MEMORY_COUNTERS_EX::default();
251 
252         // SAFETY:
253         // Safe because we own the process handle and all memory was allocated.
254         let result = unsafe {
255             GetProcessMemoryInfo(
256                 process_handle.as_raw_descriptor(),
257                 // Converting is necessary because the `winapi`' GetProcessMemoryInfo
258                 // does NOT support `PROCESS_MEMORY_COUNTERS_EX`, but only
259                 // 'PROCESS_MEMORY_COUNTERS'. The casting is safe because the underlining
260                 // Windows api does `PROCESS_MEMORY_COUNTERS_EX`.
261                 &mut counters as *mut PROCESS_MEMORY_COUNTERS_EX as *mut PROCESS_MEMORY_COUNTERS,
262                 mem::size_of::<PROCESS_MEMORY_COUNTERS_EX>() as DWORD,
263             )
264         };
265         if result == 0 {
266             return Err(SysError::last());
267         }
268 
269         Ok(ProcessMemory {
270             page_fault_count: counters.PageFaultCount,
271             working_set_size: counters.WorkingSetSize,
272             working_set_peak: counters.PeakWorkingSetSize,
273             page_file_usage: counters.PagefileUsage,
274             page_file_peak: counters.PeakPagefileUsage,
275             physical: counters.PrivateUsage,
276         })
277     }
278 
get_cpu_metrics(&self) -> SysResult<Measurement>279     fn get_cpu_metrics(&self) -> SysResult<Measurement> {
280         let mut sys_time: SystemCpuTime = Default::default();
281         let mut process_time: ProcessCpuTime = Default::default();
282         let sys_time_success: i32;
283 
284         // SAFETY:
285         // Safe because memory is allocated for sys_time before the windows call.
286         // And the value were initilized to 0s.
287         unsafe {
288             // First get kernel cpu time.
289             sys_time_success =
290                 GetSystemTimes(&mut sys_time.idle, &mut sys_time.kernel, &mut sys_time.user);
291         }
292         if sys_time_success == 0 {
293             error!("Systime collection failed.\n");
294             return Err(SysError::last());
295         } else {
296             // Query current process cpu time.
297             let process_handle = CoreWinMetrics::get_process_handle()?;
298             let process_time_success: i32;
299             // SAFETY:
300             // Safe because memory is allocated for process_time before the windows call.
301             // And the value were initilized to 0s.
302             unsafe {
303                 process_time_success = GetProcessTimes(
304                     process_handle.as_raw_descriptor(),
305                     &mut process_time.create,
306                     &mut process_time.exit,
307                     &mut process_time.kernel,
308                     &mut process_time.user,
309                 );
310             }
311             if process_time_success == 0 {
312                 error!("Systime collection failed.\n");
313                 return Err(SysError::last());
314             }
315         }
316         Ok(Measurement {
317             sys_time: SystemCpuTime {
318                 idle: sys_time.idle,
319                 kernel: sys_time.kernel,
320                 user: sys_time.user,
321             },
322             process_time: ProcessCpuTime {
323                 create: process_time.create,
324                 exit: process_time.exit,
325                 kernel: process_time.kernel,
326                 user: process_time.user,
327             },
328         })
329     }
330 
get_io_metrics(&self) -> SysResult<ProcessIo>331     fn get_io_metrics(&self) -> SysResult<ProcessIo> {
332         let process_handle = CoreWinMetrics::get_process_handle()?;
333         let mut io_counters = IO_COUNTERS::default();
334         // SAFETY:
335         // Safe because we own the process handle and all memory was allocated.
336         let result = unsafe {
337             GetProcessIoCounters(
338                 process_handle.as_raw_descriptor(),
339                 &mut io_counters as *mut IO_COUNTERS,
340             )
341         };
342         if result == 0 {
343             return Err(SysError::last());
344         }
345         Ok(ProcessIo {
346             read_bytes: io_counters.ReadTransferCount,
347             write_bytes: io_counters.WriteTransferCount,
348         })
349     }
350 
mem_metrics_string(&self) -> String351     fn mem_metrics_string(&self) -> String {
352         let guard = self.memory.lock().unwrap();
353         let memory: ProcessMemory = *guard;
354         let mut buf = format!(
355             "Physical memory used: {} mb.\n",
356             memory.physical / BYTES_PER_MB
357         );
358         let _ = writeln!(
359             buf,
360             "Total working memory: {} mb.",
361             memory.working_set_size / BYTES_PER_MB
362         );
363         let _ = writeln!(
364             buf,
365             "Peak working memory: {} mb.",
366             memory.working_set_peak / BYTES_PER_MB
367         );
368         let _ = writeln!(buf, "Page fault count: {}.", memory.page_fault_count);
369         let _ = writeln!(
370             buf,
371             "Page file used: {} mb.",
372             memory.page_file_usage / BYTES_PER_MB
373         );
374         let _ = writeln!(
375             buf,
376             "Peak page file used: {} mb.",
377             memory.page_file_peak / BYTES_PER_MB
378         );
379         buf
380     }
381 
cpu_metrics_string(&self) -> String382     fn cpu_metrics_string(&self) -> String {
383         let guard = self.measurements.lock().unwrap();
384         let mut buf = String::new();
385 
386         // Now we use current and last cpu measurment data to calculate cpu usage
387         // as a percentage.
388         if let Some(measurements) = &*guard {
389             let sys_time = measurements.current.sys_time;
390             let process_time = measurements.current.process_time;
391             let prev_sys_time = measurements.previous.sys_time;
392             let prev_process_time = measurements.previous.process_time;
393 
394             let diff_systime_kernel =
395                 compute_filetime_subtraction(sys_time.kernel, prev_sys_time.kernel);
396             let diff_systime_user = compute_filetime_subtraction(sys_time.user, prev_sys_time.user);
397 
398             let diff_processtime_kernel =
399                 compute_filetime_subtraction(process_time.kernel, prev_process_time.kernel);
400             let diff_processtime_user =
401                 compute_filetime_subtraction(process_time.user, prev_process_time.user);
402 
403             let total_systime = diff_systime_kernel + diff_systime_user;
404             let total_processtime = diff_processtime_kernel + diff_processtime_user;
405 
406             let mut process_cpu = String::from("still calculating...");
407             if total_systime > 0 {
408                 process_cpu = format!("{}%", (100 * total_processtime / total_systime));
409             }
410             let _ = writeln!(buf, "Process cpu usage is: {}", process_cpu);
411 
412             #[cfg(debug_assertions)]
413             {
414                 // Show data supporting our cpu usage calculation.
415                 // Output system cpu time.
416                 let _ = writeln!(
417                     buf,
418                     "Systime Idle: low {} / high {}",
419                     sys_time.idle.dwLowDateTime, sys_time.idle.dwHighDateTime
420                 );
421                 let _ = writeln!(
422                     buf,
423                     "Systime User: low {} / high {}",
424                     sys_time.user.dwLowDateTime, sys_time.user.dwHighDateTime
425                 );
426                 let _ = writeln!(
427                     buf,
428                     "Systime kernel: low {} / high {}",
429                     sys_time.kernel.dwLowDateTime, sys_time.kernel.dwHighDateTime
430                 );
431                 // Output process cpu time.
432                 let _ = writeln!(
433                     buf,
434                     "Process Create: low {} / high {}",
435                     process_time.create.dwLowDateTime, process_time.create.dwHighDateTime
436                 );
437                 let _ = writeln!(
438                     buf,
439                     "Process Exit: low {} / high {}",
440                     process_time.exit.dwLowDateTime, process_time.exit.dwHighDateTime
441                 );
442                 let _ = writeln!(
443                     buf,
444                     "Process kernel: low {} / high {}",
445                     process_time.kernel.dwLowDateTime, process_time.kernel.dwHighDateTime
446                 );
447                 let _ = writeln!(
448                     buf,
449                     "Process user: low {} / high {}",
450                     process_time.user.dwLowDateTime, process_time.user.dwHighDateTime
451                 );
452             }
453         } else {
454             let _ = write!(buf, "Calculating cpu usage...");
455         }
456         buf
457     }
458 }
459 
compute_filetime_subtraction(fta: FILETIME, ftb: FILETIME) -> LONGLONG460 fn compute_filetime_subtraction(fta: FILETIME, ftb: FILETIME) -> LONGLONG {
461     // SAFETY:
462     // safe because we are initializing the struct to 0s.
463     unsafe {
464         let mut a: LARGE_INTEGER = mem::zeroed::<LARGE_INTEGER>();
465         a.u_mut().LowPart = fta.dwLowDateTime;
466         a.u_mut().HighPart = fta.dwHighDateTime as i32;
467         let mut b: LARGE_INTEGER = mem::zeroed::<LARGE_INTEGER>();
468         b.u_mut().LowPart = ftb.dwLowDateTime;
469         b.u_mut().HighPart = ftb.dwHighDateTime as i32;
470         a.QuadPart() - b.QuadPart()
471     }
472 }
473 
474 // Adds to a running total of memory metrics over the course of a collection period.
475 // Can divide these sums to calculate averages.
accumulate_process_memory( acc: ProcessMemoryAccumulated, mem: ProcessMemory, ) -> ProcessMemoryAccumulated476 fn accumulate_process_memory(
477     acc: ProcessMemoryAccumulated,
478     mem: ProcessMemory,
479 ) -> ProcessMemoryAccumulated {
480     ProcessMemoryAccumulated {
481         accumulated: ProcessMemory {
482             page_fault_count: mem.page_fault_count,
483             working_set_size: acc.accumulated.working_set_size + mem.working_set_size,
484             working_set_peak: mem.working_set_peak,
485             page_file_usage: acc.accumulated.page_file_usage + mem.page_file_usage,
486             page_file_peak: mem.page_file_peak,
487             physical: acc.accumulated.physical + mem.physical,
488         },
489         accumulated_count: acc.accumulated_count + 1,
490     }
491 }
492 
493 #[derive(Copy, Clone, Default)]
494 struct SystemCpuTime {
495     idle: FILETIME,
496     kernel: FILETIME,
497     user: FILETIME,
498 }
499 
500 #[derive(Copy, Clone, Default)]
501 struct ProcessCpuTime {
502     create: FILETIME,
503     exit: FILETIME,
504     kernel: FILETIME,
505     user: FILETIME,
506 }
507 
508 #[derive(Copy, Clone, Default)]
509 struct ProcessMemory {
510     page_fault_count: u32,
511     working_set_size: usize,
512     working_set_peak: usize,
513     page_file_usage: usize,
514     page_file_peak: usize,
515     physical: usize,
516 }
517 
518 #[derive(Copy, Clone)]
519 struct ProcessMemoryAccumulated {
520     accumulated: ProcessMemory,
521     accumulated_count: usize,
522 }
523 
524 #[derive(Copy, Clone, Default)]
525 struct ProcessIo {
526     read_bytes: u64,
527     write_bytes: u64,
528 }
529 
530 #[derive(Copy, Clone)]
531 struct ProcessIoRecord {
532     current: ProcessIo,
533     current_time: DateTime<Utc>,
534     last_upload: ProcessIo,
535     last_upload_time: DateTime<Utc>,
536 }
537 
538 #[derive(Copy, Clone)]
539 struct Measurement {
540     sys_time: SystemCpuTime,
541     process_time: ProcessCpuTime,
542 }
543 
544 struct Measurements {
545     current: Measurement,
546     previous: Measurement,
547     last_upload: Measurement,
548 }
549 
550 /// A managing struct for a job which defines regular logging of core Windows system metrics.
551 pub(crate) struct CoreWinMetrics {
552     metrics_string: Weak<Mutex<String>>,
553     exit_evt: Event,
554     worker_thread: Option<JoinHandle<()>>,
555 }
556 
557 impl CoreWinMetrics {
new() -> Result<Self>558     pub fn new() -> Result<Self> {
559         let exit_evt = match Event::new() {
560             Ok(evt) => evt,
561             Err(_e) => return Err(Error::CannotInstantiateEvent),
562         };
563 
564         let metrics_string = String::new();
565         let arc_metrics_memory = Arc::new(Mutex::new(metrics_string));
566         let weak_metrics_memory = Arc::downgrade(&arc_metrics_memory);
567 
568         let mut me = Self {
569             metrics_string: weak_metrics_memory,
570             exit_evt,
571             worker_thread: None,
572         };
573         let exit_evt_clone = match me.exit_evt.try_clone() {
574             Ok(evt) => evt,
575             Err(_) => return Err(Error::CannotCloneEvent),
576         };
577         me.worker_thread.replace(thread::spawn(|| {
578             Worker {
579                 exit_evt: exit_evt_clone,
580                 io: Arc::new(Mutex::new(None)),
581                 measurements: Arc::new(Mutex::new(None)),
582                 memory: Arc::new(Mutex::new(Default::default())),
583                 memory_acc: Arc::new(Mutex::new(None)),
584                 metrics_string: arc_metrics_memory,
585             }
586             .run();
587         }));
588         Ok(me)
589     }
590 
get_process_handle() -> SysResult<SafeDescriptor>591     fn get_process_handle() -> SysResult<SafeDescriptor> {
592         // SAFETY:
593         // Safe because we own the current process.
594         let process_handle = unsafe {
595             OpenProcess(
596                 PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_VM_READ | SYNCHRONIZE,
597                 0,
598                 std::process::id(),
599             )
600         };
601         if process_handle.is_null() {
602             return Err(SysError::last());
603         }
604         // SAFETY:
605         // Safe as the SafeDescriptor is the only thing with access to the handle after this.
606         Ok(unsafe { SafeDescriptor::from_raw_descriptor(process_handle) })
607     }
608 }
609 
610 impl Drop for CoreWinMetrics {
drop(&mut self)611     fn drop(&mut self) {
612         if let Some(join_handle) = self.worker_thread.take() {
613             let _ = self.exit_evt.signal();
614             join_handle
615                 .join()
616                 .expect("fail to join the worker thread of a win core metrics collector.");
617         }
618     }
619 }
620 
621 impl fmt::Display for CoreWinMetrics {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result622     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
623         match self.metrics_string.upgrade() {
624             Some(metrics_string) => write!(f, "{}", *metrics_string.lock().unwrap()),
625             None => write!(f, ""),
626         }
627     }
628 }
629