1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::fmt;
6 use std::fmt::Write as _;
7 use std::mem;
8 use std::sync::Arc;
9 use std::sync::Mutex;
10 use std::sync::Weak;
11 use std::thread;
12 use std::thread::JoinHandle;
13 use std::time::Duration;
14 use std::time::Instant;
15
16 use base::error;
17 use base::AsRawDescriptor;
18 use base::Error as SysError;
19 use base::Event;
20 use base::EventToken;
21 use base::FromRawDescriptor;
22 use base::SafeDescriptor;
23 use base::WaitContext;
24 use chrono::DateTime;
25 use chrono::Utc;
26 use winapi::shared::minwindef::DWORD;
27 use winapi::shared::minwindef::FILETIME;
28 use winapi::um::processthreadsapi::GetProcessTimes;
29 use winapi::um::processthreadsapi::GetSystemTimes;
30 use winapi::um::processthreadsapi::OpenProcess;
31 use winapi::um::psapi::GetProcessMemoryInfo;
32 use winapi::um::psapi::PROCESS_MEMORY_COUNTERS;
33 use winapi::um::psapi::PROCESS_MEMORY_COUNTERS_EX;
34 use winapi::um::winbase::GetProcessIoCounters;
35 use winapi::um::winnt::IO_COUNTERS;
36 use winapi::um::winnt::LARGE_INTEGER;
37 use winapi::um::winnt::LONGLONG;
38 use winapi::um::winnt::PROCESS_QUERY_LIMITED_INFORMATION;
39 use winapi::um::winnt::PROCESS_VM_READ;
40 use winapi::um::winnt::SYNCHRONIZE;
41
42 use crate::log_metric;
43 use crate::sys::windows::Error;
44 use crate::sys::windows::Result;
45 use crate::sys::windows::METRICS_UPLOAD_INTERVAL;
46 use crate::MetricEventType;
47
48 const BYTES_PER_MB: usize = 1024 * 1024;
49 const WORKER_REPORT_INTERVAL: Duration = Duration::from_secs(1);
50
51 type SysResult<T> = std::result::Result<T, SysError>;
52
53 /// A worker job which periodically logs system metrics.
54 struct Worker {
55 exit_evt: Event,
56 io: Arc<Mutex<Option<ProcessIoRecord>>>,
57 measurements: Arc<Mutex<Option<Measurements>>>,
58 memory: Arc<Mutex<ProcessMemory>>,
59 memory_acc: Arc<Mutex<Option<ProcessMemoryAccumulated>>>,
60 metrics_string: Arc<Mutex<String>>,
61 }
62
63 impl Worker {
run(&mut self)64 fn run(&mut self) {
65 #[derive(EventToken)]
66 enum Token {
67 Exit,
68 }
69 let event_ctx: WaitContext<Token> =
70 match WaitContext::build_with(&[(&self.exit_evt, Token::Exit)]) {
71 Ok(event_ctx) => event_ctx,
72 Err(e) => {
73 error!("failed creating WaitContext: {}", e);
74 return;
75 }
76 };
77 let mut last_metric_upload_time = Instant::now();
78 'poll: loop {
79 let events = match event_ctx.wait_timeout(WORKER_REPORT_INTERVAL) {
80 Ok(events) => events,
81 Err(e) => {
82 error!("failed polling for events: {}", e);
83 return;
84 }
85 };
86 if events.is_empty() {
87 self.collect_metrics();
88 // Time budget for UI thread is very limited.
89 // We make the metric string for displaying in UI in the
90 // worker thread for best performance.
91 self.make_metrics_string();
92
93 self.upload_metrics(&mut last_metric_upload_time);
94 }
95
96 if events.into_iter().any(|e| e.is_readable) {
97 break 'poll;
98 }
99 }
100 }
101
make_metrics_string(&mut self)102 fn make_metrics_string(&mut self) {
103 let mut metrics_string = self.metrics_string.lock().unwrap();
104 *metrics_string = format!(
105 "{}\n{}",
106 self.cpu_metrics_string(),
107 self.mem_metrics_string()
108 );
109 }
110
upload_metrics(&self, last_metric_upload_time: &mut Instant)111 fn upload_metrics(&self, last_metric_upload_time: &mut Instant) {
112 if last_metric_upload_time.elapsed() >= METRICS_UPLOAD_INTERVAL {
113 let mut memory_acc = self.memory_acc.lock().unwrap();
114 if let Some(acc) = &*memory_acc {
115 let mem = acc.accumulated.physical / acc.accumulated_count / BYTES_PER_MB;
116 // The i64 cast will not cause overflow because the mem is at most 10TB for
117 // Windows 10.
118 log_metric(MetricEventType::MemoryUsage, mem as i64);
119 }
120 *memory_acc = None;
121
122 let mut cpu_measurements = self.measurements.lock().unwrap();
123 if let Some(measurements) = &*cpu_measurements {
124 let sys_time = measurements.current.sys_time;
125 let process_time = measurements.current.process_time;
126 let prev_sys_time = measurements.last_upload.sys_time;
127 let prev_process_time = measurements.last_upload.process_time;
128
129 let diff_systime_kernel =
130 compute_filetime_subtraction(sys_time.kernel, prev_sys_time.kernel);
131 let diff_systime_user =
132 compute_filetime_subtraction(sys_time.user, prev_sys_time.user);
133
134 let diff_processtime_kernel =
135 compute_filetime_subtraction(process_time.kernel, prev_process_time.kernel);
136 let diff_processtime_user =
137 compute_filetime_subtraction(process_time.user, prev_process_time.user);
138
139 let total_systime = diff_systime_kernel + diff_systime_user;
140 let total_processtime = diff_processtime_kernel + diff_processtime_user;
141
142 if total_systime > 0 {
143 let cpu_usage = 100 * total_processtime / total_systime;
144 // The i64 cast will not cause overflow because the usage is at most 100.
145 log_metric(MetricEventType::CpuUsage, cpu_usage);
146 }
147 }
148 *cpu_measurements = None;
149
150 let mut io = self.io.lock().unwrap();
151 if let Some(io_record) = &*io {
152 let new_io_read_bytes =
153 io_record.current.read_bytes - io_record.last_upload.read_bytes;
154 let new_io_write_bytes =
155 io_record.current.write_bytes - io_record.last_upload.write_bytes;
156
157 let ms_elapsed =
158 (io_record.current_time - io_record.last_upload_time).num_milliseconds();
159 if ms_elapsed > 0 {
160 let io_read_bytes_per_sec =
161 (new_io_read_bytes as f32) / (ms_elapsed as f32) * 1000.0;
162 let io_write_bytes_per_sec =
163 (new_io_write_bytes as f32) / (ms_elapsed as f32) * 1000.0;
164 log_metric(MetricEventType::ReadIo, io_read_bytes_per_sec as i64);
165 log_metric(MetricEventType::WriteIo, io_write_bytes_per_sec as i64);
166 }
167 }
168 *io = None;
169 *last_metric_upload_time = Instant::now();
170 }
171 }
172
collect_metrics(&mut self)173 fn collect_metrics(&mut self) {
174 match self.get_cpu_metrics() {
175 Ok(new_measurement) => {
176 let mut measurements = self.measurements.lock().unwrap();
177 let next_measurements = match *measurements {
178 Some(Measurements {
179 current,
180 last_upload,
181 ..
182 }) => Measurements {
183 current: new_measurement,
184 previous: current,
185 last_upload,
186 },
187 None => Measurements {
188 current: new_measurement,
189 previous: new_measurement,
190 last_upload: new_measurement,
191 },
192 };
193 *measurements = Some(next_measurements);
194 }
195 Err(e) => {
196 // Do not panic because of cpu query related failures.
197 error!("Get cpu measurement failed: {}", e);
198 }
199 }
200
201 match self.get_mem_metrics() {
202 Ok(mem) => {
203 // Keep running sum and count to calculate averages.
204 let mut memory_acc = self.memory_acc.lock().unwrap();
205 let updated_memory_acc = match *memory_acc {
206 Some(acc) => accumulate_process_memory(acc, mem),
207 None => ProcessMemoryAccumulated {
208 accumulated: mem,
209 accumulated_count: 1,
210 },
211 };
212 *memory_acc = Some(updated_memory_acc);
213 *self.memory.lock().unwrap() = mem
214 }
215 Err(e) => {
216 // Do not panic because of memory query failures.
217 error!("Get cpu measurement failed: {}", e);
218 }
219 }
220
221 match self.get_io_metrics() {
222 Ok(new_io) => {
223 let mut io_record = self.io.lock().unwrap();
224 let updated_io = match *io_record {
225 Some(io) => ProcessIoRecord {
226 current: new_io,
227 current_time: Utc::now(),
228 last_upload: io.last_upload,
229 last_upload_time: io.last_upload_time,
230 },
231 None => ProcessIoRecord {
232 current: new_io,
233 current_time: Utc::now(),
234 last_upload: new_io,
235 last_upload_time: Utc::now(),
236 },
237 };
238 *io_record = Some(updated_io);
239 }
240 Err(e) => {
241 // Do not panic because of io query failures.
242 error!("Get io measurement failed: {}", e);
243 }
244 }
245 }
246
get_mem_metrics(&self) -> SysResult<ProcessMemory>247 fn get_mem_metrics(&self) -> SysResult<ProcessMemory> {
248 let process_handle = CoreWinMetrics::get_process_handle()?;
249
250 let mut counters = PROCESS_MEMORY_COUNTERS_EX::default();
251
252 // SAFETY:
253 // Safe because we own the process handle and all memory was allocated.
254 let result = unsafe {
255 GetProcessMemoryInfo(
256 process_handle.as_raw_descriptor(),
257 // Converting is necessary because the `winapi`' GetProcessMemoryInfo
258 // does NOT support `PROCESS_MEMORY_COUNTERS_EX`, but only
259 // 'PROCESS_MEMORY_COUNTERS'. The casting is safe because the underlining
260 // Windows api does `PROCESS_MEMORY_COUNTERS_EX`.
261 &mut counters as *mut PROCESS_MEMORY_COUNTERS_EX as *mut PROCESS_MEMORY_COUNTERS,
262 mem::size_of::<PROCESS_MEMORY_COUNTERS_EX>() as DWORD,
263 )
264 };
265 if result == 0 {
266 return Err(SysError::last());
267 }
268
269 Ok(ProcessMemory {
270 page_fault_count: counters.PageFaultCount,
271 working_set_size: counters.WorkingSetSize,
272 working_set_peak: counters.PeakWorkingSetSize,
273 page_file_usage: counters.PagefileUsage,
274 page_file_peak: counters.PeakPagefileUsage,
275 physical: counters.PrivateUsage,
276 })
277 }
278
get_cpu_metrics(&self) -> SysResult<Measurement>279 fn get_cpu_metrics(&self) -> SysResult<Measurement> {
280 let mut sys_time: SystemCpuTime = Default::default();
281 let mut process_time: ProcessCpuTime = Default::default();
282 let sys_time_success: i32;
283
284 // SAFETY:
285 // Safe because memory is allocated for sys_time before the windows call.
286 // And the value were initilized to 0s.
287 unsafe {
288 // First get kernel cpu time.
289 sys_time_success =
290 GetSystemTimes(&mut sys_time.idle, &mut sys_time.kernel, &mut sys_time.user);
291 }
292 if sys_time_success == 0 {
293 error!("Systime collection failed.\n");
294 return Err(SysError::last());
295 } else {
296 // Query current process cpu time.
297 let process_handle = CoreWinMetrics::get_process_handle()?;
298 let process_time_success: i32;
299 // SAFETY:
300 // Safe because memory is allocated for process_time before the windows call.
301 // And the value were initilized to 0s.
302 unsafe {
303 process_time_success = GetProcessTimes(
304 process_handle.as_raw_descriptor(),
305 &mut process_time.create,
306 &mut process_time.exit,
307 &mut process_time.kernel,
308 &mut process_time.user,
309 );
310 }
311 if process_time_success == 0 {
312 error!("Systime collection failed.\n");
313 return Err(SysError::last());
314 }
315 }
316 Ok(Measurement {
317 sys_time: SystemCpuTime {
318 idle: sys_time.idle,
319 kernel: sys_time.kernel,
320 user: sys_time.user,
321 },
322 process_time: ProcessCpuTime {
323 create: process_time.create,
324 exit: process_time.exit,
325 kernel: process_time.kernel,
326 user: process_time.user,
327 },
328 })
329 }
330
get_io_metrics(&self) -> SysResult<ProcessIo>331 fn get_io_metrics(&self) -> SysResult<ProcessIo> {
332 let process_handle = CoreWinMetrics::get_process_handle()?;
333 let mut io_counters = IO_COUNTERS::default();
334 // SAFETY:
335 // Safe because we own the process handle and all memory was allocated.
336 let result = unsafe {
337 GetProcessIoCounters(
338 process_handle.as_raw_descriptor(),
339 &mut io_counters as *mut IO_COUNTERS,
340 )
341 };
342 if result == 0 {
343 return Err(SysError::last());
344 }
345 Ok(ProcessIo {
346 read_bytes: io_counters.ReadTransferCount,
347 write_bytes: io_counters.WriteTransferCount,
348 })
349 }
350
mem_metrics_string(&self) -> String351 fn mem_metrics_string(&self) -> String {
352 let guard = self.memory.lock().unwrap();
353 let memory: ProcessMemory = *guard;
354 let mut buf = format!(
355 "Physical memory used: {} mb.\n",
356 memory.physical / BYTES_PER_MB
357 );
358 let _ = writeln!(
359 buf,
360 "Total working memory: {} mb.",
361 memory.working_set_size / BYTES_PER_MB
362 );
363 let _ = writeln!(
364 buf,
365 "Peak working memory: {} mb.",
366 memory.working_set_peak / BYTES_PER_MB
367 );
368 let _ = writeln!(buf, "Page fault count: {}.", memory.page_fault_count);
369 let _ = writeln!(
370 buf,
371 "Page file used: {} mb.",
372 memory.page_file_usage / BYTES_PER_MB
373 );
374 let _ = writeln!(
375 buf,
376 "Peak page file used: {} mb.",
377 memory.page_file_peak / BYTES_PER_MB
378 );
379 buf
380 }
381
cpu_metrics_string(&self) -> String382 fn cpu_metrics_string(&self) -> String {
383 let guard = self.measurements.lock().unwrap();
384 let mut buf = String::new();
385
386 // Now we use current and last cpu measurment data to calculate cpu usage
387 // as a percentage.
388 if let Some(measurements) = &*guard {
389 let sys_time = measurements.current.sys_time;
390 let process_time = measurements.current.process_time;
391 let prev_sys_time = measurements.previous.sys_time;
392 let prev_process_time = measurements.previous.process_time;
393
394 let diff_systime_kernel =
395 compute_filetime_subtraction(sys_time.kernel, prev_sys_time.kernel);
396 let diff_systime_user = compute_filetime_subtraction(sys_time.user, prev_sys_time.user);
397
398 let diff_processtime_kernel =
399 compute_filetime_subtraction(process_time.kernel, prev_process_time.kernel);
400 let diff_processtime_user =
401 compute_filetime_subtraction(process_time.user, prev_process_time.user);
402
403 let total_systime = diff_systime_kernel + diff_systime_user;
404 let total_processtime = diff_processtime_kernel + diff_processtime_user;
405
406 let mut process_cpu = String::from("still calculating...");
407 if total_systime > 0 {
408 process_cpu = format!("{}%", (100 * total_processtime / total_systime));
409 }
410 let _ = writeln!(buf, "Process cpu usage is: {}", process_cpu);
411
412 #[cfg(debug_assertions)]
413 {
414 // Show data supporting our cpu usage calculation.
415 // Output system cpu time.
416 let _ = writeln!(
417 buf,
418 "Systime Idle: low {} / high {}",
419 sys_time.idle.dwLowDateTime, sys_time.idle.dwHighDateTime
420 );
421 let _ = writeln!(
422 buf,
423 "Systime User: low {} / high {}",
424 sys_time.user.dwLowDateTime, sys_time.user.dwHighDateTime
425 );
426 let _ = writeln!(
427 buf,
428 "Systime kernel: low {} / high {}",
429 sys_time.kernel.dwLowDateTime, sys_time.kernel.dwHighDateTime
430 );
431 // Output process cpu time.
432 let _ = writeln!(
433 buf,
434 "Process Create: low {} / high {}",
435 process_time.create.dwLowDateTime, process_time.create.dwHighDateTime
436 );
437 let _ = writeln!(
438 buf,
439 "Process Exit: low {} / high {}",
440 process_time.exit.dwLowDateTime, process_time.exit.dwHighDateTime
441 );
442 let _ = writeln!(
443 buf,
444 "Process kernel: low {} / high {}",
445 process_time.kernel.dwLowDateTime, process_time.kernel.dwHighDateTime
446 );
447 let _ = writeln!(
448 buf,
449 "Process user: low {} / high {}",
450 process_time.user.dwLowDateTime, process_time.user.dwHighDateTime
451 );
452 }
453 } else {
454 let _ = write!(buf, "Calculating cpu usage...");
455 }
456 buf
457 }
458 }
459
compute_filetime_subtraction(fta: FILETIME, ftb: FILETIME) -> LONGLONG460 fn compute_filetime_subtraction(fta: FILETIME, ftb: FILETIME) -> LONGLONG {
461 // SAFETY:
462 // safe because we are initializing the struct to 0s.
463 unsafe {
464 let mut a: LARGE_INTEGER = mem::zeroed::<LARGE_INTEGER>();
465 a.u_mut().LowPart = fta.dwLowDateTime;
466 a.u_mut().HighPart = fta.dwHighDateTime as i32;
467 let mut b: LARGE_INTEGER = mem::zeroed::<LARGE_INTEGER>();
468 b.u_mut().LowPart = ftb.dwLowDateTime;
469 b.u_mut().HighPart = ftb.dwHighDateTime as i32;
470 a.QuadPart() - b.QuadPart()
471 }
472 }
473
474 // Adds to a running total of memory metrics over the course of a collection period.
475 // Can divide these sums to calculate averages.
accumulate_process_memory( acc: ProcessMemoryAccumulated, mem: ProcessMemory, ) -> ProcessMemoryAccumulated476 fn accumulate_process_memory(
477 acc: ProcessMemoryAccumulated,
478 mem: ProcessMemory,
479 ) -> ProcessMemoryAccumulated {
480 ProcessMemoryAccumulated {
481 accumulated: ProcessMemory {
482 page_fault_count: mem.page_fault_count,
483 working_set_size: acc.accumulated.working_set_size + mem.working_set_size,
484 working_set_peak: mem.working_set_peak,
485 page_file_usage: acc.accumulated.page_file_usage + mem.page_file_usage,
486 page_file_peak: mem.page_file_peak,
487 physical: acc.accumulated.physical + mem.physical,
488 },
489 accumulated_count: acc.accumulated_count + 1,
490 }
491 }
492
493 #[derive(Copy, Clone, Default)]
494 struct SystemCpuTime {
495 idle: FILETIME,
496 kernel: FILETIME,
497 user: FILETIME,
498 }
499
500 #[derive(Copy, Clone, Default)]
501 struct ProcessCpuTime {
502 create: FILETIME,
503 exit: FILETIME,
504 kernel: FILETIME,
505 user: FILETIME,
506 }
507
508 #[derive(Copy, Clone, Default)]
509 struct ProcessMemory {
510 page_fault_count: u32,
511 working_set_size: usize,
512 working_set_peak: usize,
513 page_file_usage: usize,
514 page_file_peak: usize,
515 physical: usize,
516 }
517
518 #[derive(Copy, Clone)]
519 struct ProcessMemoryAccumulated {
520 accumulated: ProcessMemory,
521 accumulated_count: usize,
522 }
523
524 #[derive(Copy, Clone, Default)]
525 struct ProcessIo {
526 read_bytes: u64,
527 write_bytes: u64,
528 }
529
530 #[derive(Copy, Clone)]
531 struct ProcessIoRecord {
532 current: ProcessIo,
533 current_time: DateTime<Utc>,
534 last_upload: ProcessIo,
535 last_upload_time: DateTime<Utc>,
536 }
537
538 #[derive(Copy, Clone)]
539 struct Measurement {
540 sys_time: SystemCpuTime,
541 process_time: ProcessCpuTime,
542 }
543
544 struct Measurements {
545 current: Measurement,
546 previous: Measurement,
547 last_upload: Measurement,
548 }
549
550 /// A managing struct for a job which defines regular logging of core Windows system metrics.
551 pub(crate) struct CoreWinMetrics {
552 metrics_string: Weak<Mutex<String>>,
553 exit_evt: Event,
554 worker_thread: Option<JoinHandle<()>>,
555 }
556
557 impl CoreWinMetrics {
new() -> Result<Self>558 pub fn new() -> Result<Self> {
559 let exit_evt = match Event::new() {
560 Ok(evt) => evt,
561 Err(_e) => return Err(Error::CannotInstantiateEvent),
562 };
563
564 let metrics_string = String::new();
565 let arc_metrics_memory = Arc::new(Mutex::new(metrics_string));
566 let weak_metrics_memory = Arc::downgrade(&arc_metrics_memory);
567
568 let mut me = Self {
569 metrics_string: weak_metrics_memory,
570 exit_evt,
571 worker_thread: None,
572 };
573 let exit_evt_clone = match me.exit_evt.try_clone() {
574 Ok(evt) => evt,
575 Err(_) => return Err(Error::CannotCloneEvent),
576 };
577 me.worker_thread.replace(thread::spawn(|| {
578 Worker {
579 exit_evt: exit_evt_clone,
580 io: Arc::new(Mutex::new(None)),
581 measurements: Arc::new(Mutex::new(None)),
582 memory: Arc::new(Mutex::new(Default::default())),
583 memory_acc: Arc::new(Mutex::new(None)),
584 metrics_string: arc_metrics_memory,
585 }
586 .run();
587 }));
588 Ok(me)
589 }
590
get_process_handle() -> SysResult<SafeDescriptor>591 fn get_process_handle() -> SysResult<SafeDescriptor> {
592 // SAFETY:
593 // Safe because we own the current process.
594 let process_handle = unsafe {
595 OpenProcess(
596 PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_VM_READ | SYNCHRONIZE,
597 0,
598 std::process::id(),
599 )
600 };
601 if process_handle.is_null() {
602 return Err(SysError::last());
603 }
604 // SAFETY:
605 // Safe as the SafeDescriptor is the only thing with access to the handle after this.
606 Ok(unsafe { SafeDescriptor::from_raw_descriptor(process_handle) })
607 }
608 }
609
610 impl Drop for CoreWinMetrics {
drop(&mut self)611 fn drop(&mut self) {
612 if let Some(join_handle) = self.worker_thread.take() {
613 let _ = self.exit_evt.signal();
614 join_handle
615 .join()
616 .expect("fail to join the worker thread of a win core metrics collector.");
617 }
618 }
619 }
620
621 impl fmt::Display for CoreWinMetrics {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result622 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
623 match self.metrics_string.upgrade() {
624 Some(metrics_string) => write!(f, "{}", *metrics_string.lock().unwrap()),
625 None => write!(f, ""),
626 }
627 }
628 }
629