1#!/usr/bin/env python3 2 3from __future__ import annotations 4 5import datetime 6import json 7import signal 8import time 9from typing import Any 10 11import psutil # type: ignore[import] 12 13 14def get_processes_running_python_tests() -> list[Any]: 15 python_processes = [] 16 for process in psutil.process_iter(): 17 try: 18 if "python" in process.name() and process.cmdline(): 19 python_processes.append(process) 20 except (psutil.NoSuchProcess, psutil.AccessDenied): 21 # access denied or the process died 22 pass 23 return python_processes 24 25 26def get_per_process_cpu_info() -> list[dict[str, Any]]: 27 processes = get_processes_running_python_tests() 28 per_process_info = [] 29 for p in processes: 30 info = { 31 "pid": p.pid, 32 "cmd": " ".join(p.cmdline()), 33 "cpu_percent": p.cpu_percent(), 34 "rss_memory": p.memory_info().rss, 35 } 36 37 # https://psutil.readthedocs.io/en/latest/index.html?highlight=memory_full_info 38 # requires higher user privileges and could throw AccessDenied error, i.e. mac 39 try: 40 memory_full_info = p.memory_full_info() 41 42 info["uss_memory"] = memory_full_info.uss 43 if "pss" in memory_full_info: 44 # only availiable in linux 45 info["pss_memory"] = memory_full_info.pss 46 47 except psutil.AccessDenied as e: 48 # It's ok to skip this 49 pass 50 51 per_process_info.append(info) 52 return per_process_info 53 54 55def get_per_process_gpu_info(handle: Any) -> list[dict[str, Any]]: 56 processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) 57 per_process_info = [] 58 for p in processes: 59 info = {"pid": p.pid, "gpu_memory": p.usedGpuMemory} 60 per_process_info.append(info) 61 return per_process_info 62 63 64def rocm_get_per_process_gpu_info(handle: Any) -> list[dict[str, Any]]: 65 processes = amdsmi.amdsmi_get_gpu_process_list(handle) 66 per_process_info = [] 67 for p in processes: 68 try: 69 proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) 70 except AttributeError: 71 # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 72 # BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi 73 proc_info = p 74 info = { 75 "pid": proc_info["pid"], 76 "gpu_memory": proc_info["memory_usage"]["vram_mem"], 77 } 78 per_process_info.append(info) 79 return per_process_info 80 81 82if __name__ == "__main__": 83 handle = None 84 try: 85 import pynvml # type: ignore[import] 86 87 try: 88 pynvml.nvmlInit() 89 handle = pynvml.nvmlDeviceGetHandleByIndex(0) 90 except pynvml.NVMLError: 91 pass 92 except ModuleNotFoundError: 93 # no pynvml avaliable, probably because not cuda 94 pass 95 try: 96 import amdsmi # type: ignore[import] 97 98 try: 99 amdsmi.amdsmi_init() 100 amdsmi_handle = amdsmi.amdsmi_get_processor_handles()[0] 101 except amdsmi.AmdSmiException: 102 pass 103 except ModuleNotFoundError: 104 # no amdsmi is available 105 pass 106 107 kill_now = False 108 109 def exit_gracefully(*args: Any) -> None: 110 global kill_now 111 kill_now = True 112 113 signal.signal(signal.SIGTERM, exit_gracefully) 114 115 while not kill_now: 116 try: 117 stats = { 118 "time": datetime.datetime.utcnow().isoformat("T") + "Z", 119 "total_cpu_percent": psutil.cpu_percent(), 120 "per_process_cpu_info": get_per_process_cpu_info(), 121 } 122 if handle is not None: 123 stats["per_process_gpu_info"] = get_per_process_gpu_info(handle) 124 # https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html 125 gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) 126 stats["total_gpu_utilization"] = gpu_utilization.gpu 127 stats["total_gpu_mem_utilization"] = gpu_utilization.memory 128 if amdsmi_handle is not None: 129 stats["per_process_gpu_info"] = rocm_get_per_process_gpu_info( 130 amdsmi_handle 131 ) 132 stats["total_gpu_utilization"] = amdsmi.amdsmi_get_gpu_activity( 133 amdsmi_handle 134 )["gfx_activity"] 135 stats["total_gpu_mem_utilization"] = amdsmi.amdsmi_get_gpu_activity( 136 amdsmi_handle 137 )["umc_activity"] 138 except Exception as e: 139 stats = { 140 "time": datetime.datetime.utcnow().isoformat("T") + "Z", 141 "error": str(e), 142 } 143 finally: 144 print(json.dumps(stats)) 145 time.sleep(1) 146