xref: /aosp_15_r20/external/pytorch/tools/stats/monitor.py (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1#!/usr/bin/env python3
2
3from __future__ import annotations
4
5import datetime
6import json
7import signal
8import time
9from typing import Any
10
11import psutil  # type: ignore[import]
12
13
14def get_processes_running_python_tests() -> list[Any]:
15    python_processes = []
16    for process in psutil.process_iter():
17        try:
18            if "python" in process.name() and process.cmdline():
19                python_processes.append(process)
20        except (psutil.NoSuchProcess, psutil.AccessDenied):
21            # access denied or the process died
22            pass
23    return python_processes
24
25
26def get_per_process_cpu_info() -> list[dict[str, Any]]:
27    processes = get_processes_running_python_tests()
28    per_process_info = []
29    for p in processes:
30        info = {
31            "pid": p.pid,
32            "cmd": " ".join(p.cmdline()),
33            "cpu_percent": p.cpu_percent(),
34            "rss_memory": p.memory_info().rss,
35        }
36
37        # https://psutil.readthedocs.io/en/latest/index.html?highlight=memory_full_info
38        # requires higher user privileges and could throw AccessDenied error, i.e. mac
39        try:
40            memory_full_info = p.memory_full_info()
41
42            info["uss_memory"] = memory_full_info.uss
43            if "pss" in memory_full_info:
44                # only availiable in linux
45                info["pss_memory"] = memory_full_info.pss
46
47        except psutil.AccessDenied as e:
48            # It's ok to skip this
49            pass
50
51        per_process_info.append(info)
52    return per_process_info
53
54
55def get_per_process_gpu_info(handle: Any) -> list[dict[str, Any]]:
56    processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
57    per_process_info = []
58    for p in processes:
59        info = {"pid": p.pid, "gpu_memory": p.usedGpuMemory}
60        per_process_info.append(info)
61    return per_process_info
62
63
64def rocm_get_per_process_gpu_info(handle: Any) -> list[dict[str, Any]]:
65    processes = amdsmi.amdsmi_get_gpu_process_list(handle)
66    per_process_info = []
67    for p in processes:
68        try:
69            proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p)
70        except AttributeError:
71            # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7
72            # BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi
73            proc_info = p
74        info = {
75            "pid": proc_info["pid"],
76            "gpu_memory": proc_info["memory_usage"]["vram_mem"],
77        }
78        per_process_info.append(info)
79    return per_process_info
80
81
82if __name__ == "__main__":
83    handle = None
84    try:
85        import pynvml  # type: ignore[import]
86
87        try:
88            pynvml.nvmlInit()
89            handle = pynvml.nvmlDeviceGetHandleByIndex(0)
90        except pynvml.NVMLError:
91            pass
92    except ModuleNotFoundError:
93        # no pynvml avaliable, probably because not cuda
94        pass
95    try:
96        import amdsmi  # type: ignore[import]
97
98        try:
99            amdsmi.amdsmi_init()
100            amdsmi_handle = amdsmi.amdsmi_get_processor_handles()[0]
101        except amdsmi.AmdSmiException:
102            pass
103    except ModuleNotFoundError:
104        # no amdsmi is available
105        pass
106
107    kill_now = False
108
109    def exit_gracefully(*args: Any) -> None:
110        global kill_now
111        kill_now = True
112
113    signal.signal(signal.SIGTERM, exit_gracefully)
114
115    while not kill_now:
116        try:
117            stats = {
118                "time": datetime.datetime.utcnow().isoformat("T") + "Z",
119                "total_cpu_percent": psutil.cpu_percent(),
120                "per_process_cpu_info": get_per_process_cpu_info(),
121            }
122            if handle is not None:
123                stats["per_process_gpu_info"] = get_per_process_gpu_info(handle)
124                # https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html
125                gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
126                stats["total_gpu_utilization"] = gpu_utilization.gpu
127                stats["total_gpu_mem_utilization"] = gpu_utilization.memory
128            if amdsmi_handle is not None:
129                stats["per_process_gpu_info"] = rocm_get_per_process_gpu_info(
130                    amdsmi_handle
131                )
132                stats["total_gpu_utilization"] = amdsmi.amdsmi_get_gpu_activity(
133                    amdsmi_handle
134                )["gfx_activity"]
135                stats["total_gpu_mem_utilization"] = amdsmi.amdsmi_get_gpu_activity(
136                    amdsmi_handle
137                )["umc_activity"]
138        except Exception as e:
139            stats = {
140                "time": datetime.datetime.utcnow().isoformat("T") + "Z",
141                "error": str(e),
142            }
143        finally:
144            print(json.dumps(stats))
145            time.sleep(1)
146