xref: /aosp_15_r20/external/pytorch/aten/src/ATen/mps/MPSProfiler.h (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 //  Copyright © 2022 Apple Inc.
2 
3 #pragma once
4 
5 #include <ATen/Tensor.h>
6 #include <ATen/mps/MPSStream.h>
7 #include <ATen/mps/MPSAllocatorInterface.h>
8 
9 #include <os/signpost.h>
10 #include <os/log.h>
11 
12 #include <atomic>
13 #include <ctime>
14 #include <sstream>
15 #include <string>
16 #include <unordered_map>
17 #include <utility>
18 
19 namespace at::mps {
20 
21 namespace Profiler {
22 
23 struct BaseInfo {
24   // profiling info types
25   enum class Type {
26     GRAPH,
27     KERNEL,
28     COPY,
29     CPU_FALLBACK,
30   };
31 
BaseInfoBaseInfo32   BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) :
33       type(infoType), profileId(Id), handle(Handle) { }
34   virtual ~BaseInfo() = default;
35 
36   // type of profiling info
37   Type type;
38   // unique profile ID for execution instances of operations or copies
39   uint64_t profileId;
40   // ID generated by os_signpost
41   // since it's possible to use event and interval-based signposts at the
42   // same time, we need separate IDs for each.
43   os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0;
44   // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime")
45   std::atomic<double> totalGpuTime{0.0};
46   // accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime")
47   std::atomic<double> totalSchedulingTime{0.0};
48   // indicates if the operation or copy execution has completed
49   std::atomic_bool completed{false};
50   // handle used to identify the profile info's instance (usually the pointer)
51   const uintptr_t handle;
52 
53   virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const;
54   // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()])
55   static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) {
56     if (tensor.defined()) {
57       std::stringstream tensorStr;
58       auto deviceType = tensor.device().type();
59       tensorStr << c10::DeviceTypeName(deviceType);
60       // see comments for INCLUDE_BUFFER_ID
61       if (includeBufferId && deviceType == at::kMPS) {
62         id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
63         tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer))
64                   << ":" << buffer.retainCount << ")";
65       }
66       tensorStr << ":"
67                 << tensor.scalar_type() << tensor.sizes();
68       return tensorStr.str();
69     } else {
70       return "undefined";
71     }
72   }
getTimeBaseInfo73   static uint64_t getTime() {
74     return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
75   }
76 };
77 
78 struct OperationInfo : BaseInfo {
OperationInfoOperationInfo79   OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) :
80       BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { }
81 
82   uint64_t runCount = 0;
83   std::string strKey;
84 
85   const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
86 
87   // builds a string for a kernel
88   static std::string buildKernelString(const std::string& kernelName,
89                                        const TensorList& tensors,
90                                        bool includeBufferId = false) {
91     std::stringstream kernelStr;
92     kernelStr << kernelName;
93     for (const Tensor& tensor: tensors) {
94       kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
95     }
96     return kernelStr.str();
97   }
98 };
99 
100 struct CpuFbInfo : BaseInfo {
CpuFbInfoCpuFbInfo101   CpuFbInfo(uint64_t Id, const std::string& OpName) :
102       BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { }
103 
104   uint64_t runCount = 0;
105   // the current and total overhead of copies in bytes required to convert the Op's
106   // input tensors from MPS to CPU and then output from CPU back to MPS
107   size_t currentCopyOverhead = 0;
108   size_t totalCopyOverhead = 0;
109   std::string opName;
110   std::string strKey;
111   uint64_t startTime = 0;
112 
113   const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
114 
updateCopyOverheadCpuFbInfo115   void updateCopyOverhead(const TensorList& tensors) {
116     currentCopyOverhead = 0;
117     for (const Tensor& tensor: tensors) {
118       if (tensor.defined()) {
119         currentCopyOverhead += tensor.nbytes();
120       }
121     }
122     totalCopyOverhead += currentCopyOverhead;
123   }
124 };
125 
126 struct CopyInfo : BaseInfo {
127   enum class Kind {
128     MPS_TO_MPS,
129     MPS_TO_CPU,
130     CPU_TO_MPS,
131   };
132 
CopyInfoCopyInfo133   CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) :
134            BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS),
135            length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { }
136 
137   Kind kind;
138   size_t length;
139   bool isNonBlocking;
140   bool usesBlitter;
141   std::string srcStrKey;
142   std::string dstStrKey;
143   // for copies that don't use blitters, we measure CPU time
144   uint64_t startTime = 0;
145 
146   const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
147 
148   static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false);
149 
isStorageOnMPSCopyInfo150   static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) {
151     if (tensor.has_value()) {
152       return tensor->device().type() == at::kMPS;
153     }
154     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer);
155     // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device
156     return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0;
157   }
158 
getCopyKindCopyInfo159   static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer,
160                           const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) {
161     const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor);
162     const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor);
163     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS);
164     if (isSrcOnMPS && !isDstOnMPS) {
165       return Kind::MPS_TO_CPU;
166     } else if (!isSrcOnMPS && isDstOnMPS) {
167       return Kind::CPU_TO_MPS;
168     }
169     return Kind::MPS_TO_MPS;
170   }
171 };
172 
173 struct CopyStat : CopyInfo {
CopyStatCopyStat174   explicit CopyStat(std::string CopyKindStr) :
175           CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {}
176   // total number of copies
177   size_t totalCount = 0;
178   // number of Scalar copies (i.e., less than sizeof(int64))
179   size_t scalarsCount = 0;
180   // number of blocking copies (i.e., require syncing to GPU)
181   size_t blockingCount = 0;
182   // number of copies that used memcpy(), instead of Metal Blit Encoder
183   size_t memcpyCount = 0;
184   // accumulated GPU time in ms for the scalar copies
185   std::atomic<double> scalarsGpuTime{0.0};
186   // copy kind in string type
187   std::string kindStr;
188 };
189 
190 class MPSProfiler {
191 public:
192   // lower 16 bits used for profiler options
193   enum ProfileOptions : uint32_t {
194     OPTIONS_NONE = 0,
195     // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, etc.)
196     // (used for convenience to not compute bit flags by OR-ing manually)
197     // trace all signpost types using events
198     ALL_SIGNPOST_EVENTS    = (1 << 0),
199     // trace all signpost types using intervals
200     ALL_SIGNPOST_INTERVALS = (1 << 1),
201     // always wait for command buffer to finish executing after each commit
202     WAIT_UNTIL_COMPLETED   = (1 << 2),
203     // for interval-based signposts, include the scheduling portion of
204     // Graph/Kernel/Copy executions as well.
205     // if flag is disable, only "GPU run time" is included in interval,
206     // and not schedule time.
207     INCLUDE_SCHEDULE_INTERVAL = (1 << 3),
208 
209     // use these if you need to trace signposts types individually (rarely required)
210     // trace signpost using intervals
211     USE_INTERVALS = (1 << 4),
212     // trace signpost by emitting events
213     USE_EVENTS    = (1 << 5),
214     // used for sanity check (Change this when new option added)
215     OPTIONS_COUNT = (USE_EVENTS << 1) - 1,
216   };
217 
218   // when adding new types, #define the type string in MPSProfiler.mm as well.
219   // upper 16 bits used for event types
220   enum SignpostTypes : uint32_t {
221     SIGNPOST_NONE = 0,
222     // trace signposts for PyTorch operation executions
223     RUN_OPERATION = (1 << 16),
224     // trace signposts for blitter copies
225     BLIT_COPY     = (1 << 17),
226     // trace signposts for ops that fall back on CPU
227     CPU_FALLBACK  = (1 << 18),
228     // used for sanity check (Change this when new type added)
229     SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1,
230   };
231 
232   enum LogOptions : uint32_t {
233     LOG_NONE = 0,
234 
235     // Info logging options during execution
236     // -------------------------------------
237     // prints operation info (id/key/run_count) during execution
238     OPERATION_INFO      = (1 << 0),
239     // prints copy info (src/dst tensors/buffers, size, etc.) during execution
240     COPY_INFO           = (1 << 1),
241     // prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution
242     CPU_FALLBACK_INFO   = (1 << 2),
243 
244     // Profiling Statistics logging options when process terminates
245     // ------------------------------------------------------------
246     // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates
247     // this is convenient to not combine following stats bit flags manually
248     ALL_STATS           = (1 << 3),
249     // prints operation stats (GPU times, run count, etc.) before process terminates
250     OPERATION_STATS     = (1 << 4),
251     // prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates
252     COPY_STATS          = (1 << 5),
253     // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies
254     // for tensors, etc.) before process terminates
255     CPU_FALLBACK_STATS  = (1 << 6),
256 
257     // Metadata format options when logging the info
258     // ---------------------------------------------
259     // if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime
260     // from Metal Command Buffers) (e.g., [GPU=0.324 ms])
261     INCLUDE_GPU_TIME    = (1 << 7),
262     // if enabled, includes GPU scheduling time in metadata separately
263     // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers)
264     // e.g., [GPU=0.324 ms, KRNL=0.036 ms]
265     INCLUDE_KERNEL_TIME = (1 << 8),
266     // if enabled, includes the unique buffer ID in metadata for the storage
267     // of a tensor that was allocated on MPSAllocator. This is useful (along with
268     // the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved
269     // with various operations.
270     INCLUDE_BUFFER_ID   = (1 << 9),
271 
272     // used for sanity check (Change this when new option added)
273     LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1,
274   };
275 
276   explicit MPSProfiler();
277   ~MPSProfiler();
278 
279   // the handle is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
280   // the beginProfile*() functions return a profileId which is unique per graph/kernel/copy
281   uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph);
282   uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors);
283   uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer,
284                             const OptionalTensorRef srcTensor,
285                             const OptionalTensorRef dstTensor,
286                             size_t length, bool isNonBlocking, bool usesBlitter = true);
287   uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors);
288   void beginProfileGPUInterval(const void* handle);
289 
290   void endProfileCopy(uint64_t profileId, SyncType syncType);
291   void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE);
292   void endProfileCPUFallback(const std::string& opName);
293 
294   // these are used to hook into Python bindings for torch.mps.profiler module.
295   // this enables generating OS Signpost traces from MPSProfiler on-demand
296   // during runtime (instead of environment variables).
297   // The "mode" could be either "interval", "event", or both "interval,event"
298   // for interval-based and/or event-based signpost tracing.
299   void StartTrace(const std::string& mode, bool waitUntilCompleted);
300   void StopTrace();
301 
302   // Abstractions for GPU trace capturing
303   bool isCaptureEnabled() const;
304   bool isCapturing() const;
305   void startCapture(const std::string& name, MPSStream* stream = nullptr);
306   void stopCapture(MPSStream* stream = nullptr);
307 
308   // convenience functions to indicate whether signpost tracing or
309   // logging are enabled for the SignpostTypes
isOperationProfilingEnabled()310   bool isOperationProfilingEnabled() const {
311     return (m_signpost_types & SignpostTypes::RUN_OPERATION) ||
312            (m_log_options & (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS));
313   }
isCopyProfilingEnabled()314   bool isCopyProfilingEnabled() const {
315     return (m_signpost_types & SignpostTypes::BLIT_COPY) ||
316            (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS));
317   }
isCPUFallbackProfilingEnabled()318   bool isCPUFallbackProfilingEnabled() const {
319     return (m_signpost_types & SignpostTypes::CPU_FALLBACK) ||
320            (m_log_options & (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS));
321   }
isSignpostTracingEnabled()322   bool isSignpostTracingEnabled() const {
323     return (m_signpost_types != SignpostTypes::SIGNPOST_NONE);
324   }
325 
326  private:
327   // indicates what type of signpost types are enabled and traced by MPS profiler.
328   uint32_t m_signpost_types = 0;
329   uint32_t m_profile_options = 0;
330   uint32_t m_log_options = 0;
331   uint64_t m_kernel_counter = 0;
332   uint64_t m_graph_counter = 0;
333   uint64_t m_cpu_fb_counter = 0;
334   uint64_t m_copy_counter = 0;
335   // technically, it's possible to trace both events and intervals at the same time
336   // so we use separate os_log categories for them
337   os_log_t m_os_log_events;
338   os_log_t m_os_log_intervals;
339   // stats logging could run either from destructor or signal handler
340   // so this is used to check if logging has already started.
341   std::atomic_bool hasLoggedStats{false};
342   // indicates there are pending completionHandler callbacks that haven't been called yet.
343   std::atomic_bool hasPendingCompletionHandlers{false};
344   // used to capture sigint signal to log profiling stats
345   static struct sigaction currentSigint, previousSigint;
346 
347   // We use the following lists for two reasons:
348   // 1- for interval-based signposts the "begin" point won't be in same function
349   // as the "end" point where we need to be able to retrieve signpost's info
350   // 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO.
351 
352   // the pointer key for this map is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
353   // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
354   std::unordered_map<uintptr_t, std::unique_ptr<OperationInfo>> m_op_info_list{};
355   // the string key for this map is the op name that we fall back to execute on CPU
356   // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
357   std::unordered_map<std::string, std::unique_ptr<CpuFbInfo>> m_cpu_fb_info_list{};
358   // this list contains the info for copies, and its key is the unique profileId
359   // which is generated from m_copy_counter
360   // The copyInfo list is not retained.
361   std::unordered_map<uint64_t, std::unique_ptr<CopyInfo>> m_copy_info_list{};
362   // a short list that contains copy stats
363   std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{};
364 
365   mutable MTLCaptureManager *captureManager = nil;
366   unsigned captureCount = 0;
367 
368   void initialize();
369   void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
370   void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id,
371                            os_signpost_id_t interval_signpost_id,
372                            double gpuTime, double schedulingTime);
373   void addProfilerScheduledHandler(BaseInfo& info);
374   void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType);
375   void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
376                          const std::string& msg) const;
377   void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
378                              const std::string& msg) const;
379   void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const;
380 
381   void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime);
382   // returns true if logging the profiling info "during the execution" is enabled
383   bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded);
384   // logs all the profiling stats that are enabled
385   void logProfilingStats();
386   // logs kernel profiling stats when the process ends.
387   void logOperationsProfilingStats(std::FILE* f) const;
388   // logs CPU Fallback profiling stats when the process ends.
389   void logCPUFallbackProfilingStats(std::FILE* f) const;
390   // logs copy profiling stats when the process ends.
391   void logCopyProfilingStats(std::FILE* f) const;
392 
393   os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr);
394   static SignpostTypes getSignpostType(BaseInfo::Type infoType);
395   static void handleIntSignal(int signal);
396 };
397 
398 } // namespace Profiler
399 
400 Profiler::MPSProfiler& getMPSProfiler();
401 
402 } // namespace at::mps
403