1 // Copyright © 2022 Apple Inc. 2 3 #pragma once 4 5 #include <ATen/Tensor.h> 6 #include <ATen/mps/MPSStream.h> 7 #include <ATen/mps/MPSAllocatorInterface.h> 8 9 #include <os/signpost.h> 10 #include <os/log.h> 11 12 #include <atomic> 13 #include <ctime> 14 #include <sstream> 15 #include <string> 16 #include <unordered_map> 17 #include <utility> 18 19 namespace at::mps { 20 21 namespace Profiler { 22 23 struct BaseInfo { 24 // profiling info types 25 enum class Type { 26 GRAPH, 27 KERNEL, 28 COPY, 29 CPU_FALLBACK, 30 }; 31 BaseInfoBaseInfo32 BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) : 33 type(infoType), profileId(Id), handle(Handle) { } 34 virtual ~BaseInfo() = default; 35 36 // type of profiling info 37 Type type; 38 // unique profile ID for execution instances of operations or copies 39 uint64_t profileId; 40 // ID generated by os_signpost 41 // since it's possible to use event and interval-based signposts at the 42 // same time, we need separate IDs for each. 43 os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0; 44 // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime") 45 std::atomic<double> totalGpuTime{0.0}; 46 // accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime") 47 std::atomic<double> totalSchedulingTime{0.0}; 48 // indicates if the operation or copy execution has completed 49 std::atomic_bool completed{false}; 50 // handle used to identify the profile info's instance (usually the pointer) 51 const uintptr_t handle; 52 53 virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const; 54 // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()]) 55 static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) { 56 if (tensor.defined()) { 57 std::stringstream tensorStr; 58 auto deviceType = tensor.device().type(); 59 tensorStr << c10::DeviceTypeName(deviceType); 60 // see comments for INCLUDE_BUFFER_ID 61 if (includeBufferId && deviceType == at::kMPS) { 62 id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data()); 63 tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) 64 << ":" << buffer.retainCount << ")"; 65 } 66 tensorStr << ":" 67 << tensor.scalar_type() << tensor.sizes(); 68 return tensorStr.str(); 69 } else { 70 return "undefined"; 71 } 72 } getTimeBaseInfo73 static uint64_t getTime() { 74 return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW); 75 } 76 }; 77 78 struct OperationInfo : BaseInfo { OperationInfoOperationInfo79 OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) : 80 BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { } 81 82 uint64_t runCount = 0; 83 std::string strKey; 84 85 const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override; 86 87 // builds a string for a kernel 88 static std::string buildKernelString(const std::string& kernelName, 89 const TensorList& tensors, 90 bool includeBufferId = false) { 91 std::stringstream kernelStr; 92 kernelStr << kernelName; 93 for (const Tensor& tensor: tensors) { 94 kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId); 95 } 96 return kernelStr.str(); 97 } 98 }; 99 100 struct CpuFbInfo : BaseInfo { CpuFbInfoCpuFbInfo101 CpuFbInfo(uint64_t Id, const std::string& OpName) : 102 BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { } 103 104 uint64_t runCount = 0; 105 // the current and total overhead of copies in bytes required to convert the Op's 106 // input tensors from MPS to CPU and then output from CPU back to MPS 107 size_t currentCopyOverhead = 0; 108 size_t totalCopyOverhead = 0; 109 std::string opName; 110 std::string strKey; 111 uint64_t startTime = 0; 112 113 const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override; 114 updateCopyOverheadCpuFbInfo115 void updateCopyOverhead(const TensorList& tensors) { 116 currentCopyOverhead = 0; 117 for (const Tensor& tensor: tensors) { 118 if (tensor.defined()) { 119 currentCopyOverhead += tensor.nbytes(); 120 } 121 } 122 totalCopyOverhead += currentCopyOverhead; 123 } 124 }; 125 126 struct CopyInfo : BaseInfo { 127 enum class Kind { 128 MPS_TO_MPS, 129 MPS_TO_CPU, 130 CPU_TO_MPS, 131 }; 132 CopyInfoCopyInfo133 CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) : 134 BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS), 135 length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { } 136 137 Kind kind; 138 size_t length; 139 bool isNonBlocking; 140 bool usesBlitter; 141 std::string srcStrKey; 142 std::string dstStrKey; 143 // for copies that don't use blitters, we measure CPU time 144 uint64_t startTime = 0; 145 146 const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override; 147 148 static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false); 149 isStorageOnMPSCopyInfo150 static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) { 151 if (tensor.has_value()) { 152 return tensor->device().type() == at::kMPS; 153 } 154 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer); 155 // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device 156 return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0; 157 } 158 getCopyKindCopyInfo159 static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer, 160 const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) { 161 const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor); 162 const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor); 163 TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS); 164 if (isSrcOnMPS && !isDstOnMPS) { 165 return Kind::MPS_TO_CPU; 166 } else if (!isSrcOnMPS && isDstOnMPS) { 167 return Kind::CPU_TO_MPS; 168 } 169 return Kind::MPS_TO_MPS; 170 } 171 }; 172 173 struct CopyStat : CopyInfo { CopyStatCopyStat174 explicit CopyStat(std::string CopyKindStr) : 175 CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {} 176 // total number of copies 177 size_t totalCount = 0; 178 // number of Scalar copies (i.e., less than sizeof(int64)) 179 size_t scalarsCount = 0; 180 // number of blocking copies (i.e., require syncing to GPU) 181 size_t blockingCount = 0; 182 // number of copies that used memcpy(), instead of Metal Blit Encoder 183 size_t memcpyCount = 0; 184 // accumulated GPU time in ms for the scalar copies 185 std::atomic<double> scalarsGpuTime{0.0}; 186 // copy kind in string type 187 std::string kindStr; 188 }; 189 190 class MPSProfiler { 191 public: 192 // lower 16 bits used for profiler options 193 enum ProfileOptions : uint32_t { 194 OPTIONS_NONE = 0, 195 // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, etc.) 196 // (used for convenience to not compute bit flags by OR-ing manually) 197 // trace all signpost types using events 198 ALL_SIGNPOST_EVENTS = (1 << 0), 199 // trace all signpost types using intervals 200 ALL_SIGNPOST_INTERVALS = (1 << 1), 201 // always wait for command buffer to finish executing after each commit 202 WAIT_UNTIL_COMPLETED = (1 << 2), 203 // for interval-based signposts, include the scheduling portion of 204 // Graph/Kernel/Copy executions as well. 205 // if flag is disable, only "GPU run time" is included in interval, 206 // and not schedule time. 207 INCLUDE_SCHEDULE_INTERVAL = (1 << 3), 208 209 // use these if you need to trace signposts types individually (rarely required) 210 // trace signpost using intervals 211 USE_INTERVALS = (1 << 4), 212 // trace signpost by emitting events 213 USE_EVENTS = (1 << 5), 214 // used for sanity check (Change this when new option added) 215 OPTIONS_COUNT = (USE_EVENTS << 1) - 1, 216 }; 217 218 // when adding new types, #define the type string in MPSProfiler.mm as well. 219 // upper 16 bits used for event types 220 enum SignpostTypes : uint32_t { 221 SIGNPOST_NONE = 0, 222 // trace signposts for PyTorch operation executions 223 RUN_OPERATION = (1 << 16), 224 // trace signposts for blitter copies 225 BLIT_COPY = (1 << 17), 226 // trace signposts for ops that fall back on CPU 227 CPU_FALLBACK = (1 << 18), 228 // used for sanity check (Change this when new type added) 229 SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1, 230 }; 231 232 enum LogOptions : uint32_t { 233 LOG_NONE = 0, 234 235 // Info logging options during execution 236 // ------------------------------------- 237 // prints operation info (id/key/run_count) during execution 238 OPERATION_INFO = (1 << 0), 239 // prints copy info (src/dst tensors/buffers, size, etc.) during execution 240 COPY_INFO = (1 << 1), 241 // prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution 242 CPU_FALLBACK_INFO = (1 << 2), 243 244 // Profiling Statistics logging options when process terminates 245 // ------------------------------------------------------------ 246 // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates 247 // this is convenient to not combine following stats bit flags manually 248 ALL_STATS = (1 << 3), 249 // prints operation stats (GPU times, run count, etc.) before process terminates 250 OPERATION_STATS = (1 << 4), 251 // prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates 252 COPY_STATS = (1 << 5), 253 // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies 254 // for tensors, etc.) before process terminates 255 CPU_FALLBACK_STATS = (1 << 6), 256 257 // Metadata format options when logging the info 258 // --------------------------------------------- 259 // if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime 260 // from Metal Command Buffers) (e.g., [GPU=0.324 ms]) 261 INCLUDE_GPU_TIME = (1 << 7), 262 // if enabled, includes GPU scheduling time in metadata separately 263 // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers) 264 // e.g., [GPU=0.324 ms, KRNL=0.036 ms] 265 INCLUDE_KERNEL_TIME = (1 << 8), 266 // if enabled, includes the unique buffer ID in metadata for the storage 267 // of a tensor that was allocated on MPSAllocator. This is useful (along with 268 // the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved 269 // with various operations. 270 INCLUDE_BUFFER_ID = (1 << 9), 271 272 // used for sanity check (Change this when new option added) 273 LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1, 274 }; 275 276 explicit MPSProfiler(); 277 ~MPSProfiler(); 278 279 // the handle is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels 280 // the beginProfile*() functions return a profileId which is unique per graph/kernel/copy 281 uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph); 282 uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors); 283 uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer, 284 const OptionalTensorRef srcTensor, 285 const OptionalTensorRef dstTensor, 286 size_t length, bool isNonBlocking, bool usesBlitter = true); 287 uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors); 288 void beginProfileGPUInterval(const void* handle); 289 290 void endProfileCopy(uint64_t profileId, SyncType syncType); 291 void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE); 292 void endProfileCPUFallback(const std::string& opName); 293 294 // these are used to hook into Python bindings for torch.mps.profiler module. 295 // this enables generating OS Signpost traces from MPSProfiler on-demand 296 // during runtime (instead of environment variables). 297 // The "mode" could be either "interval", "event", or both "interval,event" 298 // for interval-based and/or event-based signpost tracing. 299 void StartTrace(const std::string& mode, bool waitUntilCompleted); 300 void StopTrace(); 301 302 // Abstractions for GPU trace capturing 303 bool isCaptureEnabled() const; 304 bool isCapturing() const; 305 void startCapture(const std::string& name, MPSStream* stream = nullptr); 306 void stopCapture(MPSStream* stream = nullptr); 307 308 // convenience functions to indicate whether signpost tracing or 309 // logging are enabled for the SignpostTypes isOperationProfilingEnabled()310 bool isOperationProfilingEnabled() const { 311 return (m_signpost_types & SignpostTypes::RUN_OPERATION) || 312 (m_log_options & (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS)); 313 } isCopyProfilingEnabled()314 bool isCopyProfilingEnabled() const { 315 return (m_signpost_types & SignpostTypes::BLIT_COPY) || 316 (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS)); 317 } isCPUFallbackProfilingEnabled()318 bool isCPUFallbackProfilingEnabled() const { 319 return (m_signpost_types & SignpostTypes::CPU_FALLBACK) || 320 (m_log_options & (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS)); 321 } isSignpostTracingEnabled()322 bool isSignpostTracingEnabled() const { 323 return (m_signpost_types != SignpostTypes::SIGNPOST_NONE); 324 } 325 326 private: 327 // indicates what type of signpost types are enabled and traced by MPS profiler. 328 uint32_t m_signpost_types = 0; 329 uint32_t m_profile_options = 0; 330 uint32_t m_log_options = 0; 331 uint64_t m_kernel_counter = 0; 332 uint64_t m_graph_counter = 0; 333 uint64_t m_cpu_fb_counter = 0; 334 uint64_t m_copy_counter = 0; 335 // technically, it's possible to trace both events and intervals at the same time 336 // so we use separate os_log categories for them 337 os_log_t m_os_log_events; 338 os_log_t m_os_log_intervals; 339 // stats logging could run either from destructor or signal handler 340 // so this is used to check if logging has already started. 341 std::atomic_bool hasLoggedStats{false}; 342 // indicates there are pending completionHandler callbacks that haven't been called yet. 343 std::atomic_bool hasPendingCompletionHandlers{false}; 344 // used to capture sigint signal to log profiling stats 345 static struct sigaction currentSigint, previousSigint; 346 347 // We use the following lists for two reasons: 348 // 1- for interval-based signposts the "begin" point won't be in same function 349 // as the "end" point where we need to be able to retrieve signpost's info 350 // 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO. 351 352 // the pointer key for this map is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels 353 // this list is retained and could be logged along with aggregate profiling numbers when the process ends. 354 std::unordered_map<uintptr_t, std::unique_ptr<OperationInfo>> m_op_info_list{}; 355 // the string key for this map is the op name that we fall back to execute on CPU 356 // this list is retained and could be logged along with aggregate profiling numbers when the process ends. 357 std::unordered_map<std::string, std::unique_ptr<CpuFbInfo>> m_cpu_fb_info_list{}; 358 // this list contains the info for copies, and its key is the unique profileId 359 // which is generated from m_copy_counter 360 // The copyInfo list is not retained. 361 std::unordered_map<uint64_t, std::unique_ptr<CopyInfo>> m_copy_info_list{}; 362 // a short list that contains copy stats 363 std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{}; 364 365 mutable MTLCaptureManager *captureManager = nil; 366 unsigned captureCount = 0; 367 368 void initialize(); 369 void beginProfileExecution(BaseInfo& info, bool cpuExecution = false); 370 void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id, 371 os_signpost_id_t interval_signpost_id, 372 double gpuTime, double schedulingTime); 373 void addProfilerScheduledHandler(BaseInfo& info); 374 void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType); 375 void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id, 376 const std::string& msg) const; 377 void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id, 378 const std::string& msg) const; 379 void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const; 380 381 void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime); 382 // returns true if logging the profiling info "during the execution" is enabled 383 bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded); 384 // logs all the profiling stats that are enabled 385 void logProfilingStats(); 386 // logs kernel profiling stats when the process ends. 387 void logOperationsProfilingStats(std::FILE* f) const; 388 // logs CPU Fallback profiling stats when the process ends. 389 void logCPUFallbackProfilingStats(std::FILE* f) const; 390 // logs copy profiling stats when the process ends. 391 void logCopyProfilingStats(std::FILE* f) const; 392 393 os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr); 394 static SignpostTypes getSignpostType(BaseInfo::Type infoType); 395 static void handleIntSignal(int signal); 396 }; 397 398 } // namespace Profiler 399 400 Profiler::MPSProfiler& getMPSProfiler(); 401 402 } // namespace at::mps 403