#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace torch::profiler::impl { enum class EventType : uint8_t { TorchOp = 0, Backend, Vulkan, Allocation, OutOfMemory, PyCall, PyCCall, Kineto }; // ============================================================================ // == Value (Tensor, Scalar) summary ========================================== // ============================================================================ struct TORCH_API RawTensorMetadataBase { RawTensorMetadataBase() = default; explicit RawTensorMetadataBase(const at::Tensor& t); StorageImplData data_; c10::ScalarType dtype_{c10::ScalarType::Undefined}; c10::Layout layout_{c10::Layout::Strided}; uint32_t size_dim_{0}; }; // Collected during profiling. struct TORCH_API RawTensorMetadata : RawTensorMetadataBase { RawTensorMetadata() = default; RawTensorMetadata(const RawTensorMetadata&) = default; RawTensorMetadata(RawTensorMetadata&&) noexcept = default; RawTensorMetadata& operator=(const RawTensorMetadata&) = default; RawTensorMetadata& operator=(RawTensorMetadata&&) noexcept = default; explicit RawTensorMetadata(const at::Tensor& t); // Wrap `weak_self_` in `std::optional` and split device into components to // keep struct default constructable. (which the std::array initializer needs) std::optional weak_self_; c10::DeviceType device_type_{c10::DeviceType::CPU}; c10::DeviceIndex device_index_{-1}; }; // Used during post processing. struct TORCH_API TensorMetadata : public RawTensorMetadataBase { TensorMetadata( const RawTensorMetadata& r, std::vector sizes, std::vector strides); TensorImplAddress impl() const { return weak_self_.get(); } WeakTensor weak_self_; c10::Device device_; std::vector sizes_; std::vector strides_; // Set during `calculateUniqueTensorIDs`. std::optional id_; std::optional allocation_id_; }; // Used during post processing. struct TORCH_API ProfilerStepInfo { int64_t start_time_ns; // start time of the profiler step int64_t end_time_ns; // end time of the profiler step uint64_t out_idx; // index of the profiler step in the profiler "out" var in // getRecords ProfilerStepInfo(int64_t start, int64_t end, uint64_t out_idx) : start_time_ns(start), end_time_ns(end), out_idx(out_idx) {} }; using op_input_t = std::variant< TensorMetadata, std::vector, c10::IValue, std::nullopt_t>; // ============================================================================ // == ExtraFields ============================================================= // ============================================================================ template struct ExtraFields; struct TorchOpBasicFields { int64_t sequence_number_{0}; uint64_t forward_tid_{0}; at::RecordScope scope_{}; bool is_async_{false}; uint64_t record_function_id_{0}; int64_t debug_handle_{0}; std::string name_; // Set in the exit callback. uint64_t end_tid_{0}; }; using jit_stack_t = std::vector; using jit_modules_t = std::vector; using extra_args_t = std::unordered_map; using extra_meta_t = std::unordered_map; using kwinputs_t = std::unordered_map; struct FallbackPair { ProfilerVoidEventStub device_event_start_ = nullptr; ProfilerVoidEventStub device_event_end_ = nullptr; }; template <> struct ExtraFields : TorchOpBasicFields { ExtraFields( TorchOpBasicFields&& f, uint64_t correlation_id, c10::time_t end_time_ns, std::vector&& inputs, std::vector&& concrete_inputs, jit_stack_t&& jit_stack, jit_modules_t&& jit_modules, extra_args_t&& extra_args, extra_meta_t&& extra_meta, kwinputs_t&& kwinputs, FallbackPair&& device_fallback, bool allow_tf32_cublas, std::unique_ptr&& perf_event_counters) : TorchOpBasicFields(std::move(f)), correlation_id_{correlation_id}, end_time_ns_{end_time_ns}, inputs_{std::move(inputs)}, concrete_inputs_{std::move(concrete_inputs)}, jit_stack_{std::move(jit_stack)}, jit_modules_{std::move(jit_modules)}, extra_args_{std::move(extra_args)}, extra_meta_{std::move(extra_meta)}, kwinputs_{std::move(kwinputs)}, device_fallback_{std::move(device_fallback)}, allow_tf32_cublas_{allow_tf32_cublas}, perf_event_counters_{std::move(perf_event_counters)} {} uint64_t correlation_id_; c10::time_t end_time_ns_; std::vector inputs_; std::vector concrete_inputs_; jit_stack_t jit_stack_; jit_modules_t jit_modules_; extra_args_t extra_args_; extra_meta_t extra_meta_; kwinputs_t kwinputs_; FallbackPair device_fallback_; bool allow_tf32_cublas_; std::unique_ptr perf_event_counters_; }; template <> struct ExtraFields { int64_t start_time_us_; int64_t end_time_us_; int64_t debug_handle_; at::RecordScope scope_; std::string name_; std::string backend_; jit_stack_t jit_stack_; jit_modules_t jit_modules_; }; template <> struct ExtraFields { using raw_event_t = std::pair; std::string name_; int64_t duration_ns_{0}; // While building the event tree, we want to report a vulkan event's duration // as 0 so that its end time doesn't exceed that of its parent cpu op bool in_tree_building_{false}; }; struct RawAllocation { c10::approx_time_t start_time_; void* ptr_; int64_t alloc_size_; size_t total_allocated_; size_t total_reserved_; c10::DeviceType device_type_; c10::DeviceIndex device_index_; }; // For performance. static_assert(c10::is_pod_v, "Non-POD member of RawAllocation."); template <> struct ExtraFields : RawAllocation { ExtraFields(const RawAllocation& allocation) : RawAllocation(allocation) {} c10::Device device() const { return {device_type_, device_index_}; } std::optional id_; std::optional allocation_id_; }; template <> struct ExtraFields { c10::approx_time_t start_time_; int64_t alloc_size_; size_t total_allocated_; size_t total_reserved_; c10::DeviceType device_type_; c10::DeviceIndex device_index_; }; // For performance. static_assert( c10::is_pod_v>, "Non-POD member of ExtraFields."); struct PyFrameState { int line_no_; at::StringView filename_; at::StringView funcname_; }; template using strong_t = strong:: type, strong::hashable>; using PyModuleSelf = strong_t; using PyModuleCls = strong_t; using PyMethod = strong_t; using PyOptimizerSelf = strong_t; using PyOptimizerCls = strong_t; struct NNModuleInfo { struct ParameterInfo { std::string name_; TensorMetadata metadata_; std::optional grad_metadata_; }; PyModuleSelf self_; PyModuleCls cls_; at::StringView cls_name_; std::vector parameters_; // Indicates that `self_` is the kth instance of `cls_` observed. size_t id_{std::numeric_limits::max()}; }; struct OptimizerInfo { struct ParameterInfo { TensorMetadata metadata_; std::optional grad_metadata_; std::vector> state_; }; PyOptimizerSelf self_; PyOptimizerCls cls_; at::StringView cls_name_; std::vector parameters_; }; struct PyExtraFieldsBase { PyExtraFieldsBase( c10::time_t end_time_ns, size_t python_tid, PyFrameState caller) : end_time_ns_{end_time_ns}, python_tid_{python_tid}, caller_{std::move(caller)} {} c10::time_t end_time_ns_; size_t python_tid_; PyFrameState caller_; // kth python event observed. (Used by TensorBoard) size_t id_{std::numeric_limits::max()}; }; template <> struct ExtraFields : public PyExtraFieldsBase { struct args_t { PyFrameState frame_state_; std::optional module_info_; std::optional optimizer_info_; }; ExtraFields( c10::time_t end_time_ns, size_t python_tid, PyFrameState caller, args_t args) : PyExtraFieldsBase(end_time_ns, python_tid, std::move(caller)), callsite_{std::move(args.frame_state_)}, module_{std::move(args.module_info_)}, optimizer_{std::move(args.optimizer_info_)} {} PyFrameState callsite_; std::optional module_; std::optional optimizer_; }; template <> struct ExtraFields : public PyExtraFieldsBase { using args_t = at::StringView; ExtraFields( c10::time_t end_time_ns, size_t python_tid, PyFrameState caller, args_t args) : PyExtraFieldsBase(end_time_ns, python_tid, std::move(caller)), function_name_{std::move(args)} {} at::StringView function_name_; }; template <> struct ExtraFields { // Mirrors `libkineto::GenericTraceActivity::Flow`. This information is used // during post processing to properly embed Kineto events into the broader // profiler tree structure. End users are not generally expected to use these // fields directly, but they are available for debugging. struct Flow { uint32_t id{0}; uint32_t type{0}; uint32_t start{0}; }; std::string name_; int64_t duration_ns_{0}; uint64_t correlation_id_{0}; libkineto::ActivityType activity_type_; Flow flow; std::weak_ptr linked_activity_{}; }; struct TORCH_API Result : public std::enable_shared_from_this { template [[nodiscard]] static std::shared_ptr create(Args... args) { return std::shared_ptr(new Result(std::forward(args)...)); } template decltype(auto) visit(T&& visitor) { return std::visit(std::forward(visitor), extra_fields_); } template decltype(auto) visit(T&& visitor) const { return std::visit(std::forward(visitor), extra_fields_); } template void visit_if_base(Fn&& fn) const { visit([&](const auto& extra_fields) { using extra_fields_t = typename std::remove_cv_t< typename std::remove_reference_t>; if constexpr (std::is_base_of_v) { fn(extra_fields); } }); } EventType tag() const { return visit([](const auto& i) { return deduceTag(i); }); } std::string name() const; libkineto::ActivityType kinetoType() const; uint64_t correlationID() const; int64_t endTimeNS() const; uint64_t endTID() const; c10::DeviceType deviceType() const; int64_t start_time_ns_; uint64_t start_tid_; kineto::DeviceAndResource kineto_info_; std::variant< ExtraFields, ExtraFields, ExtraFields, ExtraFields, ExtraFields, ExtraFields, ExtraFields, ExtraFields> extra_fields_; std::weak_ptr parent_; std::vector> children_; bool finished_{false}; const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr}; private: template Result( int64_t start_time_ns, uint64_t start_tid, kineto::DeviceAndResource kineto_info, ExtraFields&& extra_fields) : start_time_ns_{start_time_ns}, start_tid_{start_tid}, kineto_info_{kineto_info}, extra_fields_{std::move(extra_fields)} {} template static EventType deduceTag(const ExtraFields&) { return E; } }; struct KinetoObserverContext : public at::ObserverContext { struct Event { TorchOpBasicFields basic_fields_; c10::approx_time_t start_time_; // Set in the exit callback. c10::approx_time_t end_time_{ std::numeric_limits::min()}; bool allow_tf32_cublas_; std::unique_ptr counters_; }; explicit KinetoObserverContext(Event* event) : event_{event} {} Event* event_; FallbackPair* fallback_{nullptr}; }; constexpr int IO_ENCODER_DEFAULT_BLOCK_SIZE = 1024; constexpr int SCALAR_LIST_LENGTH_LIMIT = 30; // InputOutputEncoder // Stores each op_events' shapes and dtypes, and concrete values into a // contiguous AppendOnlyList so that we no longer create vectors for shapes // and dtypes on every op. Those vectors can be created during // post-processing. // It splits the data into two categories: input shapes and concrete inputs. class InputOutputEncoder final { public: void push(c10::ArrayRef values); // Used during post-processing to unpack the encoded data. // Each method returns a "supplier" lambda which takes no arguments; // invoking the lambda once will return a list of args that represent // the inputs for one op. // The data is split into two streams: "input shapes" and "concrete inputs". // Note: "auto" only works because these are only used in collection.cpp, // where they are implemented. auto getInputShapeGenerator(); auto getConcreteInputGenerator(); bool isSupportedScalarList(const c10::IValue& list_candidate); void clear(); enum class Tag { Tensor = 0, UndefinedTensor, TensorListBegin, // TODO: generalize to other lists. ScalarList, Scalar, Other, TERMINATOR }; enum class IOType { Shapes, ConcreteInputs, None }; private: void push(const at::Tensor& t); // Implementation detail for getInputShapeGenerator and // getConcreteInputGenerator auto getIValueGenerator(const IOType& io_type); AppendOnlyList tags_; AppendOnlyList tensor_metadata_; AppendOnlyList tensor_sizes_strides_; AppendOnlyList ivalues_; }; using perf_profiler_t = torch::profiler::impl::linux_perf::PerfProfiler; class TORCH_API ThreadLocalSubqueue { public: ThreadLocalSubqueue(const uint64_t tid, ProfilerConfig config); std::unique_ptr begin_op(const at::RecordFunction& fn); template void emplace_backend_event(Args&&... args) { backend_events_.emplace_back(std::forward(args)...); } template void emplace_vulkan_event(Args&&... args) { vulkan_events_.emplace_back(std::forward(args)...); } template void emplace_allocation_event(Args&&... args) { allocations_.emplace_back(std::forward(args)...); } template void emplace_ooms_event(Args&&... args) { ooms_.emplace_back(std::forward(args)...); } template void emplace_py_call(Args&&... args) { py_calls_.emplace_back(std::forward(args)...); } uint64_t tid() const { return tid_; } const kineto::DeviceAndResource& kineto_info() const { return kineto_info_; } inline void disable_perf_profiler(perf_counters_t& counters) const { perf_profiler_->Disable(counters); } private: uint64_t tid_; ProfilerConfig config_; kineto::DeviceAndResource kineto_info_; std::unique_ptr perf_profiler_; friend class RecordQueue; // See `containers.h` for block size benchmarks. static constexpr size_t BlockSize = 512; struct TorchOpStorage { // NB: This is a destructive operation. void materialize( std::vector>& out, std::vector& step_info, const std::function& time_converter, const uint64_t tid, const kineto::DeviceAndResource& kineto_info); template class EventBlock : public std::array { public: EventBlock(); uint64_t correlation_id(const T* ptr) const; private: uint64_t id_start_; }; using event_t = KinetoObserverContext::Event; class OpList : public AppendOnlyList { public: template std::pair emplace_back(Args&&... args); static uint64_t correlationID(const OpList::Iterator& e); } op_events_; // report_input_shapes InputOutputEncoder inputs_outputs_; // with_stack (JIT) AppendOnlyList jit_stack_; // with_modules AppendOnlyList jit_modules_; // with_flops AppendOnlyList extra_args_; // report extra metadata, i.e. collective communication meta AppendOnlyList extra_meta_; // report kwinputs AppendOnlyList kwinputs_; // ProfilerState::KINETO_GPU_FALLBACK or // ProfilerState::KINETO_PRIVATEUSE1_FALLBACK AppendOnlyList device_fallback_; } torch_ops_; // reportBackendEventToActiveKinetoProfiler AppendOnlyList, BlockSize> backend_events_; // _reportVulkanEventToProfiler AppendOnlyList::raw_event_t, BlockSize> vulkan_events_; // reportMemoryUsage AppendOnlyList allocations_; // reportOOMs AppendOnlyList, BlockSize> ooms_; // with_stack (Python) AppendOnlyList< std::pair, BlockSize> py_calls_; }; class TORCH_API RecordQueue { public: RecordQueue(ProfilerConfig config, std::set activities); bool tracePython() const; ThreadLocalSubqueue* getSubqueue(); void stop(); void restart(); // NB: This is a destructive operation. std::pair< std::vector>, std::unique_ptr> getRecords( std::function time_converter, uint64_t start_time_ns, uint64_t end_time_ns); private: uint32_t id_; ProfilerConfig config_; std::set activities_; ska::flat_hash_map> sub_queues_; std::mutex sub_queue_mutex_; std::unique_ptr python_tracer_; }; TORCH_API bool get_record_concrete_inputs_enabled(); TORCH_API void set_record_concrete_inputs_enabled_fn(std::function); TORCH_API void set_record_concrete_inputs_enabled_val(bool); TORCH_API bool get_fwd_bwd_enabled(); TORCH_API void set_fwd_bwd_enabled_fn(std::function); TORCH_API void set_fwd_bwd_enabled_val(bool); TORCH_API bool get_cuda_sync_enabled(); TORCH_API void set_cuda_sync_enabled_fn(std::function); TORCH_API void set_cuda_sync_enabled_val(bool); } // namespace torch::profiler::impl