xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/vulkan/api/QueryPool.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <ATen/native/vulkan/api/QueryPool.h>
2 #include <ATen/native/vulkan/api/Utils.h>
3 #ifdef USE_KINETO
4 #include <torch/csrc/autograd/profiler_kineto.h>
5 #include <torch/csrc/profiler/orchestration/vulkan.h>
6 #endif // USE_KINETO
7 
8 #include <cmath>
9 #include <iomanip>
10 #include <iostream>
11 #include <utility>
12 
13 namespace at {
14 namespace native {
15 namespace vulkan {
16 namespace api {
17 
18 namespace {
19 // On Mali gpus timestamp_period seems to return 0.
20 // For some reason when 52.08 is used op runtimes seem to make more sense
21 // TODO: Figure out what is special about 52.08
22 constexpr int64_t kDefaultNsPerTick = 52; // lround(52.08f);
23 } // namespace
24 
QueryPool(const QueryPoolConfig & config,const Adapter * adapter_p)25 QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p)
26     : mutex_{},
27       device_(adapter_p->device_handle()),
28       config_(config),
29       querypool_(VK_NULL_HANDLE),
30       shader_logs_(1),
31       in_use_(0),
32       previous_shader_count_(0u),
33       results_pending_(false) {
34   const VkQueryPoolCreateInfo info{
35       VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, // sType
36       nullptr, // pNext
37       0u, // flags
38       VK_QUERY_TYPE_TIMESTAMP, // queryType
39       config_.maxQueryCount, // queryCount
40       0u, // pipelineStatistics
41   };
42 
43   VK_CHECK(vkCreateQueryPool(device_, &info, nullptr, &querypool_));
44 
45   shader_log().reserve(config_.initialReserveSize);
46 
47   VK_CHECK_COND(adapter_p, "Valid GPU device must be created for QueryPool");
48   ns_per_tick_ = std::lround(adapter_p->timestamp_period());
49   ns_per_tick_ = (ns_per_tick_ == 0) ? kDefaultNsPerTick : ns_per_tick_;
50 
51 #ifdef USE_KINETO
52   torch::profiler::impl::vulkan::registerGetShaderNameAndDurationNs(
__anon07a436240202(int64_t vulkan_id) 53       [this](int64_t vulkan_id) {
54         return get_shader_name_and_execution_duration_ns(vulkan_id);
55       });
56 #endif // USE_KINETO
57 }
58 
~QueryPool()59 QueryPool::~QueryPool() {
60   if (VK_NULL_HANDLE == querypool_) {
61     return;
62   }
63   vkDestroyQueryPool(device_, querypool_, nullptr);
64 
65 #ifdef USE_KINETO
66   torch::profiler::impl::vulkan::deregisterGetShaderNameAndDurationNs();
67 #endif // USE_KINETO
68 }
69 
reset(const CommandBuffer & cmd)70 void QueryPool::reset(const CommandBuffer& cmd) {
71   std::lock_guard<std::mutex> lock(mutex_);
72   cmd.reset_querypool(querypool_, 0u, in_use_);
73   previous_shader_count_ += shader_log().size();
74   in_use_ = 0u;
75   shader_logs_.emplace_back();
76   shader_log().reserve(config_.initialReserveSize);
77   results_pending_ = false;
78 }
79 
write_timestamp(const CommandBuffer & cmd)80 size_t QueryPool::write_timestamp(const CommandBuffer& cmd) {
81   VK_CHECK_COND(
82       in_use_ < config_.maxQueryCount,
83       "Vulkan QueryPool: Exceeded the maximum number of queries "
84       "allowed by the queryPool (",
85       config_.maxQueryCount,
86       ")!");
87 
88   cmd.write_timestamp(querypool_, in_use_);
89 
90   return in_use_++;
91 }
92 
shader_profile_begin(const CommandBuffer & cmd,const std::string & kernel_name,const VkExtent3D global_workgroup_size,const VkExtent3D local_workgroup_size)93 uint32_t QueryPool::shader_profile_begin(
94     const CommandBuffer& cmd,
95     const std::string& kernel_name,
96     const VkExtent3D global_workgroup_size,
97     const VkExtent3D local_workgroup_size) {
98   std::lock_guard<std::mutex> lock(mutex_);
99 
100   uint32_t query_idx = write_timestamp(cmd);
101 
102   uint32_t log_idx = shader_log().size();
103   ShaderDuration log_entry{
104       log_idx,
105       // Execution Properties
106       kernel_name,
107       global_workgroup_size,
108       local_workgroup_size,
109       // Query indexes
110       query_idx, // start query idx
111       UINT32_MAX, // end query idx
112       // Timings
113       0u, // start time
114       0u, // end time
115       0u, // duration
116   };
117 
118   shader_log().emplace_back(log_entry);
119 
120   results_pending_ = true;
121 
122 #ifdef USE_KINETO
123   torch::profiler::impl::vulkan_id_t vulkan_id =
124       torch::profiler::impl::vulkan_id_t(previous_shader_count_ + log_idx);
125 
126   torch::profiler::impl::_reportVulkanEventToProfiler(vulkan_id);
127 #endif // USE_KINETO
128 
129   return log_idx;
130 }
131 
shader_profile_end(const CommandBuffer & cmd,const uint32_t log_idx)132 void QueryPool::shader_profile_end(
133     const CommandBuffer& cmd,
134     const uint32_t log_idx) {
135   std::lock_guard<std::mutex> lock(mutex_);
136 
137   size_t query_idx = write_timestamp(cmd);
138 
139   shader_log()[log_idx].end_query_idx = query_idx;
140 }
141 
extract_results()142 void QueryPool::extract_results() {
143   std::lock_guard<std::mutex> lock(mutex_);
144 
145   if (!results_pending_) {
146     return;
147   }
148 
149   const VkQueryResultFlags flags = VK_QUERY_RESULT_64_BIT;
150 
151   std::vector<uint64_t> query_data;
152   query_data.resize(in_use_);
153 
154   VK_CHECK(vkGetQueryPoolResults(
155       device_,
156       querypool_,
157       0u, // firstQuery
158       in_use_, // queryCount
159       sizeof(uint64_t) * in_use_, // dataSize
160       query_data.data(), // pData
161       sizeof(uint64_t), // stride
162       flags)); // flags
163 
164   for (ShaderDuration& entry : shader_log()) {
165     entry.start_time_ns = query_data.at(entry.start_query_idx) * ns_per_tick_;
166     entry.end_time_ns = query_data.at(entry.end_query_idx) * ns_per_tick_;
167     entry.execution_duration_ns = entry.end_time_ns - entry.start_time_ns;
168   }
169 
170   results_pending_ = false;
171 }
172 
stringize(const VkExtent3D & extents)173 static std::string stringize(const VkExtent3D& extents) {
174   std::stringstream ss;
175   ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
176      << "}";
177   return ss.str();
178 }
179 
generate_string_report()180 std::string QueryPool::generate_string_report() {
181   std::lock_guard<std::mutex> lock(mutex_);
182 
183   std::stringstream ss;
184 
185   int kernel_name_w = 40;
186   int global_size_w = 15;
187   int duration_w = 25;
188 
189   ss << std::left;
190   ss << std::setw(kernel_name_w) << "Kernel Name";
191   ss << std::setw(global_size_w) << "Workgroup Size";
192   ss << std::right << std::setw(duration_w) << "Duration (ns)";
193   ss << std::endl;
194 
195   ss << std::left;
196   ss << std::setw(kernel_name_w) << "===========";
197   ss << std::setw(global_size_w) << "==============";
198   ss << std::right << std::setw(duration_w) << "===========";
199   ss << std::endl;
200 
201   for (ShaderDuration& entry : shader_log()) {
202     std::chrono::duration<size_t, std::nano> exec_duration_ns(
203         entry.execution_duration_ns);
204 
205     ss << std::left;
206     ss << std::setw(kernel_name_w) << entry.kernel_name;
207     ss << std::setw(global_size_w) << stringize(entry.global_workgroup_size);
208     ss << std::right << std::setw(duration_w) << exec_duration_ns.count();
209     ss << std::endl;
210   }
211 
212   return ss.str();
213 }
214 
print_results()215 void QueryPool::print_results() {
216   std::cout << generate_string_report() << std::endl;
217 }
218 
get_total_op_ns(const std::string & op_name)219 uint64_t QueryPool::get_total_op_ns(const std::string& op_name) {
220   std::lock_guard<std::mutex> lock(mutex_);
221   uint64_t sum = 0;
222   for (ShaderDuration& entry : shader_log()) {
223     if (entry.kernel_name == op_name) {
224       sum += entry.execution_duration_ns;
225     }
226   }
227   return sum;
228 }
229 
shader_log_for_each(std::function<void (const ShaderDuration &)> fn)230 void QueryPool::shader_log_for_each(
231     std::function<void(const ShaderDuration&)> fn) {
232   std::lock_guard<std::mutex> lock(mutex_);
233   std::for_each(shader_log().begin(), shader_log().end(), std::move(fn));
234 }
235 
236 std::tuple<std::string, uint64_t> QueryPool::
get_shader_name_and_execution_duration_ns(size_t query_index)237     get_shader_name_and_execution_duration_ns(size_t query_index) {
238   extract_results();
239 
240   std::lock_guard<std::mutex> lock(mutex_);
241 
242   const size_t entry_count = shader_logs_entry_count_thread_unsafe();
243   VK_CHECK_COND(
244       (query_index >= 0 && query_index < entry_count),
245       "query_index of ",
246       query_index,
247       " is out of bounds (",
248       entry_count,
249       ") in QueryPool::get_shader_name_and_duration_ns");
250 
251   size_t log_idx = 0;
252   size_t entry_count_acc = 0;
253   while (entry_count_acc + shader_logs_[log_idx].size() <= query_index) {
254     entry_count_acc += shader_logs_[log_idx].size();
255     log_idx += 1;
256   }
257 
258   const ShaderDuration& entry =
259       shader_logs_[log_idx][query_index - entry_count_acc];
260 
261   return std::tuple<std::string, uint64_t>(
262       entry.kernel_name, entry.execution_duration_ns);
263 }
264 
shader_logs_entry_count_thread_unsafe()265 size_t QueryPool::shader_logs_entry_count_thread_unsafe() {
266   return previous_shader_count_ + shader_log().size();
267 }
268 
shader_logs_entry_count()269 size_t QueryPool::shader_logs_entry_count() {
270   std::lock_guard<std::mutex> lock(mutex_);
271   return shader_logs_entry_count_thread_unsafe();
272 }
273 
274 } // namespace api
275 } // namespace vulkan
276 } // namespace native
277 } // namespace at
278