xref: /aosp_15_r20/external/executorch/backends/vulkan/runtime/vk_api/QueryPool.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // @lint-ignore-every CLANGTIDY facebook-hte-BadImplicitCast
10 
11 #include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
12 
13 #include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>
14 
15 #include <cmath>
16 #include <iomanip>
17 #include <iostream>
18 #include <utility>
19 
20 namespace vkcompute {
21 namespace vkapi {
22 
23 namespace {
24 
25 // On Mali gpus timestamp_period seems to return 0.
26 // For some reason when 52.08 is used op runtimes seem to make more sense
27 // TODO: Figure out what is special about 52.08
28 constexpr int64_t kDefaultNsPerTick = 52; // lround(52.08f);
29 
30 } // namespace
31 
32 #define EARLY_RETURN_IF_UNINITIALIZED() \
33   if (querypool_ == VK_NULL_HANDLE) {   \
34     return;                             \
35   }
36 
QueryPool(const QueryPoolConfig & config,const Adapter * adapter_p)37 QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p)
38     : config_(config),
39       ns_per_tick_(1u),
40       device_(VK_NULL_HANDLE),
41       querypool_(VK_NULL_HANDLE),
42       num_queries_(0u),
43       shader_durations_(0),
44       mutex_{} {
45   initialize(adapter_p);
46 }
47 
~QueryPool()48 QueryPool::~QueryPool() {
49   EARLY_RETURN_IF_UNINITIALIZED();
50   vkDestroyQueryPool(device_, querypool_, nullptr);
51 }
52 
initialize(const Adapter * adapter_p)53 void QueryPool::initialize(const Adapter* adapter_p) {
54   // No-op if adapter_p is nullptr or querypool is already created
55   if (!adapter_p || querypool_ != VK_NULL_HANDLE) {
56     return;
57   }
58 
59   device_ = adapter_p->device_handle();
60 
61   ns_per_tick_ = std::lround(adapter_p->timestamp_period());
62   ns_per_tick_ = (ns_per_tick_ == 0) ? kDefaultNsPerTick : ns_per_tick_;
63 
64   shader_durations_.reserve(config_.initial_reserve_size);
65 
66   const VkQueryPoolCreateInfo info{
67       VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, // sType
68       nullptr, // pNext
69       0u, // flags
70       VK_QUERY_TYPE_TIMESTAMP, // queryType
71       config_.max_query_count, // queryCount
72       0u, // pipelineStatistics
73   };
74 
75   VK_CHECK(vkCreateQueryPool(device_, &info, nullptr, &querypool_));
76 }
77 
write_timestamp(const CommandBuffer & cmd)78 size_t QueryPool::write_timestamp(const CommandBuffer& cmd) {
79   VK_CHECK_COND(
80       num_queries_ < config_.max_query_count,
81       "Vulkan QueryPool: Exceeded the maximum number of queries "
82       "allowed by the queryPool (",
83       config_.max_query_count,
84       ")!");
85 
86   cmd.write_timestamp(querypool_, num_queries_++);
87   return num_queries_ - 1;
88 }
89 
reset_querypool(const CommandBuffer & cmd)90 void QueryPool::reset_querypool(const CommandBuffer& cmd) {
91   EARLY_RETURN_IF_UNINITIALIZED();
92   std::lock_guard<std::mutex> lock(mutex_);
93 
94   cmd.reset_querypool(querypool_, 0u, config_.max_query_count);
95   reset_state();
96 }
97 
reset_state()98 void QueryPool::reset_state() {
99   num_queries_ = 0u;
100   shader_durations_.clear();
101 }
102 
shader_profile_begin(const CommandBuffer & cmd,const uint32_t dispatch_id,const std::string & kernel_name,const VkExtent3D global_workgroup_size,const VkExtent3D local_workgroup_size)103 void QueryPool::shader_profile_begin(
104     const CommandBuffer& cmd,
105     const uint32_t dispatch_id,
106     const std::string& kernel_name,
107     const VkExtent3D global_workgroup_size,
108     const VkExtent3D local_workgroup_size) {
109   EARLY_RETURN_IF_UNINITIALIZED();
110   std::lock_guard<std::mutex> lock(mutex_);
111 
112   uint32_t query_idx = write_timestamp(cmd);
113 
114   ShaderDuration log_entry{
115       utils::safe_downcast<uint32_t>(shader_durations_.size()),
116       // Execution Properties
117       dispatch_id,
118       kernel_name,
119       global_workgroup_size,
120       local_workgroup_size,
121       // Query indexes
122       query_idx, // start query idx
123       UINT32_MAX, // end query idx
124       // Timings
125       0u, // start time
126       0u, // end time
127       0u, // duration
128   };
129 
130   shader_durations_.emplace_back(log_entry);
131 }
132 
shader_profile_end(const CommandBuffer & cmd)133 void QueryPool::shader_profile_end(const CommandBuffer& cmd) {
134   EARLY_RETURN_IF_UNINITIALIZED();
135   std::lock_guard<std::mutex> lock(mutex_);
136 
137   size_t query_idx = write_timestamp(cmd);
138   shader_durations_.back().end_query_idx = query_idx;
139 }
140 
extract_results()141 void QueryPool::extract_results() {
142   EARLY_RETURN_IF_UNINITIALIZED();
143   std::lock_guard<std::mutex> lock(mutex_);
144 
145   const VkQueryResultFlags flags = VK_QUERY_RESULT_64_BIT;
146 
147   std::vector<uint64_t> query_data;
148   query_data.resize(num_queries_);
149 
150   VK_CHECK(vkGetQueryPoolResults(
151       device_,
152       querypool_,
153       0u, // firstQuery
154       num_queries_, // queryCount
155       sizeof(uint64_t) * num_queries_, // dataSize
156       query_data.data(), // pData
157       sizeof(uint64_t), // stride
158       flags)); // flags
159 
160   for (ShaderDuration& entry : shader_durations_) {
161     entry.start_time_ns = query_data.at(entry.start_query_idx) * ns_per_tick_;
162     entry.end_time_ns = query_data.at(entry.end_query_idx) * ns_per_tick_;
163     entry.execution_duration_ns = entry.end_time_ns - entry.start_time_ns;
164   }
165 }
166 
operator <<(std::ostream & os,const VkExtent3D & extents)167 std::ostream& operator<<(std::ostream& os, const VkExtent3D& extents) {
168   os << "{" << extents.width << ", " << extents.height << ", " << extents.depth
169      << "}";
170   return os;
171 }
172 
stringize(const VkExtent3D & extents)173 std::string stringize(const VkExtent3D& extents) {
174   std::stringstream ss;
175   ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
176      << "}";
177   return ss.str();
178 }
179 std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
get_shader_timestamp_data()180 QueryPool::get_shader_timestamp_data() {
181   if (querypool_ == VK_NULL_HANDLE) {
182     return {};
183   }
184   std::lock_guard<std::mutex> lock(mutex_);
185   std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
186       shader_timestamp_data;
187   for (ShaderDuration& entry : shader_durations_) {
188     shader_timestamp_data.emplace_back(std::make_tuple(
189         entry.kernel_name,
190         entry.dispatch_id,
191         entry.start_time_ns,
192         entry.end_time_ns));
193   }
194   return shader_timestamp_data;
195 }
196 
generate_string_report()197 std::string QueryPool::generate_string_report() {
198   std::lock_guard<std::mutex> lock(mutex_);
199 
200   std::stringstream ss;
201 
202   int kernel_name_w = 40;
203   int global_size_w = 25;
204   int local_size_w = 25;
205   int duration_w = 25;
206 
207   ss << std::left;
208   ss << std::setw(kernel_name_w) << "Kernel Name";
209   ss << std::setw(global_size_w) << "Global Workgroup Size";
210   ss << std::setw(local_size_w) << "Local Workgroup Size";
211   ss << std::right << std::setw(duration_w) << "Duration (ns)";
212   ss << std::endl;
213 
214   ss << std::left;
215   ss << std::setw(kernel_name_w) << "===========";
216   ss << std::setw(global_size_w) << "=====================";
217   ss << std::setw(local_size_w) << "====================";
218   ss << std::right << std::setw(duration_w) << "=============";
219   ss << std::endl;
220 
221   for (ShaderDuration& entry : shader_durations_) {
222     std::chrono::duration<size_t, std::nano> exec_duration_ns(
223         entry.execution_duration_ns);
224 
225     ss << std::left;
226     ss << std::setw(kernel_name_w) << entry.kernel_name;
227     ss << std::setw(global_size_w) << stringize(entry.global_workgroup_size);
228     ss << std::setw(local_size_w) << stringize(entry.local_workgroup_size);
229     ss << std::right << std::setw(duration_w) << exec_duration_ns.count();
230     ss << std::endl;
231   }
232 
233   return ss.str();
234 }
235 
print_results()236 void QueryPool::print_results() {
237   EARLY_RETURN_IF_UNINITIALIZED();
238   std::cout << generate_string_report() << std::endl;
239 }
240 
get_total_shader_ns(std::string kernel_name)241 unsigned long QueryPool::get_total_shader_ns(std::string kernel_name) {
242   for (ShaderDuration& entry : shader_durations_) {
243     if (entry.kernel_name == kernel_name) {
244       std::chrono::duration<size_t, std::nano> exec_duration_ns(
245           entry.execution_duration_ns);
246       return exec_duration_ns.count();
247     }
248   }
249   return 0;
250 }
251 
get_mean_shader_ns(std::string kernel_name)252 unsigned long QueryPool::get_mean_shader_ns(std::string kernel_name) {
253   uint64_t total_ns = 0;
254   uint32_t count = 0;
255   for (ShaderDuration& entry : shader_durations_) {
256     if (entry.kernel_name == kernel_name) {
257       std::chrono::duration<size_t, std::nano> exec_duration_ns(
258           entry.execution_duration_ns);
259       total_ns += exec_duration_ns.count();
260       count++;
261     }
262   }
263   if (count == 0) {
264     return 0;
265   }
266   return total_ns / count;
267 }
268 } // namespace vkapi
269 } // namespace vkcompute
270