1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 // @lint-ignore-every CLANGTIDY facebook-hte-BadImplicitCast
10
11 #include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
12
13 #include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>
14
15 #include <cmath>
16 #include <iomanip>
17 #include <iostream>
18 #include <utility>
19
20 namespace vkcompute {
21 namespace vkapi {
22
23 namespace {
24
25 // On Mali gpus timestamp_period seems to return 0.
26 // For some reason when 52.08 is used op runtimes seem to make more sense
27 // TODO: Figure out what is special about 52.08
28 constexpr int64_t kDefaultNsPerTick = 52; // lround(52.08f);
29
30 } // namespace
31
32 #define EARLY_RETURN_IF_UNINITIALIZED() \
33 if (querypool_ == VK_NULL_HANDLE) { \
34 return; \
35 }
36
QueryPool(const QueryPoolConfig & config,const Adapter * adapter_p)37 QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p)
38 : config_(config),
39 ns_per_tick_(1u),
40 device_(VK_NULL_HANDLE),
41 querypool_(VK_NULL_HANDLE),
42 num_queries_(0u),
43 shader_durations_(0),
44 mutex_{} {
45 initialize(adapter_p);
46 }
47
~QueryPool()48 QueryPool::~QueryPool() {
49 EARLY_RETURN_IF_UNINITIALIZED();
50 vkDestroyQueryPool(device_, querypool_, nullptr);
51 }
52
initialize(const Adapter * adapter_p)53 void QueryPool::initialize(const Adapter* adapter_p) {
54 // No-op if adapter_p is nullptr or querypool is already created
55 if (!adapter_p || querypool_ != VK_NULL_HANDLE) {
56 return;
57 }
58
59 device_ = adapter_p->device_handle();
60
61 ns_per_tick_ = std::lround(adapter_p->timestamp_period());
62 ns_per_tick_ = (ns_per_tick_ == 0) ? kDefaultNsPerTick : ns_per_tick_;
63
64 shader_durations_.reserve(config_.initial_reserve_size);
65
66 const VkQueryPoolCreateInfo info{
67 VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, // sType
68 nullptr, // pNext
69 0u, // flags
70 VK_QUERY_TYPE_TIMESTAMP, // queryType
71 config_.max_query_count, // queryCount
72 0u, // pipelineStatistics
73 };
74
75 VK_CHECK(vkCreateQueryPool(device_, &info, nullptr, &querypool_));
76 }
77
write_timestamp(const CommandBuffer & cmd)78 size_t QueryPool::write_timestamp(const CommandBuffer& cmd) {
79 VK_CHECK_COND(
80 num_queries_ < config_.max_query_count,
81 "Vulkan QueryPool: Exceeded the maximum number of queries "
82 "allowed by the queryPool (",
83 config_.max_query_count,
84 ")!");
85
86 cmd.write_timestamp(querypool_, num_queries_++);
87 return num_queries_ - 1;
88 }
89
reset_querypool(const CommandBuffer & cmd)90 void QueryPool::reset_querypool(const CommandBuffer& cmd) {
91 EARLY_RETURN_IF_UNINITIALIZED();
92 std::lock_guard<std::mutex> lock(mutex_);
93
94 cmd.reset_querypool(querypool_, 0u, config_.max_query_count);
95 reset_state();
96 }
97
reset_state()98 void QueryPool::reset_state() {
99 num_queries_ = 0u;
100 shader_durations_.clear();
101 }
102
shader_profile_begin(const CommandBuffer & cmd,const uint32_t dispatch_id,const std::string & kernel_name,const VkExtent3D global_workgroup_size,const VkExtent3D local_workgroup_size)103 void QueryPool::shader_profile_begin(
104 const CommandBuffer& cmd,
105 const uint32_t dispatch_id,
106 const std::string& kernel_name,
107 const VkExtent3D global_workgroup_size,
108 const VkExtent3D local_workgroup_size) {
109 EARLY_RETURN_IF_UNINITIALIZED();
110 std::lock_guard<std::mutex> lock(mutex_);
111
112 uint32_t query_idx = write_timestamp(cmd);
113
114 ShaderDuration log_entry{
115 utils::safe_downcast<uint32_t>(shader_durations_.size()),
116 // Execution Properties
117 dispatch_id,
118 kernel_name,
119 global_workgroup_size,
120 local_workgroup_size,
121 // Query indexes
122 query_idx, // start query idx
123 UINT32_MAX, // end query idx
124 // Timings
125 0u, // start time
126 0u, // end time
127 0u, // duration
128 };
129
130 shader_durations_.emplace_back(log_entry);
131 }
132
shader_profile_end(const CommandBuffer & cmd)133 void QueryPool::shader_profile_end(const CommandBuffer& cmd) {
134 EARLY_RETURN_IF_UNINITIALIZED();
135 std::lock_guard<std::mutex> lock(mutex_);
136
137 size_t query_idx = write_timestamp(cmd);
138 shader_durations_.back().end_query_idx = query_idx;
139 }
140
extract_results()141 void QueryPool::extract_results() {
142 EARLY_RETURN_IF_UNINITIALIZED();
143 std::lock_guard<std::mutex> lock(mutex_);
144
145 const VkQueryResultFlags flags = VK_QUERY_RESULT_64_BIT;
146
147 std::vector<uint64_t> query_data;
148 query_data.resize(num_queries_);
149
150 VK_CHECK(vkGetQueryPoolResults(
151 device_,
152 querypool_,
153 0u, // firstQuery
154 num_queries_, // queryCount
155 sizeof(uint64_t) * num_queries_, // dataSize
156 query_data.data(), // pData
157 sizeof(uint64_t), // stride
158 flags)); // flags
159
160 for (ShaderDuration& entry : shader_durations_) {
161 entry.start_time_ns = query_data.at(entry.start_query_idx) * ns_per_tick_;
162 entry.end_time_ns = query_data.at(entry.end_query_idx) * ns_per_tick_;
163 entry.execution_duration_ns = entry.end_time_ns - entry.start_time_ns;
164 }
165 }
166
operator <<(std::ostream & os,const VkExtent3D & extents)167 std::ostream& operator<<(std::ostream& os, const VkExtent3D& extents) {
168 os << "{" << extents.width << ", " << extents.height << ", " << extents.depth
169 << "}";
170 return os;
171 }
172
stringize(const VkExtent3D & extents)173 std::string stringize(const VkExtent3D& extents) {
174 std::stringstream ss;
175 ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
176 << "}";
177 return ss.str();
178 }
179 std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
get_shader_timestamp_data()180 QueryPool::get_shader_timestamp_data() {
181 if (querypool_ == VK_NULL_HANDLE) {
182 return {};
183 }
184 std::lock_guard<std::mutex> lock(mutex_);
185 std::vector<std::tuple<std::string, uint32_t, uint64_t, uint64_t>>
186 shader_timestamp_data;
187 for (ShaderDuration& entry : shader_durations_) {
188 shader_timestamp_data.emplace_back(std::make_tuple(
189 entry.kernel_name,
190 entry.dispatch_id,
191 entry.start_time_ns,
192 entry.end_time_ns));
193 }
194 return shader_timestamp_data;
195 }
196
generate_string_report()197 std::string QueryPool::generate_string_report() {
198 std::lock_guard<std::mutex> lock(mutex_);
199
200 std::stringstream ss;
201
202 int kernel_name_w = 40;
203 int global_size_w = 25;
204 int local_size_w = 25;
205 int duration_w = 25;
206
207 ss << std::left;
208 ss << std::setw(kernel_name_w) << "Kernel Name";
209 ss << std::setw(global_size_w) << "Global Workgroup Size";
210 ss << std::setw(local_size_w) << "Local Workgroup Size";
211 ss << std::right << std::setw(duration_w) << "Duration (ns)";
212 ss << std::endl;
213
214 ss << std::left;
215 ss << std::setw(kernel_name_w) << "===========";
216 ss << std::setw(global_size_w) << "=====================";
217 ss << std::setw(local_size_w) << "====================";
218 ss << std::right << std::setw(duration_w) << "=============";
219 ss << std::endl;
220
221 for (ShaderDuration& entry : shader_durations_) {
222 std::chrono::duration<size_t, std::nano> exec_duration_ns(
223 entry.execution_duration_ns);
224
225 ss << std::left;
226 ss << std::setw(kernel_name_w) << entry.kernel_name;
227 ss << std::setw(global_size_w) << stringize(entry.global_workgroup_size);
228 ss << std::setw(local_size_w) << stringize(entry.local_workgroup_size);
229 ss << std::right << std::setw(duration_w) << exec_duration_ns.count();
230 ss << std::endl;
231 }
232
233 return ss.str();
234 }
235
print_results()236 void QueryPool::print_results() {
237 EARLY_RETURN_IF_UNINITIALIZED();
238 std::cout << generate_string_report() << std::endl;
239 }
240
get_total_shader_ns(std::string kernel_name)241 unsigned long QueryPool::get_total_shader_ns(std::string kernel_name) {
242 for (ShaderDuration& entry : shader_durations_) {
243 if (entry.kernel_name == kernel_name) {
244 std::chrono::duration<size_t, std::nano> exec_duration_ns(
245 entry.execution_duration_ns);
246 return exec_duration_ns.count();
247 }
248 }
249 return 0;
250 }
251
get_mean_shader_ns(std::string kernel_name)252 unsigned long QueryPool::get_mean_shader_ns(std::string kernel_name) {
253 uint64_t total_ns = 0;
254 uint32_t count = 0;
255 for (ShaderDuration& entry : shader_durations_) {
256 if (entry.kernel_name == kernel_name) {
257 std::chrono::duration<size_t, std::nano> exec_duration_ns(
258 entry.execution_duration_ns);
259 total_ns += exec_duration_ns.count();
260 count++;
261 }
262 }
263 if (count == 0) {
264 return 0;
265 }
266 return total_ns / count;
267 }
268 } // namespace vkapi
269 } // namespace vkcompute
270