1 #include <ATen/native/vulkan/api/QueryPool.h>
2 #include <ATen/native/vulkan/api/Utils.h>
3 #ifdef USE_KINETO
4 #include <torch/csrc/autograd/profiler_kineto.h>
5 #include <torch/csrc/profiler/orchestration/vulkan.h>
6 #endif // USE_KINETO
7
8 #include <cmath>
9 #include <iomanip>
10 #include <iostream>
11 #include <utility>
12
13 namespace at {
14 namespace native {
15 namespace vulkan {
16 namespace api {
17
18 namespace {
19 // On Mali gpus timestamp_period seems to return 0.
20 // For some reason when 52.08 is used op runtimes seem to make more sense
21 // TODO: Figure out what is special about 52.08
22 constexpr int64_t kDefaultNsPerTick = 52; // lround(52.08f);
23 } // namespace
24
QueryPool(const QueryPoolConfig & config,const Adapter * adapter_p)25 QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p)
26 : mutex_{},
27 device_(adapter_p->device_handle()),
28 config_(config),
29 querypool_(VK_NULL_HANDLE),
30 shader_logs_(1),
31 in_use_(0),
32 previous_shader_count_(0u),
33 results_pending_(false) {
34 const VkQueryPoolCreateInfo info{
35 VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, // sType
36 nullptr, // pNext
37 0u, // flags
38 VK_QUERY_TYPE_TIMESTAMP, // queryType
39 config_.maxQueryCount, // queryCount
40 0u, // pipelineStatistics
41 };
42
43 VK_CHECK(vkCreateQueryPool(device_, &info, nullptr, &querypool_));
44
45 shader_log().reserve(config_.initialReserveSize);
46
47 VK_CHECK_COND(adapter_p, "Valid GPU device must be created for QueryPool");
48 ns_per_tick_ = std::lround(adapter_p->timestamp_period());
49 ns_per_tick_ = (ns_per_tick_ == 0) ? kDefaultNsPerTick : ns_per_tick_;
50
51 #ifdef USE_KINETO
52 torch::profiler::impl::vulkan::registerGetShaderNameAndDurationNs(
__anon07a436240202(int64_t vulkan_id) 53 [this](int64_t vulkan_id) {
54 return get_shader_name_and_execution_duration_ns(vulkan_id);
55 });
56 #endif // USE_KINETO
57 }
58
~QueryPool()59 QueryPool::~QueryPool() {
60 if (VK_NULL_HANDLE == querypool_) {
61 return;
62 }
63 vkDestroyQueryPool(device_, querypool_, nullptr);
64
65 #ifdef USE_KINETO
66 torch::profiler::impl::vulkan::deregisterGetShaderNameAndDurationNs();
67 #endif // USE_KINETO
68 }
69
reset(const CommandBuffer & cmd)70 void QueryPool::reset(const CommandBuffer& cmd) {
71 std::lock_guard<std::mutex> lock(mutex_);
72 cmd.reset_querypool(querypool_, 0u, in_use_);
73 previous_shader_count_ += shader_log().size();
74 in_use_ = 0u;
75 shader_logs_.emplace_back();
76 shader_log().reserve(config_.initialReserveSize);
77 results_pending_ = false;
78 }
79
write_timestamp(const CommandBuffer & cmd)80 size_t QueryPool::write_timestamp(const CommandBuffer& cmd) {
81 VK_CHECK_COND(
82 in_use_ < config_.maxQueryCount,
83 "Vulkan QueryPool: Exceeded the maximum number of queries "
84 "allowed by the queryPool (",
85 config_.maxQueryCount,
86 ")!");
87
88 cmd.write_timestamp(querypool_, in_use_);
89
90 return in_use_++;
91 }
92
shader_profile_begin(const CommandBuffer & cmd,const std::string & kernel_name,const VkExtent3D global_workgroup_size,const VkExtent3D local_workgroup_size)93 uint32_t QueryPool::shader_profile_begin(
94 const CommandBuffer& cmd,
95 const std::string& kernel_name,
96 const VkExtent3D global_workgroup_size,
97 const VkExtent3D local_workgroup_size) {
98 std::lock_guard<std::mutex> lock(mutex_);
99
100 uint32_t query_idx = write_timestamp(cmd);
101
102 uint32_t log_idx = shader_log().size();
103 ShaderDuration log_entry{
104 log_idx,
105 // Execution Properties
106 kernel_name,
107 global_workgroup_size,
108 local_workgroup_size,
109 // Query indexes
110 query_idx, // start query idx
111 UINT32_MAX, // end query idx
112 // Timings
113 0u, // start time
114 0u, // end time
115 0u, // duration
116 };
117
118 shader_log().emplace_back(log_entry);
119
120 results_pending_ = true;
121
122 #ifdef USE_KINETO
123 torch::profiler::impl::vulkan_id_t vulkan_id =
124 torch::profiler::impl::vulkan_id_t(previous_shader_count_ + log_idx);
125
126 torch::profiler::impl::_reportVulkanEventToProfiler(vulkan_id);
127 #endif // USE_KINETO
128
129 return log_idx;
130 }
131
shader_profile_end(const CommandBuffer & cmd,const uint32_t log_idx)132 void QueryPool::shader_profile_end(
133 const CommandBuffer& cmd,
134 const uint32_t log_idx) {
135 std::lock_guard<std::mutex> lock(mutex_);
136
137 size_t query_idx = write_timestamp(cmd);
138
139 shader_log()[log_idx].end_query_idx = query_idx;
140 }
141
extract_results()142 void QueryPool::extract_results() {
143 std::lock_guard<std::mutex> lock(mutex_);
144
145 if (!results_pending_) {
146 return;
147 }
148
149 const VkQueryResultFlags flags = VK_QUERY_RESULT_64_BIT;
150
151 std::vector<uint64_t> query_data;
152 query_data.resize(in_use_);
153
154 VK_CHECK(vkGetQueryPoolResults(
155 device_,
156 querypool_,
157 0u, // firstQuery
158 in_use_, // queryCount
159 sizeof(uint64_t) * in_use_, // dataSize
160 query_data.data(), // pData
161 sizeof(uint64_t), // stride
162 flags)); // flags
163
164 for (ShaderDuration& entry : shader_log()) {
165 entry.start_time_ns = query_data.at(entry.start_query_idx) * ns_per_tick_;
166 entry.end_time_ns = query_data.at(entry.end_query_idx) * ns_per_tick_;
167 entry.execution_duration_ns = entry.end_time_ns - entry.start_time_ns;
168 }
169
170 results_pending_ = false;
171 }
172
stringize(const VkExtent3D & extents)173 static std::string stringize(const VkExtent3D& extents) {
174 std::stringstream ss;
175 ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
176 << "}";
177 return ss.str();
178 }
179
generate_string_report()180 std::string QueryPool::generate_string_report() {
181 std::lock_guard<std::mutex> lock(mutex_);
182
183 std::stringstream ss;
184
185 int kernel_name_w = 40;
186 int global_size_w = 15;
187 int duration_w = 25;
188
189 ss << std::left;
190 ss << std::setw(kernel_name_w) << "Kernel Name";
191 ss << std::setw(global_size_w) << "Workgroup Size";
192 ss << std::right << std::setw(duration_w) << "Duration (ns)";
193 ss << std::endl;
194
195 ss << std::left;
196 ss << std::setw(kernel_name_w) << "===========";
197 ss << std::setw(global_size_w) << "==============";
198 ss << std::right << std::setw(duration_w) << "===========";
199 ss << std::endl;
200
201 for (ShaderDuration& entry : shader_log()) {
202 std::chrono::duration<size_t, std::nano> exec_duration_ns(
203 entry.execution_duration_ns);
204
205 ss << std::left;
206 ss << std::setw(kernel_name_w) << entry.kernel_name;
207 ss << std::setw(global_size_w) << stringize(entry.global_workgroup_size);
208 ss << std::right << std::setw(duration_w) << exec_duration_ns.count();
209 ss << std::endl;
210 }
211
212 return ss.str();
213 }
214
print_results()215 void QueryPool::print_results() {
216 std::cout << generate_string_report() << std::endl;
217 }
218
get_total_op_ns(const std::string & op_name)219 uint64_t QueryPool::get_total_op_ns(const std::string& op_name) {
220 std::lock_guard<std::mutex> lock(mutex_);
221 uint64_t sum = 0;
222 for (ShaderDuration& entry : shader_log()) {
223 if (entry.kernel_name == op_name) {
224 sum += entry.execution_duration_ns;
225 }
226 }
227 return sum;
228 }
229
shader_log_for_each(std::function<void (const ShaderDuration &)> fn)230 void QueryPool::shader_log_for_each(
231 std::function<void(const ShaderDuration&)> fn) {
232 std::lock_guard<std::mutex> lock(mutex_);
233 std::for_each(shader_log().begin(), shader_log().end(), std::move(fn));
234 }
235
236 std::tuple<std::string, uint64_t> QueryPool::
get_shader_name_and_execution_duration_ns(size_t query_index)237 get_shader_name_and_execution_duration_ns(size_t query_index) {
238 extract_results();
239
240 std::lock_guard<std::mutex> lock(mutex_);
241
242 const size_t entry_count = shader_logs_entry_count_thread_unsafe();
243 VK_CHECK_COND(
244 (query_index >= 0 && query_index < entry_count),
245 "query_index of ",
246 query_index,
247 " is out of bounds (",
248 entry_count,
249 ") in QueryPool::get_shader_name_and_duration_ns");
250
251 size_t log_idx = 0;
252 size_t entry_count_acc = 0;
253 while (entry_count_acc + shader_logs_[log_idx].size() <= query_index) {
254 entry_count_acc += shader_logs_[log_idx].size();
255 log_idx += 1;
256 }
257
258 const ShaderDuration& entry =
259 shader_logs_[log_idx][query_index - entry_count_acc];
260
261 return std::tuple<std::string, uint64_t>(
262 entry.kernel_name, entry.execution_duration_ns);
263 }
264
shader_logs_entry_count_thread_unsafe()265 size_t QueryPool::shader_logs_entry_count_thread_unsafe() {
266 return previous_shader_count_ + shader_log().size();
267 }
268
shader_logs_entry_count()269 size_t QueryPool::shader_logs_entry_count() {
270 std::lock_guard<std::mutex> lock(mutex_);
271 return shader_logs_entry_count_thread_unsafe();
272 }
273
274 } // namespace api
275 } // namespace vulkan
276 } // namespace native
277 } // namespace at
278