1 /*
2 * Copyright (c) 2017-2019, 2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "OpenCLTimer.h"
25
26 #include "../Framework.h"
27 #include "../Utils.h"
28
29 #include "arm_compute/graph/INode.h"
30 #include "arm_compute/runtime/CL/CLScheduler.h"
31
32 #ifndef ARM_COMPUTE_CL
33 #error "You can't use OpenCLTimer without OpenCL"
34 #endif /* ARM_COMPUTE_CL */
35
36 namespace arm_compute
37 {
38 namespace test
39 {
40 namespace framework
41 {
42 template <bool output_timestamps>
id() const43 std::string OpenCLClock<output_timestamps>::id() const
44 {
45 if(output_timestamps)
46 {
47 return "OpenCLTimestamps";
48 }
49 else
50 {
51 return "OpenCLTimer";
52 }
53 }
54
55 template <bool output_timestamps>
OpenCLClock(ScaleFactor scale_factor)56 OpenCLClock<output_timestamps>::OpenCLClock(ScaleFactor scale_factor)
57 : _kernels(),
58 _real_function(nullptr),
59 #ifdef ARM_COMPUTE_GRAPH_ENABLED
60 _real_graph_function(nullptr),
61 #endif /* ARM_COMPUTE_GRAPH_ENABLED */
62 _prefix(),
63 _timer_enabled(false)
64 {
65 auto q = CLScheduler::get().queue();
66 cl_command_queue_properties props = q.getInfo<CL_QUEUE_PROPERTIES>();
67 if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
68 {
69 CLScheduler::get().set_queue(cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE));
70 }
71
72 switch(scale_factor)
73 {
74 case ScaleFactor::NONE:
75 _scale_factor = 1.f;
76 _unit = "ns";
77 break;
78 case ScaleFactor::TIME_US:
79 _scale_factor = 1000.f;
80 _unit = "us";
81 break;
82 case ScaleFactor::TIME_MS:
83 _scale_factor = 1000000.f;
84 _unit = "ms";
85 break;
86 case ScaleFactor::TIME_S:
87 _scale_factor = 1000000000.f;
88 _unit = "s";
89 break;
90 default:
91 ARM_COMPUTE_ERROR("Invalid scale");
92 }
93 }
94
95 template <bool output_timestamps>
test_start()96 void OpenCLClock<output_timestamps>::test_start()
97 {
98 // Start intercepting enqueues:
99 ARM_COMPUTE_ERROR_ON(_real_function != nullptr);
100 _real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
101 auto interceptor = [this](
102 cl_command_queue command_queue,
103 cl_kernel kernel,
104 cl_uint work_dim,
105 const size_t *gwo,
106 const size_t *gws,
107 const size_t *lws,
108 cl_uint num_events_in_wait_list,
109 const cl_event * event_wait_list,
110 cl_event * event)
111 {
112 if(this->_timer_enabled)
113 {
114 kernel_info info;
115 cl::Kernel cpp_kernel(kernel, true);
116 std::stringstream ss;
117 ss << this->_prefix << cpp_kernel.getInfo<CL_KERNEL_FUNCTION_NAME>();
118 if(gws != nullptr)
119 {
120 ss << " GWS[" << gws[0] << "," << gws[1] << "," << gws[2] << "]";
121 }
122 if(lws != nullptr)
123 {
124 ss << " LWS[" << lws[0] << "," << lws[1] << "," << lws[2] << "]";
125 }
126 info.name = ss.str();
127 cl_event tmp;
128 cl_int retval = this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
129 info.event = tmp;
130 this->_kernels.push_back(std::move(info));
131
132 if(event != nullptr)
133 {
134 //return cl_event from the intercepted call
135 clRetainEvent(tmp);
136 *event = tmp;
137 }
138 return retval;
139 }
140 else
141 {
142 return this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, event);
143 }
144 };
145 CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
146
147 #ifdef ARM_COMPUTE_GRAPH_ENABLED
148 ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr);
149 _real_graph_function = graph::TaskExecutor::get().execute_function;
150 // Start intercepting tasks:
151 auto task_interceptor = [this](graph::ExecutionTask & task)
152 {
153 if(task.node != nullptr && !task.node->name().empty())
154 {
155 this->_prefix = task.node->name() + "/";
156 }
157 else
158 {
159 this->_prefix = "";
160 }
161 this->_real_graph_function(task);
162 this->_prefix = "";
163 };
164 graph::TaskExecutor::get().execute_function = task_interceptor;
165 #endif /* ARM_COMPUTE_GRAPH_ENABLED */
166 }
167
168 template <bool output_timestamps>
start()169 void OpenCLClock<output_timestamps>::start()
170 {
171 _kernels.clear();
172 _timer_enabled = true;
173 }
174 template <bool output_timestamps>
stop()175 void OpenCLClock<output_timestamps>::stop()
176 {
177 _timer_enabled = false;
178 }
179
180 template <bool output_timestamps>
test_stop()181 void OpenCLClock<output_timestamps>::test_stop()
182 {
183 // Restore real function
184 CLSymbols::get().clEnqueueNDRangeKernel_ptr = _real_function;
185 _real_function = nullptr;
186 #ifdef ARM_COMPUTE_GRAPH_ENABLED
187 graph::TaskExecutor::get().execute_function = _real_graph_function;
188 _real_graph_function = nullptr;
189 #endif /* ARM_COMPUTE_GRAPH_ENABLED */
190 }
191
192 template <bool output_timestamps>
measurements() const193 Instrument::MeasurementsMap OpenCLClock<output_timestamps>::measurements() const
194 {
195 MeasurementsMap measurements;
196 unsigned int kernel_number = 0;
197 for(auto const &kernel : _kernels)
198 {
199 cl_ulong queued;
200 cl_ulong flushed;
201 cl_ulong start;
202 cl_ulong end;
203 kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &queued);
204 kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &flushed);
205 kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);
206 kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);
207 std::string name = kernel.name + " #" + support::cpp11::to_string(kernel_number++);
208
209 if(output_timestamps)
210 {
211 measurements.emplace("[start]" + name, Measurement(start / static_cast<cl_ulong>(_scale_factor), _unit));
212 measurements.emplace("[queued]" + name, Measurement(queued / static_cast<cl_ulong>(_scale_factor), _unit));
213 measurements.emplace("[flushed]" + name, Measurement(flushed / static_cast<cl_ulong>(_scale_factor), _unit));
214 measurements.emplace("[end]" + name, Measurement(end / static_cast<cl_ulong>(_scale_factor), _unit));
215 }
216 else
217 {
218 measurements.emplace(name, Measurement((end - start) / _scale_factor, _unit));
219 }
220 }
221
222 return measurements;
223 }
224
225 template <bool output_timestamps>
test_measurements() const226 Instrument::MeasurementsMap OpenCLClock<output_timestamps>::test_measurements() const
227 {
228 MeasurementsMap measurements;
229
230 if(output_timestamps)
231 {
232 // The OpenCL clock and the wall clock are not in sync, so we use
233 // this trick to calculate the offset between the two clocks:
234 ::cl::Event event;
235 cl_ulong now_gpu;
236
237 // Enqueue retrieve current CPU clock and enqueue a dummy marker
238 std::chrono::high_resolution_clock::time_point now_cpu = std::chrono::high_resolution_clock::now();
239 CLScheduler::get().queue().enqueueMarker(&event);
240
241 CLScheduler::get().queue().finish();
242 //Access the time at which the marker was enqueued:
243 event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &now_gpu);
244
245 measurements.emplace("Now Wall clock", Measurement(now_cpu.time_since_epoch().count() / 1000, "us"));
246 measurements.emplace("Now OpenCL", Measurement(now_gpu / static_cast<cl_ulong>(_scale_factor), _unit));
247 }
248
249 return measurements;
250 }
251
252 } // namespace framework
253 } // namespace test
254 } // namespace arm_compute
255
256 template class arm_compute::test::framework::OpenCLClock<true>;
257 template class arm_compute::test::framework::OpenCLClock<false>;
258