xref: /aosp_15_r20/external/ComputeLibrary/tests/framework/instruments/OpenCLTimer.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2017-2019, 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "OpenCLTimer.h"
25 
26 #include "../Framework.h"
27 #include "../Utils.h"
28 
29 #include "arm_compute/graph/INode.h"
30 #include "arm_compute/runtime/CL/CLScheduler.h"
31 
32 #ifndef ARM_COMPUTE_CL
33 #error "You can't use OpenCLTimer without OpenCL"
34 #endif /* ARM_COMPUTE_CL */
35 
36 namespace arm_compute
37 {
38 namespace test
39 {
40 namespace framework
41 {
42 template <bool output_timestamps>
id() const43 std::string    OpenCLClock<output_timestamps>::id() const
44 {
45     if(output_timestamps)
46     {
47         return "OpenCLTimestamps";
48     }
49     else
50     {
51         return "OpenCLTimer";
52     }
53 }
54 
55 template <bool output_timestamps>
OpenCLClock(ScaleFactor scale_factor)56 OpenCLClock<output_timestamps>::OpenCLClock(ScaleFactor scale_factor)
57     : _kernels(),
58       _real_function(nullptr),
59 #ifdef ARM_COMPUTE_GRAPH_ENABLED
60       _real_graph_function(nullptr),
61 #endif /* ARM_COMPUTE_GRAPH_ENABLED */
62       _prefix(),
63       _timer_enabled(false)
64 {
65     auto                        q     = CLScheduler::get().queue();
66     cl_command_queue_properties props = q.getInfo<CL_QUEUE_PROPERTIES>();
67     if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
68     {
69         CLScheduler::get().set_queue(cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE));
70     }
71 
72     switch(scale_factor)
73     {
74         case ScaleFactor::NONE:
75             _scale_factor = 1.f;
76             _unit         = "ns";
77             break;
78         case ScaleFactor::TIME_US:
79             _scale_factor = 1000.f;
80             _unit         = "us";
81             break;
82         case ScaleFactor::TIME_MS:
83             _scale_factor = 1000000.f;
84             _unit         = "ms";
85             break;
86         case ScaleFactor::TIME_S:
87             _scale_factor = 1000000000.f;
88             _unit         = "s";
89             break;
90         default:
91             ARM_COMPUTE_ERROR("Invalid scale");
92     }
93 }
94 
95 template <bool output_timestamps>
test_start()96 void           OpenCLClock<output_timestamps>::test_start()
97 {
98     // Start intercepting enqueues:
99     ARM_COMPUTE_ERROR_ON(_real_function != nullptr);
100     _real_function   = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
101     auto interceptor = [this](
102                            cl_command_queue command_queue,
103                            cl_kernel        kernel,
104                            cl_uint          work_dim,
105                            const size_t    *gwo,
106                            const size_t    *gws,
107                            const size_t    *lws,
108                            cl_uint          num_events_in_wait_list,
109                            const cl_event * event_wait_list,
110                            cl_event *       event)
111     {
112         if(this->_timer_enabled)
113         {
114             kernel_info       info;
115             cl::Kernel        cpp_kernel(kernel, true);
116             std::stringstream ss;
117             ss << this->_prefix << cpp_kernel.getInfo<CL_KERNEL_FUNCTION_NAME>();
118             if(gws != nullptr)
119             {
120                 ss << " GWS[" << gws[0] << "," << gws[1] << "," << gws[2] << "]";
121             }
122             if(lws != nullptr)
123             {
124                 ss << " LWS[" << lws[0] << "," << lws[1] << "," << lws[2] << "]";
125             }
126             info.name = ss.str();
127             cl_event tmp;
128             cl_int   retval = this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
129             info.event      = tmp;
130             this->_kernels.push_back(std::move(info));
131 
132             if(event != nullptr)
133             {
134                 //return cl_event from the intercepted call
135                 clRetainEvent(tmp);
136                 *event = tmp;
137             }
138             return retval;
139         }
140         else
141         {
142             return this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, event);
143         }
144     };
145     CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
146 
147 #ifdef ARM_COMPUTE_GRAPH_ENABLED
148     ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr);
149     _real_graph_function = graph::TaskExecutor::get().execute_function;
150     // Start intercepting tasks:
151     auto task_interceptor = [this](graph::ExecutionTask & task)
152     {
153         if(task.node != nullptr && !task.node->name().empty())
154         {
155             this->_prefix = task.node->name() + "/";
156         }
157         else
158         {
159             this->_prefix = "";
160         }
161         this->_real_graph_function(task);
162         this->_prefix = "";
163     };
164     graph::TaskExecutor::get().execute_function = task_interceptor;
165 #endif /* ARM_COMPUTE_GRAPH_ENABLED */
166 }
167 
168 template <bool output_timestamps>
start()169 void           OpenCLClock<output_timestamps>::start()
170 {
171     _kernels.clear();
172     _timer_enabled = true;
173 }
174 template <bool output_timestamps>
stop()175 void           OpenCLClock<output_timestamps>::stop()
176 {
177     _timer_enabled = false;
178 }
179 
180 template <bool output_timestamps>
test_stop()181 void           OpenCLClock<output_timestamps>::test_stop()
182 {
183     // Restore real function
184     CLSymbols::get().clEnqueueNDRangeKernel_ptr = _real_function;
185     _real_function                              = nullptr;
186 #ifdef ARM_COMPUTE_GRAPH_ENABLED
187     graph::TaskExecutor::get().execute_function = _real_graph_function;
188     _real_graph_function                        = nullptr;
189 #endif /* ARM_COMPUTE_GRAPH_ENABLED */
190 }
191 
192 template <bool              output_timestamps>
measurements() const193 Instrument::MeasurementsMap OpenCLClock<output_timestamps>::measurements() const
194 {
195     MeasurementsMap measurements;
196     unsigned int    kernel_number = 0;
197     for(auto const &kernel : _kernels)
198     {
199         cl_ulong queued;
200         cl_ulong flushed;
201         cl_ulong start;
202         cl_ulong end;
203         kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &queued);
204         kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &flushed);
205         kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);
206         kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);
207         std::string name = kernel.name + " #" + support::cpp11::to_string(kernel_number++);
208 
209         if(output_timestamps)
210         {
211             measurements.emplace("[start]" + name, Measurement(start / static_cast<cl_ulong>(_scale_factor), _unit));
212             measurements.emplace("[queued]" + name, Measurement(queued / static_cast<cl_ulong>(_scale_factor), _unit));
213             measurements.emplace("[flushed]" + name, Measurement(flushed / static_cast<cl_ulong>(_scale_factor), _unit));
214             measurements.emplace("[end]" + name, Measurement(end / static_cast<cl_ulong>(_scale_factor), _unit));
215         }
216         else
217         {
218             measurements.emplace(name, Measurement((end - start) / _scale_factor, _unit));
219         }
220     }
221 
222     return measurements;
223 }
224 
225 template <bool              output_timestamps>
test_measurements() const226 Instrument::MeasurementsMap OpenCLClock<output_timestamps>::test_measurements() const
227 {
228     MeasurementsMap measurements;
229 
230     if(output_timestamps)
231     {
232         // The OpenCL clock and the wall clock are not in sync, so we use
233         // this trick to calculate the offset between the two clocks:
234         ::cl::Event event;
235         cl_ulong    now_gpu;
236 
237         // Enqueue retrieve current CPU clock and enqueue a dummy marker
238         std::chrono::high_resolution_clock::time_point now_cpu = std::chrono::high_resolution_clock::now();
239         CLScheduler::get().queue().enqueueMarker(&event);
240 
241         CLScheduler::get().queue().finish();
242         //Access the time at which the marker was enqueued:
243         event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &now_gpu);
244 
245         measurements.emplace("Now Wall clock", Measurement(now_cpu.time_since_epoch().count() / 1000, "us"));
246         measurements.emplace("Now OpenCL", Measurement(now_gpu / static_cast<cl_ulong>(_scale_factor), _unit));
247     }
248 
249     return measurements;
250 }
251 
252 } // namespace framework
253 } // namespace test
254 } // namespace arm_compute
255 
256 template class arm_compute::test::framework::OpenCLClock<true>;
257 template class arm_compute::test::framework::OpenCLClock<false>;
258