xref: /aosp_15_r20/external/gemmlowp/profiling/profiler.h (revision 5f39d1b313f0528e11bae88b3029b54b9e1033e7)
1*5f39d1b3SJooyung Han // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2*5f39d1b3SJooyung Han //
3*5f39d1b3SJooyung Han // Licensed under the Apache License, Version 2.0 (the "License");
4*5f39d1b3SJooyung Han // you may not use this file except in compliance with the License.
5*5f39d1b3SJooyung Han // You may obtain a copy of the License at
6*5f39d1b3SJooyung Han //
7*5f39d1b3SJooyung Han //     http://www.apache.org/licenses/LICENSE-2.0
8*5f39d1b3SJooyung Han //
9*5f39d1b3SJooyung Han // Unless required by applicable law or agreed to in writing, software
10*5f39d1b3SJooyung Han // distributed under the License is distributed on an "AS IS" BASIS,
11*5f39d1b3SJooyung Han // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*5f39d1b3SJooyung Han // See the License for the specific language governing permissions and
13*5f39d1b3SJooyung Han // limitations under the License.
14*5f39d1b3SJooyung Han 
15*5f39d1b3SJooyung Han // profiler.h: a simple sampling profiler that's always just one #include away!
16*5f39d1b3SJooyung Han //
17*5f39d1b3SJooyung Han // Overview
18*5f39d1b3SJooyung Han // ========
19*5f39d1b3SJooyung Han //
20*5f39d1b3SJooyung Han // This profiler only samples a pseudo-stack, not the actual call stack.
21*5f39d1b3SJooyung Han // The code to be profiled needs to be instrumented with
22*5f39d1b3SJooyung Han // pseudo-stack "labels", see ScopedProfilingLabel.
23*5f39d1b3SJooyung Han // Using pseudo-stacks allows this profiler to be very simple, low-overhead,
24*5f39d1b3SJooyung Han // portable, and independent of compilation details such as function inlining
25*5f39d1b3SJooyung Han // and frame pointers. The granularity of instrumentation can be freely chosen,
26*5f39d1b3SJooyung Han // and it is possible to get some annotate-like detail, i.e. detail within one
27*5f39d1b3SJooyung Han // function without splitting it into multiple functions.
28*5f39d1b3SJooyung Han //
29*5f39d1b3SJooyung Han // This profiler should remain small and simple; its key feature is to fit in
30*5f39d1b3SJooyung Han // a single header file so that there should never be a reason to refrain
31*5f39d1b3SJooyung Han // from profiling. More complex and feature-rich alternatives are
32*5f39d1b3SJooyung Han // readily available. This one offers a strict superset of its
33*5f39d1b3SJooyung Han // functionality: https://github.com/bgirard/GeckoProfiler, including
34*5f39d1b3SJooyung Han // intertwining pseudostacks with real call stacks, more annotation options,
35*5f39d1b3SJooyung Han // and advanced visualization.
36*5f39d1b3SJooyung Han //
37*5f39d1b3SJooyung Han // Usage
38*5f39d1b3SJooyung Han // =====
39*5f39d1b3SJooyung Han //
40*5f39d1b3SJooyung Han // 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is
41*5f39d1b3SJooyung Han //    not enabled, profiling instrumentation from instrumentation.h
42*5f39d1b3SJooyung Han //    (ScopedProfilingLabel, RegisterCurrentThreadForProfiling)
43*5f39d1b3SJooyung Han //    is still defined but does nothing. On the other hand,
44*5f39d1b3SJooyung Han //    when profiling is not enabled, it is an error to #include the
45*5f39d1b3SJooyung Han //    present file.
46*5f39d1b3SJooyung Han //
47*5f39d1b3SJooyung Han // 1. Each thread can opt in to profiling by calling
48*5f39d1b3SJooyung Han //    RegisterCurrentThreadForProfiling() defined in instrumentation.h.
49*5f39d1b3SJooyung Han //    This can be done at any time, before or during profiling.
50*5f39d1b3SJooyung Han //    No sample will be collected from a thread until
51*5f39d1b3SJooyung Han //    it has called RegisterCurrentThreadForProfiling().
52*5f39d1b3SJooyung Han //
53*5f39d1b3SJooyung Han // 2. Instrument your code to be profiled with ScopedProfilingLabel,
54*5f39d1b3SJooyung Han //    which is a RAII helper defined in instrumentation.h. The identifier
55*5f39d1b3SJooyung Han //    names (some_label, etc) do not matter; what will show up
56*5f39d1b3SJooyung Han //    in the profile is the string passed to the constructor, which
57*5f39d1b3SJooyung Han //    must be a literal string. See the full example below.
58*5f39d1b3SJooyung Han //
59*5f39d1b3SJooyung Han //    Note: the overhead of ScopedProfilingLabel is zero when not
60*5f39d1b3SJooyung Han //    enabling profiling (when not defining GEMMLOWP_PROFILING).
61*5f39d1b3SJooyung Han //
62*5f39d1b3SJooyung Han // 3. Use the profiler.h interface to control profiling. There are two
63*5f39d1b3SJooyung Han //    functions: StartProfiling() and FinishProfiling(). They must be
64*5f39d1b3SJooyung Han //    called on the same thread. FinishProfiling() prints the profile
65*5f39d1b3SJooyung Han //    on stdout.
66*5f39d1b3SJooyung Han //
67*5f39d1b3SJooyung Han // Full example
68*5f39d1b3SJooyung Han // ============
69*5f39d1b3SJooyung Han /*
70*5f39d1b3SJooyung Han     #define GEMMLOWP_PROFILING
71*5f39d1b3SJooyung Han     #include "profiling/instrumentation.h"
72*5f39d1b3SJooyung Han     using namespace gemmlowp;
73*5f39d1b3SJooyung Han 
74*5f39d1b3SJooyung Han     const int iters = 100000000;
75*5f39d1b3SJooyung Han     volatile int i;
76*5f39d1b3SJooyung Han 
77*5f39d1b3SJooyung Han     void Bar() {
78*5f39d1b3SJooyung Han       ScopedProfilingLabel label("Bar");
79*5f39d1b3SJooyung Han       for (i = 0; i < iters; i++) {}
80*5f39d1b3SJooyung Han     }
81*5f39d1b3SJooyung Han 
82*5f39d1b3SJooyung Han     void Foo() {
83*5f39d1b3SJooyung Han       ScopedProfilingLabel label("Foo");
84*5f39d1b3SJooyung Han       for (i = 0; i < iters; i++) {}
85*5f39d1b3SJooyung Han       Bar();
86*5f39d1b3SJooyung Han     }
87*5f39d1b3SJooyung Han 
88*5f39d1b3SJooyung Han     void Init() {
89*5f39d1b3SJooyung Han       RegisterCurrentThreadForProfiling();
90*5f39d1b3SJooyung Han     }
91*5f39d1b3SJooyung Han 
92*5f39d1b3SJooyung Han     #include "profiling/profiler.h"
93*5f39d1b3SJooyung Han 
94*5f39d1b3SJooyung Han     int main() {
95*5f39d1b3SJooyung Han       Init();
96*5f39d1b3SJooyung Han       StartProfiling();
97*5f39d1b3SJooyung Han       Foo();
98*5f39d1b3SJooyung Han       FinishProfiling();
99*5f39d1b3SJooyung Han     }
100*5f39d1b3SJooyung Han *
101*5f39d1b3SJooyung Han * Output:
102*5f39d1b3SJooyung Han *
103*5f39d1b3SJooyung Han     gemmlowp profile (1 threads, 304 samples)
104*5f39d1b3SJooyung Han     100.00% Foo
105*5f39d1b3SJooyung Han         51.32% other
106*5f39d1b3SJooyung Han         48.68% Bar
107*5f39d1b3SJooyung Han     0.00% other (outside of any label)
108*5f39d1b3SJooyung Han */
109*5f39d1b3SJooyung Han //
110*5f39d1b3SJooyung Han // Interpreting results
111*5f39d1b3SJooyung Han // ====================
112*5f39d1b3SJooyung Han //
113*5f39d1b3SJooyung Han //  Each node shows the absolute percentage, among all the samples,
114*5f39d1b3SJooyung Han //  of the number of samples that recorded the given pseudo-stack.
115*5f39d1b3SJooyung Han //  The percentages are *NOT* relative to the parent node. In addition
116*5f39d1b3SJooyung Han //  to your own labels, you will also see 'other' nodes that collect
117*5f39d1b3SJooyung Han //  the remainder of samples under the parent node that didn't fall into
118*5f39d1b3SJooyung Han //  any of the labelled child nodes. Example:
119*5f39d1b3SJooyung Han //
120*5f39d1b3SJooyung Han //  20% Foo
121*5f39d1b3SJooyung Han //      12% Bar
122*5f39d1b3SJooyung Han //      6% Xyz
123*5f39d1b3SJooyung Han //      2% other
124*5f39d1b3SJooyung Han //
125*5f39d1b3SJooyung Han //  This means that 20% of all labels were under Foo, of which 12%/20%==60%
126*5f39d1b3SJooyung Han //  were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not
127*5f39d1b3SJooyung Han //  under either Bar or Xyz.
128*5f39d1b3SJooyung Han //
129*5f39d1b3SJooyung Han //  Typically, one wants to keep adding ScopedProfilingLabel's until
130*5f39d1b3SJooyung Han //  the 'other' nodes show low percentages.
131*5f39d1b3SJooyung Han //
132*5f39d1b3SJooyung Han // Interpreting results with multiple threads
133*5f39d1b3SJooyung Han // ==========================================
134*5f39d1b3SJooyung Han //
135*5f39d1b3SJooyung Han // At each sample, each thread registered for profiling gets sampled once.
136*5f39d1b3SJooyung Han // So if there is one "main thread" spending its time in MainFunc() and
137*5f39d1b3SJooyung Han // 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the
138*5f39d1b3SJooyung Han // samples will be in WorkerFunc, so the profile will look like this:
139*5f39d1b3SJooyung Han //
140*5f39d1b3SJooyung Han // 80% WorkerFunc
141*5f39d1b3SJooyung Han // 20% MainFunc
142*5f39d1b3SJooyung Han 
143*5f39d1b3SJooyung Han #ifndef GEMMLOWP_PROFILING_PROFILER_H_
144*5f39d1b3SJooyung Han #define GEMMLOWP_PROFILING_PROFILER_H_
145*5f39d1b3SJooyung Han 
146*5f39d1b3SJooyung Han #ifndef GEMMLOWP_PROFILING
147*5f39d1b3SJooyung Han #error Profiling is not enabled!
148*5f39d1b3SJooyung Han #endif
149*5f39d1b3SJooyung Han 
150*5f39d1b3SJooyung Han #include <vector>
151*5f39d1b3SJooyung Han 
152*5f39d1b3SJooyung Han #include "instrumentation.h"
153*5f39d1b3SJooyung Han 
154*5f39d1b3SJooyung Han namespace gemmlowp {
155*5f39d1b3SJooyung Han 
156*5f39d1b3SJooyung Han // A tree view of a profile.
157*5f39d1b3SJooyung Han class ProfileTreeView {
158*5f39d1b3SJooyung Han   struct Node {
159*5f39d1b3SJooyung Han     std::vector<Node*> children;
160*5f39d1b3SJooyung Han     const char* label;
161*5f39d1b3SJooyung Han     std::size_t weight;
NodeNode162*5f39d1b3SJooyung Han     Node() : label(nullptr), weight(0) {}
~NodeNode163*5f39d1b3SJooyung Han     ~Node() {
164*5f39d1b3SJooyung Han       for (auto child : children) {
165*5f39d1b3SJooyung Han         delete child;
166*5f39d1b3SJooyung Han       }
167*5f39d1b3SJooyung Han     }
168*5f39d1b3SJooyung Han   };
169*5f39d1b3SJooyung Han 
CompareNodes(Node * n1,Node * n2)170*5f39d1b3SJooyung Han   static bool CompareNodes(Node* n1, Node* n2) {
171*5f39d1b3SJooyung Han     return n1->weight > n2->weight;
172*5f39d1b3SJooyung Han   }
173*5f39d1b3SJooyung Han 
174*5f39d1b3SJooyung Han   Node root_;
175*5f39d1b3SJooyung Han 
PrintNode(const Node * node,int level)176*5f39d1b3SJooyung Han   void PrintNode(const Node* node, int level) const {
177*5f39d1b3SJooyung Han     if (level) {
178*5f39d1b3SJooyung Han       for (int i = 1; i < level; i++) {
179*5f39d1b3SJooyung Han         printf("    ");
180*5f39d1b3SJooyung Han       }
181*5f39d1b3SJooyung Han       printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label);
182*5f39d1b3SJooyung Han     }
183*5f39d1b3SJooyung Han     for (auto child : node->children) {
184*5f39d1b3SJooyung Han       PrintNode(child, level + 1);
185*5f39d1b3SJooyung Han     }
186*5f39d1b3SJooyung Han   }
187*5f39d1b3SJooyung Han 
AddStackToNode(const ProfilingStack & stack,Node * node,std::size_t level)188*5f39d1b3SJooyung Han   static void AddStackToNode(const ProfilingStack& stack, Node* node,
189*5f39d1b3SJooyung Han                              std::size_t level) {
190*5f39d1b3SJooyung Han     node->weight++;
191*5f39d1b3SJooyung Han     if (stack.size == level) {
192*5f39d1b3SJooyung Han       return;
193*5f39d1b3SJooyung Han     }
194*5f39d1b3SJooyung Han     Node* child_to_add_to = nullptr;
195*5f39d1b3SJooyung Han     for (auto child : node->children) {
196*5f39d1b3SJooyung Han       if (child->label == stack.labels[level]) {
197*5f39d1b3SJooyung Han         child_to_add_to = child;
198*5f39d1b3SJooyung Han         break;
199*5f39d1b3SJooyung Han       }
200*5f39d1b3SJooyung Han     }
201*5f39d1b3SJooyung Han     if (!child_to_add_to) {
202*5f39d1b3SJooyung Han       child_to_add_to = new Node;
203*5f39d1b3SJooyung Han       child_to_add_to->label = stack.labels[level];
204*5f39d1b3SJooyung Han       node->children.push_back(child_to_add_to);
205*5f39d1b3SJooyung Han     }
206*5f39d1b3SJooyung Han     AddStackToNode(stack, child_to_add_to, level + 1);
207*5f39d1b3SJooyung Han     return;
208*5f39d1b3SJooyung Han   }
209*5f39d1b3SJooyung Han 
AddStack(const ProfilingStack & stack)210*5f39d1b3SJooyung Han   void AddStack(const ProfilingStack& stack) {
211*5f39d1b3SJooyung Han     AddStackToNode(stack, &root_, 0);
212*5f39d1b3SJooyung Han   }
213*5f39d1b3SJooyung Han 
AddOtherChildrenToNode(Node * node)214*5f39d1b3SJooyung Han   void AddOtherChildrenToNode(Node* node) {
215*5f39d1b3SJooyung Han     std::size_t top_level_children_weight = 0;
216*5f39d1b3SJooyung Han     for (auto c : node->children) {
217*5f39d1b3SJooyung Han       AddOtherChildrenToNode(c);
218*5f39d1b3SJooyung Han       top_level_children_weight += c->weight;
219*5f39d1b3SJooyung Han     }
220*5f39d1b3SJooyung Han     if (top_level_children_weight) {
221*5f39d1b3SJooyung Han       Node* other_child = new Node;
222*5f39d1b3SJooyung Han       other_child->label =
223*5f39d1b3SJooyung Han           node == &root_ ? "other (outside of any label)" : "other";
224*5f39d1b3SJooyung Han       other_child->weight = node->weight - top_level_children_weight;
225*5f39d1b3SJooyung Han       node->children.push_back(other_child);
226*5f39d1b3SJooyung Han     }
227*5f39d1b3SJooyung Han   }
228*5f39d1b3SJooyung Han 
AddOtherNodes()229*5f39d1b3SJooyung Han   void AddOtherNodes() { AddOtherChildrenToNode(&root_); }
230*5f39d1b3SJooyung Han 
SortNode(Node * node)231*5f39d1b3SJooyung Han   void SortNode(Node* node) {
232*5f39d1b3SJooyung Han     std::sort(node->children.begin(), node->children.end(), CompareNodes);
233*5f39d1b3SJooyung Han     for (auto child : node->children) {
234*5f39d1b3SJooyung Han       SortNode(child);
235*5f39d1b3SJooyung Han     }
236*5f39d1b3SJooyung Han   }
237*5f39d1b3SJooyung Han 
Sort()238*5f39d1b3SJooyung Han   void Sort() { SortNode(&root_); }
239*5f39d1b3SJooyung Han 
240*5f39d1b3SJooyung Han  public:
ProfileTreeView(const std::vector<ProfilingStack> & stacks)241*5f39d1b3SJooyung Han   explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) {
242*5f39d1b3SJooyung Han     for (auto stack : stacks) {
243*5f39d1b3SJooyung Han       AddStack(stack);
244*5f39d1b3SJooyung Han     }
245*5f39d1b3SJooyung Han     AddOtherNodes();
246*5f39d1b3SJooyung Han     Sort();
247*5f39d1b3SJooyung Han   }
248*5f39d1b3SJooyung Han 
Print()249*5f39d1b3SJooyung Han   void Print() const {
250*5f39d1b3SJooyung Han     printf("\n");
251*5f39d1b3SJooyung Han     printf("gemmlowp profile (%d threads, %d samples)\n",
252*5f39d1b3SJooyung Han            static_cast<int>(ThreadsUnderProfiling().size()),
253*5f39d1b3SJooyung Han            static_cast<int>(root_.weight));
254*5f39d1b3SJooyung Han     PrintNode(&root_, 0);
255*5f39d1b3SJooyung Han     printf("\n");
256*5f39d1b3SJooyung Han   }
257*5f39d1b3SJooyung Han };
258*5f39d1b3SJooyung Han 
259*5f39d1b3SJooyung Han // This function is the only place that determines our sampling frequency.
WaitOneProfilerTick()260*5f39d1b3SJooyung Han inline void WaitOneProfilerTick() {
261*5f39d1b3SJooyung Han   static const int millisecond = 1000000;
262*5f39d1b3SJooyung Han 
263*5f39d1b3SJooyung Han #if defined __arm__ || defined __aarch64__
264*5f39d1b3SJooyung Han   // Reduced sampling frequency on mobile devices helps limit time and memory
265*5f39d1b3SJooyung Han   // overhead there.
266*5f39d1b3SJooyung Han   static const int interval = 10 * millisecond;
267*5f39d1b3SJooyung Han #else
268*5f39d1b3SJooyung Han   static const int interval = 1 * millisecond;
269*5f39d1b3SJooyung Han #endif
270*5f39d1b3SJooyung Han 
271*5f39d1b3SJooyung Han   timespec ts;
272*5f39d1b3SJooyung Han   ts.tv_sec = 0;
273*5f39d1b3SJooyung Han   ts.tv_nsec = interval;
274*5f39d1b3SJooyung Han   nanosleep(&ts, nullptr);
275*5f39d1b3SJooyung Han }
276*5f39d1b3SJooyung Han 
277*5f39d1b3SJooyung Han // This is how we track whether we've already started profiling,
278*5f39d1b3SJooyung Han // to guard against misuse of the API.
IsProfiling()279*5f39d1b3SJooyung Han inline bool& IsProfiling() {
280*5f39d1b3SJooyung Han   static bool b;
281*5f39d1b3SJooyung Han   return b;
282*5f39d1b3SJooyung Han }
283*5f39d1b3SJooyung Han 
284*5f39d1b3SJooyung Han // This is how we tell the profiler thread to finish.
ProfilerThreadShouldFinish()285*5f39d1b3SJooyung Han inline bool& ProfilerThreadShouldFinish() {
286*5f39d1b3SJooyung Han   static bool b;
287*5f39d1b3SJooyung Han   return b;
288*5f39d1b3SJooyung Han }
289*5f39d1b3SJooyung Han 
290*5f39d1b3SJooyung Han // The profiler thread. See ProfilerThreadFunc.
ProfilerThread()291*5f39d1b3SJooyung Han inline pthread_t& ProfilerThread() {
292*5f39d1b3SJooyung Han   static pthread_t t;
293*5f39d1b3SJooyung Han   return t;
294*5f39d1b3SJooyung Han }
295*5f39d1b3SJooyung Han 
296*5f39d1b3SJooyung Han // Records a stack from a running thread.
297*5f39d1b3SJooyung Han // The tricky part is that we're not interrupting the thread.
298*5f39d1b3SJooyung Han // This is OK because we're looking at a pseudo-stack of labels,
299*5f39d1b3SJooyung Han // not at the real thread stack, and if the pseudo-stack changes
300*5f39d1b3SJooyung Han // while we're recording it, we are OK with getting either the
301*5f39d1b3SJooyung Han // old or the new stack. Note that ProfilingStack::Pop
302*5f39d1b3SJooyung Han // only decrements the size, and doesn't null the popped label,
303*5f39d1b3SJooyung Han // so if we're concurrently recording it, it shouldn't change
304*5f39d1b3SJooyung Han // under our feet until another label is pushed, at which point
305*5f39d1b3SJooyung Han // we are OK with getting either this new label or the old one.
306*5f39d1b3SJooyung Han // In the end, the key atomicity property that we are relying on
307*5f39d1b3SJooyung Han // here is that pointers are changed atomically, and the labels
308*5f39d1b3SJooyung Han // are pointers (to literal strings).
RecordStack(ThreadInfo * thread,ProfilingStack * dst)309*5f39d1b3SJooyung Han inline void RecordStack(ThreadInfo* thread, ProfilingStack* dst) {
310*5f39d1b3SJooyung Han   ScopedLock sl(thread->stack.lock);
311*5f39d1b3SJooyung Han   assert(!dst->size);
312*5f39d1b3SJooyung Han   while (dst->size < thread->stack.size) {
313*5f39d1b3SJooyung Han     dst->labels[dst->size] = thread->stack.labels[dst->size];
314*5f39d1b3SJooyung Han     dst->size++;
315*5f39d1b3SJooyung Han   }
316*5f39d1b3SJooyung Han }
317*5f39d1b3SJooyung Han 
318*5f39d1b3SJooyung Han // The profiler thread's entry point.
319*5f39d1b3SJooyung Han // Note that a separate thread is to be started each time we call
320*5f39d1b3SJooyung Han // StartProfiling(), and finishes when we call FinishProfiling().
321*5f39d1b3SJooyung Han // So here we only need to handle the recording and reporting of
322*5f39d1b3SJooyung Han // a single profile.
ProfilerThreadFunc(void *)323*5f39d1b3SJooyung Han inline void* ProfilerThreadFunc(void*) {
324*5f39d1b3SJooyung Han   assert(ProfilerThread() == pthread_self());
325*5f39d1b3SJooyung Han 
326*5f39d1b3SJooyung Han   // Since we only handle one profile per profiler thread, the
327*5f39d1b3SJooyung Han   // profile data (the array of recorded stacks) can be a local variable here.
328*5f39d1b3SJooyung Han   std::vector<ProfilingStack> stacks;
329*5f39d1b3SJooyung Han 
330*5f39d1b3SJooyung Han   while (!ProfilerThreadShouldFinish()) {
331*5f39d1b3SJooyung Han     WaitOneProfilerTick();
332*5f39d1b3SJooyung Han     {
333*5f39d1b3SJooyung Han       ScopedLock sl(GlobalMutexes::Profiler());
334*5f39d1b3SJooyung Han       for (auto t : ThreadsUnderProfiling()) {
335*5f39d1b3SJooyung Han         ProfilingStack s;
336*5f39d1b3SJooyung Han         RecordStack(t, &s);
337*5f39d1b3SJooyung Han         stacks.push_back(s);
338*5f39d1b3SJooyung Han       }
339*5f39d1b3SJooyung Han     }
340*5f39d1b3SJooyung Han   }
341*5f39d1b3SJooyung Han 
342*5f39d1b3SJooyung Han   // Profiling is finished and we now report the results.
343*5f39d1b3SJooyung Han   ProfileTreeView(stacks).Print();
344*5f39d1b3SJooyung Han 
345*5f39d1b3SJooyung Han   return nullptr;
346*5f39d1b3SJooyung Han }
347*5f39d1b3SJooyung Han 
348*5f39d1b3SJooyung Han // Starts recording samples.
StartProfiling()349*5f39d1b3SJooyung Han inline void StartProfiling() {
350*5f39d1b3SJooyung Han   ScopedLock sl(GlobalMutexes::Profiler());
351*5f39d1b3SJooyung Han   ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
352*5f39d1b3SJooyung Han   IsProfiling() = true;
353*5f39d1b3SJooyung Han   ProfilerThreadShouldFinish() = false;
354*5f39d1b3SJooyung Han   pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr);
355*5f39d1b3SJooyung Han }
356*5f39d1b3SJooyung Han 
357*5f39d1b3SJooyung Han // Stops recording samples, and prints a profile tree-view on stdout.
FinishProfiling()358*5f39d1b3SJooyung Han inline void FinishProfiling() {
359*5f39d1b3SJooyung Han   {
360*5f39d1b3SJooyung Han     ScopedLock sl(GlobalMutexes::Profiler());
361*5f39d1b3SJooyung Han     ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
362*5f39d1b3SJooyung Han     // The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
363*5f39d1b3SJooyung Han     // as the scary comments below should make clear.
364*5f39d1b3SJooyung Han     // Should we use a condition variable?
365*5f39d1b3SJooyung Han     ProfilerThreadShouldFinish() = true;
366*5f39d1b3SJooyung Han   }  // must release the lock here to avoid deadlock with profiler thread.
367*5f39d1b3SJooyung Han   pthread_join(ProfilerThread(), nullptr);
368*5f39d1b3SJooyung Han   IsProfiling() = false;  // yikes, this should be guarded by the lock!
369*5f39d1b3SJooyung Han }
370*5f39d1b3SJooyung Han 
371*5f39d1b3SJooyung Han }  // namespace gemmlowp
372*5f39d1b3SJooyung Han 
373*5f39d1b3SJooyung Han #endif  // GEMMLOWP_PROFILING_PROFILER_H_
374