1*5f39d1b3SJooyung Han // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2*5f39d1b3SJooyung Han //
3*5f39d1b3SJooyung Han // Licensed under the Apache License, Version 2.0 (the "License");
4*5f39d1b3SJooyung Han // you may not use this file except in compliance with the License.
5*5f39d1b3SJooyung Han // You may obtain a copy of the License at
6*5f39d1b3SJooyung Han //
7*5f39d1b3SJooyung Han // http://www.apache.org/licenses/LICENSE-2.0
8*5f39d1b3SJooyung Han //
9*5f39d1b3SJooyung Han // Unless required by applicable law or agreed to in writing, software
10*5f39d1b3SJooyung Han // distributed under the License is distributed on an "AS IS" BASIS,
11*5f39d1b3SJooyung Han // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*5f39d1b3SJooyung Han // See the License for the specific language governing permissions and
13*5f39d1b3SJooyung Han // limitations under the License.
14*5f39d1b3SJooyung Han
15*5f39d1b3SJooyung Han // multi_thread_gemm.h: Multi-threaded GEMM entry point.
16*5f39d1b3SJooyung Han // Readers note: To understand this file, it is useful to first
17*5f39d1b3SJooyung Han // read and understand the much simpler single_thread_gemm.h.
18*5f39d1b3SJooyung Han
19*5f39d1b3SJooyung Han #ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
20*5f39d1b3SJooyung Han #define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
21*5f39d1b3SJooyung Han
22*5f39d1b3SJooyung Han #include <atomic> // NOLINT
23*5f39d1b3SJooyung Han #include <chrono> // NOLINT
24*5f39d1b3SJooyung Han #include <thread> // NOLINT
25*5f39d1b3SJooyung Han #include <vector>
26*5f39d1b3SJooyung Han
27*5f39d1b3SJooyung Han #include "single_thread_gemm.h"
28*5f39d1b3SJooyung Han
29*5f39d1b3SJooyung Han namespace gemmlowp {
30*5f39d1b3SJooyung Han
31*5f39d1b3SJooyung Han // This value was empirically derived on an end-to-end application benchmark.
32*5f39d1b3SJooyung Han // That this number of cycles means that we may be sleeping substantially longer
33*5f39d1b3SJooyung Han // than a scheduler timeslice's duration is not necessarily surprising. The
34*5f39d1b3SJooyung Han // idea is to pick up quickly new work after having finished the previous
35*5f39d1b3SJooyung Han // workload. When it's new work within the same GEMM as the previous work, the
36*5f39d1b3SJooyung Han // time interval that we might be busy-waiting is very small, so for that
37*5f39d1b3SJooyung Han // purpose it would be more than enough to sleep for 1 million cycles.
38*5f39d1b3SJooyung Han // That is all what we would observe on a GEMM benchmark. However, in a real
39*5f39d1b3SJooyung Han // application, after having finished a GEMM, we might do unrelated work for
40*5f39d1b3SJooyung Han // a little while, then start on a new GEMM. Think of a neural network
41*5f39d1b3SJooyung Han // application performing inference, where many but not all layers are
42*5f39d1b3SJooyung Han // implemented by a GEMM. In such cases, our worker threads might be idle for
43*5f39d1b3SJooyung Han // longer periods of time before having work again. If we let them passively
44*5f39d1b3SJooyung Han // wait, on a mobile device, the CPU scheduler might aggressively clock down
45*5f39d1b3SJooyung Han // or even turn off the CPU cores that they were running on. That would result
46*5f39d1b3SJooyung Han // in a long delay the next time these need to be turned back on for the next
47*5f39d1b3SJooyung Han // GEMM. So we need to strike a balance that reflects typical time intervals
48*5f39d1b3SJooyung Han // between consecutive GEMM invokations, not just intra-GEMM considerations.
49*5f39d1b3SJooyung Han // Of course, we need to balance keeping CPUs spinning longer to resume work
50*5f39d1b3SJooyung Han // faster, versus passively waiting to conserve power.
51*5f39d1b3SJooyung Han const int kMaxBusyWaitNOPs = 4 * 1000 * 1000;
52*5f39d1b3SJooyung Han
53*5f39d1b3SJooyung Han // On X86 and ARM platforms we may use NOP instructions to know how long we
54*5f39d1b3SJooyung Han // are busy-waiting.
55*5f39d1b3SJooyung Han
56*5f39d1b3SJooyung Han #if defined(GEMMLOWP_ALLOW_INLINE_ASM) && !defined(GEMMLOWP_NO_BUSYWAIT) && \
57*5f39d1b3SJooyung Han (defined(GEMMLOWP_ARM) || defined(GEMMLOWP_X86))
58*5f39d1b3SJooyung Han
59*5f39d1b3SJooyung Han #define GEMMLOWP_NOP "nop\n"
60*5f39d1b3SJooyung Han
61*5f39d1b3SJooyung Han #define GEMMLOWP_STRING_CONCAT_4(X) X X X X
62*5f39d1b3SJooyung Han #define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP)
63*5f39d1b3SJooyung Han #define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4)
64*5f39d1b3SJooyung Han #define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16)
65*5f39d1b3SJooyung Han
DoSomeNOPs()66*5f39d1b3SJooyung Han inline int DoSomeNOPs() {
67*5f39d1b3SJooyung Han asm volatile(GEMMLOWP_NOP64);
68*5f39d1b3SJooyung Han return 64;
69*5f39d1b3SJooyung Han }
70*5f39d1b3SJooyung Han
71*5f39d1b3SJooyung Han #undef GEMMLOWP_STRING_CONCAT_4
72*5f39d1b3SJooyung Han #undef GEMMLOWP_NOP64
73*5f39d1b3SJooyung Han #undef GEMMLOWP_NOP16
74*5f39d1b3SJooyung Han #undef GEMMLOWP_NOP4
75*5f39d1b3SJooyung Han #undef GEMMLOWP_NOP
76*5f39d1b3SJooyung Han
77*5f39d1b3SJooyung Han #else // May not use asm NOP.
78*5f39d1b3SJooyung Han
79*5f39d1b3SJooyung Han // If we can't use NOPs, let's use a non-inline function call as a basic
80*5f39d1b3SJooyung Han // thing that has some vaguely known, nonzero cost.
81*5f39d1b3SJooyung Han GEMMLOWP_NOINLINE
DoSomeNOPs()82*5f39d1b3SJooyung Han inline int DoSomeNOPs() {
83*5f39d1b3SJooyung Han // Pretend that calling an empty function takes as long as 16 NOPs...
84*5f39d1b3SJooyung Han return 16;
85*5f39d1b3SJooyung Han }
86*5f39d1b3SJooyung Han #endif
87*5f39d1b3SJooyung Han
88*5f39d1b3SJooyung Han // Waits until *var != initial_value.
89*5f39d1b3SJooyung Han //
90*5f39d1b3SJooyung Han // Returns the new value of *var. The guarantee here is that
91*5f39d1b3SJooyung Han // the return value is different from initial_value, and that that
92*5f39d1b3SJooyung Han // new value has been taken by *var at some point during the
93*5f39d1b3SJooyung Han // execution of this function. There is no guarantee that this is
94*5f39d1b3SJooyung Han // still the value of *var when this function returns, since *var is
95*5f39d1b3SJooyung Han // not assumed to be guarded by any lock.
96*5f39d1b3SJooyung Han //
97*5f39d1b3SJooyung Han // First does some busy-waiting for a fixed number of no-op cycles,
98*5f39d1b3SJooyung Han // then falls back to passive waiting for the given condvar, guarded
99*5f39d1b3SJooyung Han // by the given mutex.
100*5f39d1b3SJooyung Han //
101*5f39d1b3SJooyung Han // The idea of doing some initial busy-waiting is to help get
102*5f39d1b3SJooyung Han // better and more consistent multithreading benefits for small GEMM sizes.
103*5f39d1b3SJooyung Han // Busy-waiting help ensuring that if we need to wake up soon after having
104*5f39d1b3SJooyung Han // started waiting, then we can wake up quickly (as opposed to, say,
105*5f39d1b3SJooyung Han // having to wait to be scheduled again by the OS). On the other hand,
106*5f39d1b3SJooyung Han // we must still eventually revert to passive waiting for longer waits
107*5f39d1b3SJooyung Han // (e.g. worker threads having finished a GEMM and waiting until the next GEMM)
108*5f39d1b3SJooyung Han // so as to avoid permanently spinning.
109*5f39d1b3SJooyung Han //
110*5f39d1b3SJooyung Han template <typename T>
WaitForVariableChange(std::atomic<T> * var,T initial_value,pthread_cond_t * cond,pthread_mutex_t * mutex)111*5f39d1b3SJooyung Han T WaitForVariableChange(std::atomic<T>* var, T initial_value,
112*5f39d1b3SJooyung Han pthread_cond_t* cond, pthread_mutex_t* mutex) {
113*5f39d1b3SJooyung Han // First, trivial case where the variable already changed value.
114*5f39d1b3SJooyung Han T new_value = var->load(std::memory_order_acquire);
115*5f39d1b3SJooyung Han if (new_value != initial_value) {
116*5f39d1b3SJooyung Han return new_value;
117*5f39d1b3SJooyung Han }
118*5f39d1b3SJooyung Han // Then try busy-waiting.
119*5f39d1b3SJooyung Han int nops = 0;
120*5f39d1b3SJooyung Han while (nops < kMaxBusyWaitNOPs) {
121*5f39d1b3SJooyung Han nops += DoSomeNOPs();
122*5f39d1b3SJooyung Han new_value = var->load(std::memory_order_acquire);
123*5f39d1b3SJooyung Han if (new_value != initial_value) {
124*5f39d1b3SJooyung Han return new_value;
125*5f39d1b3SJooyung Han }
126*5f39d1b3SJooyung Han }
127*5f39d1b3SJooyung Han
128*5f39d1b3SJooyung Han // Finally, do real passive waiting.
129*5f39d1b3SJooyung Han pthread_mutex_lock(mutex);
130*5f39d1b3SJooyung Han new_value = var->load(std::memory_order_acquire);
131*5f39d1b3SJooyung Han while (new_value == initial_value) {
132*5f39d1b3SJooyung Han pthread_cond_wait(cond, mutex);
133*5f39d1b3SJooyung Han new_value = var->load(std::memory_order_acquire);
134*5f39d1b3SJooyung Han }
135*5f39d1b3SJooyung Han pthread_mutex_unlock(mutex);
136*5f39d1b3SJooyung Han return new_value;
137*5f39d1b3SJooyung Han }
138*5f39d1b3SJooyung Han
139*5f39d1b3SJooyung Han // A BlockingCounter lets one thread to wait for N events to occur.
140*5f39d1b3SJooyung Han // This is how the master thread waits for all the worker threads
141*5f39d1b3SJooyung Han // to have finished working.
142*5f39d1b3SJooyung Han // The waiting is done using a naive spinlock waiting for the atomic
143*5f39d1b3SJooyung Han // count_ to hit the value 0. This is acceptable because in our usage
144*5f39d1b3SJooyung Han // pattern, BlockingCounter is used only to synchronize threads after
145*5f39d1b3SJooyung Han // short-lived tasks (performing parts of the same GEMM). It is not used
146*5f39d1b3SJooyung Han // for synchronizing longer waits (resuming work on the next GEMM).
147*5f39d1b3SJooyung Han class BlockingCounter {
148*5f39d1b3SJooyung Han public:
BlockingCounter()149*5f39d1b3SJooyung Han BlockingCounter() : count_(0) {}
150*5f39d1b3SJooyung Han
151*5f39d1b3SJooyung Han // Sets/resets the counter; initial_count is the number of
152*5f39d1b3SJooyung Han // decrementing events that the Wait() call will be waiting for.
Reset(std::size_t initial_count)153*5f39d1b3SJooyung Han void Reset(std::size_t initial_count) {
154*5f39d1b3SJooyung Han std::size_t old_count_value = count_.load(std::memory_order_relaxed);
155*5f39d1b3SJooyung Han assert(old_count_value == 0);
156*5f39d1b3SJooyung Han (void)old_count_value;
157*5f39d1b3SJooyung Han count_.store(initial_count, std::memory_order_release);
158*5f39d1b3SJooyung Han }
159*5f39d1b3SJooyung Han
160*5f39d1b3SJooyung Han // Decrements the counter; if the counter hits zero, signals
161*5f39d1b3SJooyung Han // the threads that were waiting for that, and returns true.
162*5f39d1b3SJooyung Han // Otherwise (if the decremented count is still nonzero),
163*5f39d1b3SJooyung Han // returns false.
DecrementCount()164*5f39d1b3SJooyung Han bool DecrementCount() {
165*5f39d1b3SJooyung Han std::size_t old_count_value =
166*5f39d1b3SJooyung Han count_.fetch_sub(1, std::memory_order_acq_rel);
167*5f39d1b3SJooyung Han assert(old_count_value > 0);
168*5f39d1b3SJooyung Han std::size_t count_value = old_count_value - 1;
169*5f39d1b3SJooyung Han return count_value == 0;
170*5f39d1b3SJooyung Han }
171*5f39d1b3SJooyung Han
172*5f39d1b3SJooyung Han // Waits for the N other threads (N having been set by Reset())
173*5f39d1b3SJooyung Han // to hit the BlockingCounter.
Wait()174*5f39d1b3SJooyung Han void Wait() {
175*5f39d1b3SJooyung Han ScopedProfilingLabel label("BlockingCounter::Wait");
176*5f39d1b3SJooyung Han // Busy-wait until the count value is 0.
177*5f39d1b3SJooyung Han int nops = 0;
178*5f39d1b3SJooyung Han while (count_.load(std::memory_order_acquire)) {
179*5f39d1b3SJooyung Han nops += DoSomeNOPs();
180*5f39d1b3SJooyung Han if (nops > kMaxBusyWaitNOPs) {
181*5f39d1b3SJooyung Han nops = 0;
182*5f39d1b3SJooyung Han // If we are unlucky, the blocking thread (that calls DecrementCount)
183*5f39d1b3SJooyung Han // and the blocked thread (here, calling Wait) may be scheduled on
184*5f39d1b3SJooyung Han // the same CPU, so the busy-waiting of the present thread may prevent
185*5f39d1b3SJooyung Han // the blocking thread from resuming and unblocking.
186*5f39d1b3SJooyung Han // If we are even unluckier, the priorities of the present thread
187*5f39d1b3SJooyung Han // might be higher than that of the blocking thread, so just yielding
188*5f39d1b3SJooyung Han // wouldn't allow the blocking thread to resume. So we sleep for
189*5f39d1b3SJooyung Han // a substantial amount of time in that case. Notice that we only
190*5f39d1b3SJooyung Han // do so after having busy-waited for kMaxBusyWaitNOPs, which is
191*5f39d1b3SJooyung Han // typically several milliseconds, so sleeping 1 more millisecond
192*5f39d1b3SJooyung Han // isn't terrible at that point.
193*5f39d1b3SJooyung Han //
194*5f39d1b3SJooyung Han // How this is mitigated in practice:
195*5f39d1b3SJooyung Han // In practice, it is well known that the application should be
196*5f39d1b3SJooyung Han // conservative in choosing how many threads to tell gemmlowp to use,
197*5f39d1b3SJooyung Han // as it's hard to know how many CPU cores it will get to run on,
198*5f39d1b3SJooyung Han // on typical mobile devices.
199*5f39d1b3SJooyung Han // It seems impossible for gemmlowp to make this choice automatically,
200*5f39d1b3SJooyung Han // which is why gemmlowp's default is to use only 1 thread, and
201*5f39d1b3SJooyung Han // applications may override that if they know that they can count on
202*5f39d1b3SJooyung Han // using more than that.
203*5f39d1b3SJooyung Han std::this_thread::sleep_for(std::chrono::milliseconds(1));
204*5f39d1b3SJooyung Han }
205*5f39d1b3SJooyung Han }
206*5f39d1b3SJooyung Han }
207*5f39d1b3SJooyung Han
208*5f39d1b3SJooyung Han private:
209*5f39d1b3SJooyung Han std::atomic<std::size_t> count_;
210*5f39d1b3SJooyung Han };
211*5f39d1b3SJooyung Han
212*5f39d1b3SJooyung Han // A workload for a worker.
213*5f39d1b3SJooyung Han struct Task {
TaskTask214*5f39d1b3SJooyung Han Task() : local_allocator(nullptr) {}
~TaskTask215*5f39d1b3SJooyung Han virtual ~Task() {}
216*5f39d1b3SJooyung Han virtual void Run() = 0;
217*5f39d1b3SJooyung Han Allocator* local_allocator;
218*5f39d1b3SJooyung Han };
219*5f39d1b3SJooyung Han
220*5f39d1b3SJooyung Han // A worker thread.
221*5f39d1b3SJooyung Han class Worker {
222*5f39d1b3SJooyung Han public:
223*5f39d1b3SJooyung Han enum class State {
224*5f39d1b3SJooyung Han ThreadStartup, // The initial state before the thread main loop runs.
225*5f39d1b3SJooyung Han Ready, // Is not working, has not yet received new work to do.
226*5f39d1b3SJooyung Han HasWork, // Has work to do.
227*5f39d1b3SJooyung Han ExitAsSoonAsPossible // Should exit at earliest convenience.
228*5f39d1b3SJooyung Han };
229*5f39d1b3SJooyung Han
Worker(BlockingCounter * counter_to_decrement_when_ready)230*5f39d1b3SJooyung Han explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
231*5f39d1b3SJooyung Han : task_(nullptr),
232*5f39d1b3SJooyung Han state_(State::ThreadStartup),
233*5f39d1b3SJooyung Han counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
234*5f39d1b3SJooyung Han pthread_cond_init(&state_cond_, nullptr);
235*5f39d1b3SJooyung Han pthread_mutex_init(&state_mutex_, nullptr);
236*5f39d1b3SJooyung Han pthread_create(&thread_, nullptr, ThreadFunc, this);
237*5f39d1b3SJooyung Han }
238*5f39d1b3SJooyung Han
~Worker()239*5f39d1b3SJooyung Han ~Worker() {
240*5f39d1b3SJooyung Han ChangeState(State::ExitAsSoonAsPossible);
241*5f39d1b3SJooyung Han pthread_join(thread_, nullptr);
242*5f39d1b3SJooyung Han pthread_cond_destroy(&state_cond_);
243*5f39d1b3SJooyung Han pthread_mutex_destroy(&state_mutex_);
244*5f39d1b3SJooyung Han }
245*5f39d1b3SJooyung Han
246*5f39d1b3SJooyung Han // Changes State; may be called from either the worker thread
247*5f39d1b3SJooyung Han // or the master thread; however, not all state transitions are legal,
248*5f39d1b3SJooyung Han // which is guarded by assertions.
249*5f39d1b3SJooyung Han //
250*5f39d1b3SJooyung Han // The Task argument is to be used only with new_state==HasWork.
251*5f39d1b3SJooyung Han // It specifies the Task being handed to this Worker.
252*5f39d1b3SJooyung Han void ChangeState(State new_state, Task* task = nullptr) {
253*5f39d1b3SJooyung Han ScopedProfilingLabel label("Worker::ChangeState");
254*5f39d1b3SJooyung Han pthread_mutex_lock(&state_mutex_);
255*5f39d1b3SJooyung Han State old_state = state_.load(std::memory_order_relaxed);
256*5f39d1b3SJooyung Han assert(old_state != new_state);
257*5f39d1b3SJooyung Han switch (old_state) {
258*5f39d1b3SJooyung Han case State::ThreadStartup:
259*5f39d1b3SJooyung Han assert(new_state == State::Ready);
260*5f39d1b3SJooyung Han break;
261*5f39d1b3SJooyung Han case State::Ready:
262*5f39d1b3SJooyung Han assert(new_state == State::HasWork ||
263*5f39d1b3SJooyung Han new_state == State::ExitAsSoonAsPossible);
264*5f39d1b3SJooyung Han break;
265*5f39d1b3SJooyung Han case State::HasWork:
266*5f39d1b3SJooyung Han assert(new_state == State::Ready ||
267*5f39d1b3SJooyung Han new_state == State::ExitAsSoonAsPossible);
268*5f39d1b3SJooyung Han break;
269*5f39d1b3SJooyung Han default:
270*5f39d1b3SJooyung Han abort();
271*5f39d1b3SJooyung Han }
272*5f39d1b3SJooyung Han switch (new_state) {
273*5f39d1b3SJooyung Han case State::Ready:
274*5f39d1b3SJooyung Han if (task_) {
275*5f39d1b3SJooyung Han // Doing work is part of reverting to 'ready' state.
276*5f39d1b3SJooyung Han task_->Run();
277*5f39d1b3SJooyung Han task_ = nullptr;
278*5f39d1b3SJooyung Han }
279*5f39d1b3SJooyung Han break;
280*5f39d1b3SJooyung Han case State::HasWork:
281*5f39d1b3SJooyung Han assert(!task_);
282*5f39d1b3SJooyung Han task->local_allocator = &local_allocator_;
283*5f39d1b3SJooyung Han task_ = task;
284*5f39d1b3SJooyung Han break;
285*5f39d1b3SJooyung Han default:
286*5f39d1b3SJooyung Han break;
287*5f39d1b3SJooyung Han }
288*5f39d1b3SJooyung Han state_.store(new_state, std::memory_order_relaxed);
289*5f39d1b3SJooyung Han pthread_cond_broadcast(&state_cond_);
290*5f39d1b3SJooyung Han pthread_mutex_unlock(&state_mutex_);
291*5f39d1b3SJooyung Han if (new_state == State::Ready) {
292*5f39d1b3SJooyung Han counter_to_decrement_when_ready_->DecrementCount();
293*5f39d1b3SJooyung Han }
294*5f39d1b3SJooyung Han }
295*5f39d1b3SJooyung Han
296*5f39d1b3SJooyung Han // Thread entry point.
ThreadFunc()297*5f39d1b3SJooyung Han void ThreadFunc() {
298*5f39d1b3SJooyung Han ScopedProfilingLabel label("Worker::ThreadFunc");
299*5f39d1b3SJooyung Han
300*5f39d1b3SJooyung Han ChangeState(State::Ready);
301*5f39d1b3SJooyung Han
302*5f39d1b3SJooyung Han // Thread main loop
303*5f39d1b3SJooyung Han while (true) {
304*5f39d1b3SJooyung Han // Get a state to act on
305*5f39d1b3SJooyung Han // In the 'Ready' state, we have nothing to do but to wait until
306*5f39d1b3SJooyung Han // we switch to another state.
307*5f39d1b3SJooyung Han State state_to_act_upon = WaitForVariableChange(
308*5f39d1b3SJooyung Han &state_, State::Ready, &state_cond_, &state_mutex_);
309*5f39d1b3SJooyung Han
310*5f39d1b3SJooyung Han // We now have a state to act on, so act.
311*5f39d1b3SJooyung Han switch (state_to_act_upon) {
312*5f39d1b3SJooyung Han case State::HasWork:
313*5f39d1b3SJooyung Han // Got work to do! So do it, and then revert to 'Ready' state.
314*5f39d1b3SJooyung Han ChangeState(State::Ready);
315*5f39d1b3SJooyung Han break;
316*5f39d1b3SJooyung Han case State::ExitAsSoonAsPossible:
317*5f39d1b3SJooyung Han return;
318*5f39d1b3SJooyung Han default:
319*5f39d1b3SJooyung Han abort();
320*5f39d1b3SJooyung Han }
321*5f39d1b3SJooyung Han }
322*5f39d1b3SJooyung Han }
323*5f39d1b3SJooyung Han
ThreadFunc(void * arg)324*5f39d1b3SJooyung Han static void* ThreadFunc(void* arg) {
325*5f39d1b3SJooyung Han static_cast<Worker*>(arg)->ThreadFunc();
326*5f39d1b3SJooyung Han return nullptr;
327*5f39d1b3SJooyung Han }
328*5f39d1b3SJooyung Han
329*5f39d1b3SJooyung Han // Called by the master thead to give this worker work to do.
StartWork(Task * task)330*5f39d1b3SJooyung Han void StartWork(Task* task) { ChangeState(State::HasWork, task); }
331*5f39d1b3SJooyung Han
332*5f39d1b3SJooyung Han private:
333*5f39d1b3SJooyung Han // The underlying thread.
334*5f39d1b3SJooyung Han pthread_t thread_;
335*5f39d1b3SJooyung Han
336*5f39d1b3SJooyung Han // The task to be worked on.
337*5f39d1b3SJooyung Han Task* task_;
338*5f39d1b3SJooyung Han
339*5f39d1b3SJooyung Han // The condition variable and mutex guarding state changes.
340*5f39d1b3SJooyung Han pthread_cond_t state_cond_;
341*5f39d1b3SJooyung Han pthread_mutex_t state_mutex_;
342*5f39d1b3SJooyung Han
343*5f39d1b3SJooyung Han // The state enum tells if we're currently working, waiting for work, etc.
344*5f39d1b3SJooyung Han // Its concurrent accesses by the worker and main threads are guarded by
345*5f39d1b3SJooyung Han // state_mutex_, and can thus use memory_order_relaxed. This still needs
346*5f39d1b3SJooyung Han // to be a std::atomic because we use WaitForVariableChange.
347*5f39d1b3SJooyung Han std::atomic<State> state_;
348*5f39d1b3SJooyung Han
349*5f39d1b3SJooyung Han // Each thread had a local allocator so they can allocate temporary
350*5f39d1b3SJooyung Han // buffers without blocking each other.
351*5f39d1b3SJooyung Han Allocator local_allocator_;
352*5f39d1b3SJooyung Han
353*5f39d1b3SJooyung Han // pointer to the master's thread BlockingCounter object, to notify the
354*5f39d1b3SJooyung Han // master thread of when this worker switches to the 'Ready' state.
355*5f39d1b3SJooyung Han BlockingCounter* const counter_to_decrement_when_ready_;
356*5f39d1b3SJooyung Han };
357*5f39d1b3SJooyung Han
358*5f39d1b3SJooyung Han // A very simple pool of workers, that only allows the very
359*5f39d1b3SJooyung Han // specific parallelization pattern that we use here:
360*5f39d1b3SJooyung Han // a fixed number of workers can be given work, and one then
361*5f39d1b3SJooyung Han // waits for all of them to finish.
362*5f39d1b3SJooyung Han //
363*5f39d1b3SJooyung Han // See MultiThreadGemmContextBase for how other WorkersPool implementations can
364*5f39d1b3SJooyung Han // be used.
365*5f39d1b3SJooyung Han class WorkersPool {
366*5f39d1b3SJooyung Han public:
WorkersPool()367*5f39d1b3SJooyung Han WorkersPool() {}
368*5f39d1b3SJooyung Han
~WorkersPool()369*5f39d1b3SJooyung Han ~WorkersPool() {
370*5f39d1b3SJooyung Han for (auto w : workers_) {
371*5f39d1b3SJooyung Han delete w;
372*5f39d1b3SJooyung Han }
373*5f39d1b3SJooyung Han }
374*5f39d1b3SJooyung Han
375*5f39d1b3SJooyung Han // Just executes the tasks. Does not destroy them. Similar to
376*5f39d1b3SJooyung Han // ruy::ThreadPool::Execute.
377*5f39d1b3SJooyung Han template <typename TaskType>
Execute(int tasks_count,TaskType * tasks)378*5f39d1b3SJooyung Han void Execute(int tasks_count, TaskType* tasks) {
379*5f39d1b3SJooyung Han assert(tasks_count >= 1);
380*5f39d1b3SJooyung Han // One of the tasks will be run on the current thread.
381*5f39d1b3SJooyung Han std::size_t workers_count = tasks_count - 1;
382*5f39d1b3SJooyung Han CreateWorkers(workers_count);
383*5f39d1b3SJooyung Han assert(workers_count <= workers_.size());
384*5f39d1b3SJooyung Han counter_to_decrement_when_ready_.Reset(workers_count);
385*5f39d1b3SJooyung Han for (std::size_t i = 0; i < tasks_count - 1; i++) {
386*5f39d1b3SJooyung Han workers_[i]->StartWork(&tasks[i]);
387*5f39d1b3SJooyung Han }
388*5f39d1b3SJooyung Han // Execute the remaining workload immediately on the current thread.
389*5f39d1b3SJooyung Han Task* task = &tasks[tasks_count - 1];
390*5f39d1b3SJooyung Han task->local_allocator = &main_thread_task_allocator_;
391*5f39d1b3SJooyung Han task->Run();
392*5f39d1b3SJooyung Han // Wait for the workers submitted above to finish.
393*5f39d1b3SJooyung Han counter_to_decrement_when_ready_.Wait();
394*5f39d1b3SJooyung Han }
395*5f39d1b3SJooyung Han
396*5f39d1b3SJooyung Han // Legacy: executes the tasks and destroys them
LegacyExecuteAndDestroyTasks(const std::vector<Task * > & tasks)397*5f39d1b3SJooyung Han void LegacyExecuteAndDestroyTasks(const std::vector<Task*>& tasks) {
398*5f39d1b3SJooyung Han std::size_t tasks_count = tasks.size();
399*5f39d1b3SJooyung Han assert(tasks_count >= 1);
400*5f39d1b3SJooyung Han // One of the tasks will be run on the current thread.
401*5f39d1b3SJooyung Han std::size_t workers_count = tasks_count - 1;
402*5f39d1b3SJooyung Han CreateWorkers(workers_count);
403*5f39d1b3SJooyung Han assert(workers_count <= workers_.size());
404*5f39d1b3SJooyung Han counter_to_decrement_when_ready_.Reset(workers_count);
405*5f39d1b3SJooyung Han for (int i = 0; i < tasks_count - 1; i++) {
406*5f39d1b3SJooyung Han workers_[i]->StartWork(tasks[i]);
407*5f39d1b3SJooyung Han }
408*5f39d1b3SJooyung Han // Execute the remaining workload immediately on the current thread.
409*5f39d1b3SJooyung Han Task* task = tasks[tasks_count - 1];
410*5f39d1b3SJooyung Han task->local_allocator = &main_thread_task_allocator_;
411*5f39d1b3SJooyung Han task->Run();
412*5f39d1b3SJooyung Han // Wait for the workers submitted above to finish.
413*5f39d1b3SJooyung Han counter_to_decrement_when_ready_.Wait();
414*5f39d1b3SJooyung Han // Cleanup tasks (best to do this from the same thread that allocated
415*5f39d1b3SJooyung Han // the memory).
416*5f39d1b3SJooyung Han std::for_each(tasks.begin(), tasks.end(), [](Task* task) { delete task; });
417*5f39d1b3SJooyung Han }
418*5f39d1b3SJooyung Han
419*5f39d1b3SJooyung Han // Legacy old name of LegacyExecuteAndDestroyTasks
Execute(const std::vector<Task * > & tasks)420*5f39d1b3SJooyung Han void Execute(const std::vector<Task*>& tasks) {
421*5f39d1b3SJooyung Han LegacyExecuteAndDestroyTasks(tasks);
422*5f39d1b3SJooyung Han }
423*5f39d1b3SJooyung Han
424*5f39d1b3SJooyung Han private:
425*5f39d1b3SJooyung Han // Ensures that the pool has at least the given count of workers.
426*5f39d1b3SJooyung Han // If any new worker has to be created, this function waits for it to
427*5f39d1b3SJooyung Han // be ready.
CreateWorkers(std::size_t workers_count)428*5f39d1b3SJooyung Han void CreateWorkers(std::size_t workers_count) {
429*5f39d1b3SJooyung Han if (workers_.size() >= workers_count) {
430*5f39d1b3SJooyung Han return;
431*5f39d1b3SJooyung Han }
432*5f39d1b3SJooyung Han counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
433*5f39d1b3SJooyung Han while (workers_.size() < workers_count) {
434*5f39d1b3SJooyung Han workers_.push_back(new Worker(&counter_to_decrement_when_ready_));
435*5f39d1b3SJooyung Han }
436*5f39d1b3SJooyung Han counter_to_decrement_when_ready_.Wait();
437*5f39d1b3SJooyung Han }
438*5f39d1b3SJooyung Han
439*5f39d1b3SJooyung Han // copy construction disallowed
440*5f39d1b3SJooyung Han WorkersPool(const WorkersPool&) = delete;
441*5f39d1b3SJooyung Han
442*5f39d1b3SJooyung Han // The workers in this pool. They are owned by the pool:
443*5f39d1b3SJooyung Han // the pool creates workers and destroys them in its destructor.
444*5f39d1b3SJooyung Han std::vector<Worker*> workers_;
445*5f39d1b3SJooyung Han
446*5f39d1b3SJooyung Han // The BlockingCounter used to wait for the workers.
447*5f39d1b3SJooyung Han BlockingCounter counter_to_decrement_when_ready_;
448*5f39d1b3SJooyung Han
449*5f39d1b3SJooyung Han // For N-threaded operations, we will use only N-1 worker threads
450*5f39d1b3SJooyung Han // while the last task will be run directly on the main thread.
451*5f39d1b3SJooyung Han // It will then use this main_thread_task_allocator_; having a
452*5f39d1b3SJooyung Han // dedicated allocator for that (separate from the base allocator_)
453*5f39d1b3SJooyung Han // allows to use the same code for all tasks regardless of which
454*5f39d1b3SJooyung Han // thread they run on.
455*5f39d1b3SJooyung Han Allocator main_thread_task_allocator_;
456*5f39d1b3SJooyung Han };
457*5f39d1b3SJooyung Han
458*5f39d1b3SJooyung Han // The task we use to implement a multi-threaded Gemm: a block of the
459*5f39d1b3SJooyung Han // RHS has been packed by the master thread; each worker thread
460*5f39d1b3SJooyung Han // then has to pack a block of the LHS and accumulate the Gemm of these
461*5f39d1b3SJooyung Han // packed LHS and RHS blocks.
462*5f39d1b3SJooyung Han template <typename KernelFormat, typename InputScalar, typename OutputScalar,
463*5f39d1b3SJooyung Han typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
464*5f39d1b3SJooyung Han MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
465*5f39d1b3SJooyung Han typename OutputPipelineType, typename GemmContextType>
466*5f39d1b3SJooyung Han struct GemmWithPackedRhsTask : Task {
467*5f39d1b3SJooyung Han typedef PackedSideBlock<typename KernelFormat::Lhs> PackedLhs;
468*5f39d1b3SJooyung Han typedef PackedSideBlock<typename KernelFormat::Rhs> PackedRhs;
GemmWithPackedRhsTaskGemmWithPackedRhsTask469*5f39d1b3SJooyung Han GemmWithPackedRhsTask(GemmContextType* _context, const KernelBase& _kernel,
470*5f39d1b3SJooyung Han const MatrixMap<const InputScalar, LhsOrder>& _lhs,
471*5f39d1b3SJooyung Han const PackedRhs& _packed_rhs,
472*5f39d1b3SJooyung Han MatrixMap<OutputScalar, ResultOrder>* _result,
473*5f39d1b3SJooyung Han const MatrixBlockBounds& _result_block,
474*5f39d1b3SJooyung Han const LhsOffset& _lhs_offset,
475*5f39d1b3SJooyung Han const RhsOffset& _rhs_offset,
476*5f39d1b3SJooyung Han const BlockParams& _block_params,
477*5f39d1b3SJooyung Han const OutputPipelineType& _output_pipeline)
478*5f39d1b3SJooyung Han : context(_context),
479*5f39d1b3SJooyung Han kernel(_kernel),
480*5f39d1b3SJooyung Han lhs(_lhs),
481*5f39d1b3SJooyung Han packed_rhs(_packed_rhs),
482*5f39d1b3SJooyung Han result(*_result),
483*5f39d1b3SJooyung Han result_block(_result_block),
484*5f39d1b3SJooyung Han lhs_offset(_lhs_offset),
485*5f39d1b3SJooyung Han rhs_offset(_rhs_offset),
486*5f39d1b3SJooyung Han block_params(_block_params),
487*5f39d1b3SJooyung Han output_pipeline(_output_pipeline) {}
488*5f39d1b3SJooyung Han
RunGemmWithPackedRhsTask489*5f39d1b3SJooyung Han void Run() override {
490*5f39d1b3SJooyung Han ScopedProfilingLabel label("GemmWithPackedRhsTask");
491*5f39d1b3SJooyung Han
492*5f39d1b3SJooyung Han const int rows = result_block.rows;
493*5f39d1b3SJooyung Han const int cols = result_block.cols;
494*5f39d1b3SJooyung Han const int depth = lhs.cols();
495*5f39d1b3SJooyung Han
496*5f39d1b3SJooyung Han PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params);
497*5f39d1b3SJooyung Han
498*5f39d1b3SJooyung Han PackedResult packed_result(local_allocator, block_params);
499*5f39d1b3SJooyung Han
500*5f39d1b3SJooyung Han local_allocator->Commit();
501*5f39d1b3SJooyung Han
502*5f39d1b3SJooyung Han for (int c = 0; c < cols; c += block_params.l2_cols) {
503*5f39d1b3SJooyung Han int cs = std::min(block_params.l2_cols, cols - c);
504*5f39d1b3SJooyung Han
505*5f39d1b3SJooyung Han for (int r = 0; r < rows; r += block_params.l2_rows) {
506*5f39d1b3SJooyung Han int rs = std::min(block_params.l2_rows, rows - r);
507*5f39d1b3SJooyung Han
508*5f39d1b3SJooyung Han PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
509*5f39d1b3SJooyung Han
510*5f39d1b3SJooyung Han Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
511*5f39d1b3SJooyung Han depth);
512*5f39d1b3SJooyung Han
513*5f39d1b3SJooyung Han auto curr_result_block = MatrixBlockBounds(
514*5f39d1b3SJooyung Han result_block.start_row + r, result_block.start_col + c, rs, cs);
515*5f39d1b3SJooyung Han UnpackResult<KernelFormat>(
516*5f39d1b3SJooyung Han &result, curr_result_block, packed_result, depth,
517*5f39d1b3SJooyung Han packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
518*5f39d1b3SJooyung Han lhs_offset.block(curr_result_block.start_row, rs),
519*5f39d1b3SJooyung Han rhs_offset.block(curr_result_block.start_col, cs), output_pipeline);
520*5f39d1b3SJooyung Han }
521*5f39d1b3SJooyung Han }
522*5f39d1b3SJooyung Han
523*5f39d1b3SJooyung Han local_allocator->Decommit();
524*5f39d1b3SJooyung Han }
525*5f39d1b3SJooyung Han
526*5f39d1b3SJooyung Han const GemmContextType* context;
527*5f39d1b3SJooyung Han const KernelBase& kernel;
528*5f39d1b3SJooyung Han const MatrixMap<const InputScalar, LhsOrder> lhs;
529*5f39d1b3SJooyung Han const PackedRhs packed_rhs;
530*5f39d1b3SJooyung Han MatrixMap<OutputScalar, ResultOrder> result;
531*5f39d1b3SJooyung Han const MatrixBlockBounds result_block;
532*5f39d1b3SJooyung Han const LhsOffset& lhs_offset;
533*5f39d1b3SJooyung Han const RhsOffset& rhs_offset;
534*5f39d1b3SJooyung Han const BlockParams& block_params;
535*5f39d1b3SJooyung Han const OutputPipelineType& output_pipeline;
536*5f39d1b3SJooyung Han };
537*5f39d1b3SJooyung Han
538*5f39d1b3SJooyung Han // This base class for multi-threading allows subclasses to implement their own
539*5f39d1b3SJooyung Han // workers_pool() method. See MultiThreadGemmContext below for an example;
540*5f39d1b3SJooyung Han // any other implementation of workers_pool() must return an object with the
541*5f39d1b3SJooyung Han // same public methods as WorkersPool.
542*5f39d1b3SJooyung Han class MultiThreadGemmContextBase : public SingleThreadGemmContext {
543*5f39d1b3SJooyung Han public:
set_max_num_threads(int n)544*5f39d1b3SJooyung Han void set_max_num_threads(int n) { max_num_threads_ = n; }
545*5f39d1b3SJooyung Han
max_num_threads()546*5f39d1b3SJooyung Han int max_num_threads() const { return max_num_threads_; }
547*5f39d1b3SJooyung Han
548*5f39d1b3SJooyung Han protected:
549*5f39d1b3SJooyung Han // The maximum number of worker threads to use (including
550*5f39d1b3SJooyung Han // the master thread).
551*5f39d1b3SJooyung Han // The default value 1 means single-threading. That is the default
552*5f39d1b3SJooyung Han // because gemmlowp's primary target is mobile hardware, where thermal
553*5f39d1b3SJooyung Han // constraints usually mean that it may not be realistic to use more
554*5f39d1b3SJooyung Han // than 1 CPU core even if multiple cores are present.
555*5f39d1b3SJooyung Han // The special value 0 means try to detect the number of hardware threads.
556*5f39d1b3SJooyung Han // Note: this assumes that all CPU cores are equivalent. That assumption
557*5f39d1b3SJooyung Han // is defeated on big.LITTLE ARM devices, where we have no API to query
558*5f39d1b3SJooyung Han // the number of big cores (which is typically what we would want to use,
559*5f39d1b3SJooyung Han // leaving aside above-mentioned thermal issues). That is the other reason
560*5f39d1b3SJooyung Han // why the best compromise here is to let max_num_threads_ default to 1,
561*5f39d1b3SJooyung Han // so users who want multi-threading have to make the decision of how many
562*5f39d1b3SJooyung Han // threads to use by themselves.
563*5f39d1b3SJooyung Han int max_num_threads_ = 1;
564*5f39d1b3SJooyung Han };
565*5f39d1b3SJooyung Han
566*5f39d1b3SJooyung Han class MultiThreadGemmContext : public MultiThreadGemmContextBase {
567*5f39d1b3SJooyung Han public:
workers_pool()568*5f39d1b3SJooyung Han WorkersPool* workers_pool() { return &workers_pool_; }
569*5f39d1b3SJooyung Han
570*5f39d1b3SJooyung Han private:
571*5f39d1b3SJooyung Han // The workers pool used by MultiThreadGemm. Making
572*5f39d1b3SJooyung Han // this part of the context allows it to be persistent,
573*5f39d1b3SJooyung Han // avoiding recreating threads on every Gemm.
574*5f39d1b3SJooyung Han WorkersPool workers_pool_;
575*5f39d1b3SJooyung Han };
576*5f39d1b3SJooyung Han
577*5f39d1b3SJooyung Han // Determines how many threads should be used for a given Gemm
578*5f39d1b3SJooyung Han // operation.
579*5f39d1b3SJooyung Han template <int KernelRows>
HowManyThreads(int max_num_threads,int rows,int cols,int depth)580*5f39d1b3SJooyung Han inline int HowManyThreads(int max_num_threads, int rows, int cols, int depth) {
581*5f39d1b3SJooyung Han // Early-exit in the default case where multi-threading is disabled.
582*5f39d1b3SJooyung Han if (max_num_threads == 1) {
583*5f39d1b3SJooyung Han return 1;
584*5f39d1b3SJooyung Han }
585*5f39d1b3SJooyung Han
586*5f39d1b3SJooyung Han // Determine the maximum number of threads.
587*5f39d1b3SJooyung Han int max_count = GetHardwareConcurrency(max_num_threads);
588*5f39d1b3SJooyung Han
589*5f39d1b3SJooyung Han // Basic calculation: take into account max pool size, and
590*5f39d1b3SJooyung Han // how many rows we have to feed our kernel.
591*5f39d1b3SJooyung Han // The motivation for an absolute minimum number of rows per thread,
592*5f39d1b3SJooyung Han // potentially higher than KernelRows, is that very thin thread workload
593*5f39d1b3SJooyung Han // currently defeat assumptions of the AddMod generator, resulting
594*5f39d1b3SJooyung Han // in substantial bias in TestWithRealData on 24 threads.
595*5f39d1b3SJooyung Han // Ideally, the AddMod generator should be aware of global (r,c) coordinates
596*5f39d1b3SJooyung Han // so as to be independent of the number of threads.
597*5f39d1b3SJooyung Han static const int AbsoluteMinRowsPerThread = 16;
598*5f39d1b3SJooyung Han static const int MinRowsPerThread = KernelRows > AbsoluteMinRowsPerThread
599*5f39d1b3SJooyung Han ? KernelRows
600*5f39d1b3SJooyung Han : AbsoluteMinRowsPerThread;
601*5f39d1b3SJooyung Han int thread_count = std::min(max_count, CeilQuotient(rows, MinRowsPerThread));
602*5f39d1b3SJooyung Han
603*5f39d1b3SJooyung Han // At this point for small products we already have thread_count==1 so
604*5f39d1b3SJooyung Han // we can avoid doing more work; otherwise, we still want to check
605*5f39d1b3SJooyung Han // that the cubic size (rows*cols*depth) is big enough to keep
606*5f39d1b3SJooyung Han // workers_ busy.
607*5f39d1b3SJooyung Han if (thread_count > 1) {
608*5f39d1b3SJooyung Han // Empirically determined value.
609*5f39d1b3SJooyung Han static const std::uint64_t min_cubic_size_per_thread = 64 * 1024;
610*5f39d1b3SJooyung Han
611*5f39d1b3SJooyung Han // We can only multiply two out of three sizes without risking overflow
612*5f39d1b3SJooyung Han const std::uint64_t cubic_size =
613*5f39d1b3SJooyung Han std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
614*5f39d1b3SJooyung Han
615*5f39d1b3SJooyung Han thread_count =
616*5f39d1b3SJooyung Han std::min(thread_count, int(cubic_size / min_cubic_size_per_thread));
617*5f39d1b3SJooyung Han
618*5f39d1b3SJooyung Han if (thread_count < 1) {
619*5f39d1b3SJooyung Han thread_count = 1;
620*5f39d1b3SJooyung Han }
621*5f39d1b3SJooyung Han }
622*5f39d1b3SJooyung Han
623*5f39d1b3SJooyung Han assert(thread_count > 0 && thread_count <= max_count);
624*5f39d1b3SJooyung Han return thread_count;
625*5f39d1b3SJooyung Han }
626*5f39d1b3SJooyung Han
627*5f39d1b3SJooyung Han // The main multi-threaded Gemm function.
628*5f39d1b3SJooyung Han // To understand it, first read the code of SingleThreadGemm().
629*5f39d1b3SJooyung Han // The parallelization scheme used here is to have this master function
630*5f39d1b3SJooyung Han // pack a block of RHS and then start worker threads to pack a block of LHS
631*5f39d1b3SJooyung Han // each, and accumulate the corresponding products.
632*5f39d1b3SJooyung Han template <typename KernelFormat, typename InputScalar, typename OutputScalar,
633*5f39d1b3SJooyung Han typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
634*5f39d1b3SJooyung Han MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
635*5f39d1b3SJooyung Han typename OutputPipelineType, typename GemmContextType>
MultiThreadGemm(GemmContextType * context,const KernelBase & kernel,const MatrixMap<const InputScalar,LhsOrder> & lhs,const MatrixMap<const InputScalar,RhsOrder> & rhs,MatrixMap<OutputScalar,ResultOrder> * result,const LhsOffset & lhs_offset,const RhsOffset & rhs_offset,const OutputPipelineType & output_pipeline)636*5f39d1b3SJooyung Han void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel,
637*5f39d1b3SJooyung Han const MatrixMap<const InputScalar, LhsOrder>& lhs,
638*5f39d1b3SJooyung Han const MatrixMap<const InputScalar, RhsOrder>& rhs,
639*5f39d1b3SJooyung Han MatrixMap<OutputScalar, ResultOrder>* result,
640*5f39d1b3SJooyung Han const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
641*5f39d1b3SJooyung Han const OutputPipelineType& output_pipeline) {
642*5f39d1b3SJooyung Han ScopedProfilingLabel label("gemmlowp::MultiThreadGemm");
643*5f39d1b3SJooyung Han
644*5f39d1b3SJooyung Han assert(lhs.cols() == rhs.rows());
645*5f39d1b3SJooyung Han
646*5f39d1b3SJooyung Han int rows = result->rows();
647*5f39d1b3SJooyung Han int cols = result->cols();
648*5f39d1b3SJooyung Han int depth = lhs.cols();
649*5f39d1b3SJooyung Han
650*5f39d1b3SJooyung Han // zero sizes should have been caught earlier and early-returned.
651*5f39d1b3SJooyung Han assert(rows > 0);
652*5f39d1b3SJooyung Han assert(cols > 0);
653*5f39d1b3SJooyung Han assert(depth > 0);
654*5f39d1b3SJooyung Han
655*5f39d1b3SJooyung Han // The case of rows<cols should have been caught earlier and transposed.
656*5f39d1b3SJooyung Han assert(rows >= cols);
657*5f39d1b3SJooyung Han
658*5f39d1b3SJooyung Han const int thread_count = HowManyThreads<KernelFormat::kRows>(
659*5f39d1b3SJooyung Han context->max_num_threads(), rows, cols, depth);
660*5f39d1b3SJooyung Han if (thread_count == 1) {
661*5f39d1b3SJooyung Han return SingleThreadGemm<KernelFormat, InputScalar, OutputScalar,
662*5f39d1b3SJooyung Han BitDepthParams>(context, kernel, lhs, rhs, result,
663*5f39d1b3SJooyung Han lhs_offset, rhs_offset,
664*5f39d1b3SJooyung Han output_pipeline);
665*5f39d1b3SJooyung Han }
666*5f39d1b3SJooyung Han assert(thread_count > 1);
667*5f39d1b3SJooyung Han
668*5f39d1b3SJooyung Han // Simple 1:1 mapping of tasks to physical cores, which is very important
669*5f39d1b3SJooyung Han // to getting good multithreaded performance, specially for not-very-large
670*5f39d1b3SJooyung Han // GEMMs, and especially on Android.
671*5f39d1b3SJooyung Han const int task_count = thread_count;
672*5f39d1b3SJooyung Han
673*5f39d1b3SJooyung Han Allocator* allocator = context->allocator();
674*5f39d1b3SJooyung Han auto* workers_pool = context->workers_pool();
675*5f39d1b3SJooyung Han
676*5f39d1b3SJooyung Han BlockParams block_params;
677*5f39d1b3SJooyung Han block_params.Init<KernelFormat>(
678*5f39d1b3SJooyung Han rows, cols, depth, task_count, context->l1_bytes_to_use(),
679*5f39d1b3SJooyung Han context->l2_bytes_to_use(), context->l2_rhs_factor());
680*5f39d1b3SJooyung Han
681*5f39d1b3SJooyung Han PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
682*5f39d1b3SJooyung Han block_params);
683*5f39d1b3SJooyung Han allocator->Commit();
684*5f39d1b3SJooyung Han
685*5f39d1b3SJooyung Han // We loop over large blocks of the RHS.
686*5f39d1b3SJooyung Han for (int c = 0; c < cols; c += block_params.l2_cols) {
687*5f39d1b3SJooyung Han int cs = std::min(block_params.l2_cols, cols - c);
688*5f39d1b3SJooyung Han
689*5f39d1b3SJooyung Han // Pack a large block of the RHS.
690*5f39d1b3SJooyung Han PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
691*5f39d1b3SJooyung Han
692*5f39d1b3SJooyung Han // Give work to each worker.
693*5f39d1b3SJooyung Han std::vector<Task*> tasks;
694*5f39d1b3SJooyung Han int next_start_row = 0;
695*5f39d1b3SJooyung Han for (int n = 0; n < task_count; ++n) {
696*5f39d1b3SJooyung Han int start_row = next_start_row;
697*5f39d1b3SJooyung Han next_start_row = std::min(
698*5f39d1b3SJooyung Han rows, RoundUp<KernelFormat::kRows>(rows * (n + 1) / task_count));
699*5f39d1b3SJooyung Han
700*5f39d1b3SJooyung Han int block_rows = next_start_row - start_row;
701*5f39d1b3SJooyung Han auto lhs_block = lhs.block(start_row, 0, block_rows, depth);
702*5f39d1b3SJooyung Han typedef GemmWithPackedRhsTask<KernelFormat, InputScalar, OutputScalar,
703*5f39d1b3SJooyung Han BitDepthParams, LhsOrder, RhsOrder,
704*5f39d1b3SJooyung Han ResultOrder, LhsOffset, RhsOffset,
705*5f39d1b3SJooyung Han OutputPipelineType, GemmContextType>
706*5f39d1b3SJooyung Han TaskType;
707*5f39d1b3SJooyung Han tasks.push_back(
708*5f39d1b3SJooyung Han new TaskType(context, kernel, lhs_block, packed_rhs, result,
709*5f39d1b3SJooyung Han MatrixBlockBounds(start_row, c, block_rows, cs),
710*5f39d1b3SJooyung Han lhs_offset, rhs_offset, block_params, output_pipeline));
711*5f39d1b3SJooyung Han }
712*5f39d1b3SJooyung Han // Execute the work on the workers (and partially on this thread).
713*5f39d1b3SJooyung Han workers_pool->Execute(tasks);
714*5f39d1b3SJooyung Han }
715*5f39d1b3SJooyung Han
716*5f39d1b3SJooyung Han allocator->Decommit();
717*5f39d1b3SJooyung Han }
718*5f39d1b3SJooyung Han
719*5f39d1b3SJooyung Han } // namespace gemmlowp
720*5f39d1b3SJooyung Han
721*5f39d1b3SJooyung Han #endif // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
722