gemmlowp/internal/multi_thread_gemm.h

*5f39d1b3SJooyung Han// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han// Licensed under the Apache License, Version 2.0 (the "License");
*5f39d1b3SJooyung Han// you may not use this file except in compliance with the License.
*5f39d1b3SJooyung Han// You may obtain a copy of the License at
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han//     http://www.apache.org/licenses/LICENSE-2.0
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han// Unless required by applicable law or agreed to in writing, software
*5f39d1b3SJooyung Han// distributed under the License is distributed on an "AS IS" BASIS,
*5f39d1b3SJooyung Han// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*5f39d1b3SJooyung Han// See the License for the specific language governing permissions and
*5f39d1b3SJooyung Han// limitations under the License.
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// multi_thread_gemm.h: Multi-threaded GEMM entry point.
*5f39d1b3SJooyung Han// Readers note: To understand this file, it is useful to first
*5f39d1b3SJooyung Han// read and understand the much simpler single_thread_gemm.h.
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
*5f39d1b3SJooyung Han#define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#include <atomic>  // NOLINT
*5f39d1b3SJooyung Han#include <chrono>  // NOLINT
*5f39d1b3SJooyung Han#include <thread>  // NOLINT
*5f39d1b3SJooyung Han#include <vector>
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#include "single_thread_gemm.h"
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Hannamespace gemmlowp {
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// This value was empirically derived on an end-to-end application benchmark.
*5f39d1b3SJooyung Han// That this number of cycles means that we may be sleeping substantially longer
*5f39d1b3SJooyung Han// than a scheduler timeslice's duration is not necessarily surprising. The
*5f39d1b3SJooyung Han// idea is to pick up quickly new work after having finished the previous
*5f39d1b3SJooyung Han// workload. When it's new work within the same GEMM as the previous work, the
*5f39d1b3SJooyung Han// time interval that we might be busy-waiting is very small, so for that
*5f39d1b3SJooyung Han// purpose it would be more than enough to sleep for 1 million cycles.
*5f39d1b3SJooyung Han// That is all what we would observe on a GEMM benchmark. However, in a real
*5f39d1b3SJooyung Han// application, after having finished a GEMM, we might do unrelated work for
*5f39d1b3SJooyung Han// a little while, then start on a new GEMM. Think of a neural network
*5f39d1b3SJooyung Han// application performing inference, where many but not all layers are
*5f39d1b3SJooyung Han// implemented by a GEMM. In such cases, our worker threads might be idle for
*5f39d1b3SJooyung Han// longer periods of time before having work again. If we let them passively
*5f39d1b3SJooyung Han// wait, on a mobile device, the CPU scheduler might aggressively clock down
*5f39d1b3SJooyung Han// or even turn off the CPU cores that they were running on. That would result
*5f39d1b3SJooyung Han// in a long delay the next time these need to be turned back on for the next
*5f39d1b3SJooyung Han// GEMM. So we need to strike a balance that reflects typical time intervals
*5f39d1b3SJooyung Han// between consecutive GEMM invokations, not just intra-GEMM considerations.
*5f39d1b3SJooyung Han// Of course, we need to balance keeping CPUs spinning longer to resume work
*5f39d1b3SJooyung Han// faster, versus passively waiting to conserve power.
*5f39d1b3SJooyung Hanconst int kMaxBusyWaitNOPs = 4 * 1000 * 1000;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// On X86 and ARM platforms we may use NOP instructions to know how long we
*5f39d1b3SJooyung Han// are busy-waiting.
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#if defined(GEMMLOWP_ALLOW_INLINE_ASM) && !defined(GEMMLOWP_NO_BUSYWAIT) && \
*5f39d1b3SJooyung Han    (defined(GEMMLOWP_ARM) || defined(GEMMLOWP_X86))
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#define GEMMLOWP_NOP "nop\n"
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#define GEMMLOWP_STRING_CONCAT_4(X) X X X X
*5f39d1b3SJooyung Han#define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP)
*5f39d1b3SJooyung Han#define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4)
*5f39d1b3SJooyung Han#define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16)
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Haninline int DoSomeNOPs() {
*5f39d1b3SJooyung Han  asm volatile(GEMMLOWP_NOP64);
*5f39d1b3SJooyung Han  return 64;
*5f39d1b3SJooyung Han}
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#undef GEMMLOWP_STRING_CONCAT_4
*5f39d1b3SJooyung Han#undef GEMMLOWP_NOP64
*5f39d1b3SJooyung Han#undef GEMMLOWP_NOP16
*5f39d1b3SJooyung Han#undef GEMMLOWP_NOP4
*5f39d1b3SJooyung Han#undef GEMMLOWP_NOP
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#else  // May not use asm NOP.
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// If we can't use NOPs, let's use a non-inline function call as a basic
*5f39d1b3SJooyung Han// thing that has some vaguely known, nonzero cost.
*5f39d1b3SJooyung HanGEMMLOWP_NOINLINE
*5f39d1b3SJooyung Haninline int DoSomeNOPs() {
*5f39d1b3SJooyung Han  // Pretend that calling an empty function takes as long as 16 NOPs...
*5f39d1b3SJooyung Han  return 16;
*5f39d1b3SJooyung Han}
*5f39d1b3SJooyung Han#endif
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// Waits until *var != initial_value.
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han// Returns the new value of *var. The guarantee here is that
*5f39d1b3SJooyung Han// the return value is different from initial_value, and that that
*5f39d1b3SJooyung Han// new value has been taken by *var at some point during the
*5f39d1b3SJooyung Han// execution of this function. There is no guarantee that this is
*5f39d1b3SJooyung Han// still the value of *var when this function returns, since *var is
*5f39d1b3SJooyung Han// not assumed to be guarded by any lock.
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han// First does some busy-waiting for a fixed number of no-op cycles,
*5f39d1b3SJooyung Han// then falls back to passive waiting for the given condvar, guarded
*5f39d1b3SJooyung Han// by the given mutex.
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han// The idea of doing some initial busy-waiting is to help get
*5f39d1b3SJooyung Han// better and more consistent multithreading benefits for small GEMM sizes.
*5f39d1b3SJooyung Han// Busy-waiting help ensuring that if we need to wake up soon after having
*5f39d1b3SJooyung Han// started waiting, then we can wake up quickly (as opposed to, say,
*5f39d1b3SJooyung Han// having to wait to be scheduled again by the OS). On the other hand,
*5f39d1b3SJooyung Han// we must still eventually revert to passive waiting for longer waits
*5f39d1b3SJooyung Han// (e.g. worker threads having finished a GEMM and waiting until the next GEMM)
*5f39d1b3SJooyung Han// so as to avoid permanently spinning.
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Hantemplate <typename T>
*5f39d1b3SJooyung HanT WaitForVariableChange(std::atomic<T>* var, T initial_value,
*5f39d1b3SJooyung Han                        pthread_cond_t* cond, pthread_mutex_t* mutex) {
*5f39d1b3SJooyung Han  // First, trivial case where the variable already changed value.
*5f39d1b3SJooyung Han  T new_value = var->load(std::memory_order_acquire);
*5f39d1b3SJooyung Han  if (new_value != initial_value) {
*5f39d1b3SJooyung Han    return new_value;
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han  // Then try busy-waiting.
*5f39d1b3SJooyung Han  int nops = 0;
*5f39d1b3SJooyung Han  while (nops < kMaxBusyWaitNOPs) {
*5f39d1b3SJooyung Han    nops += DoSomeNOPs();
*5f39d1b3SJooyung Han    new_value = var->load(std::memory_order_acquire);
*5f39d1b3SJooyung Han    if (new_value != initial_value) {
*5f39d1b3SJooyung Han      return new_value;
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Finally, do real passive waiting.
*5f39d1b3SJooyung Han  pthread_mutex_lock(mutex);
*5f39d1b3SJooyung Han  new_value = var->load(std::memory_order_acquire);
*5f39d1b3SJooyung Han  while (new_value == initial_value) {
*5f39d1b3SJooyung Han    pthread_cond_wait(cond, mutex);
*5f39d1b3SJooyung Han    new_value = var->load(std::memory_order_acquire);
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han  pthread_mutex_unlock(mutex);
*5f39d1b3SJooyung Han  return new_value;
*5f39d1b3SJooyung Han}
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// A BlockingCounter lets one thread to wait for N events to occur.
*5f39d1b3SJooyung Han// This is how the master thread waits for all the worker threads
*5f39d1b3SJooyung Han// to have finished working.
*5f39d1b3SJooyung Han// The waiting is done using a naive spinlock waiting for the atomic
*5f39d1b3SJooyung Han// count_ to hit the value 0. This is acceptable because in our usage
*5f39d1b3SJooyung Han// pattern, BlockingCounter is used only to synchronize threads after
*5f39d1b3SJooyung Han// short-lived tasks (performing parts of the same GEMM). It is not used
*5f39d1b3SJooyung Han// for synchronizing longer waits (resuming work on the next GEMM).
*5f39d1b3SJooyung Hanclass BlockingCounter {
*5f39d1b3SJooyung Han public:
*5f39d1b3SJooyung Han  BlockingCounter() : count_(0) {}
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Sets/resets the counter; initial_count is the number of
*5f39d1b3SJooyung Han  // decrementing events that the Wait() call will be waiting for.
*5f39d1b3SJooyung Han  void Reset(std::size_t initial_count) {
*5f39d1b3SJooyung Han    std::size_t old_count_value = count_.load(std::memory_order_relaxed);
*5f39d1b3SJooyung Han    assert(old_count_value == 0);
*5f39d1b3SJooyung Han    (void)old_count_value;
*5f39d1b3SJooyung Han    count_.store(initial_count, std::memory_order_release);
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Decrements the counter; if the counter hits zero, signals
*5f39d1b3SJooyung Han  // the threads that were waiting for that, and returns true.
*5f39d1b3SJooyung Han  // Otherwise (if the decremented count is still nonzero),
*5f39d1b3SJooyung Han  // returns false.
*5f39d1b3SJooyung Han  bool DecrementCount() {
*5f39d1b3SJooyung Han    std::size_t old_count_value =
*5f39d1b3SJooyung Han        count_.fetch_sub(1, std::memory_order_acq_rel);
*5f39d1b3SJooyung Han    assert(old_count_value > 0);
*5f39d1b3SJooyung Han    std::size_t count_value = old_count_value - 1;
*5f39d1b3SJooyung Han    return count_value == 0;
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Waits for the N other threads (N having been set by Reset())
*5f39d1b3SJooyung Han  // to hit the BlockingCounter.
*5f39d1b3SJooyung Han  void Wait() {
*5f39d1b3SJooyung Han    ScopedProfilingLabel label("BlockingCounter::Wait");
*5f39d1b3SJooyung Han    // Busy-wait until the count value is 0.
*5f39d1b3SJooyung Han    int nops = 0;
*5f39d1b3SJooyung Han    while (count_.load(std::memory_order_acquire)) {
*5f39d1b3SJooyung Han      nops += DoSomeNOPs();
*5f39d1b3SJooyung Han      if (nops > kMaxBusyWaitNOPs) {
*5f39d1b3SJooyung Han        nops = 0;
*5f39d1b3SJooyung Han        // If we are unlucky, the blocking thread (that calls DecrementCount)
*5f39d1b3SJooyung Han        // and the blocked thread (here, calling Wait) may be scheduled on
*5f39d1b3SJooyung Han        // the same CPU, so the busy-waiting of the present thread may prevent
*5f39d1b3SJooyung Han        // the blocking thread from resuming and unblocking.
*5f39d1b3SJooyung Han        // If we are even unluckier, the priorities of the present thread
*5f39d1b3SJooyung Han        // might be higher than that of the blocking thread, so just yielding
*5f39d1b3SJooyung Han        // wouldn't allow the blocking thread to resume. So we sleep for
*5f39d1b3SJooyung Han        // a substantial amount of time in that case. Notice that we only
*5f39d1b3SJooyung Han        // do so after having busy-waited for kMaxBusyWaitNOPs, which is
*5f39d1b3SJooyung Han        // typically several milliseconds, so sleeping 1 more millisecond
*5f39d1b3SJooyung Han        // isn't terrible at that point.
*5f39d1b3SJooyung Han        //
*5f39d1b3SJooyung Han        // How this is mitigated in practice:
*5f39d1b3SJooyung Han        // In practice, it is well known that the application should be
*5f39d1b3SJooyung Han        // conservative in choosing how many threads to tell gemmlowp to use,
*5f39d1b3SJooyung Han        // as it's hard to know how many CPU cores it will get to run on,
*5f39d1b3SJooyung Han        // on typical mobile devices.
*5f39d1b3SJooyung Han        // It seems impossible for gemmlowp to make this choice automatically,
*5f39d1b3SJooyung Han        // which is why gemmlowp's default is to use only 1 thread, and
*5f39d1b3SJooyung Han        // applications may override that if they know that they can count on
*5f39d1b3SJooyung Han        // using more than that.
*5f39d1b3SJooyung Han        std::this_thread::sleep_for(std::chrono::milliseconds(1));
*5f39d1b3SJooyung Han      }
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han private:
*5f39d1b3SJooyung Han  std::atomic<std::size_t> count_;
*5f39d1b3SJooyung Han};
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// A workload for a worker.
*5f39d1b3SJooyung Hanstruct Task {
*5f39d1b3SJooyung Han  Task() : local_allocator(nullptr) {}
*5f39d1b3SJooyung Han  virtual ~Task() {}
*5f39d1b3SJooyung Han  virtual void Run() = 0;
*5f39d1b3SJooyung Han  Allocator* local_allocator;
*5f39d1b3SJooyung Han};
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// A worker thread.
*5f39d1b3SJooyung Hanclass Worker {
*5f39d1b3SJooyung Han public:
*5f39d1b3SJooyung Han  enum class State {
*5f39d1b3SJooyung Han    ThreadStartup,  // The initial state before the thread main loop runs.
*5f39d1b3SJooyung Han    Ready,          // Is not working, has not yet received new work to do.
*5f39d1b3SJooyung Han    HasWork,        // Has work to do.
*5f39d1b3SJooyung Han    ExitAsSoonAsPossible  // Should exit at earliest convenience.
*5f39d1b3SJooyung Han  };
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
*5f39d1b3SJooyung Han      : task_(nullptr),
*5f39d1b3SJooyung Han        state_(State::ThreadStartup),
*5f39d1b3SJooyung Han        counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
*5f39d1b3SJooyung Han    pthread_cond_init(&state_cond_, nullptr);
*5f39d1b3SJooyung Han    pthread_mutex_init(&state_mutex_, nullptr);
*5f39d1b3SJooyung Han    pthread_create(&thread_, nullptr, ThreadFunc, this);
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  ~Worker() {
*5f39d1b3SJooyung Han    ChangeState(State::ExitAsSoonAsPossible);
*5f39d1b3SJooyung Han    pthread_join(thread_, nullptr);
*5f39d1b3SJooyung Han    pthread_cond_destroy(&state_cond_);
*5f39d1b3SJooyung Han    pthread_mutex_destroy(&state_mutex_);
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Changes State; may be called from either the worker thread
*5f39d1b3SJooyung Han  // or the master thread; however, not all state transitions are legal,
*5f39d1b3SJooyung Han  // which is guarded by assertions.
*5f39d1b3SJooyung Han  //
*5f39d1b3SJooyung Han  // The Task argument is to be used only with new_state==HasWork.
*5f39d1b3SJooyung Han  // It specifies the Task being handed to this Worker.
*5f39d1b3SJooyung Han  void ChangeState(State new_state, Task* task = nullptr) {
*5f39d1b3SJooyung Han    ScopedProfilingLabel label("Worker::ChangeState");
*5f39d1b3SJooyung Han    pthread_mutex_lock(&state_mutex_);
*5f39d1b3SJooyung Han    State old_state = state_.load(std::memory_order_relaxed);
*5f39d1b3SJooyung Han    assert(old_state != new_state);
*5f39d1b3SJooyung Han    switch (old_state) {
*5f39d1b3SJooyung Han      case State::ThreadStartup:
*5f39d1b3SJooyung Han        assert(new_state == State::Ready);
*5f39d1b3SJooyung Han        break;
*5f39d1b3SJooyung Han      case State::Ready:
*5f39d1b3SJooyung Han        assert(new_state == State::HasWork ||
*5f39d1b3SJooyung Han               new_state == State::ExitAsSoonAsPossible);
*5f39d1b3SJooyung Han        break;
*5f39d1b3SJooyung Han      case State::HasWork:
*5f39d1b3SJooyung Han        assert(new_state == State::Ready ||
*5f39d1b3SJooyung Han               new_state == State::ExitAsSoonAsPossible);
*5f39d1b3SJooyung Han        break;
*5f39d1b3SJooyung Han      default:
*5f39d1b3SJooyung Han        abort();
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han    switch (new_state) {
*5f39d1b3SJooyung Han      case State::Ready:
*5f39d1b3SJooyung Han        if (task_) {
*5f39d1b3SJooyung Han          // Doing work is part of reverting to 'ready' state.
*5f39d1b3SJooyung Han          task_->Run();
*5f39d1b3SJooyung Han          task_ = nullptr;
*5f39d1b3SJooyung Han        }
*5f39d1b3SJooyung Han        break;
*5f39d1b3SJooyung Han      case State::HasWork:
*5f39d1b3SJooyung Han        assert(!task_);
*5f39d1b3SJooyung Han        task->local_allocator = &local_allocator_;
*5f39d1b3SJooyung Han        task_ = task;
*5f39d1b3SJooyung Han        break;
*5f39d1b3SJooyung Han      default:
*5f39d1b3SJooyung Han        break;
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han    state_.store(new_state, std::memory_order_relaxed);
*5f39d1b3SJooyung Han    pthread_cond_broadcast(&state_cond_);
*5f39d1b3SJooyung Han    pthread_mutex_unlock(&state_mutex_);
*5f39d1b3SJooyung Han    if (new_state == State::Ready) {
*5f39d1b3SJooyung Han      counter_to_decrement_when_ready_->DecrementCount();
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Thread entry point.
*5f39d1b3SJooyung Han  void ThreadFunc() {
*5f39d1b3SJooyung Han    ScopedProfilingLabel label("Worker::ThreadFunc");
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    ChangeState(State::Ready);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    // Thread main loop
*5f39d1b3SJooyung Han    while (true) {
*5f39d1b3SJooyung Han      // Get a state to act on
*5f39d1b3SJooyung Han      // In the 'Ready' state, we have nothing to do but to wait until
*5f39d1b3SJooyung Han      // we switch to another state.
*5f39d1b3SJooyung Han      State state_to_act_upon = WaitForVariableChange(
*5f39d1b3SJooyung Han          &state_, State::Ready, &state_cond_, &state_mutex_);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han      // We now have a state to act on, so act.
*5f39d1b3SJooyung Han      switch (state_to_act_upon) {
*5f39d1b3SJooyung Han        case State::HasWork:
*5f39d1b3SJooyung Han          // Got work to do! So do it, and then revert to 'Ready' state.
*5f39d1b3SJooyung Han          ChangeState(State::Ready);
*5f39d1b3SJooyung Han          break;
*5f39d1b3SJooyung Han        case State::ExitAsSoonAsPossible:
*5f39d1b3SJooyung Han          return;
*5f39d1b3SJooyung Han        default:
*5f39d1b3SJooyung Han          abort();
*5f39d1b3SJooyung Han      }
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  static void* ThreadFunc(void* arg) {
*5f39d1b3SJooyung Han    static_cast<Worker*>(arg)->ThreadFunc();
*5f39d1b3SJooyung Han    return nullptr;
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Called by the master thead to give this worker work to do.
*5f39d1b3SJooyung Han  void StartWork(Task* task) { ChangeState(State::HasWork, task); }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han private:
*5f39d1b3SJooyung Han  // The underlying thread.
*5f39d1b3SJooyung Han  pthread_t thread_;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // The task to be worked on.
*5f39d1b3SJooyung Han  Task* task_;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // The condition variable and mutex guarding state changes.
*5f39d1b3SJooyung Han  pthread_cond_t state_cond_;
*5f39d1b3SJooyung Han  pthread_mutex_t state_mutex_;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // The state enum tells if we're currently working, waiting for work, etc.
*5f39d1b3SJooyung Han  // Its concurrent accesses by the worker and main threads are guarded by
*5f39d1b3SJooyung Han  // state_mutex_, and can thus use memory_order_relaxed. This still needs
*5f39d1b3SJooyung Han  // to be a std::atomic because we use WaitForVariableChange.
*5f39d1b3SJooyung Han  std::atomic<State> state_;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Each thread had a local allocator so they can allocate temporary
*5f39d1b3SJooyung Han  // buffers without blocking each other.
*5f39d1b3SJooyung Han  Allocator local_allocator_;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // pointer to the master's thread BlockingCounter object, to notify the
*5f39d1b3SJooyung Han  // master thread of when this worker switches to the 'Ready' state.
*5f39d1b3SJooyung Han  BlockingCounter* const counter_to_decrement_when_ready_;
*5f39d1b3SJooyung Han};
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// A very simple pool of workers, that only allows the very
*5f39d1b3SJooyung Han// specific parallelization pattern that we use here:
*5f39d1b3SJooyung Han// a fixed number of workers can be given work, and one then
*5f39d1b3SJooyung Han// waits for all of them to finish.
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han// See MultiThreadGemmContextBase for how other WorkersPool implementations can
*5f39d1b3SJooyung Han// be used.
*5f39d1b3SJooyung Hanclass WorkersPool {
*5f39d1b3SJooyung Han public:
*5f39d1b3SJooyung Han  WorkersPool() {}
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  ~WorkersPool() {
*5f39d1b3SJooyung Han    for (auto w : workers_) {
*5f39d1b3SJooyung Han      delete w;
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Just executes the tasks. Does not destroy them. Similar to
*5f39d1b3SJooyung Han  // ruy::ThreadPool::Execute.
*5f39d1b3SJooyung Han  template <typename TaskType>
*5f39d1b3SJooyung Han  void Execute(int tasks_count, TaskType* tasks) {
*5f39d1b3SJooyung Han    assert(tasks_count >= 1);
*5f39d1b3SJooyung Han    // One of the tasks will be run on the current thread.
*5f39d1b3SJooyung Han    std::size_t workers_count = tasks_count - 1;
*5f39d1b3SJooyung Han    CreateWorkers(workers_count);
*5f39d1b3SJooyung Han    assert(workers_count <= workers_.size());
*5f39d1b3SJooyung Han    counter_to_decrement_when_ready_.Reset(workers_count);
*5f39d1b3SJooyung Han    for (std::size_t i = 0; i < tasks_count - 1; i++) {
*5f39d1b3SJooyung Han      workers_[i]->StartWork(&tasks[i]);
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han    // Execute the remaining workload immediately on the current thread.
*5f39d1b3SJooyung Han    Task* task = &tasks[tasks_count - 1];
*5f39d1b3SJooyung Han    task->local_allocator = &main_thread_task_allocator_;
*5f39d1b3SJooyung Han    task->Run();
*5f39d1b3SJooyung Han    // Wait for the workers submitted above to finish.
*5f39d1b3SJooyung Han    counter_to_decrement_when_ready_.Wait();
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Legacy: executes the tasks and destroys them
*5f39d1b3SJooyung Han  void LegacyExecuteAndDestroyTasks(const std::vector<Task*>& tasks) {
*5f39d1b3SJooyung Han    std::size_t tasks_count = tasks.size();
*5f39d1b3SJooyung Han    assert(tasks_count >= 1);
*5f39d1b3SJooyung Han    // One of the tasks will be run on the current thread.
*5f39d1b3SJooyung Han    std::size_t workers_count = tasks_count - 1;
*5f39d1b3SJooyung Han    CreateWorkers(workers_count);
*5f39d1b3SJooyung Han    assert(workers_count <= workers_.size());
*5f39d1b3SJooyung Han    counter_to_decrement_when_ready_.Reset(workers_count);
*5f39d1b3SJooyung Han    for (int i = 0; i < tasks_count - 1; i++) {
*5f39d1b3SJooyung Han      workers_[i]->StartWork(tasks[i]);
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han    // Execute the remaining workload immediately on the current thread.
*5f39d1b3SJooyung Han    Task* task = tasks[tasks_count - 1];
*5f39d1b3SJooyung Han    task->local_allocator = &main_thread_task_allocator_;
*5f39d1b3SJooyung Han    task->Run();
*5f39d1b3SJooyung Han    // Wait for the workers submitted above to finish.
*5f39d1b3SJooyung Han    counter_to_decrement_when_ready_.Wait();
*5f39d1b3SJooyung Han    // Cleanup tasks (best to do this from the same thread that allocated
*5f39d1b3SJooyung Han    // the memory).
*5f39d1b3SJooyung Han    std::for_each(tasks.begin(), tasks.end(), [](Task* task) { delete task; });
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Legacy old name of LegacyExecuteAndDestroyTasks
*5f39d1b3SJooyung Han  void Execute(const std::vector<Task*>& tasks) {
*5f39d1b3SJooyung Han    LegacyExecuteAndDestroyTasks(tasks);
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han private:
*5f39d1b3SJooyung Han  // Ensures that the pool has at least the given count of workers.
*5f39d1b3SJooyung Han  // If any new worker has to be created, this function waits for it to
*5f39d1b3SJooyung Han  // be ready.
*5f39d1b3SJooyung Han  void CreateWorkers(std::size_t workers_count) {
*5f39d1b3SJooyung Han    if (workers_.size() >= workers_count) {
*5f39d1b3SJooyung Han      return;
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han    counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
*5f39d1b3SJooyung Han    while (workers_.size() < workers_count) {
*5f39d1b3SJooyung Han      workers_.push_back(new Worker(&counter_to_decrement_when_ready_));
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han    counter_to_decrement_when_ready_.Wait();
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // copy construction disallowed
*5f39d1b3SJooyung Han  WorkersPool(const WorkersPool&) = delete;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // The workers in this pool. They are owned by the pool:
*5f39d1b3SJooyung Han  // the pool creates workers and destroys them in its destructor.
*5f39d1b3SJooyung Han  std::vector<Worker*> workers_;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // The BlockingCounter used to wait for the workers.
*5f39d1b3SJooyung Han  BlockingCounter counter_to_decrement_when_ready_;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // For N-threaded operations, we will use only N-1 worker threads
*5f39d1b3SJooyung Han  // while the last task will be run directly on the main thread.
*5f39d1b3SJooyung Han  // It will then use this main_thread_task_allocator_; having a
*5f39d1b3SJooyung Han  // dedicated allocator for that (separate from the base allocator_)
*5f39d1b3SJooyung Han  // allows to use the same code for all tasks regardless of which
*5f39d1b3SJooyung Han  // thread they run on.
*5f39d1b3SJooyung Han  Allocator main_thread_task_allocator_;
*5f39d1b3SJooyung Han};
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// The task we use to implement a multi-threaded Gemm: a block of the
*5f39d1b3SJooyung Han// RHS has been packed by the master thread; each worker thread
*5f39d1b3SJooyung Han// then has to pack a block of the LHS and accumulate the Gemm of these
*5f39d1b3SJooyung Han// packed LHS and RHS blocks.
*5f39d1b3SJooyung Hantemplate <typename KernelFormat, typename InputScalar, typename OutputScalar,
*5f39d1b3SJooyung Han          typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
*5f39d1b3SJooyung Han          MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
*5f39d1b3SJooyung Han          typename OutputPipelineType, typename GemmContextType>
*5f39d1b3SJooyung Hanstruct GemmWithPackedRhsTask : Task {
*5f39d1b3SJooyung Han  typedef PackedSideBlock<typename KernelFormat::Lhs> PackedLhs;
*5f39d1b3SJooyung Han  typedef PackedSideBlock<typename KernelFormat::Rhs> PackedRhs;
*5f39d1b3SJooyung Han  GemmWithPackedRhsTask(GemmContextType* _context, const KernelBase& _kernel,
*5f39d1b3SJooyung Han                        const MatrixMap<const InputScalar, LhsOrder>& _lhs,
*5f39d1b3SJooyung Han                        const PackedRhs& _packed_rhs,
*5f39d1b3SJooyung Han                        MatrixMap<OutputScalar, ResultOrder>* _result,
*5f39d1b3SJooyung Han                        const MatrixBlockBounds& _result_block,
*5f39d1b3SJooyung Han                        const LhsOffset& _lhs_offset,
*5f39d1b3SJooyung Han                        const RhsOffset& _rhs_offset,
*5f39d1b3SJooyung Han                        const BlockParams& _block_params,
*5f39d1b3SJooyung Han                        const OutputPipelineType& _output_pipeline)
*5f39d1b3SJooyung Han      : context(_context),
*5f39d1b3SJooyung Han        kernel(_kernel),
*5f39d1b3SJooyung Han        lhs(_lhs),
*5f39d1b3SJooyung Han        packed_rhs(_packed_rhs),
*5f39d1b3SJooyung Han        result(*_result),
*5f39d1b3SJooyung Han        result_block(_result_block),
*5f39d1b3SJooyung Han        lhs_offset(_lhs_offset),
*5f39d1b3SJooyung Han        rhs_offset(_rhs_offset),
*5f39d1b3SJooyung Han        block_params(_block_params),
*5f39d1b3SJooyung Han        output_pipeline(_output_pipeline) {}
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  void Run() override {
*5f39d1b3SJooyung Han    ScopedProfilingLabel label("GemmWithPackedRhsTask");
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    const int rows = result_block.rows;
*5f39d1b3SJooyung Han    const int cols = result_block.cols;
*5f39d1b3SJooyung Han    const int depth = lhs.cols();
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    PackedResult packed_result(local_allocator, block_params);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    local_allocator->Commit();
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    for (int c = 0; c < cols; c += block_params.l2_cols) {
*5f39d1b3SJooyung Han      int cs = std::min(block_params.l2_cols, cols - c);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han      for (int r = 0; r < rows; r += block_params.l2_rows) {
*5f39d1b3SJooyung Han        int rs = std::min(block_params.l2_rows, rows - r);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han        PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han        Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
*5f39d1b3SJooyung Han                depth);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han        auto curr_result_block = MatrixBlockBounds(
*5f39d1b3SJooyung Han            result_block.start_row + r, result_block.start_col + c, rs, cs);
*5f39d1b3SJooyung Han        UnpackResult<KernelFormat>(
*5f39d1b3SJooyung Han            &result, curr_result_block, packed_result, depth,
*5f39d1b3SJooyung Han            packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
*5f39d1b3SJooyung Han            lhs_offset.block(curr_result_block.start_row, rs),
*5f39d1b3SJooyung Han            rhs_offset.block(curr_result_block.start_col, cs), output_pipeline);
*5f39d1b3SJooyung Han      }
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    local_allocator->Decommit();
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  const GemmContextType* context;
*5f39d1b3SJooyung Han  const KernelBase& kernel;
*5f39d1b3SJooyung Han  const MatrixMap<const InputScalar, LhsOrder> lhs;
*5f39d1b3SJooyung Han  const PackedRhs packed_rhs;
*5f39d1b3SJooyung Han  MatrixMap<OutputScalar, ResultOrder> result;
*5f39d1b3SJooyung Han  const MatrixBlockBounds result_block;
*5f39d1b3SJooyung Han  const LhsOffset& lhs_offset;
*5f39d1b3SJooyung Han  const RhsOffset& rhs_offset;
*5f39d1b3SJooyung Han  const BlockParams& block_params;
*5f39d1b3SJooyung Han  const OutputPipelineType& output_pipeline;
*5f39d1b3SJooyung Han};
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// This base class for multi-threading allows subclasses to implement their own
*5f39d1b3SJooyung Han// workers_pool() method.  See MultiThreadGemmContext below for an example;
*5f39d1b3SJooyung Han// any other implementation of workers_pool() must return an object with the
*5f39d1b3SJooyung Han// same public methods as WorkersPool.
*5f39d1b3SJooyung Hanclass MultiThreadGemmContextBase : public SingleThreadGemmContext {
*5f39d1b3SJooyung Han public:
*5f39d1b3SJooyung Han  void set_max_num_threads(int n) { max_num_threads_ = n; }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  int max_num_threads() const { return max_num_threads_; }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han protected:
*5f39d1b3SJooyung Han  // The maximum number of worker threads to use (including
*5f39d1b3SJooyung Han  // the master thread).
*5f39d1b3SJooyung Han  // The default value 1 means single-threading. That is the default
*5f39d1b3SJooyung Han  // because gemmlowp's primary target is mobile hardware, where thermal
*5f39d1b3SJooyung Han  // constraints usually mean that it may not be realistic to use more
*5f39d1b3SJooyung Han  // than 1 CPU core even if multiple cores are present.
*5f39d1b3SJooyung Han  // The special value 0 means try to detect the number of hardware threads.
*5f39d1b3SJooyung Han  // Note: this assumes that all CPU cores are equivalent. That assumption
*5f39d1b3SJooyung Han  // is defeated on big.LITTLE ARM devices, where we have no API to query
*5f39d1b3SJooyung Han  // the number of big cores (which is typically what we would want to use,
*5f39d1b3SJooyung Han  // leaving aside above-mentioned thermal issues). That is the other reason
*5f39d1b3SJooyung Han  // why the best compromise here is to let max_num_threads_ default to 1,
*5f39d1b3SJooyung Han  // so users who want multi-threading have to make the decision of how many
*5f39d1b3SJooyung Han  // threads to use by themselves.
*5f39d1b3SJooyung Han  int max_num_threads_ = 1;
*5f39d1b3SJooyung Han};
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Hanclass MultiThreadGemmContext : public MultiThreadGemmContextBase {
*5f39d1b3SJooyung Han public:
*5f39d1b3SJooyung Han  WorkersPool* workers_pool() { return &workers_pool_; }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han private:
*5f39d1b3SJooyung Han  // The workers pool used by MultiThreadGemm. Making
*5f39d1b3SJooyung Han  // this part of the context allows it to be persistent,
*5f39d1b3SJooyung Han  // avoiding recreating threads on every Gemm.
*5f39d1b3SJooyung Han  WorkersPool workers_pool_;
*5f39d1b3SJooyung Han};
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// Determines how many threads should be used for a given Gemm
*5f39d1b3SJooyung Han// operation.
*5f39d1b3SJooyung Hantemplate <int KernelRows>
*5f39d1b3SJooyung Haninline int HowManyThreads(int max_num_threads, int rows, int cols, int depth) {
*5f39d1b3SJooyung Han  // Early-exit in the default case where multi-threading is disabled.
*5f39d1b3SJooyung Han  if (max_num_threads == 1) {
*5f39d1b3SJooyung Han    return 1;
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Determine the maximum number of threads.
*5f39d1b3SJooyung Han  int max_count = GetHardwareConcurrency(max_num_threads);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Basic calculation: take into account max pool size, and
*5f39d1b3SJooyung Han  // how many rows we have to feed our kernel.
*5f39d1b3SJooyung Han  // The motivation for an absolute minimum number of rows per thread,
*5f39d1b3SJooyung Han  // potentially higher than KernelRows, is that very thin thread workload
*5f39d1b3SJooyung Han  // currently defeat assumptions of the AddMod generator, resulting
*5f39d1b3SJooyung Han  // in substantial bias in TestWithRealData on 24 threads.
*5f39d1b3SJooyung Han  // Ideally, the AddMod generator should be aware of global (r,c) coordinates
*5f39d1b3SJooyung Han  // so as to be independent of the number of threads.
*5f39d1b3SJooyung Han  static const int AbsoluteMinRowsPerThread = 16;
*5f39d1b3SJooyung Han  static const int MinRowsPerThread = KernelRows > AbsoluteMinRowsPerThread
*5f39d1b3SJooyung Han                                          ? KernelRows
*5f39d1b3SJooyung Han                                          : AbsoluteMinRowsPerThread;
*5f39d1b3SJooyung Han  int thread_count = std::min(max_count, CeilQuotient(rows, MinRowsPerThread));
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // At this point for small products we already have thread_count==1 so
*5f39d1b3SJooyung Han  // we can avoid doing more work; otherwise, we still want to check
*5f39d1b3SJooyung Han  // that the cubic size (rows*cols*depth) is big enough to keep
*5f39d1b3SJooyung Han  // workers_ busy.
*5f39d1b3SJooyung Han  if (thread_count > 1) {
*5f39d1b3SJooyung Han    // Empirically determined value.
*5f39d1b3SJooyung Han    static const std::uint64_t min_cubic_size_per_thread = 64 * 1024;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    // We can only multiply two out of three sizes without risking overflow
*5f39d1b3SJooyung Han    const std::uint64_t cubic_size =
*5f39d1b3SJooyung Han        std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    thread_count =
*5f39d1b3SJooyung Han        std::min(thread_count, int(cubic_size / min_cubic_size_per_thread));
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    if (thread_count < 1) {
*5f39d1b3SJooyung Han      thread_count = 1;
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  assert(thread_count > 0 && thread_count <= max_count);
*5f39d1b3SJooyung Han  return thread_count;
*5f39d1b3SJooyung Han}
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// The main multi-threaded Gemm function.
*5f39d1b3SJooyung Han// To understand it, first read the code of SingleThreadGemm().
*5f39d1b3SJooyung Han// The parallelization scheme used here is to have this master function
*5f39d1b3SJooyung Han// pack a block of RHS and then start worker threads to pack a block of LHS
*5f39d1b3SJooyung Han// each, and accumulate the corresponding products.
*5f39d1b3SJooyung Hantemplate <typename KernelFormat, typename InputScalar, typename OutputScalar,
*5f39d1b3SJooyung Han          typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
*5f39d1b3SJooyung Han          MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
*5f39d1b3SJooyung Han          typename OutputPipelineType, typename GemmContextType>
*5f39d1b3SJooyung Hanvoid MultiThreadGemm(GemmContextType* context, const KernelBase& kernel,
*5f39d1b3SJooyung Han                     const MatrixMap<const InputScalar, LhsOrder>& lhs,
*5f39d1b3SJooyung Han                     const MatrixMap<const InputScalar, RhsOrder>& rhs,
*5f39d1b3SJooyung Han                     MatrixMap<OutputScalar, ResultOrder>* result,
*5f39d1b3SJooyung Han                     const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
*5f39d1b3SJooyung Han                     const OutputPipelineType& output_pipeline) {
*5f39d1b3SJooyung Han  ScopedProfilingLabel label("gemmlowp::MultiThreadGemm");
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  assert(lhs.cols() == rhs.rows());
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  int rows = result->rows();
*5f39d1b3SJooyung Han  int cols = result->cols();
*5f39d1b3SJooyung Han  int depth = lhs.cols();
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // zero sizes should have been caught earlier and early-returned.
*5f39d1b3SJooyung Han  assert(rows > 0);
*5f39d1b3SJooyung Han  assert(cols > 0);
*5f39d1b3SJooyung Han  assert(depth > 0);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // The case of rows<cols should have been caught earlier and transposed.
*5f39d1b3SJooyung Han  assert(rows >= cols);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  const int thread_count = HowManyThreads<KernelFormat::kRows>(
*5f39d1b3SJooyung Han      context->max_num_threads(), rows, cols, depth);
*5f39d1b3SJooyung Han  if (thread_count == 1) {
*5f39d1b3SJooyung Han    return SingleThreadGemm<KernelFormat, InputScalar, OutputScalar,
*5f39d1b3SJooyung Han                            BitDepthParams>(context, kernel, lhs, rhs, result,
*5f39d1b3SJooyung Han                                            lhs_offset, rhs_offset,
*5f39d1b3SJooyung Han                                            output_pipeline);
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han  assert(thread_count > 1);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // Simple 1:1 mapping of tasks to physical cores, which is very important
*5f39d1b3SJooyung Han  // to getting good multithreaded performance, specially for not-very-large
*5f39d1b3SJooyung Han  // GEMMs, and especially on Android.
*5f39d1b3SJooyung Han  const int task_count = thread_count;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  Allocator* allocator = context->allocator();
*5f39d1b3SJooyung Han  auto* workers_pool = context->workers_pool();
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  BlockParams block_params;
*5f39d1b3SJooyung Han  block_params.Init<KernelFormat>(
*5f39d1b3SJooyung Han      rows, cols, depth, task_count, context->l1_bytes_to_use(),
*5f39d1b3SJooyung Han      context->l2_bytes_to_use(), context->l2_rhs_factor());
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
*5f39d1b3SJooyung Han                                                         block_params);
*5f39d1b3SJooyung Han  allocator->Commit();
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // We loop over large blocks of the RHS.
*5f39d1b3SJooyung Han  for (int c = 0; c < cols; c += block_params.l2_cols) {
*5f39d1b3SJooyung Han    int cs = std::min(block_params.l2_cols, cols - c);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    // Pack a large block of the RHS.
*5f39d1b3SJooyung Han    PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    // Give work to each worker.
*5f39d1b3SJooyung Han    std::vector<Task*> tasks;
*5f39d1b3SJooyung Han    int next_start_row = 0;
*5f39d1b3SJooyung Han    for (int n = 0; n < task_count; ++n) {
*5f39d1b3SJooyung Han      int start_row = next_start_row;
*5f39d1b3SJooyung Han      next_start_row = std::min(
*5f39d1b3SJooyung Han          rows, RoundUp<KernelFormat::kRows>(rows * (n + 1) / task_count));
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han      int block_rows = next_start_row - start_row;
*5f39d1b3SJooyung Han      auto lhs_block = lhs.block(start_row, 0, block_rows, depth);
*5f39d1b3SJooyung Han      typedef GemmWithPackedRhsTask<KernelFormat, InputScalar, OutputScalar,
*5f39d1b3SJooyung Han                                    BitDepthParams, LhsOrder, RhsOrder,
*5f39d1b3SJooyung Han                                    ResultOrder, LhsOffset, RhsOffset,
*5f39d1b3SJooyung Han                                    OutputPipelineType, GemmContextType>
*5f39d1b3SJooyung Han          TaskType;
*5f39d1b3SJooyung Han      tasks.push_back(
*5f39d1b3SJooyung Han          new TaskType(context, kernel, lhs_block, packed_rhs, result,
*5f39d1b3SJooyung Han                       MatrixBlockBounds(start_row, c, block_rows, cs),
*5f39d1b3SJooyung Han                       lhs_offset, rhs_offset, block_params, output_pipeline));
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han    // Execute the work on the workers (and partially on this thread).
*5f39d1b3SJooyung Han    workers_pool->Execute(tasks);
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  allocator->Decommit();
*5f39d1b3SJooyung Han}
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han}  // namespace gemmlowp
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#endif  // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_