1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
17 
18 #include <algorithm>
19 
20 #include "tensorflow/lite/kernels/cpu_backend_context.h"
21 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
22 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
23 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
24 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
25 
26 namespace tflite {
27 namespace optimized_ops {
28 
29 // TODO(luwa): add multithread to per-channel depthwise_conv
30 // DepthwiseConv can run with multi threads on the dim specified by thread_dim.
31 // Each thread processes output elements on dim, thread_dim, in the range of
32 // [thread_start, thread_end).
33 // For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
34 // means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
35 template <typename T, typename TS>
36 struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
DepthwiseConvWorkerTaskDepthwiseConvWorkerTask37   DepthwiseConvWorkerTask(const DepthwiseParams& params,
38                           const RuntimeShape& input_shape, const T* input_data,
39                           const RuntimeShape& filter_shape,
40                           const T* filter_data, const RuntimeShape& bias_shape,
41                           const TS* bias_data, const RuntimeShape& output_shape,
42                           T* output_data, const CpuFlags& cpu_flags,
43                           int thread_start, int thread_end, int thread_dim)
44       : params_(params),
45         input_shape_(input_shape),
46         input_data_(input_data),
47         filter_shape_(filter_shape),
48         filter_data_(filter_data),
49         bias_shape_(bias_shape),
50         bias_data_(bias_data),
51         output_shape_(output_shape),
52         output_data_(output_data),
53         cpu_flags_(cpu_flags),
54         thread_start_(thread_start),
55         thread_end_(thread_end),
56         thread_dim_(thread_dim) {}
57 
RunDepthwiseConvWorkerTask58   void Run() override {
59     DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
60                       filter_data_, bias_shape_, bias_data_, output_shape_,
61                       output_data_, cpu_flags_, thread_start_, thread_end_,
62                       thread_dim_);
63   }
64 
65  private:
66   const DepthwiseParams& params_;
67   const RuntimeShape& input_shape_;
68   const T* input_data_;
69   const RuntimeShape& filter_shape_;
70   const T* filter_data_;
71   const RuntimeShape& bias_shape_;
72   const TS* bias_data_;
73   const RuntimeShape& output_shape_;
74   T* output_data_;
75   const CpuFlags& cpu_flags_;
76   int thread_start_;
77   int thread_end_;
78   int thread_dim_;
79 };
80 
HowManyConvThreads(const RuntimeShape & output_shape,const RuntimeShape & filter_shape)81 inline int HowManyConvThreads(const RuntimeShape& output_shape,
82                               const RuntimeShape& filter_shape) {
83   // How many scalar multiplications are needed to make it worth using one
84   // more thread
85   static constexpr int kMinMulPerThread = 1 << 13;  // 8k
86   const int filter_height = filter_shape.Dims(1);
87   const int filter_width = filter_shape.Dims(2);
88   const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
89   // Try to avoid real runtime divisions if possible by dividing by a
90   // compile-time constant.
91   int thread_count = std::max(1, num_muls / kMinMulPerThread);
92   return thread_count;
93 }
94 
MultithreadAlongBatches(int thread_count,int batches)95 inline bool MultithreadAlongBatches(int thread_count, int batches) {
96   TFLITE_DCHECK_GE(thread_count, 2);
97   // If there are fewer batch entries than the number of threads we want to use,
98   // then better do intra-batch-entry multithreading.
99   if (batches < thread_count) {
100     return false;
101   }
102   // If there are at least 2 batch entries to be handed to each thread, then
103   // it's safe to proceed with batch-wise multithreading: each thread will have
104   // approximately equal number of batch entries to handle, so the load
105   // balancing will be reasonable, and the amount to which the load is not
106   // perfectly balanced will be offset by the inherent advantages of
107   // batch-wise multithreading (each thread is more efficient thanks to working
108   // on larger buffers with less boundary-handling overhead).
109   if (batches >= 2 * thread_count) {
110     return true;
111   }
112   // In the limit case were there are at least 1 but not much more than 1
113   // batch entries per thread, it may be a good idea to do per-batch
114   // multithreading if the number of batch entries is a multiple of the number
115   // of threads, so that each thread will have the same number of batch entries
116   // to process.
117   return ((batches % thread_count) == 0);
118 }
119 
120 template <typename T, typename TS>
DepthwiseConv(const DepthwiseParams & params,const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & filter_shape,const T * filter_data,const RuntimeShape & bias_shape,const TS * bias_data,const RuntimeShape & output_shape,T * output_data,CpuBackendContext * cpu_backend_context)121 inline void DepthwiseConv(const DepthwiseParams& params,
122                           const RuntimeShape& input_shape, const T* input_data,
123                           const RuntimeShape& filter_shape,
124                           const T* filter_data, const RuntimeShape& bias_shape,
125                           const TS* bias_data, const RuntimeShape& output_shape,
126                           T* output_data,
127                           CpuBackendContext* cpu_backend_context) {
128   ruy::profiler::ScopeLabel label("DepthwiseConv");
129 
130   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
131   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
132   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
133 
134   int thread_count = HowManyConvThreads(output_shape, filter_shape);
135   const int max_threads = cpu_backend_context->max_num_threads();
136   thread_count = std::max(1, std::min(thread_count, max_threads));
137 #ifndef TFLITE_WITH_RUY
138   // Cap the number of threads to 2 for float path to avoid regression in
139   // performance (b/132294857).
140   if (std::is_floating_point<T>::value) {
141     thread_count = std::min(thread_count, 2);
142   }
143 #endif
144 
145   const int output_batches = output_shape.Dims(0);
146   const int output_height = output_shape.Dims(1);
147 
148   CpuFlags cpu_flags;
149   GetCpuFlags(&cpu_flags);
150 
151   if (thread_count == 1) {
152     DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
153                       filter_data, bias_shape, bias_data, output_shape,
154                       output_data, cpu_flags, /*thread_start=*/0,
155                       /*thread_end=*/output_height, /*thread_dim=*/1);
156     return;
157   }
158 
159   int thread_dim, thread_dim_size;
160   if (MultithreadAlongBatches(thread_count, output_batches)) {
161     thread_dim = 0;
162     thread_dim_size = output_batches;
163   } else {
164     thread_dim = 1;
165     thread_dim_size = output_height;
166   }
167 
168   std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
169   // TODO(b/131746020) don't create new heap allocations every time.
170   // At least we make it a single heap allocation by using reserve().
171   tasks.reserve(thread_count);
172   int thread_start = 0;
173   for (int i = 0; i < thread_count; ++i) {
174     int thread_end =
175         thread_start + (thread_dim_size - thread_start) / (thread_count - i);
176     tasks.emplace_back(params, input_shape, input_data, filter_shape,
177                        filter_data, bias_shape, bias_data, output_shape,
178                        output_data, cpu_flags, thread_start, thread_end,
179                        thread_dim);
180     thread_start = thread_end;
181   }
182   cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
183                                   cpu_backend_context);
184 }
185 
186 }  // namespace optimized_ops
187 }  // namespace tflite
188 
189 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
190