1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
17
18 #include <algorithm>
19
20 #include "tensorflow/lite/kernels/cpu_backend_context.h"
21 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
22 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
23 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
24 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
25
26 namespace tflite {
27 namespace optimized_ops {
28
29 // TODO(luwa): add multithread to per-channel depthwise_conv
30 // DepthwiseConv can run with multi threads on the dim specified by thread_dim.
31 // Each thread processes output elements on dim, thread_dim, in the range of
32 // [thread_start, thread_end).
33 // For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
34 // means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
35 template <typename T, typename TS>
36 struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
DepthwiseConvWorkerTaskDepthwiseConvWorkerTask37 DepthwiseConvWorkerTask(const DepthwiseParams& params,
38 const RuntimeShape& input_shape, const T* input_data,
39 const RuntimeShape& filter_shape,
40 const T* filter_data, const RuntimeShape& bias_shape,
41 const TS* bias_data, const RuntimeShape& output_shape,
42 T* output_data, const CpuFlags& cpu_flags,
43 int thread_start, int thread_end, int thread_dim)
44 : params_(params),
45 input_shape_(input_shape),
46 input_data_(input_data),
47 filter_shape_(filter_shape),
48 filter_data_(filter_data),
49 bias_shape_(bias_shape),
50 bias_data_(bias_data),
51 output_shape_(output_shape),
52 output_data_(output_data),
53 cpu_flags_(cpu_flags),
54 thread_start_(thread_start),
55 thread_end_(thread_end),
56 thread_dim_(thread_dim) {}
57
RunDepthwiseConvWorkerTask58 void Run() override {
59 DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
60 filter_data_, bias_shape_, bias_data_, output_shape_,
61 output_data_, cpu_flags_, thread_start_, thread_end_,
62 thread_dim_);
63 }
64
65 private:
66 const DepthwiseParams& params_;
67 const RuntimeShape& input_shape_;
68 const T* input_data_;
69 const RuntimeShape& filter_shape_;
70 const T* filter_data_;
71 const RuntimeShape& bias_shape_;
72 const TS* bias_data_;
73 const RuntimeShape& output_shape_;
74 T* output_data_;
75 const CpuFlags& cpu_flags_;
76 int thread_start_;
77 int thread_end_;
78 int thread_dim_;
79 };
80
HowManyConvThreads(const RuntimeShape & output_shape,const RuntimeShape & filter_shape)81 inline int HowManyConvThreads(const RuntimeShape& output_shape,
82 const RuntimeShape& filter_shape) {
83 // How many scalar multiplications are needed to make it worth using one
84 // more thread
85 static constexpr int kMinMulPerThread = 1 << 13; // 8k
86 const int filter_height = filter_shape.Dims(1);
87 const int filter_width = filter_shape.Dims(2);
88 const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
89 // Try to avoid real runtime divisions if possible by dividing by a
90 // compile-time constant.
91 int thread_count = std::max(1, num_muls / kMinMulPerThread);
92 return thread_count;
93 }
94
MultithreadAlongBatches(int thread_count,int batches)95 inline bool MultithreadAlongBatches(int thread_count, int batches) {
96 TFLITE_DCHECK_GE(thread_count, 2);
97 // If there are fewer batch entries than the number of threads we want to use,
98 // then better do intra-batch-entry multithreading.
99 if (batches < thread_count) {
100 return false;
101 }
102 // If there are at least 2 batch entries to be handed to each thread, then
103 // it's safe to proceed with batch-wise multithreading: each thread will have
104 // approximately equal number of batch entries to handle, so the load
105 // balancing will be reasonable, and the amount to which the load is not
106 // perfectly balanced will be offset by the inherent advantages of
107 // batch-wise multithreading (each thread is more efficient thanks to working
108 // on larger buffers with less boundary-handling overhead).
109 if (batches >= 2 * thread_count) {
110 return true;
111 }
112 // In the limit case were there are at least 1 but not much more than 1
113 // batch entries per thread, it may be a good idea to do per-batch
114 // multithreading if the number of batch entries is a multiple of the number
115 // of threads, so that each thread will have the same number of batch entries
116 // to process.
117 return ((batches % thread_count) == 0);
118 }
119
120 template <typename T, typename TS>
DepthwiseConv(const DepthwiseParams & params,const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & filter_shape,const T * filter_data,const RuntimeShape & bias_shape,const TS * bias_data,const RuntimeShape & output_shape,T * output_data,CpuBackendContext * cpu_backend_context)121 inline void DepthwiseConv(const DepthwiseParams& params,
122 const RuntimeShape& input_shape, const T* input_data,
123 const RuntimeShape& filter_shape,
124 const T* filter_data, const RuntimeShape& bias_shape,
125 const TS* bias_data, const RuntimeShape& output_shape,
126 T* output_data,
127 CpuBackendContext* cpu_backend_context) {
128 ruy::profiler::ScopeLabel label("DepthwiseConv");
129
130 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
131 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
132 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
133
134 int thread_count = HowManyConvThreads(output_shape, filter_shape);
135 const int max_threads = cpu_backend_context->max_num_threads();
136 thread_count = std::max(1, std::min(thread_count, max_threads));
137 #ifndef TFLITE_WITH_RUY
138 // Cap the number of threads to 2 for float path to avoid regression in
139 // performance (b/132294857).
140 if (std::is_floating_point<T>::value) {
141 thread_count = std::min(thread_count, 2);
142 }
143 #endif
144
145 const int output_batches = output_shape.Dims(0);
146 const int output_height = output_shape.Dims(1);
147
148 CpuFlags cpu_flags;
149 GetCpuFlags(&cpu_flags);
150
151 if (thread_count == 1) {
152 DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
153 filter_data, bias_shape, bias_data, output_shape,
154 output_data, cpu_flags, /*thread_start=*/0,
155 /*thread_end=*/output_height, /*thread_dim=*/1);
156 return;
157 }
158
159 int thread_dim, thread_dim_size;
160 if (MultithreadAlongBatches(thread_count, output_batches)) {
161 thread_dim = 0;
162 thread_dim_size = output_batches;
163 } else {
164 thread_dim = 1;
165 thread_dim_size = output_height;
166 }
167
168 std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
169 // TODO(b/131746020) don't create new heap allocations every time.
170 // At least we make it a single heap allocation by using reserve().
171 tasks.reserve(thread_count);
172 int thread_start = 0;
173 for (int i = 0; i < thread_count; ++i) {
174 int thread_end =
175 thread_start + (thread_dim_size - thread_start) / (thread_count - i);
176 tasks.emplace_back(params, input_shape, input_data, filter_shape,
177 filter_data, bias_shape, bias_data, output_shape,
178 output_data, cpu_flags, thread_start, thread_end,
179 thread_dim);
180 thread_start = thread_end;
181 }
182 cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
183 cpu_backend_context);
184 }
185
186 } // namespace optimized_ops
187 } // namespace tflite
188
189 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
190