1 /*
2 * Copyright (c) 2017-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
25
26 #include "arm_compute/core/utils/misc/InfoHelpers.h"
27 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
28 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
29 #include "arm_compute/runtime/NEON/NEScheduler.h"
30 #include "src/common/utils/Log.h"
31 #include "src/cpu/operators/CpuDepthwiseConv2d.h"
32
33 using namespace arm_compute::misc;
34 using namespace arm_compute::misc::shape_calculator;
35
36 namespace arm_compute
37 {
38 NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
39
40 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
41 {
42 ITensor *src{ nullptr }; // SRC_0
43 ITensor *dst{ nullptr }; // DST_0
44 const ITensor *weights
45 {
46 nullptr
47 }; // SRC_1
48 const ITensor *biases
49 {
50 nullptr
51 }; // SRC_2
52 Tensor permuted_input{}; // INT_0
53 Tensor permuted_weights{}; // INT_1
54 Tensor permuted_output{}; // INT_2
55 Tensor workspace{}; // INT_3
56 Tensor packed_weights{}; // INT_4
57 std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
58 bool is_prepared{ false };
59 bool permute{ false };
60 };
61
NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)62 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
63 : _memory_group(memory_manager), _impl(std::make_unique<Impl>())
64 {
65 }
66
configure(ITensor * input,const ITensor * weights,const ITensor * biases,ITensor * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)67 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input,
68 const ITensor *weights,
69 const ITensor *biases,
70 ITensor *output, const PadStrideInfo &conv_info,
71 unsigned int depth_multiplier,
72 const ActivationLayerInfo &act_info,
73 const Size2D &dilation)
74 {
75 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
76
77 bool is_nhwc = input->info()->data_layout() == DataLayout::NCHW;
78 _impl->src = input;
79 _impl->weights = weights;
80 _impl->biases = biases;
81 _impl->dst = output;
82 _impl->permute = is_nhwc;
83
84 _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
85 ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
86 _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
87 _impl->dst->info(), info);
88
89 // Configure pipeline
90 ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
91 const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
92 const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
93 bool is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
94
95 if(!is_activationlayer_enabled)
96 {
97 act_info_to_use = act_info;
98 }
99 info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
100
101 auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
102
103 if(is_nhwc)
104 {
105 auto permute_input = std::make_unique<cpu::CpuPermute>();
106 auto permute_weights = std::make_unique<cpu::CpuPermute>();
107 auto permute_output = std::make_unique<cpu::CpuPermute>();
108
109 _memory_group.manage(&_impl->permuted_input);
110 _memory_group.manage(&_impl->permuted_weights);
111 _memory_group.manage(&_impl->permuted_output);
112
113 // Configure the function to transform the input tensor from NCHW -> NHWC
114 permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
115 _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
116
117 // Configure the function to transform the weights tensor from IHW -> HWI
118 permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
119 _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
120
121 _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
122 _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
123
124 // Configure optimized depthwise
125 dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info);
126
127 // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
128 _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
129 permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
130
131 _impl->permuted_input.allocator()->allocate();
132 _impl->permuted_output.allocator()->allocate();
133 }
134 else
135 {
136 dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
137 }
138
139 // Allocate memory based on the internal memory requirements
140 experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
141 _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment);
142 _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment);
143 _memory_group.manage(&_impl->workspace);
144 _memory_group.manage(&_impl->packed_weights);
145 _impl->workspace.allocator()->allocate();
146 _impl->packed_weights.allocator()->allocate();
147 }
148
validate(const ITensorInfo * input,const ITensorInfo * weights,const ITensorInfo * biases,const ITensorInfo * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)149 Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input,
150 const ITensorInfo *weights,
151 const ITensorInfo *biases,
152 const ITensorInfo *output,
153 const PadStrideInfo &conv_info,
154 unsigned int depth_multiplier,
155 const ActivationLayerInfo &act_info,
156 const Size2D &dilation)
157 {
158 ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
159 return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
160 }
161
run()162 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
163 {
164 prepare();
165 MemoryGroupResourceScope scope_mg(_memory_group);
166
167 ITensorPack pack;
168 pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
169 pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
170 pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
171 pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
172 pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
173 pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
174 pack.add_tensor(TensorType::ACL_INT_3, &_impl->workspace);
175 pack.add_tensor(TensorType::ACL_INT_4, &_impl->packed_weights);
176 pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
177
178 _impl->op->run(pack);
179 }
180
prepare()181 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
182 {
183 if(!_impl->is_prepared)
184 {
185 // Permute weights
186 if(_impl->permute)
187 {
188 _impl->permuted_weights.allocator()->allocate();
189 }
190
191 if(!_impl->permuted_weights.is_used())
192 {
193 _impl->permuted_weights.allocator()->free();
194 }
195
196 _impl->is_prepared = true;
197 }
198 }
199
200 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
201 {
202 Tensor permuted_input{};
203 Tensor permuted_weights{};
204 Tensor permuted_output{};
205 bool is_prepared{ false };
206 bool is_nchw{ false };
207 bool is_activationlayer_enabled{ false };
208 const ITensor *weights{ nullptr };
209 const ITensor *biases{ nullptr };
210 const ITensor *src{ nullptr };
211 ITensor *dst{ nullptr };
212 std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
213 };
214
NEDepthwiseConvolutionLayerGeneric()215 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
216 : _impl(std::make_unique<Impl>())
217 {
218 }
219
configure(ITensor * input,const ITensor * weights,const ITensor * biases,ITensor * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)220 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
221 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
222 {
223 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
224 ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
225 output->info(), conv_info, depth_multiplier, act_info, dilation));
226
227 const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
228 _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
229 _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
230
231 _impl->src = input;
232 _impl->dst = output;
233 _impl->weights = weights;
234 _impl->biases = biases;
235 _impl->is_nchw = input->info()->data_layout() == DataLayout::NCHW;
236 _impl->is_prepared = !_impl->is_nchw;
237
238 ITensor *input_to_use = input;
239 const ITensor *weights_to_use = weights;
240 ITensor *output_to_use = output;
241 if(_impl->is_nchw)
242 {
243 auto permute_input = std::make_unique<cpu::CpuPermute>();
244 auto permute_weights = std::make_unique<cpu::CpuPermute>();
245
246 permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
247 _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
248 input_to_use = &_impl->permuted_input;
249
250 permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
251 _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
252 weights_to_use = &_impl->permuted_weights;
253
254 _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
255 output_to_use = &_impl->permuted_output;
256 }
257
258 auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
259 depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
260
261 if(_impl->is_nchw)
262 {
263 auto permute_output = std::make_unique<cpu::CpuPermute>();
264 permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
265 _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
266
267 _impl->permuted_input.allocator()->allocate();
268 _impl->permuted_weights.allocator()->allocate();
269 _impl->permuted_output.allocator()->allocate();
270 }
271 }
272
validate(const ITensorInfo * input,const ITensorInfo * weights,const ITensorInfo * biases,const ITensorInfo * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)273 Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
274 const PadStrideInfo &conv_info,
275 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
276 {
277 ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
278 return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
279 }
280
run()281 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
282 {
283 ITensorPack pack;
284 pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
285 pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
286 pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
287 pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
288 pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
289 pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
290 pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
291
292 _impl->op->run(pack);
293 }
294
NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)295 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
296 : _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>())
297 {
298 }
299
300 #ifndef DOXYGEN_SKIP_THIS
301 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
302 {
303 DepthwiseConvolutionFunction depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
304 NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
305 NEDepthwiseConvolutionLayerGeneric func_generic{};
306 std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
307 };
308 #endif // DOXYGEN_SKIP_THIS
309
configure(ITensor * input,const ITensor * weights,const ITensor * biases,ITensor * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)310 void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
311 const ActivationLayerInfo &act_info, const Size2D &dilation)
312 {
313 ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation);
314
315 const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
316 _impl->op = std::make_shared<cpu::CpuDepthwiseConv2d>();
317 _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
318 info);
319 switch(_impl->depth_conv_func)
320 {
321 case DepthwiseConvolutionFunction::OPTIMIZED:
322 _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
323 break;
324 case DepthwiseConvolutionFunction::GENERIC:
325 _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
326 break;
327 default:
328 ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
329 }
330 }
331
validate(const ITensorInfo * input,const ITensorInfo * weights,const ITensorInfo * biases,const ITensorInfo * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)332 Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
333 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
334 {
335 ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
336 return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
337 }
338
run()339 void NEDepthwiseConvolutionLayer::run()
340 {
341 switch(_impl->depth_conv_func)
342 {
343 case DepthwiseConvolutionFunction::OPTIMIZED:
344 _impl->func_optimized.run();
345 break;
346 case DepthwiseConvolutionFunction::GENERIC:
347 _impl->func_generic.run();
348 break;
349 default:
350 ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
351 }
352 }
353
prepare()354 void NEDepthwiseConvolutionLayer::prepare()
355 {
356 switch(_impl->depth_conv_func)
357 {
358 case DepthwiseConvolutionFunction::OPTIMIZED:
359 _impl->func_optimized.prepare();
360 break;
361 case DepthwiseConvolutionFunction::GENERIC:
362 _impl->func_generic.prepare();
363 break;
364 default:
365 ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
366 }
367 }
368 } // namespace arm_compute
369