xref: /aosp_15_r20/external/ComputeLibrary/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
25 
26 #include "arm_compute/core/utils/misc/InfoHelpers.h"
27 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
28 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
29 #include "arm_compute/runtime/NEON/NEScheduler.h"
30 #include "src/common/utils/Log.h"
31 #include "src/cpu/operators/CpuDepthwiseConv2d.h"
32 
33 using namespace arm_compute::misc;
34 using namespace arm_compute::misc::shape_calculator;
35 
36 namespace arm_compute
37 {
38 NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
39 
40 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
41 {
42     ITensor       *src{ nullptr }; // SRC_0
43     ITensor       *dst{ nullptr }; // DST_0
44     const ITensor *weights
45     {
46         nullptr
47     }; // SRC_1
48     const ITensor *biases
49     {
50         nullptr
51     };                                                           // SRC_2
52     Tensor                                   permuted_input{};   // INT_0
53     Tensor                                   permuted_weights{}; // INT_1
54     Tensor                                   permuted_output{};  // INT_2
55     Tensor                                   workspace{};        // INT_3
56     Tensor                                   packed_weights{};   // INT_4
57     std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
58     bool                                     is_prepared{ false };
59     bool                                     permute{ false };
60 };
61 
NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)62 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
63     : _memory_group(memory_manager), _impl(std::make_unique<Impl>())
64 {
65 }
66 
configure(ITensor * input,const ITensor * weights,const ITensor * biases,ITensor * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)67 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor       *input,
68                                                                                           const ITensor *weights,
69                                                                                           const ITensor *biases,
70                                                                                           ITensor *output, const PadStrideInfo &conv_info,
71                                                                                           unsigned int               depth_multiplier,
72                                                                                           const ActivationLayerInfo &act_info,
73                                                                                           const Size2D              &dilation)
74 {
75     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
76 
77     bool is_nhwc   = input->info()->data_layout() == DataLayout::NCHW;
78     _impl->src     = input;
79     _impl->weights = weights;
80     _impl->biases  = biases;
81     _impl->dst     = output;
82     _impl->permute = is_nhwc;
83 
84     _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
85     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
86     _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
87                          _impl->dst->info(), info);
88 
89     // Configure pipeline
90     ActivationLayerInfo act_info_to_use            = ActivationLayerInfo();
91     const bool          is_relu                    = arm_compute::utils::info_helpers::is_relu(act_info);
92     const bool          is_relu6                   = arm_compute::utils::info_helpers::is_relu6(act_info);
93     bool                is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
94 
95     if(!is_activationlayer_enabled)
96     {
97         act_info_to_use = act_info;
98     }
99     info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
100 
101     auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
102 
103     if(is_nhwc)
104     {
105         auto permute_input   = std::make_unique<cpu::CpuPermute>();
106         auto permute_weights = std::make_unique<cpu::CpuPermute>();
107         auto permute_output  = std::make_unique<cpu::CpuPermute>();
108 
109         _memory_group.manage(&_impl->permuted_input);
110         _memory_group.manage(&_impl->permuted_weights);
111         _memory_group.manage(&_impl->permuted_output);
112 
113         // Configure the function to transform the input tensor from NCHW -> NHWC
114         permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
115         _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
116 
117         // Configure the function to transform the weights tensor from IHW -> HWI
118         permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
119         _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
120 
121         _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
122         _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
123 
124         // Configure optimized depthwise
125         dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info);
126 
127         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
128         _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
129         permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
130 
131         _impl->permuted_input.allocator()->allocate();
132         _impl->permuted_output.allocator()->allocate();
133     }
134     else
135     {
136         dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
137     }
138 
139     // Allocate memory based on the internal memory requirements
140     experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
141     _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment);
142     _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment);
143     _memory_group.manage(&_impl->workspace);
144     _memory_group.manage(&_impl->packed_weights);
145     _impl->workspace.allocator()->allocate();
146     _impl->packed_weights.allocator()->allocate();
147 }
148 
validate(const ITensorInfo * input,const ITensorInfo * weights,const ITensorInfo * biases,const ITensorInfo * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)149 Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo         *input,
150                                                                                            const ITensorInfo         *weights,
151                                                                                            const ITensorInfo         *biases,
152                                                                                            const ITensorInfo         *output,
153                                                                                            const PadStrideInfo       &conv_info,
154                                                                                            unsigned int               depth_multiplier,
155                                                                                            const ActivationLayerInfo &act_info,
156                                                                                            const Size2D              &dilation)
157 {
158     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
159     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
160 }
161 
run()162 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::run()
163 {
164     prepare();
165     MemoryGroupResourceScope scope_mg(_memory_group);
166 
167     ITensorPack pack;
168     pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
169     pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
170     pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
171     pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
172     pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
173     pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
174     pack.add_tensor(TensorType::ACL_INT_3, &_impl->workspace);
175     pack.add_tensor(TensorType::ACL_INT_4, &_impl->packed_weights);
176     pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
177 
178     _impl->op->run(pack);
179 }
180 
prepare()181 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
182 {
183     if(!_impl->is_prepared)
184     {
185         // Permute weights
186         if(_impl->permute)
187         {
188             _impl->permuted_weights.allocator()->allocate();
189         }
190 
191         if(!_impl->permuted_weights.is_used())
192         {
193             _impl->permuted_weights.allocator()->free();
194         }
195 
196         _impl->is_prepared = true;
197     }
198 }
199 
200 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
201 {
202     Tensor                                   permuted_input{};
203     Tensor                                   permuted_weights{};
204     Tensor                                   permuted_output{};
205     bool                                     is_prepared{ false };
206     bool                                     is_nchw{ false };
207     bool                                     is_activationlayer_enabled{ false };
208     const ITensor                           *weights{ nullptr };
209     const ITensor                           *biases{ nullptr };
210     const ITensor                           *src{ nullptr };
211     ITensor                                 *dst{ nullptr };
212     std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
213 };
214 
NEDepthwiseConvolutionLayerGeneric()215 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
216     : _impl(std::make_unique<Impl>())
217 {
218 }
219 
configure(ITensor * input,const ITensor * weights,const ITensor * biases,ITensor * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)220 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
221                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
222 {
223     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
224     ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
225                                                                      output->info(), conv_info, depth_multiplier, act_info, dilation));
226 
227     const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
228     _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
229     _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
230 
231     _impl->src         = input;
232     _impl->dst         = output;
233     _impl->weights     = weights;
234     _impl->biases      = biases;
235     _impl->is_nchw     = input->info()->data_layout() == DataLayout::NCHW;
236     _impl->is_prepared = !_impl->is_nchw;
237 
238     ITensor       *input_to_use   = input;
239     const ITensor *weights_to_use = weights;
240     ITensor       *output_to_use  = output;
241     if(_impl->is_nchw)
242     {
243         auto permute_input   = std::make_unique<cpu::CpuPermute>();
244         auto permute_weights = std::make_unique<cpu::CpuPermute>();
245 
246         permute_input->configure(input->info(), _impl->permuted_input.info(), PermutationVector(2U, 0U, 1U));
247         _impl->permuted_input.info()->set_data_layout(DataLayout::NHWC);
248         input_to_use = &_impl->permuted_input;
249 
250         permute_weights->configure(weights->info(), _impl->permuted_weights.info(), PermutationVector(2U, 0U, 1U));
251         _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
252         weights_to_use = &_impl->permuted_weights;
253 
254         _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
255         output_to_use = &_impl->permuted_output;
256     }
257 
258     auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
259     depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
260 
261     if(_impl->is_nchw)
262     {
263         auto permute_output = std::make_unique<cpu::CpuPermute>();
264         permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
265         _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
266 
267         _impl->permuted_input.allocator()->allocate();
268         _impl->permuted_weights.allocator()->allocate();
269         _impl->permuted_output.allocator()->allocate();
270     }
271 }
272 
validate(const ITensorInfo * input,const ITensorInfo * weights,const ITensorInfo * biases,const ITensorInfo * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)273 Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
274                                                                                  const PadStrideInfo &conv_info,
275                                                                                  unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
276 {
277     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
278     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
279 }
280 
run()281 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::run()
282 {
283     ITensorPack pack;
284     pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
285     pack.add_tensor(TensorType::ACL_SRC_1, _impl->weights);
286     pack.add_tensor(TensorType::ACL_SRC_2, _impl->biases);
287     pack.add_tensor(TensorType::ACL_INT_0, &_impl->permuted_input);
288     pack.add_tensor(TensorType::ACL_INT_1, &_impl->permuted_weights);
289     pack.add_tensor(TensorType::ACL_INT_2, &_impl->permuted_output);
290     pack.add_tensor(TensorType::ACL_DST_0, _impl->dst);
291 
292     _impl->op->run(pack);
293 }
294 
NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)295 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
296     : _memory_group(std::move(memory_manager)), _impl(std::make_unique<Impl>())
297 {
298 }
299 
300 #ifndef DOXYGEN_SKIP_THIS
301 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
302 {
303     DepthwiseConvolutionFunction                 depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
304     NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
305     NEDepthwiseConvolutionLayerGeneric           func_generic{};
306     std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{ nullptr };
307 };
308 #endif // DOXYGEN_SKIP_THIS
309 
configure(ITensor * input,const ITensor * weights,const ITensor * biases,ITensor * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)310 void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
311                                             const ActivationLayerInfo &act_info, const Size2D &dilation)
312 {
313     ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation);
314 
315     const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
316     _impl->op              = std::make_shared<cpu::CpuDepthwiseConv2d>();
317     _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
318                                                                           info);
319     switch(_impl->depth_conv_func)
320     {
321         case DepthwiseConvolutionFunction::OPTIMIZED:
322             _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
323             break;
324         case DepthwiseConvolutionFunction::GENERIC:
325             _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
326             break;
327         default:
328             ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
329     }
330 }
331 
validate(const ITensorInfo * input,const ITensorInfo * weights,const ITensorInfo * biases,const ITensorInfo * output,const PadStrideInfo & conv_info,unsigned int depth_multiplier,const ActivationLayerInfo & act_info,const Size2D & dilation)332 Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
333                                              unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
334 {
335     ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
336     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
337 }
338 
run()339 void NEDepthwiseConvolutionLayer::run()
340 {
341     switch(_impl->depth_conv_func)
342     {
343         case DepthwiseConvolutionFunction::OPTIMIZED:
344             _impl->func_optimized.run();
345             break;
346         case DepthwiseConvolutionFunction::GENERIC:
347             _impl->func_generic.run();
348             break;
349         default:
350             ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
351     }
352 }
353 
prepare()354 void NEDepthwiseConvolutionLayer::prepare()
355 {
356     switch(_impl->depth_conv_func)
357     {
358         case DepthwiseConvolutionFunction::OPTIMIZED:
359             _impl->func_optimized.prepare();
360             break;
361         case DepthwiseConvolutionFunction::GENERIC:
362             _impl->func_generic.prepare();
363             break;
364         default:
365             ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
366     }
367 }
368 } // namespace arm_compute
369