1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
25 #include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
26 
27 #include "arm_compute/core/CL/CLHelpers.h"
28 #include "arm_compute/core/GPUTarget.h"
29 #include "arm_compute/core/TensorInfo.h"
30 #include "arm_compute/core/TensorShape.h"
31 
32 namespace arm_compute
33 {
34 namespace cl_dwc
35 {
36 namespace
37 {
configure_f32(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info,const Size2D & dilation,unsigned int depth_multiplier,bool is_g71)38 DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
39                                    unsigned int depth_multiplier, bool is_g71)
40 {
41     DWCComputeKernelInfo desc;
42 
43     if(src->data_layout() == DataLayout::NHWC)
44     {
45         const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
46         const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
47         const TensorShape wei_shape = wei->tensor_shape();
48         const size_t      kernel_c  = wei_shape[idx_c];
49         const size_t      kernel_w  = wei_shape[idx_w];
50 
51         desc.export_input_to_cl_image = false;
52 
53         if(is_g71)
54         {
55             desc.export_weights_to_cl_image = false;
56         }
57         else
58         {
59             desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
60         }
61 
62         if(depth_multiplier == 1)
63         {
64             desc.n0 = 4;
65         }
66         else
67         {
68             if((depth_multiplier % 4) == 0)
69             {
70                 desc.n0 = 4;
71             }
72             else if((depth_multiplier % 2) == 0)
73             {
74                 desc.n0 = 2;
75             }
76             else
77             {
78                 desc.n0 = 1;
79             }
80         }
81 
82         // Note: If we reduce n0, export to cl_image must be false
83         ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
84 
85         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
86 
87         // Set m0 only if stride_x == 1 and dilation_x == 1
88         if(conv_info.stride().first == 1 && dilation.x() == 1)
89         {
90             if((kernel_w >= 9) || (kernel_w == 1))
91             {
92                 desc.m0 = 1;
93             }
94             else
95             {
96                 desc.m0 = 2;
97             }
98         }
99         else
100         {
101             desc.m0 = 1;
102         }
103     }
104 
105     return desc;
106 }
107 
configure_f16(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info,const Size2D & dilation,unsigned int depth_multiplier,bool is_g71)108 DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
109                                    unsigned int depth_multiplier, bool is_g71)
110 {
111     DWCComputeKernelInfo desc;
112 
113     if(src->data_layout() == DataLayout::NHWC)
114     {
115         // Src and weights have the same dimension indices
116         const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
117         const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
118         const TensorShape src_shape = src->tensor_shape();
119         const TensorShape wei_shape = wei->tensor_shape();
120         const size_t      src_w     = src_shape[idx_w];
121         const size_t      kernel_c  = wei_shape[idx_c];
122         const size_t      kernel_w  = wei_shape[idx_w];
123 
124         desc.export_input_to_cl_image = false;
125 
126         if(is_g71)
127         {
128             desc.export_weights_to_cl_image = false;
129         }
130         else
131         {
132             desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
133         }
134 
135         if(depth_multiplier == 1)
136         {
137             if(desc.export_weights_to_cl_image == false)
138             {
139                 desc.n0 = 8;
140             }
141             else
142             {
143                 desc.n0 = 4;
144             }
145         }
146         else
147         {
148             if((depth_multiplier % 4) == 0)
149             {
150                 desc.n0 = 4;
151             }
152             else if((depth_multiplier % 2) == 0)
153             {
154                 desc.n0 = 2;
155             }
156             else
157             {
158                 desc.n0 = 1;
159             }
160         }
161 
162         // Note: If we reduce n0, export to cl_image must be false
163         ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
164 
165         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
166 
167         // Set m0 only if stride_x == 1 and dilation_x == 1
168         if(conv_info.stride().first == 1 && dilation.x() == 1)
169         {
170             if((kernel_w >= 9) || (kernel_w == 1))
171             {
172                 desc.m0 = 1;
173             }
174             else
175             {
176                 if((src_w % 5) == 0)
177                 {
178                     desc.m0 = 5;
179                 }
180                 else
181                 {
182                     desc.m0 = 4;
183                 }
184             }
185         }
186         else
187         {
188             desc.m0 = 1;
189         }
190     }
191 
192     return desc;
193 }
194 } // namespace
195 
ClDWCNativeDefaultConfigBifrost(GPUTarget gpu)196 ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu)
197     : IClDWCNativeKernelConfig(gpu)
198 {
199 }
200 
configure(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info,const Size2D & dilation,unsigned int depth_multiplier)201 DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
202                                                                 unsigned int depth_multiplier)
203 {
204     using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
205                                                                    unsigned int depth_multiplier);
206 
207     ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDWCNativeDefaultConfigBifrost::configure_G71_f32,
208                                                                          &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
209                                                                          &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
210 
211     ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClDWCNativeDefaultConfigBifrost::configure_G7x_f32,
212                                                                          &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
213                                                                          &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
214 
215     ConfigurationFunctionExecutorPtr func = nullptr;
216     switch(_target)
217     {
218         case GPUTarget::G71:
219             func = configs_G71.get_function(src->data_type());
220             break;
221         default:
222             func = configs_G7x.get_function(src->data_type());
223             break;
224     }
225 
226     ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
227     return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
228 }
229 
configure_G71_f32(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info,const Size2D & dilation,unsigned int depth_multiplier)230 DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
231                                                                         unsigned int depth_multiplier)
232 {
233     return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true);
234 }
235 
configure_G71_f16(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info,const Size2D & dilation,unsigned int depth_multiplier)236 DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
237                                                                         unsigned int depth_multiplier)
238 {
239     return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true);
240 }
241 
configure_G7x_f32(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info,const Size2D & dilation,unsigned int depth_multiplier)242 DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
243                                                                         unsigned int depth_multiplier)
244 {
245     return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false);
246 }
247 
configure_G7x_f16(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info,const Size2D & dilation,unsigned int depth_multiplier)248 DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
249                                                                         unsigned int depth_multiplier)
250 {
251     return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false);
252 }
253 
configure_G7x_u8(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info,const Size2D & dilation,unsigned int depth_multiplier)254 DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
255                                                                         unsigned int depth_multiplier)
256 {
257     ARM_COMPUTE_UNUSED(wei);
258 
259     DWCComputeKernelInfo desc;
260 
261     if(src->data_layout() == DataLayout::NHWC)
262     {
263         desc.export_input_to_cl_image   = false;
264         desc.export_weights_to_cl_image = false;
265         desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
266         if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
267         {
268             desc.m0 = 2;
269         }
270         else
271         {
272             desc.m0 = 1;
273         }
274     }
275 
276     return desc;
277 }
278 } // namespace cl_dwc
279 } // namespace arm_compute
280