1 /*
2  * Copyright (c) 2022-2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "ClTemplateDirectConv2d.h"
25 
26 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
27 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
28 
29 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
30 #include "src/core/helpers/WindowHelpers.h"
31 
32 #include "support/StringSupport.h"
33 
34 namespace arm_compute
35 {
36 namespace experimental
37 {
38 namespace dynamic_fusion
39 {
ClTemplateDirectConv2d(ComponentId id,const ArgumentPack<ITensorInfo> & tensors,const Attributes & attributes,const Settings & settings)40 ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId                      id,
41                                                const ArgumentPack<ITensorInfo> &tensors,
42                                                const Attributes                &attributes,
43                                                const Settings                  &settings)
44     : IGpuTemplateComponentWriter{ id, tensors },
45       _src{},
46       _weight{},
47       _bias{},
48       _dst{},
49       _attributes{ attributes },
50       _settings{ settings }
51 {
52     _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
53     _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
54     if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
55     {
56         _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
57     }
58     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
59     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
60 }
61 
get_name() const62 std::string ClTemplateDirectConv2d::get_name() const
63 {
64     return "direct_conv2d";
65 }
66 
get_component_code(const ComponentGroup & comp_group) const67 std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
68 {
69     ARM_COMPUTE_UNUSED(comp_group);
70 
71     const auto channel_idx   = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
72     const auto k0            = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
73     const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
74 
75     std::string code = R"_(
76 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
77 // IN_0(src)            {{src}}
78 // IN_1(wei)            {{weight}}
79 )_";
80     if(_bias && _bias->has_valid_id())
81     {
82         code += R"_(
83 // IN_1(bia)            {{bias}}
84 )_";
85     }
86     code += R"_(
87 // OUT(dst, accum)      {{dst}}
88 
89 TILE(uint, M0, 1, g_dst_indirect_y);
90 
91 {
92 #define _IWEI_WIDTH {{WEI_WIDTH}}
93 #define _IWEI_HEIGHT {{WEI_HEIGHT}}
94 #define _ISRC_WIDTH {{src}}_w
95 #define _ISRC_HEIGHT {{src}}_h
96 #define _ISRC_CHANNELS {{src}}_c
97 #define _IDST_WIDTH {{arg_dst}}_w
98 #define _IDST_HEIGHT {{arg_dst}}_h
99 #define _IDST_CHANNELS {{arg_dst}}_c
100 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
101 
102     TILE(int, M0, 1, xi);
103     TILE(int, M0, 1, yi);
104 
105     // Convert the linear index to coordinate
106     LOOP_UNROLLING(int, i, 0, 1, M0,
107     {
108         xi[0].s[i] = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
109         yi[0].s[i] = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
110         xi[0].s[i] -= {{PAD_LEFT}};
111         yi[0].s[i] -= {{PAD_TOP}};
112     })
113 
114     LOOP_UNROLLING(int, i, 0, 1, M0,
115     {
116         {{dst}}[i].v = 0;
117     })
118 
119     for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
120     {
121         int xk = i % _IWEI_WIDTH;
122         int yk = i / _IWEI_WIDTH;
123 
124         TILE(int, 1, M0, my);
125 
126         LOOP_UNROLLING(int, i, 0, 1, M0,
127         {
128             int x_s    = xi[0].s[i] + xk;
129             int y_s    = yi[0].s[i] + yk;
130             my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
131             my[0].s[i] = my[0].s[i] + g_ind_2 * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
132             my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
133             my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
134             my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
135             my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
136         })
137 
138         int ck = 0;
139         for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
140         {
141             TILE({{SRC_DATA_TYPE}}, M0, K0, a);
142             TILE({{WEI_DATA_TYPE}}, N0, K0, b);
143 
144             LOOP_UNROLLING(int, i, 0, 1, M0,
145             {
146                 a[i].v = {{ZERO_VALUE}};
147             })
148 
149             LOOP_UNROLLING(int, i, 0, 1, N0,
150             {
151                 b[i].v = {{ZERO_VALUE}};
152             })
153 
154             T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
155 
156             T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
157 
158             T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
159         }
160 )_";
161 
162     if(leftover_loop)
163     {
164         code += R"_(
165         for(; ck < _ISRC_CHANNELS; ++ck)
166         {
167             TILE({{SRC_DATA_TYPE}}, M0, 1, a);
168             TILE({{WEI_DATA_TYPE}}, N0, 1, b);
169 
170             LOOP_UNROLLING(int, i, 0, 1, M0,
171             {
172                 a[i].v = {{ZERO_VALUE}};
173             })
174 
175             LOOP_UNROLLING(int, i, 0, 1, N0,
176             {
177                 b[i].v = {{ZERO_VALUE}};
178             })
179 
180             T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
181 
182             T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
183 
184             T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
185         }
186     )_";
187 }
188 
189 code += R"_(
190 #undef _I_WEI_WIDTH
191 #undef _I_WEI_HEIGHT
192 #undef _ISRC_WIDTH
193 #undef _ISRC_HEIGHT
194 #undef _ISRC_CHANNELS
195 #undef _IDST_WIDTH
196 #undef _IDST_HEIGHT
197 #undef _IDST_CHANNELS
198 #undef _IY_MULTIPLIER
199 
200     }
201 )_";
202 
203     if(_bias && _bias->has_valid_id())
204     {
205         code += R"_(
206         TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
207 
208         T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
209 
210         T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
211     )_";
212 }
213 
214 code += R"_(
215     LOOP_UNROLLING(int, i, 0, 1, M0,
216     {
217         g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
218         g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h);
219     })
220 }
221 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
222 )_";
223     return code;
224 }
225 
declare_variables(GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const226 void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
227 {
228     vtable.declare_variable(
229         comp_group,
230         _src,
231         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
232         "src");
233 
234     const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
235     vtable.declare_variable(
236         comp_group,
237         _weight,
238         GpuKernelArgumentInfo(weight_type),
239         "weight");
240 
241     if(_bias && _bias->has_valid_id()) // optional bias
242     {
243         vtable.declare_variable(
244             comp_group,
245             _bias,
246             GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
247             "bias");
248     }
249     vtable.declare_variable(
250         comp_group,
251         _dst,
252         GpuKernelArgumentInfo(common_tensor_type),
253         "dst");
254 }
255 
get_tag_lut(const GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const256 TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
257 {
258     TagLUT lut{};
259     // Arguments and global shared variables
260     lut["src"]    = vtable.get_variable(_src);
261     lut["weight"] = vtable.get_variable(_weight);
262 
263     if(_bias && _bias->has_valid_id()) // optional bias
264     {
265         lut["bias"]          = vtable.get_variable(_bias);
266         lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
267     }
268     lut["dst"] = vtable.get_variable(_dst);
269 
270     const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
271     lut["arg_dst"]          = dst_argument.uniq_name;
272 
273     // Local build options
274     lut["meta_kernel_id"] = id();
275     lut["ACC_DATA_TYPE"]  = _src->data_type();
276     lut["SRC_DATA_TYPE"]  = _src->data_type();
277     lut["WEI_DATA_TYPE"]  = _weight->data_type();
278 
279     lut["SRC_TENSOR_TYPE"] = "BUFFER";
280     switch(vtable.get_variable(_weight).kernel_argument_info.type)
281     {
282         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
283         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
284         case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
285     {
286         lut["WEI_TENSOR_TYPE"] = "IMAGE";
287         break;
288     }
289         default:
290     {
291         lut["WEI_TENSOR_TYPE"] = "BUFFER";
292         break;
293     }
294     }
295     const auto width_idx  = 1;
296     const auto height_idx = 2;
297     lut["WEI_WIDTH"]      = _weight->dimension(width_idx);
298     lut["WEI_HEIGHT"]     = _weight->dimension(height_idx);
299 
300     lut["STRIDE_X"] = _attributes.stride().x();
301     lut["STRIDE_Y"] = _attributes.stride().y();
302 
303     lut["PAD_LEFT"] = _attributes.pad().left;
304     lut["PAD_TOP"]  = _attributes.pad().top;
305 
306     lut["ZERO_VALUE"] = 0;
307 
308     return lut;
309 }
310 
get_build_options(const ComponentGroup & comp_group) const311 CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
312 {
313     const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
314 
315     const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
316     const unsigned int n0               = root_window.x().step();
317     const unsigned int m0               = root_window.y().step();
318     const unsigned int k0               = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
319     const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
320 
321     CLBuildOptions build_opts{};
322     if(_settings.fast_relaxed_math())
323     {
324         build_opts.add_option("-cl-fast-relaxed-math");
325     }
326     else
327     {
328         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
329         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
330         build_opts.add_option("-cl-unsafe-math-optimizations");
331     }
332 
333     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
334     build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
335     build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
336     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
337 
338     return build_opts;
339 }
340 
get_config_id() const341 std::string ClTemplateDirectConv2d::get_config_id() const
342 {
343     const DataType   data_type   = _src->data_type();
344     const DataLayout data_layout = _src->data_layout();
345 
346     const unsigned int width_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
347     const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
348 
349     const unsigned int kernel_size = _weight->dimension(width_idx);
350 
351     std::string config_id{};
352     config_id += lower_string(string_from_data_type(data_type));
353     config_id += "_";
354     config_id += support::cpp11::to_string(kernel_size);
355     config_id += "_";
356     config_id += support::cpp11::to_string(_attributes.stride().x());
357     config_id += "_";
358     config_id += support::cpp11::to_string(_attributes.stride().y());
359     config_id += "_";
360     config_id += support::cpp11::to_string(_dst->dimension(width_idx));
361     config_id += "_";
362     config_id += support::cpp11::to_string(_dst->dimension(height_idx));
363     config_id += "_";
364     config_id += lower_string(string_from_data_layout(data_layout));
365     return config_id;
366 }
367 
get_headers_list() const368 std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
369 {
370     return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
371 }
372 
get_window() const373 Window ClTemplateDirectConv2d::get_window() const
374 {
375     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
376 
377     const auto output_shape = _dst->tensor_shape();
378     const auto desc         = _settings.direct_conv_descriptor();
379 
380     const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
381     const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1] * output_shape[2]);
382 
383     // Create and configure kernel window
384     Window win = calculate_max_window(output_shape, Steps(n0, m0));
385 
386     const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], m0);
387     win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0));
388     win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
389 
390     return win;
391 }
392 
393 } // namespace dynamic_fusion
394 } // namespace experimental
395 } // namespace arm_compute
396