1 /*
2 * Copyright (c) 2022-2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "ClTemplateDirectConv2d.h"
25
26 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
27 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
28
29 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
30 #include "src/core/helpers/WindowHelpers.h"
31
32 #include "support/StringSupport.h"
33
34 namespace arm_compute
35 {
36 namespace experimental
37 {
38 namespace dynamic_fusion
39 {
ClTemplateDirectConv2d(ComponentId id,const ArgumentPack<ITensorInfo> & tensors,const Attributes & attributes,const Settings & settings)40 ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId id,
41 const ArgumentPack<ITensorInfo> &tensors,
42 const Attributes &attributes,
43 const Settings &settings)
44 : IGpuTemplateComponentWriter{ id, tensors },
45 _src{},
46 _weight{},
47 _bias{},
48 _dst{},
49 _attributes{ attributes },
50 _settings{ settings }
51 {
52 _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
53 _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
54 if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
55 {
56 _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
57 }
58 _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
59 ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
60 }
61
get_name() const62 std::string ClTemplateDirectConv2d::get_name() const
63 {
64 return "direct_conv2d";
65 }
66
get_component_code(const ComponentGroup & comp_group) const67 std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
68 {
69 ARM_COMPUTE_UNUSED(comp_group);
70
71 const auto channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
72 const auto k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
73 const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
74
75 std::string code = R"_(
76 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
77 // IN_0(src) {{src}}
78 // IN_1(wei) {{weight}}
79 )_";
80 if(_bias && _bias->has_valid_id())
81 {
82 code += R"_(
83 // IN_1(bia) {{bias}}
84 )_";
85 }
86 code += R"_(
87 // OUT(dst, accum) {{dst}}
88
89 TILE(uint, M0, 1, g_dst_indirect_y);
90
91 {
92 #define _IWEI_WIDTH {{WEI_WIDTH}}
93 #define _IWEI_HEIGHT {{WEI_HEIGHT}}
94 #define _ISRC_WIDTH {{src}}_w
95 #define _ISRC_HEIGHT {{src}}_h
96 #define _ISRC_CHANNELS {{src}}_c
97 #define _IDST_WIDTH {{arg_dst}}_w
98 #define _IDST_HEIGHT {{arg_dst}}_h
99 #define _IDST_CHANNELS {{arg_dst}}_c
100 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
101
102 TILE(int, M0, 1, xi);
103 TILE(int, M0, 1, yi);
104
105 // Convert the linear index to coordinate
106 LOOP_UNROLLING(int, i, 0, 1, M0,
107 {
108 xi[0].s[i] = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
109 yi[0].s[i] = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
110 xi[0].s[i] -= {{PAD_LEFT}};
111 yi[0].s[i] -= {{PAD_TOP}};
112 })
113
114 LOOP_UNROLLING(int, i, 0, 1, M0,
115 {
116 {{dst}}[i].v = 0;
117 })
118
119 for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
120 {
121 int xk = i % _IWEI_WIDTH;
122 int yk = i / _IWEI_WIDTH;
123
124 TILE(int, 1, M0, my);
125
126 LOOP_UNROLLING(int, i, 0, 1, M0,
127 {
128 int x_s = xi[0].s[i] + xk;
129 int y_s = yi[0].s[i] + yk;
130 my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
131 my[0].s[i] = my[0].s[i] + g_ind_2 * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
132 my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
133 my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
134 my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
135 my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
136 })
137
138 int ck = 0;
139 for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
140 {
141 TILE({{SRC_DATA_TYPE}}, M0, K0, a);
142 TILE({{WEI_DATA_TYPE}}, N0, K0, b);
143
144 LOOP_UNROLLING(int, i, 0, 1, M0,
145 {
146 a[i].v = {{ZERO_VALUE}};
147 })
148
149 LOOP_UNROLLING(int, i, 0, 1, N0,
150 {
151 b[i].v = {{ZERO_VALUE}};
152 })
153
154 T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
155
156 T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
157
158 T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
159 }
160 )_";
161
162 if(leftover_loop)
163 {
164 code += R"_(
165 for(; ck < _ISRC_CHANNELS; ++ck)
166 {
167 TILE({{SRC_DATA_TYPE}}, M0, 1, a);
168 TILE({{WEI_DATA_TYPE}}, N0, 1, b);
169
170 LOOP_UNROLLING(int, i, 0, 1, M0,
171 {
172 a[i].v = {{ZERO_VALUE}};
173 })
174
175 LOOP_UNROLLING(int, i, 0, 1, N0,
176 {
177 b[i].v = {{ZERO_VALUE}};
178 })
179
180 T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
181
182 T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
183
184 T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
185 }
186 )_";
187 }
188
189 code += R"_(
190 #undef _I_WEI_WIDTH
191 #undef _I_WEI_HEIGHT
192 #undef _ISRC_WIDTH
193 #undef _ISRC_HEIGHT
194 #undef _ISRC_CHANNELS
195 #undef _IDST_WIDTH
196 #undef _IDST_HEIGHT
197 #undef _IDST_CHANNELS
198 #undef _IY_MULTIPLIER
199
200 }
201 )_";
202
203 if(_bias && _bias->has_valid_id())
204 {
205 code += R"_(
206 TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
207
208 T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
209
210 T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
211 )_";
212 }
213
214 code += R"_(
215 LOOP_UNROLLING(int, i, 0, 1, M0,
216 {
217 g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
218 g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h);
219 })
220 }
221 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
222 )_";
223 return code;
224 }
225
declare_variables(GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const226 void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
227 {
228 vtable.declare_variable(
229 comp_group,
230 _src,
231 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
232 "src");
233
234 const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
235 vtable.declare_variable(
236 comp_group,
237 _weight,
238 GpuKernelArgumentInfo(weight_type),
239 "weight");
240
241 if(_bias && _bias->has_valid_id()) // optional bias
242 {
243 vtable.declare_variable(
244 comp_group,
245 _bias,
246 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
247 "bias");
248 }
249 vtable.declare_variable(
250 comp_group,
251 _dst,
252 GpuKernelArgumentInfo(common_tensor_type),
253 "dst");
254 }
255
get_tag_lut(const GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const256 TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
257 {
258 TagLUT lut{};
259 // Arguments and global shared variables
260 lut["src"] = vtable.get_variable(_src);
261 lut["weight"] = vtable.get_variable(_weight);
262
263 if(_bias && _bias->has_valid_id()) // optional bias
264 {
265 lut["bias"] = vtable.get_variable(_bias);
266 lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
267 }
268 lut["dst"] = vtable.get_variable(_dst);
269
270 const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
271 lut["arg_dst"] = dst_argument.uniq_name;
272
273 // Local build options
274 lut["meta_kernel_id"] = id();
275 lut["ACC_DATA_TYPE"] = _src->data_type();
276 lut["SRC_DATA_TYPE"] = _src->data_type();
277 lut["WEI_DATA_TYPE"] = _weight->data_type();
278
279 lut["SRC_TENSOR_TYPE"] = "BUFFER";
280 switch(vtable.get_variable(_weight).kernel_argument_info.type)
281 {
282 case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
283 case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
284 case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
285 {
286 lut["WEI_TENSOR_TYPE"] = "IMAGE";
287 break;
288 }
289 default:
290 {
291 lut["WEI_TENSOR_TYPE"] = "BUFFER";
292 break;
293 }
294 }
295 const auto width_idx = 1;
296 const auto height_idx = 2;
297 lut["WEI_WIDTH"] = _weight->dimension(width_idx);
298 lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
299
300 lut["STRIDE_X"] = _attributes.stride().x();
301 lut["STRIDE_Y"] = _attributes.stride().y();
302
303 lut["PAD_LEFT"] = _attributes.pad().left;
304 lut["PAD_TOP"] = _attributes.pad().top;
305
306 lut["ZERO_VALUE"] = 0;
307
308 return lut;
309 }
310
get_build_options(const ComponentGroup & comp_group) const311 CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
312 {
313 const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
314
315 const auto root_window = comp_group.get_root_component()->template_writer()->get_window();
316 const unsigned int n0 = root_window.x().step();
317 const unsigned int m0 = root_window.y().step();
318 const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
319 const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
320
321 CLBuildOptions build_opts{};
322 if(_settings.fast_relaxed_math())
323 {
324 build_opts.add_option("-cl-fast-relaxed-math");
325 }
326 else
327 {
328 // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
329 // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
330 build_opts.add_option("-cl-unsafe-math-optimizations");
331 }
332
333 build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
334 build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
335 build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
336 build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
337
338 return build_opts;
339 }
340
get_config_id() const341 std::string ClTemplateDirectConv2d::get_config_id() const
342 {
343 const DataType data_type = _src->data_type();
344 const DataLayout data_layout = _src->data_layout();
345
346 const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
347 const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
348
349 const unsigned int kernel_size = _weight->dimension(width_idx);
350
351 std::string config_id{};
352 config_id += lower_string(string_from_data_type(data_type));
353 config_id += "_";
354 config_id += support::cpp11::to_string(kernel_size);
355 config_id += "_";
356 config_id += support::cpp11::to_string(_attributes.stride().x());
357 config_id += "_";
358 config_id += support::cpp11::to_string(_attributes.stride().y());
359 config_id += "_";
360 config_id += support::cpp11::to_string(_dst->dimension(width_idx));
361 config_id += "_";
362 config_id += support::cpp11::to_string(_dst->dimension(height_idx));
363 config_id += "_";
364 config_id += lower_string(string_from_data_layout(data_layout));
365 return config_id;
366 }
367
get_headers_list() const368 std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
369 {
370 return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
371 }
372
get_window() const373 Window ClTemplateDirectConv2d::get_window() const
374 {
375 ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
376
377 const auto output_shape = _dst->tensor_shape();
378 const auto desc = _settings.direct_conv_descriptor();
379
380 const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
381 const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1] * output_shape[2]);
382
383 // Create and configure kernel window
384 Window win = calculate_max_window(output_shape, Steps(n0, m0));
385
386 const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], m0);
387 win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0));
388 win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
389
390 return win;
391 }
392
393 } // namespace dynamic_fusion
394 } // namespace experimental
395 } // namespace arm_compute
396