1 /*
2 * Copyright (c) 2022-2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "ClTemplateElementwiseBinary.h"
25
26 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
27 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
28
29 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
30 #include "src/core/helpers/WindowHelpers.h"
31
32 #include "support/StringSupport.h"
33
34 namespace arm_compute
35 {
36 namespace experimental
37 {
38 namespace dynamic_fusion
39 {
40 constexpr unsigned int vector_size_byte_opencl = 16;
41
ClTemplateElementwiseBinary(ComponentId id,const ArgumentPack<ITensorInfo> & tensors,const Attributes & attributes)42 ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId id,
43 const ArgumentPack<ITensorInfo> &tensors,
44 const Attributes &attributes)
45 : IGpuTemplateComponentWriter{ id, tensors },
46 _lhs{},
47 _rhs{},
48 _dst{},
49 _attributes{ attributes }
50 {
51 _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
52 _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
53 _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
54 ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
55 }
56
get_name() const57 std::string ClTemplateElementwiseBinary::get_name() const
58 {
59 return "elementwise_binary";
60 }
61
get_component_code(const ComponentGroup & comp_group) const62 std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup &comp_group) const
63 {
64 std::string code;
65 const bool is_root = (comp_group.get_root_component()->id() == this->id());
66 const bool is_lhs_input = comp_group.is_input_tensor(_lhs);
67 const bool is_rhs_input = comp_group.is_input_tensor(_rhs);
68
69 code =
70 R"_(
71 //------------------ START KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
72 )_";
73
74 if(is_root)
75 {
76 code +=
77 R"_(
78 TILE(uint, M0, 1, g_dst_indirect_y);
79 )_";
80 }
81
82 if(is_lhs_input)
83 {
84 code +=
85 R"_(
86 TILE({{DATA_TYPE}}, {{lhs_m0}}, N0, {{lhs}});
87 )_";
88 }
89
90 if(is_rhs_input)
91 {
92 code +=
93 R"_(
94 TILE({{DATA_TYPE}}, {{rhs_m0}}, N0, {{rhs}});
95 )_";
96 }
97
98 code +=
99 R"_(
100 {
101 )_";
102
103 if(is_lhs_input)
104 {
105 code +=
106 R"_(
107 {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_w;
108 T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}});
109 )_";
110 }
111
112 if(is_rhs_input)
113 {
114 code +=
115 R"_(
116 {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_w;
117 T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}});
118 )_";
119 }
120
121 code +=
122 R"_(
123 T_ELTWISE_{{BROADCAST_OP}}{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
124 )_";
125
126 if(is_root)
127 {
128 // Calculate the destination indirect Y
129 code +=
130 R"_(
131 LOOP_UNROLLING(int, i, 0, 1, M0,
132 {
133 g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
134 g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h);
135 })
136 )_";
137 }
138
139 code +=
140 R"_(
141 }
142 //------------------ END KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
143 )_";
144
145 return code;
146 }
147
declare_variables(GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const148 void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
149 {
150 vtable.declare_variable(
151 comp_group,
152 _lhs,
153 GpuKernelArgumentInfo(common_tensor_type),
154 "lhs");
155
156 vtable.declare_variable(
157 comp_group,
158 _rhs,
159 GpuKernelArgumentInfo(common_tensor_type),
160 "rhs");
161
162 vtable.declare_variable(
163 comp_group,
164 _dst,
165 GpuKernelArgumentInfo(common_tensor_type),
166 "dst");
167 }
168
get_tag_lut(const GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const169 TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
170 {
171 TagLUT lut{};
172
173 // Local build options
174 lut["meta_kernel_id"] = id();
175 lut["DATA_TYPE"] = get_cl_type_from_data_type(_lhs->data_type());
176 // Arguments and global shared variables
177
178 lut["lhs"] = vtable.get_variable(_lhs);
179 lut["rhs"] = vtable.get_variable(_rhs);
180 lut["dst"] = vtable.get_variable(_dst);
181 lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor());
182
183 switch(_attributes.operation())
184 {
185 case Attributes::ElementwiseOp::Add:
186 lut["ELTWISE_OP"] = "ADD";
187 break;
188 case Attributes::ElementwiseOp::Sub:
189 lut["ELTWISE_OP"] = "SUB";
190 break;
191 case Attributes::ElementwiseOp::Mul:
192 lut["ELTWISE_OP"] = "MUL";
193 break;
194 default:
195 ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
196 }
197
198 ARM_COMPUTE_ERROR_ON(
199 comp_group.is_intermediate_tensor(_lhs) && detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
200 ARM_COMPUTE_ERROR_ON(
201 comp_group.is_intermediate_tensor(_rhs) && detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
202
203 // Set broadcast parameters
204 // PRE: All tensors are broadcast-compatible
205 const auto &lhs_dims = _lhs->tensor_shape();
206 const auto &rhs_dims = _rhs->tensor_shape();
207 const auto &dst_dims = _dst->tensor_shape();
208
209 const auto lhs_broadcast_x = dst_dims[0] != 1 && lhs_dims[0] == 1;
210 const auto rhs_broadcast_x = dst_dims[0] != 1 && rhs_dims[0] == 1;
211 const auto lhs_broadcast_y = dst_dims[1] != 1 && lhs_dims[1] == 1;
212 const auto rhs_broadcast_y = dst_dims[1] != 1 && rhs_dims[1] == 1;
213 const auto lhs_broadcast_z = dst_dims[2] != 1 && lhs_dims[2] == 1;
214 const auto rhs_broadcast_z = dst_dims[2] != 1 && rhs_dims[2] == 1;
215
216 const auto lhs_broadcast_yz = lhs_broadcast_y && lhs_broadcast_z;
217 const auto rhs_broadcast_yz = rhs_broadcast_y && rhs_broadcast_z;
218
219 lut["lhs_n0"] = (lhs_broadcast_x) ? "1" : "N0";
220 lut["lhs_start_ind_0"] = (lhs_broadcast_x) ? "0" : "g_ind_0";
221 lut["rhs_n0"] = (rhs_broadcast_x) ? "1" : "N0";
222 lut["rhs_start_ind_0"] = (rhs_broadcast_x) ? "0" : "g_ind_0";
223
224 lut["lhs_m0"] = (lhs_broadcast_yz) ? "1" : "M0";
225 lut["lhs_start_ind_1"] = (lhs_broadcast_yz) ? "0" : "g_ind_1";
226 lut["rhs_m0"] = (rhs_broadcast_yz) ? "1" : "M0";
227 lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1";
228
229 lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" :
230 (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" :
231 "";
232
233 return lut;
234 }
235
get_build_options(const ComponentGroup & comp_group) const236 CLBuildOptions ClTemplateElementwiseBinary::get_build_options(const ComponentGroup &comp_group) const
237 {
238 CLBuildOptions build_opts{};
239 /// NOTE: For now tile sizes (n0, m0) are set by the execution window. This may change in the future
240 const auto root_window = comp_group.get_root_component()->template_writer()->get_window();
241 const unsigned int n0 = root_window.x().step();
242 const unsigned int m0 = root_window.y().step();
243 const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
244
245 build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
246 build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
247 build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_lhs->data_type()));
248 build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
249
250 return build_opts;
251 }
252
get_config_id() const253 std::string ClTemplateElementwiseBinary::get_config_id() const
254 {
255 std::string config_id{};
256 config_id += lower_string(string_from_data_type(_dst->data_type()));
257 config_id += "_";
258 config_id += support::cpp11::to_string(_dst->dimension(0));
259 config_id += "_";
260 config_id += support::cpp11::to_string(_dst->dimension(1));
261 config_id += "_";
262 config_id += lower_string(string_from_data_layout(_dst->data_layout()));
263
264 return config_id;
265 }
266
get_headers_list() const267 std::set<std::string> ClTemplateElementwiseBinary::get_headers_list() const
268 {
269 return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
270 }
271
get_window() const272 Window ClTemplateElementwiseBinary::get_window() const
273 {
274 ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
275
276 TensorShape output_shape = _dst->tensor_shape();
277 // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged
278 // This is in line with the collapsing convention used by operators like Conv2d
279 output_shape.collapse(2U, 1U);
280 const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
281 Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
282
283 return win;
284 }
285
286 } // namespace dynamic_fusion
287 } // namespace experimental
288 } // namespace arm_compute
289