1 /*
2  * Copyright (c) 2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "ClTemplatePool2d.h"
25 
26 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
27 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
28 
29 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
30 #include "src/core/helpers/WindowHelpers.h"
31 
32 #include "support/StringSupport.h"
33 
34 namespace arm_compute
35 {
36 namespace experimental
37 {
38 namespace dynamic_fusion
39 {
40 namespace
41 {
42 // Shape indexes for NHWC Datalayout
43 constexpr static int32_t batch_idx   = 3;
44 constexpr static int32_t height_idx  = 2;
45 constexpr static int32_t width_idx   = 1;
46 constexpr static int32_t channel_idx = 0;
47 }
ClTemplatePool2d(ComponentId id,const ArgumentPack<ITensorInfo> & tensors,const Attributes & attributes,const Settings & settings)48 ClTemplatePool2d::ClTemplatePool2d(ComponentId                      id,
49                                    const ArgumentPack<ITensorInfo> &tensors,
50                                    const Attributes                &attributes,
51                                    const Settings                  &settings)
52     : IGpuTemplateComponentWriter{ id, tensors },
53       _src{},
54       _dst{},
55       _attributes{ attributes },
56       _settings{ settings }
57 {
58     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
59     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
60     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
61 }
62 
get_name() const63 std::string ClTemplatePool2d::get_name() const
64 {
65     return "pool2d";
66 }
67 
get_component_code(const ComponentGroup & comp_group) const68 std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_group) const
69 {
70     ARM_COMPUTE_UNUSED(comp_group);
71 
72     // Condition to use 2x2 optimized kernel
73     if(_attributes.pool_size() == Size2D(2, 2))
74     {
75         return get_2x2_kernel_code();
76     }
77     else
78     {
79         return get_MxN_kernel_code();
80     }
81 }
82 
get_MxN_kernel_code() const83 std::string ClTemplatePool2d::get_MxN_kernel_code() const
84 {
85     const auto pool_type          = _attributes.pool_type();
86     const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
87 
88     // Define pool op macro.
89     std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
90 
91     // Kernel start
92     // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
93     // Note: If C is less than N0, N0 should be SHRINKED to the closest smaller N0. This operation is performed on the host side
94     std::string code = R"_(
95 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
96 // IN_0(src)            {{src}}
97 // OUT(dst, accum)      {{dst}}
98 
99 {
100     const int idx_out_c = g_ind_0;
101     const int idx_out_w = g_ind_1;
102 )_";
103 
104     // Add macro for POOL_OP
105     code += "\n" + pool_op + "\n";
106 
107     code += R"_(
108     const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
109     const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
110 )_";
111 
112     // Define common variables.
113     code += R"_(
114     __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
115 
116     __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n * {{dst}}_stride_w;
117 
118     VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
119     res0 = {{INITIAL_VALUE}};
120 
121     const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
122     const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
123 
124     const int pool_x_s = max((int)0, -idx_in_w);
125     const int pool_x_e = min((int){{POOL_SIZE_X}}, (int){{SRC_WIDTH}} - idx_in_w);
126     const int pool_y_s = max((int)0, -idx_in_h);
127     const int pool_y_e = min((int){{POOL_SIZE_Y}}, (int){{SRC_HEIGHT}} - idx_in_h);
128 )_";
129 
130     // Determine filter size depending on if padding is excluded or not
131     if(_attributes.exclude_padding())
132     {
133         code += R"_(
134     const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
135 )_";
136     }
137     else
138     {
139         code += R"_(
140     const int filter_size = {{POOL_SIZE_X}} * {{POOL_SIZE_Y}};
141 )_";
142     }
143 
144     // Loop through pool size
145     // if global pooling
146     if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx))
147     {
148         // Begin loop
149         code += R"_(
150     // Global pooling path
151     for(int y = 0; y < {{POOL_SIZE_Y}}; ++y)
152     {
153     #pragma unroll 8
154         for(int x = 0; x < {{POOL_SIZE_X}}; ++x)
155         {
156             VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
157             data0;
158 )_";
159     }
160     else // if local pooling size
161     {
162         code += R"_(
163     for(int y = pool_y_s; y < pool_y_e; ++y)
164     {
165     #pragma unroll 8
166         for(int x = pool_x_s; x < pool_x_e; ++x)
167         {
168             VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
169             data0;
170 )_";
171     } // end else
172 
173     // if condition inside loop - use 32bit acc if mixed_precision.
174     // End loop through pooling section.
175     if(fp_mixed_precision)
176     {
177         // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
178         code += R"_(
179             data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
180             res0 = POOL_OP(res0, data0);
181         }
182     }
183 )_";
184     }
185     else // load data, compute result and end loop
186     {
187         code += R"_(
188             data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z));
189             res0 = POOL_OP(res0, data0);
190         }
191     }
192 )_";
193     }
194 
195     // For Pool AVG ONLY, divide pool output by filter size
196     if(pool_type == PoolingType::AVG)
197     {
198         code += R"_(
199     res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
200 )_";
201     }
202 
203     // If mixed precision convert datatype before storing. Then end kernel.
204     if(fp_mixed_precision)
205     {
206         code += R"_(
207     VEC_DATA_TYPE({{DATA_TYPE}}, N0)
208     res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
209     STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
210 )_";
211     }
212     else
213     {
214         // Store data
215         code += R"_(
216     STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
217 )_";
218     }
219 
220     code += R"_(
221 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
222 }
223 )_";
224 
225     return code;
226 }
227 
get_2x2_kernel_code() const228 std::string ClTemplatePool2d::get_2x2_kernel_code() const
229 {
230     const auto  pool_type          = _attributes.pool_type();
231     const bool  fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
232     std::string pool_op            = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
233 
234     std::string code = R"_(
235 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
236 // IN_0(src)            {{src}}
237 // OUT(dst, accum)      {{dst}}
238 
239 #define SELECT_TYPE SELECT_VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
240 
241 {
242     const int idx_out_c = g_ind_0;
243     const int idx_out_w = g_ind_1;
244 )_";
245 
246     // Add pool op macro
247     code += "\n" + pool_op + "\n";
248 
249     // If batch size != 1, the batch size dimension is collapsed over the height dimension
250     code += R"_(
251     const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
252     const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
253 )_";
254 
255     code += R"_(
256     const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
257     const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
258 
259     __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
260     __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n *
261                                            {{dst}}_stride_w;
262     const int pool_x_s = max((int)0, -idx_in_w);
263     const int pool_x_e = min((int)2, (int){{SRC_WIDTH}} - idx_in_w);
264     const int pool_y_s = max((int)0, -idx_in_h);
265     const int pool_y_e = min((int)2, (int){{SRC_HEIGHT}} - idx_in_h);
266 
267     const int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
268     const int x0 = pool_x_s + idx_in_w;
269     const int y0 = pool_y_s + idx_in_h;
270     const int x1 = pool_x_e - 1 + idx_in_w;
271     const int y1 = pool_y_e - 1 + idx_in_h;
272 
273     REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
274 )_";
275 
276     if(fp_mixed_precision)
277     {
278         // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
279         code += R"_(
280     data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
281     data1 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
282     data2 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
283     data3 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
284 )_";
285     }
286     else
287     {
288         code += R"_(
289     data0         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z));
290     data1         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z));
291     data2         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z));
292     data3         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z));
293 )_";
294     }
295 
296     if(pool_type != PoolingType::MAX)
297     {
298         // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
299         code += R"_(
300     if(filter_size != 4)
301     {
302         SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
303         SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)({{SRC_WIDTH}} - 1);
304         SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
305         SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)({{SRC_HEIGHT}} - 1);
306 
307         data0 = select(data0, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_s));
308         data1 = select(data1, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_s));
309         data2 = select(data2, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_e));
310         data3 = select(data3, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_e));
311     }
312 )_";
313     }
314 
315     code += R"_(
316     VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
317     res0 = data0;
318     res0 = POOL_OP(res0, data1);
319     res0 = POOL_OP(res0, data2);
320     res0 = POOL_OP(res0, data3);
321 )_";
322 
323     if(pool_type == PoolingType::AVG)
324     {
325         // If avg pooling divide result accordingly.
326         if(_attributes.exclude_padding())
327         {
328             code += R"_(
329     res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
330 )_";
331         }
332         else
333         {
334             code += R"_(
335     res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))4;
336 )_";
337         }
338     }
339 
340     // Store result
341     if(fp_mixed_precision)
342     {
343         code += R"_(
344     VEC_DATA_TYPE({{DATA_TYPE}}, N0)
345     res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
346     STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
347 )_";
348     }
349     else
350     {
351         code += R"_(
352     STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
353 )_";
354     }
355 
356     code += R"_(
357     //------------------ END KERNEL {{meta_kernel_id}} ---------------------
358 }
359 #undef SELECT_TYPE
360 )_";
361 
362     return code;
363 }
364 
declare_variables(GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const365 void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
366 {
367     vtable.declare_variable(
368         comp_group,
369         _src,
370         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
371         "src");
372 
373     vtable.declare_variable(
374         comp_group,
375         _dst,
376         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
377         "dst");
378 }
379 
get_tag_lut(const GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const380 TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
381 {
382     ARM_COMPUTE_UNUSED(comp_group);
383 
384     TagLUT lut{};
385     // Arguments and global shared variables
386     lut["src"] = vtable.get_variable(_src);
387     lut["dst"] = vtable.get_variable(_dst);
388 
389     // Local build options
390     lut["meta_kernel_id"] = id();
391 
392     // Retrieve relevant data
393     const auto padding                = _attributes.pad();
394     const auto stride                 = _attributes.stride();
395     const auto pool_size              = _attributes.pool_size();
396     const auto data_type              = _src->data_type();
397     const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
398 
399     // pool specific
400     lut["STRIDE_X"]    = stride.x();
401     lut["STRIDE_Y"]    = stride.y();
402     lut["PAD_X"]       = padding.left;
403     lut["PAD_Y"]       = padding.top;
404     lut["POOL_SIZE_X"] = pool_size.width;
405     lut["POOL_SIZE_Y"] = pool_size.height;
406 
407     // Datatypes and variables
408     lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
409     lut["DATA_TYPE"]     = get_cl_type_from_data_type(data_type);
410     lut["SRC_WIDTH"]     = _src->dimension(width_idx);
411     lut["SRC_HEIGHT"]    = _src->dimension(height_idx);
412     lut["INITIAL_VALUE"] = (_attributes.pool_type() == PoolingType::MAX) ? float_to_string_with_full_precision(std::numeric_limits<float>::lowest()) : std::string("0");
413 
414     // Tensor specific data
415     lut["DST_HEIGHT"] = _dst->dimension(height_idx);
416 
417     return lut;
418 }
419 
get_build_options(const ComponentGroup & comp_group) const420 CLBuildOptions ClTemplatePool2d::get_build_options(const ComponentGroup &comp_group) const
421 {
422     const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
423     const unsigned int n0               = root_window.x().step();
424     const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
425 
426     CLBuildOptions build_opts{};
427     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
428     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
429 
430     return build_opts;
431 }
432 
get_config_id() const433 std::string ClTemplatePool2d::get_config_id() const
434 {
435     const DataType   data_type   = _src->data_type();
436     const DataLayout data_layout = _src->data_layout();
437 
438     std::string config_id{};
439     config_id += "pooling_layer_2d_";
440     config_id += lower_string(string_from_data_type(data_type));
441     config_id += "_";
442     config_id += lower_string(string_from_data_layout(data_layout));
443     config_id += "_";
444     config_id += support::cpp11::to_string(_dst->dimension(width_idx));
445     config_id += "_";
446     config_id += support::cpp11::to_string(_dst->dimension(height_idx));
447     config_id += "_";
448     config_id += support::cpp11::to_string(_dst->dimension(channel_idx));
449 
450     return config_id;
451 }
452 
get_headers_list() const453 std::set<std::string> ClTemplatePool2d::get_headers_list() const
454 {
455     return std::set<std::string>{ "helpers.h", "tile_helpers.h", "repeat.h" };
456 }
457 
get_window() const458 Window ClTemplatePool2d::get_window() const
459 {
460     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
461     const auto         output_shape = _dst->tensor_shape();
462     const unsigned int vec_size     = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
463 
464     // Create and configure kernel window
465     auto win = calculate_max_window(output_shape, Steps(vec_size));
466     win      = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
467     return win;
468 }
469 
470 } // namespace dynamic_fusion
471 } // namespace experimental
472 } // namespace arm_compute
473