xref: /aosp_15_r20/external/ComputeLibrary/src/core/CL/cl_kernels/nhwc/pooling_layer.cl (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1/*
2 * Copyright (c) 2017-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25#include "repeat.h"
26#include "tile_helpers.h"
27
28#if defined(POOL_AVG) || defined(POOL_L2)
29#define POOL_OP(x, y) ((x) + (y))
30#else /* defined(POOL_AVG) || defined(POOL_L2) */
31#define POOL_OP(x, y) (fmax((x), (y)))
32#endif /* defined(POOL_AVG) || defined(POOL_L2) */
33
34#if defined(POOL_L2)
35#define POW2_OP(x, vec_size) ((x) * (x))
36#else /* defined(POOL_L2) */
37#define POW2_OP(x, vec_size) (x)
38#endif /* defined(POOL_L2) */
39
40#define DIV_OP(x, y) (x * (1.f / y))
41#define SQRT_OP(x) sqrt((x))
42
43#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
44
45#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
46/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
47 * -# max, -DPOOL_MAX must be passed at compile time
48 * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
49 * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
50 *
51 * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
52 * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
53 * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
54 * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
55 * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
56 * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
57 * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
58 * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
59 * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
60 * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
61 * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
62 *
63 * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
64 * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
65 * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
66 * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
67 * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
68 * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
69 * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
70 * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
71 * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
72 * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
73 * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
74 * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
75 * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
76 * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
77 * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
78 * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
79 * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
80 * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
81 * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
82 * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
83 */
84__kernel void pooling_layer_MxN_nhwc(
85    TENSOR4D_DECLARATION(input),
86    TENSOR4D_DECLARATION(output))
87{
88    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
89    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
90    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
91    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
92#if DST_BATCH_SIZE != 1
93    // If batch size != 1, the batch size dimension is collapsed over the height dimension
94    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
95    int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
96#else  //DST_BATCH_SIZE != 1
97    int idx_out_h   = GET_SPATIAL_IDX(2, 1, 0);
98    int idx_out_n   = 0;
99#endif // DST_BATCH_SIZE != 1
100
101    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
102
103    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
104                                           output_stride_w;
105
106    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
107    res0 = INITIAL_VALUE;
108
109    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
110    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
111
112    int pool_x_s = max((int)0, -idx_in_w);
113    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
114    int pool_y_s = max((int)0, -idx_in_h);
115    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
116
117#if defined(EXCLUDE_PADDING)
118    int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
119#else  // defined(EXCLUDE_PADDING)
120    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
121#endif // defined(EXCLUDE_PADDING)
122
123#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
124    // Global pooling path
125    for(int y = 0; y < POOL_SIZE_Y; ++y)
126    {
127#pragma unroll 8
128        for(int x = 0; x < POOL_SIZE_X; ++x)
129        {
130#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
131    for(int y = pool_y_s; y < pool_y_e; ++y)
132    {
133#pragma unroll 8
134        for(int x = pool_x_s; x < pool_x_e; ++x)
135        {
136#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
137            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
138            data0;
139#if defined(FP_MIXED_PRECISION)
140            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
141            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
142#else  // defined(FP_MIXED_PRECISION)
143            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
144#endif // defined(FP_MIXED_PRECISION)
145
146#if defined(POOL_L2)
147            // Raise to power of 2 for L2 Pooling
148            data0 *= data0;
149#endif // defined(POOL_L2)
150            res0 = POOL_OP(res0, data0);
151        }
152    }
153
154#if defined(POOL_AVG) || defined(POOL_L2)
155    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
156#endif // defined(POOL_AVG) || defined(POOL_L2)
157
158#if defined(POOL_L2)
159    // Take square root of the result in L2 pooling
160    res0 = SQRT_OP(res0);
161#endif // defined(POOL_L2)
162
163    // Store result
164#if defined(FP_MIXED_PRECISION)
165    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
166    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
167    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
168#else  // defined(FP_MIXED_PRECISION)
169    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
170#endif // defined(FP_MIXED_PRECISION)
171}
172#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
173
174#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
175
176/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
177 * -# max, -DPOOL_MAX must be passed at compile time
178 * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
179 * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
180 * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
181 *
182 * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
183 * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
184 * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
185 * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
186 * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
187 * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
188 * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
189 * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
190 * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
191 * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
192 *
193 * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
194 * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
195 * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
196 * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
197 * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
198 * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
199 * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
200 * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
201 * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
202 * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
203 * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
204 * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
205 * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
206 * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
207 * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
208 * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
209 * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
210 * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
211 * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
212 * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
213 * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
214 * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
215 * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
216 * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
217 * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
218 * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
219 * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
220 * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
221 * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
222 * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
223 */
224__kernel void pooling_layer_2x2_nhwc(
225    TENSOR4D_DECLARATION(input),
226    TENSOR4D_DECLARATION(output)
227#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
228    ,
229    TENSOR4D_DECLARATION(indices)
230#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
231)
232{
233    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
234    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
235    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
236    int idx_out_w = get_global_id(1);
237#if DST_BATCH_SIZE != 1
238    // If batch size != 1, the batch size dimension is collapsed over the height dimension
239    int idx_out_h = get_global_id(2) % DST_HEIGHT;
240    int idx_out_n = get_global_id(2) / DST_HEIGHT;
241#else  //SRC_BATCH_SIZE != 1
242    int idx_out_h = get_global_id(2);
243    int idx_out_n = 0;
244#endif // SRC_BATCH_SIZE != 1
245
246    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
247    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
248
249    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
250
251    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
252                                           output_stride_w;
253
254    int pool_x_s = max((int)0, -idx_in_w);
255    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
256    int pool_y_s = max((int)0, -idx_in_h);
257    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
258
259    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
260
261    int x0 = pool_x_s + idx_in_w;
262    int y0 = pool_y_s + idx_in_h;
263    int x1 = pool_x_e - 1 + idx_in_w;
264    int y1 = pool_y_e - 1 + idx_in_h;
265
266    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
267
268#if defined(FP_MIXED_PRECISION)
269    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
270    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
271    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
272    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
273    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
274#else  // defined(FP_MIXED_PRECISION)
275    data0         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
276    data1         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
277    data2         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
278    data3         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
279#endif // defined(FP_MIXED_PRECISION)
280
281#if !defined(POOL_MAX)
282    if(filter_size != 4)
283    {
284        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
285        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
286        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
287        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
288
289        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
290        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
291        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
292        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
293        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
294    }
295#endif // !defined(POOL_MAX)
296
297#if defined(POOL_L2)
298    // Raise to power of 2 for L2 Pooling
299    data0 *= data0;
300    data1 *= data1;
301    data2 *= data2;
302    data3 *= data3;
303#endif /* defined(POOL_L2) */
304
305    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
306    res0 = data0;
307    res0 = POOL_OP(res0, data1);
308    res0 = POOL_OP(res0, data2);
309    res0 = POOL_OP(res0, data3);
310
311#if defined(POOL_AVG) || defined(POOL_L2)
312#if defined(EXCLUDE_PADDING)
313    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
314#else  // !defined(EXCLUDE_PADDING)
315    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
316#endif // defined(EXCLUDE_PADDING)
317#endif // defined(POOL_AVG) || defined(POOL_L2)
318
319#if defined(POOL_L2)
320    // Take square root of the result in L2 pooling
321    res0 = SQRT_OP(res0);
322#endif // defined(POOL_L2)
323
324    // Store result
325#if defined(FP_MIXED_PRECISION)
326    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
327    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
328    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
329#else  // defined(FP_MIXED_PRECISION)
330    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
331#endif // defined(FP_MIXED_PRECISION)
332
333#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
334
335    // This part is used to return the index of the maximum value
336    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
337
338    // note: Batch dimension does not contribute in the offset contribution
339    VEC_DATA_TYPE(uint, VEC_SIZE)
340    base_index = (uint)idx_out_c;
341
342    base_index += VEC_OFFS(uint, VEC_SIZE);
343
344    VEC_DATA_TYPE(uint, VEC_SIZE)
345    index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
346    VEC_DATA_TYPE(uint, VEC_SIZE)
347    index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
348    VEC_DATA_TYPE(uint, VEC_SIZE)
349    index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
350    VEC_DATA_TYPE(uint, VEC_SIZE)
351    index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
352
353    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
354    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
355    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
356
357    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
358                                           indices_stride_w;
359
360    // Store result
361    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
362#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
363}
364#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)