xref: /aosp_15_r20/external/ComputeLibrary/src/core/CL/cl_kernels/common/reduction_operation.cl (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1/*
2 * Copyright (c) 2016-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25#include "helpers_asymm.h"
26
27#if defined(FLOAT_DATA_TYPE)
28#define ISGREATER(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isgreater(x, y))
29#define ISLESS(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isless(x, y))
30#define ISGREATER_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isgreater(x, y))
31#define ISLESS_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isless(x, y))
32#else // !FLOAT_DATA_TYPE
33#if defined(WIDTH)
34#define ISGREATER(x, y) (x > y) ? 1 : 0
35#define ISLESS(x, y) (x < y) ? 1 : 0
36#define ISGREATER_SCALAR ISGREATER
37#define ISLESS_SCALAR ISLESS
38#else // !defined(WIDTH)
39#define ISGREATER(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x > y)
40#define ISLESS(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x < y)
41#endif // defined(WIDTH)
42#endif // defined(FLOAT_DATA_TYPE)
43
44#if defined(WIDTH)
45#if defined(OPERATION)
46
47#define sum(in0, in1, size) (in0 + SUM_REDUCE(in1, size))
48#define square_sum(in0, in1, size) (in0 + SUM_REDUCE((in1 * in1), size))
49#define product(in0, in1, size) (in0 * PROD_REDUCE(in1, size))
50
51/** This kernel performs parallel reduction given an operation on x-axis.
52 *
53 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
54 * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
55 * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
56 * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
57 * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
58 *
59 * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
60 * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
61 * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
62 * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
63 * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
64 * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
65 * @param[in] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input
66 * @param[in] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
67 * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
68 * @param[in] output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
69 * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
70 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
71 */
72__kernel void reduction_operation_x(
73    TENSOR3D_DECLARATION(input),
74    TENSOR3D_DECLARATION(output))
75{
76    int y = get_global_id(1);
77    int z = get_global_id(2);
78
79    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y + z * input_stride_z;
80    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y + z * output_stride_z;
81
82#if defined(PROD)
83    DATA_TYPE res = (DATA_TYPE)1;
84#else  // defined(PROD)
85    DATA_TYPE res = (DATA_TYPE)0;
86#endif // defined(PROD)
87
88    int x = 0;
89
90    for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
91    {
92        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
93        vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
94        res  = OPERATION(res, vals, VEC_SIZE);
95    }
96
97#if(WIDTH % VEC_SIZE)
98    _Pragma("unroll") for(; x < WIDTH; ++x)
99    {
100        DATA_TYPE val = *((__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
101        res           = OPERATION(res, val, 1);
102    }
103#endif // (WIDTH % VEC_SIZE)
104
105#if defined(MEAN)
106    res /= WIDTH;
107#endif // defined(MEAN)
108    *((__global DATA_TYPE *)output_addr) = res;
109}
110#endif // defined(OPERATION)
111/** This kernel performs reduction on x-axis. (Non parallel)
112 *
113 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
114 * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
115 * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
116 *
117 * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
118 * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
119 * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
120 * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
121 * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
122 * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
123 * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
124 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
125 */
126__kernel void reduction_operation_non_parallel_x(
127    VECTOR_DECLARATION(input),
128    VECTOR_DECLARATION(output))
129{
130    Vector input  = CONVERT_TO_VECTOR_STRUCT(input);
131    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
132
133    DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, 0)), DATA_TYPE_PROMOTED);
134
135    // Convert input into F32 in order to perform quantized multiplication
136#if defined(PROD) && defined(OFFSET) && defined(SCALE)
137    float res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
138#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
139
140    for(unsigned int x = 1; x < WIDTH; ++x)
141    {
142        DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, x)), DATA_TYPE_PROMOTED);
143#if defined(MIN)
144        res = select(res, in, ISLESS_SCALAR(in, res));
145#elif defined(MAX)
146        res = select(res, in, ISGREATER_SCALAR(in, res));
147#elif defined(PROD)
148#if defined(OFFSET) && defined(SCALE)
149        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
150#else  // !(defined(OFFSET) && defined(SCALE))
151        res *= in;
152#endif //  defined(OFFSET) && defined(SCALE)
153#else  // defined(SUM))
154        res += in;
155#endif // defined(MAX) || defined(MIN) || defined(PROD)
156    }
157
158    // Store result
159#if defined(MEAN)
160    res /= WIDTH;
161#endif // defined(MEAN)
162
163    // Subtract the offsets in case of quantized SUM
164#if defined(SUM) && defined(OFFSET) && defined(SCALE)
165    res -= (WIDTH - 1) * OFFSET;
166#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
167
168    // Re-quantize
169#if defined(PROD) && defined(OFFSET) && defined(SCALE)
170    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
171#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
172
173    *((__global DATA_TYPE *)output.ptr) = CONVERT_SAT(res, DATA_TYPE);
174}
175#endif // defined(WIDTH)
176
177#if defined(HEIGHT)
178/** This kernel performs reduction on y-axis.
179 *
180 * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
181 * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
182 *
183 * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
184 * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
185 * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
186 * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
187 * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
188 * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
189 * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
190 * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
191 * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
192 * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
193 * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
194 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
195 */
196__kernel void reduction_operation_y(
197    IMAGE_DECLARATION(input),
198    IMAGE_DECLARATION(output))
199{
200    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
201    int y = get_global_id(1);
202
203    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y;
204    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y;
205
206    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
207    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
208
209    // Convert input into F32 in order to perform quantized multiplication
210#if defined(PROD) && defined(OFFSET) && defined(SCALE)
211    VEC_DATA_TYPE(float, VEC_SIZE)
212    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
213#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
214
215#if defined(SUM_SQUARE)
216    res *= res;
217#endif // defined(SUM_SQUARE)
218
219    for(unsigned int y = 1; y < HEIGHT; ++y)
220    {
221        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
222        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
223#if defined(MIN)
224        res = select(res, in, ISLESS(in, res));
225#elif defined(MAX)
226        res = select(res, in, ISGREATER(in, res));
227#else // !(defined(MAX) || defined(MIN))
228#if defined(SUM_SQUARE)
229        in *= in;
230#endif // defined(SUM_SQUARE)
231#if defined(PROD)
232
233#if defined(OFFSET) && defined(SCALE)
234        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
235#else  // !(defined(OFFSET) && defined(SCALE))
236        res *= in;
237#endif //  defined(OFFSET) && defined(SCALE)
238
239#else  // !defined(PROD)
240        res += in;
241#endif // defined(PROD)
242#endif // defined(MAX) || defined(MIN)
243    }
244
245#if defined(MEAN)
246    res /= HEIGHT;
247#endif // defined(MEAN)
248
249    // Subtract the offsets in case of quantized SUM
250#if defined(SUM) && defined(OFFSET) && defined(SCALE)
251    res -= (HEIGHT - 1) * OFFSET;
252#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
253
254    // Re-quantize
255#if defined(PROD) && defined(OFFSET) && defined(SCALE)
256    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
257#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
258
259    // Store result
260    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
261    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
262    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
263}
264#endif // defined(HEIGHT)
265
266#if defined(DEPTH)
267/** This kernel performs reduction on z-axis.
268 *
269 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
270 * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
271 *
272 * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
273 * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
274 * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
275 * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
276 * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
277 * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
278 * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
279 * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
280 * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
281 * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
282 * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
283 * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
284 * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
285 * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
286 * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
287 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
288 */
289__kernel void reduction_operation_z(
290    TENSOR3D_DECLARATION(input),
291    TENSOR3D_DECLARATION(output))
292{
293    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
294    int y = get_global_id(1);
295    int z = get_global_id(2);
296
297    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z;
298    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z;
299
300    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
301    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
302
303    // Convert input into F32 in order to perform quantized multiplication
304#if defined(PROD) && defined(OFFSET) && defined(SCALE)
305    VEC_DATA_TYPE(float, VEC_SIZE)
306    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
307#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
308
309#if defined(SUM_SQUARE)
310    res *= res;
311#endif // defined(SUM_SQUARE)
312
313    for(unsigned int z = 1; z < DEPTH; ++z)
314    {
315        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
316        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
317
318#if defined(MIN)
319        res = select(res, in, ISLESS(in, res));
320#elif defined(MAX)
321        res = select(res, in, ISGREATER(in, res));
322#else // !(defined(MAX) || defined(MIN))
323#if defined(SUM_SQUARE)
324        in *= in;
325#endif // defined(SUM_SQUARE)
326#if defined(PROD)
327
328#if defined(OFFSET) && defined(SCALE)
329        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
330#else  // !(defined(OFFSET) && defined(SCALE))
331        res *= in;
332#endif //  defined(OFFSET) && defined(SCALE)
333
334#else  // !defined(PROD)
335        res += in;
336#endif // defined(PROD)
337#endif // defined(MAX) || defined(MIN)
338    }
339
340#if defined(MEAN)
341    res /= DEPTH;
342#endif // defined(MEAN)
343
344    // Subtract the offsets in case of quantized SUM
345#if defined(SUM) && defined(OFFSET) && defined(SCALE)
346    res -= (DEPTH - 1) * OFFSET;
347#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
348
349    // Re-quantize
350#if defined(PROD) && defined(OFFSET) && defined(SCALE)
351    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
352#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
353
354    // Store result
355    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
356    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
357
358    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
359}
360#endif /* defined(DEPTH) */
361
362#if defined(BATCH) && defined(DEPTH)
363/** This kernel performs reduction on w-axis.
364 *
365 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
366 * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
367 * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
368 *
369 * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
370 * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
371 * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
372 * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
373 * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
374 * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
375 * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
376 * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
377 * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
378 * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
379 * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
380 * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
381 * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
382 * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
383 * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
384 * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
385 * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
386 * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
387 * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
388 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
389 */
390__kernel void reduction_operation_w(
391    TENSOR4D_DECLARATION(input),
392    TENSOR4D_DECLARATION(output))
393{
394    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
395    int y = get_global_id(1);
396    int z = get_global_id(2);
397
398    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + (z % DEPTH) * input_stride_z + (z / DEPTH) * input_stride_w;
399    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + (z % DEPTH) * output_stride_z + (z / DEPTH) * output_stride_z;
400
401    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
402    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
403
404    // Convert input into F32 in order to perform quantized multiplication
405#if defined(PROD) && defined(OFFSET) && defined(SCALE)
406    VEC_DATA_TYPE(float, VEC_SIZE)
407    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
408#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
409
410#if defined(SUM_SQUARE)
411    res *= res;
412#endif // defined(SUM_SQUARE)
413
414    for(unsigned int w = 1; w < BATCH; ++w)
415    {
416        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
417        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
418
419#if defined(MIN)
420        res = select(res, in, ISLESS(in, res));
421#elif defined(MAX)
422        res = select(res, in, ISGREATER(in, res));
423#else // !(defined(MAX) || defined(MIN))
424#if defined(SUM_SQUARE)
425        in *= in;
426#endif // defined(SUM_SQUARE)
427#if defined(PROD)
428
429#if defined(OFFSET) && defined(SCALE)
430        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
431#else  // !(defined(OFFSET) && defined(SCALE))
432        res *= in;
433#endif //  defined(OFFSET) && defined(SCALE)
434
435#else  // !defined(PROD)
436        res += in;
437#endif //defined(PROD)
438#endif // defined(MAX) || defined(MIN)
439    }
440
441#if defined(MEAN)
442    res /= BATCH;
443#endif // defined(MEAN)
444
445    // Subtract the offsets in case of quantized SUM
446#if defined(SUM) && defined(OFFSET) && defined(SCALE)
447    res -= (BATCH - 1) * OFFSET;
448#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
449
450    // Re-quantize
451#if defined(PROD) && defined(OFFSET) && defined(SCALE)
452    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
453#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
454
455    // Store result
456    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
457    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
458    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
459}
460#endif /* defined(BATCH) && defined(DEPTH) */
461