xref: /aosp_15_r20/external/ComputeLibrary/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1/*
2 * Copyright (c) 2017-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers_asymm.h"
25
26#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN)
27
28#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
29#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
30#define VEC_FLOAT VEC_DATA_TYPE(float, VECTOR_SIZE)
31
32/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
33 *
34 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
35 * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
36 * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
37 * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
38 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
39 * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
40 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
41 * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
42 *
43 * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
44 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
45 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
46 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
47 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
48 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
49 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
50 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
51 * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
52 * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
53 * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
54 * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
55 * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
56 * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
57 * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
58 * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
59 * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
60 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
61 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
62 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
63 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
64 * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
65 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
66 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
67 */
68__kernel void softmax_layer_norm_quantized(
69    TENSOR3D_DECLARATION(src),
70    TENSOR3D_DECLARATION(sum),
71    TENSOR3D_DECLARATION(dst))
72{
73    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
74
75    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
76    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
77
78    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
79
80#ifdef BETA
81    // Initialize beta
82    VEC_FLOAT beta       = (VEC_FLOAT)BETA;
83    VEC_FLOAT scale_beta = -BETA * SCALE;
84#else  /* BETA */
85    VEC_FLOAT scale_beta = -SCALE;
86#endif /* BETA */
87
88    // Load max value of 1D logits vector (row)
89    float sum_val         = *((__global float *)offset(&sum, 0, get_global_id(1)));
90    float sum_val_inverse = 256.f / sum_val;
91
92    VEC_INT   data_diff   = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr);
93    VEC_FLOAT data_diff_f = CONVERT(data_diff, VEC_FLOAT);
94
95    data_diff_f *= scale_beta;
96    data_diff_f = exp(data_diff_f);
97    data_diff_f *= sum_val_inverse;
98
99#ifdef QASYMM8_SIGNED
100    data_diff_f -= 128.f;
101#endif /* QASYMM8_SIGNED */
102    VEC_INT  data  = CONVERT(data_diff_f, VEC_INT);
103    VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
104    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
105}
106
107#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
108
109/* Number of workitems in dimension 0. */
110#if !defined(GRID_SIZE)
111#define GRID_SIZE 1
112#endif /* !defined(GRID_SIZE) */
113
114#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
115
116VEC_INT mult_by_quantized_multiplier(VEC_INT data)
117{
118#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
119    if(INPUT_BETA_MULTIPLIER > 1)
120    {
121        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
122    }
123#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
124    return data;
125}
126
127/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
128 * then gets the exponent of each element as sums all elements across each row.
129 *
130 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
131 * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
132 * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
133 * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
134 * @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
135 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
136 * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
137 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
138 * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
139 *
140 * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
141 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
142 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
143 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
144 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
145 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
146 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
147 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
148 * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
149 * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
150 * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
151 * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
152 * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
153 * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
154 * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
155 * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
156 * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: S32
157 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
158 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
159 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
160 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
161 * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
162 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
163 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
164 * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
165 * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
166 * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
167 * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
168 * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
169 * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
170 * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
171 * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
172 */
173__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
174    TENSOR3D_DECLARATION(src),
175    TENSOR3D_DECLARATION(maxo),
176    TENSOR3D_DECLARATION(dst),
177    TENSOR3D_DECLARATION(sum))
178{
179    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
180    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
181
182    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
183    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
184
185    VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
186
187#ifdef BETA
188    // Initialize beta
189    VEC_FLOAT beta       = (VEC_FLOAT)BETA;
190    VEC_FLOAT scale_beta = -BETA * SCALE;
191#else  /* BETA */
192    VEC_FLOAT scale_beta = -SCALE;
193#endif /* BETA */
194
195    // Calculate max of row
196#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
197    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
198    VEC_BASE data        = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
199    VEC_INT widx         = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
200    max_val_vec          = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));
201#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
202
203    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
204    {
205        VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
206        max_val_vec   = max(data, max_val_vec);
207    }
208
209    // Perform max reduction
210    DATA_TYPE max_local               = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
211    *((__global DATA_TYPE *)maxo.ptr) = max_local;
212
213    // Second part
214
215    // Load max value of 1D logits vector (row)
216    int       max_val = convert_int(max_local);
217    VEC_FLOAT sum1D_f = 0.f;
218    // Start with the leftover items
219#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
220    VEC_INT   data_fp   = CONVERT(data, VEC_INT);
221    VEC_INT   data_diff = max_val - data_fp;
222    VEC_FLOAT data_fp_f = CONVERT(data_diff, VEC_FLOAT);
223    data_fp_f *= scale_beta;
224    data_fp_f = exp(data_fp_f);
225    data_fp_f = select(0, data_fp_f, widx);
226    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
227    (data_diff, 0, (__global int *)dst_addr);
228    sum1D_f += data_fp_f;
229#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
230    // Do the rest and compute exp and sum
231    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
232    {
233        VEC_BASE data       = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
234        VEC_INT   data_fp   = CONVERT(data, VEC_INT);
235        VEC_INT   data_diff = max_val - data_fp;
236        VEC_FLOAT data_fp_f = CONVERT(data_diff, VEC_FLOAT);
237        data_fp_f *= scale_beta;
238        data_fp_f = exp(data_fp_f);
239        sum1D_f += data_fp_f;
240        VSTORE(VECTOR_SIZE)
241        (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));
242    }
243    // Perform sum reduction
244    *((__global float *)sum.ptr) = SUM_REDUCE(sum1D_f, VECTOR_SIZE);
245}
246
247/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
248 * then gets the exponent of each element as sums all elements across each row.
249 *
250 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
251 * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
252 * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
253 * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
254 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
255 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
256 * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
257 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
258 * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
259 *
260 * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
261 * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
262 * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
263 * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
264 * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
265 * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
266 * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
267 * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
268 * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
269 * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
270 * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
271 * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
272 * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
273 * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
274 * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
275 * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
276 * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
277 * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
278 * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
279 * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
280 * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
281 * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
282 * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
283 * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
284 * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
285 * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
286 * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
287 * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
288 * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
289 * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
290 * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
291 * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
292 */
293__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
294    TENSOR3D_DECLARATION(src),
295    TENSOR3D_DECLARATION(maxo),
296    TENSOR3D_DECLARATION(dst),
297    TENSOR3D_DECLARATION(sum))
298{
299    const uint lid    = get_local_id(0);
300    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
301
302    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
303    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
304
305    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
306    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
307
308    // Define one temporary vector per work-item.
309    __local VEC_INT tmp_local[GRID_SIZE];
310    __local DATA_TYPE max_local;
311
312    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
313    VEC_BASE max_val_vec = vec_min_val;
314
315    // Number of iterations per work-item.
316    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
317    // Calculate max of row
318    uint i = 0;
319    for(; i < width; ++i)
320    {
321        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
322        max_val_vec       = max(data_max, max_val_vec);
323    }
324#ifdef NON_MULTIPLE_OF_GRID_SIZE
325    // How many work-items needed to complete the computation.
326    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
327    if(lid < boundary_workitems)
328    {
329        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
330        max_val_vec       = max(data_max, max_val_vec);
331    }
332#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
333    VEC_INT widx;
334    if(lid == 0)
335    {
336        // Handle non multiple of 4
337        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
338        widx              = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
339        max_val_vec       = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));
340    }
341#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
342#endif /* NON_MULTIPLE_OF_GRID_SIZE */
343    tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);
344
345    barrier(CLK_LOCAL_MEM_FENCE);
346
347    if(GRID_SIZE >= 256)
348    {
349        if(lid < 128)
350        {
351            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
352        }
353        barrier(CLK_LOCAL_MEM_FENCE);
354    }
355    if(GRID_SIZE >= 128)
356    {
357        if(lid < 64)
358        {
359            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
360        }
361        barrier(CLK_LOCAL_MEM_FENCE);
362    }
363    if(GRID_SIZE >= 64)
364    {
365        if(lid < 32)
366        {
367            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
368        }
369        barrier(CLK_LOCAL_MEM_FENCE);
370    }
371    if(GRID_SIZE >= 32)
372    {
373        if(lid < 16)
374        {
375            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
376        }
377        barrier(CLK_LOCAL_MEM_FENCE);
378    }
379    if(GRID_SIZE >= 16)
380    {
381        if(lid < 8)
382        {
383            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
384        }
385        barrier(CLK_LOCAL_MEM_FENCE);
386    }
387    if(GRID_SIZE >= 8)
388    {
389        if(lid < 4)
390        {
391            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
392        }
393        barrier(CLK_LOCAL_MEM_FENCE);
394    }
395    if(GRID_SIZE >= 4)
396    {
397        if(lid < 2)
398        {
399            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
400        }
401        barrier(CLK_LOCAL_MEM_FENCE);
402    }
403    if(lid == 0)
404    {
405        max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));
406        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
407    }
408    barrier(CLK_LOCAL_MEM_FENCE);
409
410    /* Second section */
411
412    // Set sum vector
413    VEC_INT sum1D   = 0;
414    int     max_val = convert_int(max_local);
415
416    // Shift values, exp and sum
417    for(i = 0; i < width; ++i)
418    {
419        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
420        VEC_INT data_fp        = CONVERT(data, VEC_INT);
421        VEC_INT data_diff      = data_fp - max_val;
422        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
423        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
424        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
425        VSTORE(VECTOR_SIZE)
426        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
427        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
428    }
429#ifdef NON_MULTIPLE_OF_GRID_SIZE
430    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
431    if(lid < boundary_workitems)
432    {
433        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
434        VEC_INT data_fp        = CONVERT(data, VEC_INT);
435        VEC_INT data_diff      = data_fp - max_val;
436        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
437        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
438        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
439        VSTORE(VECTOR_SIZE)
440        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
441        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
442    }
443#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
444    if(lid == 0)
445    {
446        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
447        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
448        VEC_INT data_fp        = CONVERT(data, VEC_INT);
449        VEC_INT data_diff      = data_fp - max_val;
450        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
451        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
452        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
453        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
454        (data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));
455        data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
456        data_fp = select(0, data_fp, widx);
457        sum1D   = sum1D + data_fp;
458    }
459#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
460#endif /* NON_MULTIPLE_OF_GRID_SIZE */
461    tmp_local[lid] = sum1D;
462
463    barrier(CLK_LOCAL_MEM_FENCE);
464
465    if(GRID_SIZE >= 256)
466    {
467        if(lid < 128)
468        {
469            tmp_local[lid] += tmp_local[lid + 128];
470        }
471        barrier(CLK_LOCAL_MEM_FENCE);
472    }
473    if(GRID_SIZE >= 128)
474    {
475        if(lid < 64)
476        {
477            tmp_local[lid] += tmp_local[lid + 64];
478        }
479        barrier(CLK_LOCAL_MEM_FENCE);
480    }
481    if(GRID_SIZE >= 64)
482    {
483        if(lid < 32)
484        {
485            tmp_local[lid] += tmp_local[lid + 32];
486        }
487        barrier(CLK_LOCAL_MEM_FENCE);
488    }
489    if(GRID_SIZE >= 32)
490    {
491        if(lid < 16)
492        {
493            tmp_local[lid] += tmp_local[lid + 16];
494        }
495        barrier(CLK_LOCAL_MEM_FENCE);
496    }
497    if(GRID_SIZE >= 16)
498    {
499        if(lid < 8)
500        {
501            tmp_local[lid] += tmp_local[lid + 8];
502        }
503        barrier(CLK_LOCAL_MEM_FENCE);
504    }
505    if(GRID_SIZE >= 8)
506    {
507        if(lid < 4)
508        {
509            tmp_local[lid] += tmp_local[lid + 4];
510        }
511        barrier(CLK_LOCAL_MEM_FENCE);
512    }
513    if(GRID_SIZE >= 4)
514    {
515        if(lid < 2)
516        {
517            tmp_local[lid] += tmp_local[lid + 2];
518        }
519        barrier(CLK_LOCAL_MEM_FENCE);
520    }
521    if(lid == 0)
522    {
523        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
524        // Perform sum reduction
525        *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
526    }
527}
528#endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
529#endif /* defined(DATA_TYPE) && defined(DIFF_MIN) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(MIN_VALUE) */
530