1/* 2 * Copyright (c) 2017-2021 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "helpers.h" 25 26#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) 27 28/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel. 29 * 30 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float 31 * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0 32 * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16 33 * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE 34 * @note In case of log softmax, -DLOG_SOFTMAX must be passed. 35 * 36 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32 37 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 38 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 39 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 40 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 41 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 42 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 43 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 44 * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr 45 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes) 46 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes) 47 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes) 48 * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes) 49 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes) 50 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes) 51 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor 52 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr 53 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 54 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 55 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 56 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 57 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 58 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 59 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 60 */ 61__kernel void softmax_layer_norm( 62 TENSOR3D_DECLARATION(src), 63 TENSOR3D_DECLARATION(sum), 64 TENSOR3D_DECLARATION(dst)) 65{ 66 const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE); 67 68 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z; 69 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z; 70 71 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum); 72 73 // Load max value of 1D logits vector (row) 74 DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1))); 75 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) 76 data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr); 77 78#if defined(LOG_SOFTMAX) 79 sum_val = log(sum_val); 80 data0 -= sum_val; 81#else // defined(LOG_SOFTMAX) 82 data0 /= sum_val; 83#endif // defined(LOG_SOFTMAX) 84 85 STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 86} 87 88#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL) 89 90/* Number of workitems in dimension 0. */ 91#if !defined(GRID_SIZE) 92#define GRID_SIZE 1 93#endif /* !defined(GRID_SIZE) */ 94 95#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) 96#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) 97 98/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value, 99 * then gets the exponent of each element as sums all elements across each row. 100 * 101 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float 102 * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0 103 * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16 104 * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE 105 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed. 106 * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0). 107 * @note In case of log softmax, -DLOG_SOFTMAX must be passed. 108 * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX 109 * 110 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32 111 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 112 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 113 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 114 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 115 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 116 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 117 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 118 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr 119 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes) 120 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes) 121 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes) 122 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes) 123 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes) 124 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes) 125 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor 126 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr 127 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 128 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 129 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 130 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 131 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 132 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 133 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 134 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr 135 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes) 136 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes) 137 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes) 138 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes) 139 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes) 140 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes) 141 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor 142 */ 143__kernel void softmax_layer_max_shift_exp_sum_serial( 144 TENSOR3D_DECLARATION(src), 145 TENSOR3D_DECLARATION(maxo), 146 TENSOR3D_DECLARATION(dst), 147 TENSOR3D_DECLARATION(sum)) 148{ 149 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z; 150 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z; 151 152 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo); 153 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum); 154 155#ifdef BETA 156 // Initialize beta 157 VEC_TYPE beta = (VEC_TYPE)BETA; 158#endif /* BETA */ 159 160 // Initialize local maximum 161 VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL); 162 163#ifdef NON_MULTIPLE_OF_VECTOR_SIZE 164 VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr); 165 SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE); 166 max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx)); 167#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */ 168 169 for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE) 170 { 171 VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE))); 172 max_val_vec = max(data, max_val_vec); 173 } 174 175 // Perform max reduction 176 DATA_TYPE max_val = MAX_REDUCE(max_val_vec, VECTOR_SIZE); 177 *((__global DATA_TYPE *)maxo.ptr) = max_val; 178 179 /* Second section */ 180 181 // Set sum vector 182 VEC_TYPE sum1D = 0; 183 184#ifdef NON_MULTIPLE_OF_VECTOR_SIZE 185 data -= max_val; 186#ifdef BETA 187 data *= beta; 188#endif /* BETA */ 189#ifdef LOG_SOFTMAX 190 VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER) 191 (data, 0, (__global DATA_TYPE *)dst_addr); 192 data = exp(data); 193 data = select(0, data, widx); 194#else /* LOG_SOFTMAX */ 195 data = exp(data); 196 data = select(0, data, widx); 197 VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER) 198 (data, 0, (__global DATA_TYPE *)dst_addr); 199#endif /* LOG_SOFTMAX */ 200 sum1D += data; 201#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */ 202 203 // Shift values, exp and sum 204 for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE) 205 { 206 VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE))); 207 data -= max_val; 208#ifdef BETA 209 data *= beta; 210#endif /* BETA */ 211#ifdef LOG_SOFTMAX 212 VSTORE(VECTOR_SIZE) 213 (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE))); 214 data = exp(data); 215#else /* LOG_SOFTMAX */ 216 data = exp(data); 217 VSTORE(VECTOR_SIZE) 218 (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE))); 219#endif /* LOG_SOFTMAX */ 220 sum1D += data; 221 } 222 223 // Perform sum reduction 224 *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE); 225} 226 227/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value, 228 * then gets the exponent of each element as sums all elements across each row. 229 * 230 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float 231 * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0 232 * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16 233 * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE 234 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed. 235 * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0). 236 * @note In case of log softmax, -DLOG_SOFTMAX must be passed. 237 * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX 238 * 239 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32 240 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 241 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 242 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 243 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 244 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 245 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 246 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 247 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr 248 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes) 249 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes) 250 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes) 251 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes) 252 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes) 253 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes) 254 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor 255 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr 256 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 257 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 258 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 259 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 260 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 261 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 262 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 263 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr 264 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes) 265 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes) 266 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes) 267 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes) 268 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes) 269 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes) 270 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor 271 */ 272__kernel void softmax_layer_max_shift_exp_sum_parallel( 273 TENSOR3D_DECLARATION(src), 274 TENSOR3D_DECLARATION(maxo), 275 TENSOR3D_DECLARATION(dst), 276 TENSOR3D_DECLARATION(sum)) 277{ 278 const uint lid = get_local_id(0); 279 const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE); 280 281 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z; 282 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z; 283 284 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo); 285 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum); 286 287#ifdef BETA 288 // Initialize beta 289 VEC_TYPE beta = (VEC_TYPE)BETA; 290#endif /* BETA */ 291 292 // Define one temporary vector per work-item. 293 __local VEC_TYPE tmp_local[GRID_SIZE]; 294 __local DATA_TYPE max_local; 295 296 VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL); 297 298 // Number of iterations per work-item. 299 const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE; 300 // Calculate max of row 301 uint i = 0; 302 for(; i < width; ++i) 303 { 304 VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); 305 max_val_vec = max(data_max, max_val_vec); 306 } 307#ifdef NON_MULTIPLE_OF_GRID_SIZE 308 // How many work-items needed to complete the computation. 309 int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE; 310 if(lid < boundary_workitems) 311 { 312 VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); 313 max_val_vec = max(data_max, max_val_vec); 314 } 315#ifdef NON_MULTIPLE_OF_VECTOR_SIZE 316 SELECT_TYPE widx; 317 if(lid == 0) 318 { 319 // Handle non multiple of 4 320 VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE))); 321 widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE); 322 max_val_vec = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx)); 323 } 324#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */ 325#endif /* NON_MULTIPLE_OF_GRID_SIZE */ 326 tmp_local[lid] = max_val_vec; 327 328 barrier(CLK_LOCAL_MEM_FENCE); 329 330 if(GRID_SIZE >= 256) 331 { 332 if(lid < 128) 333 { 334 tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]); 335 } 336 barrier(CLK_LOCAL_MEM_FENCE); 337 } 338 if(GRID_SIZE >= 128) 339 { 340 if(lid < 64) 341 { 342 tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]); 343 } 344 barrier(CLK_LOCAL_MEM_FENCE); 345 } 346 if(GRID_SIZE >= 64) 347 { 348 if(lid < 32) 349 { 350 tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]); 351 } 352 barrier(CLK_LOCAL_MEM_FENCE); 353 } 354 if(GRID_SIZE >= 32) 355 { 356 if(lid < 16) 357 { 358 tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]); 359 } 360 barrier(CLK_LOCAL_MEM_FENCE); 361 } 362 if(GRID_SIZE >= 16) 363 { 364 if(lid < 8) 365 { 366 tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]); 367 } 368 barrier(CLK_LOCAL_MEM_FENCE); 369 } 370 if(GRID_SIZE >= 8) 371 { 372 if(lid < 4) 373 { 374 tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]); 375 } 376 barrier(CLK_LOCAL_MEM_FENCE); 377 } 378 if(GRID_SIZE >= 4) 379 { 380 if(lid < 2) 381 { 382 tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]); 383 } 384 barrier(CLK_LOCAL_MEM_FENCE); 385 } 386 if(lid == 0) 387 { 388 max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]); 389 max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE); 390 } 391 barrier(CLK_LOCAL_MEM_FENCE); 392 393 /* Second section */ 394 395 // Set sum vector 396 VEC_TYPE sum1D = 0; 397 DATA_TYPE max_val = max_local; 398 399 // Shift values, exp and sum 400 for(i = 0; i < width; ++i) 401 { 402 VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); 403 data -= max_val; 404#ifdef BETA 405 data *= beta; 406#endif /* BETA */ 407#ifdef LOG_SOFTMAX 408 VSTORE(VECTOR_SIZE) 409 (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); 410 data = exp(data); 411#else /* LOG_SOFTMAX */ 412 data = exp(data); 413 VSTORE(VECTOR_SIZE) 414 (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); 415#endif /* LOG_SOFTMAX */ 416 sum1D += data; 417 } 418#ifdef NON_MULTIPLE_OF_GRID_SIZE 419 boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE; 420 if(lid < boundary_workitems) 421 { 422 VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); 423 data -= max_val; 424#ifdef BETA 425 data *= beta; 426#endif /* BETA */ 427#ifdef LOG_SOFTMAX 428 VSTORE(VECTOR_SIZE) 429 (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); 430 data = exp(data); 431#else /* LOG_SOFTMAX */ 432 data = exp(data); 433 VSTORE(VECTOR_SIZE) 434 (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE))); 435#endif /* LOG_SOFTMAX */ 436 sum1D += data; 437 } 438#ifdef NON_MULTIPLE_OF_VECTOR_SIZE 439 if(lid == 0) 440 { 441 // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride 442 VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE))); 443 data -= max_val; 444#ifdef BETA 445 data *= beta; 446#endif /* BETA */ 447#ifdef LOG_SOFTMAX 448 VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER) 449 (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE))); 450 data = exp(data); 451 data = select(0, data, widx); 452#else /* LOG_SOFTMAX */ 453 data = exp(data); 454 data = select(0, data, widx); 455 VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER) 456 (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE))); 457#endif /* LOG_SOFTMAX */ 458 sum1D += data; 459 } 460#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */ 461#endif /* NON_MULTIPLE_OF_GRID_SIZE */ 462 tmp_local[lid] = sum1D; 463 464 barrier(CLK_LOCAL_MEM_FENCE); 465 466 if(GRID_SIZE >= 256) 467 { 468 if(lid < 128) 469 { 470 tmp_local[lid] += tmp_local[lid + 128]; 471 } 472 barrier(CLK_LOCAL_MEM_FENCE); 473 } 474 if(GRID_SIZE >= 128) 475 { 476 if(lid < 64) 477 { 478 tmp_local[lid] += tmp_local[lid + 64]; 479 } 480 barrier(CLK_LOCAL_MEM_FENCE); 481 } 482 if(GRID_SIZE >= 64) 483 { 484 if(lid < 32) 485 { 486 tmp_local[lid] += tmp_local[lid + 32]; 487 } 488 barrier(CLK_LOCAL_MEM_FENCE); 489 } 490 if(GRID_SIZE >= 32) 491 { 492 if(lid < 16) 493 { 494 tmp_local[lid] += tmp_local[lid + 16]; 495 } 496 barrier(CLK_LOCAL_MEM_FENCE); 497 } 498 if(GRID_SIZE >= 16) 499 { 500 if(lid < 8) 501 { 502 tmp_local[lid] += tmp_local[lid + 8]; 503 } 504 barrier(CLK_LOCAL_MEM_FENCE); 505 } 506 if(GRID_SIZE >= 8) 507 { 508 if(lid < 4) 509 { 510 tmp_local[lid] += tmp_local[lid + 4]; 511 } 512 barrier(CLK_LOCAL_MEM_FENCE); 513 } 514 if(GRID_SIZE >= 4) 515 { 516 if(lid < 2) 517 { 518 tmp_local[lid] += tmp_local[lid + 2]; 519 } 520 barrier(CLK_LOCAL_MEM_FENCE); 521 } 522 if(lid == 0) 523 { 524 sum1D = (tmp_local[lid + 1] + tmp_local[lid]); 525 // Perform sum reduction 526 *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE); 527 } 528} 529 530#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL) 531#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)