1/* 2 * Copyright (c) 2017-2022 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "gemm_helpers.h" 25#include "helpers_asymm.h" 26#include "repeat.h" 27#include "tile_helpers.h" 28 29#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE) 30 31#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 32#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 33#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val)); 34#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 35#define ARM_DOT(x, y, val) val += arm_dot((x), (y)); 36#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 37#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 38 39#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 40 41#define ARM_DOT1(a, b, c) \ 42 ({ \ 43 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \ 44 }) 45#define ARM_DOT2(a, b, c) \ 46 ({ \ 47 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \ 48 }) 49#define ARM_DOT3(a, b, c) \ 50 ({ \ 51 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \ 52 }) 53#define ARM_DOT4(a, b, c) \ 54 ({ \ 55 ARM_DOT(a, b, c); \ 56 }) 57#define ARM_DOT8(a, b, c) \ 58 ({ \ 59 ARM_DOT4((a.lo), (b.lo), c); \ 60 ARM_DOT4((a.hi), (b.hi), c); \ 61 }) 62#define ARM_DOT16(a, b, c) \ 63 ({ \ 64 ARM_DOT8((a.lo), (b.lo), c); \ 65 ARM_DOT8((a.hi), (b.hi), c); \ 66 }) 67 68#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 69 70/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] without using the dot8 instruction. */ 71#define ARM_DOT1(a, b, c) \ 72 ({ \ 73 c += (ACC_DATA_TYPE)a * b; \ 74 }) 75#define ARM_DOT2(a, b, c) \ 76 ({ \ 77 c += (ACC_DATA_TYPE)a.s0 * b.s0; \ 78 c += (ACC_DATA_TYPE)a.s1 * b.s1; \ 79 }) 80#define ARM_DOT3(a, b, c) \ 81 ({ \ 82 ARM_DOT2(a, b, c); \ 83 c += (ACC_DATA_TYPE)a.s2 * b.s2; \ 84 }) 85#define ARM_DOT4(a, b, c) \ 86 ({ \ 87 ARM_DOT3(a, b, c); \ 88 c += (ACC_DATA_TYPE)a.s3 * b.s3; \ 89 }) 90#define ARM_DOT8(a, b, c) \ 91 ({ \ 92 ARM_DOT4((a.lo), (b.lo), c); \ 93 ARM_DOT4((a.hi), (b.hi), c); \ 94 }) 95#define ARM_DOT16(a, b, c) \ 96 ({ \ 97 ARM_DOT8((a.lo), (b.lo), c); \ 98 ARM_DOT8((a.hi), (b.hi), c); \ 99 }) 100#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 101 102/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */ 103#define ARM_DOT_K0X1(k0, a, b, c) \ 104 ({ \ 105 ARM_DOT_K0(k0, (a), (b##0), (c)); \ 106 }) 107#define ARM_DOT_K0X2(k0, a, b, c) \ 108 ({ \ 109 ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \ 110 ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \ 111 }) 112#define ARM_DOT_K0X3(k0, a, b, c) \ 113 ({ \ 114 ARM_DOT_K0X2(k0, a, b, c); \ 115 ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \ 116 }) 117#define ARM_DOT_K0X4(k0, a, b, c) \ 118 ({ \ 119 ARM_DOT_K0X3(k0, a, b, c); \ 120 ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \ 121 }) 122#define ARM_DOT_K0X8(k0, a, b, c) \ 123 ({ \ 124 ARM_DOT_K0X4(k0, a, b, c); \ 125 ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \ 126 ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \ 127 ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \ 128 ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \ 129 }) 130#define ARM_DOT_K0X16(k0, a, b, c) \ 131 ({ \ 132 ARM_DOT_K0X8(k0, a, b, c); \ 133 ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \ 134 ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \ 135 ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \ 136 ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \ 137 ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \ 138 ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \ 139 ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \ 140 ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \ 141 }) 142 143/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */ 144#define ARM_MM_K0XN0X1(n0, k0, a, b, c) \ 145 ({ \ 146 ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \ 147 }) 148#define ARM_MM_K0XN0X2(n0, k0, a, b, c) \ 149 ({ \ 150 ARM_MM_K0XN0X1(n0, k0, a, b, c); \ 151 ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \ 152 }) 153#define ARM_MM_K0XN0X3(n0, k0, a, b, c) \ 154 ({ \ 155 ARM_MM_K0XN0X2(n0, k0, a, b, c); \ 156 ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \ 157 }) 158#define ARM_MM_K0XN0X4(n0, k0, a, b, c) \ 159 ({ \ 160 ARM_MM_K0XN0X3(n0, k0, a, b, c); \ 161 ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \ 162 }) 163#define ARM_MM_K0XN0X5(n0, k0, a, b, c) \ 164 ({ \ 165 ARM_MM_K0XN0X4(n0, k0, a, b, c); \ 166 ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \ 167 }) 168#define ARM_MM_K0XN0X6(n0, k0, a, b, c) \ 169 ({ \ 170 ARM_MM_K0XN0X5(n0, k0, a, b, c); \ 171 ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \ 172 }) 173#define ARM_MM_K0XN0X7(n0, k0, a, b, c) \ 174 ({ \ 175 ARM_MM_K0XN0X6(n0, k0, a, b, c); \ 176 ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \ 177 }) 178#define ARM_MM_K0XN0X8(n0, k0, a, b, c) \ 179 ({ \ 180 ARM_MM_K0XN0X7(n0, k0, a, b, c); \ 181 ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \ 182 }) 183 184#define ARM_DOT_K0(k0, a, b, c) \ 185 ({ \ 186 CONCAT(ARM_DOT, k0) \ 187 ((a), (b), (c)); \ 188 }) 189 190#define ARM_DOT_K0XN0(n0, k0, a, b, c) \ 191 ({ \ 192 CONCAT(ARM_DOT_K0X, n0) \ 193 (k0, (a), b, (c)); \ 194 }) 195 196#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \ 197 ({ \ 198 CONCAT(ARM_MM_K0XN0X, m0) \ 199 (n0, k0, a, b, c); \ 200 }) 201 202/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */ 203#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) \ 204 ({ \ 205 c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \ 206 }) 207#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c) \ 208 ({ \ 209 c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \ 210 c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \ 211 }) 212#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c) \ 213 ({ \ 214 ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c); \ 215 c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \ 216 }) 217#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c) \ 218 ({ \ 219 ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c); \ 220 c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \ 221 }) 222#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c) \ 223 ({ \ 224 ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c); \ 225 c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \ 226 c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \ 227 c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \ 228 c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \ 229 }) 230#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c) \ 231 ({ \ 232 ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c); \ 233 c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \ 234 c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \ 235 c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \ 236 c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \ 237 c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \ 238 c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \ 239 c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \ 240 c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \ 241 }) 242/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */ 243#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \ 244 ({ \ 245 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \ 246 }) 247#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \ 248 ({ \ 249 ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c); \ 250 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \ 251 }) 252#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \ 253 ({ \ 254 ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c); \ 255 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \ 256 }) 257#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \ 258 ({ \ 259 ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c); \ 260 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \ 261 }) 262#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \ 263 ({ \ 264 ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c); \ 265 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \ 266 }) 267#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \ 268 ({ \ 269 ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c); \ 270 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \ 271 }) 272#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \ 273 ({ \ 274 ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c); \ 275 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \ 276 }) 277#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \ 278 ({ \ 279 ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c); \ 280 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \ 281 }) 282#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \ 283 ({ \ 284 CONCAT(ARM_MUL_N0X, k0) \ 285 (VECTOR_ACC_TYPE, (a), b, (c)); \ 286 }) 287#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \ 288 ({ \ 289 CONCAT(ARM_MM_NATIVE_N0XK0X, m0) \ 290 (VECTOR_ACC_TYPE, k0, a, b, c); \ 291 }) 292 293#if defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T) 294/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type. 295 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed 296 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed 297 * 298 * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar) 299 * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint) 300 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. 301 * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90). 302 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4). 303 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2) 304 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2) 305 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time. 306 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. 307 * @note Only the following configurations of M0, N0 and K0 are currently supported: 308 * - M0 = 2, 3, 4, 5, 6, 7, 8 309 * - N0 = 2, 3, 4, 8, 16 310 * - K0 = 2, 3, 4, 8, 16 311 * - V0 >= 1 312 * - H0 >= 1 313 * 314 * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time: 315 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D 316 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor. 317 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor 318 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped 319 * 320 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM_SIGNED 321 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes) 322 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 323 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes) 324 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 325 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix 326 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr 327 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes) 328 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 329 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes) 330 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 331 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix 332 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32 333 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) 334 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 335 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) 336 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 337 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix 338 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped. 339 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes) 340 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) 341 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 342 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) 343 */ 344__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), 345 IMAGE_DECLARATION(rhs), 346 IMAGE_DECLARATION(dst), 347 uint k, 348 uint lhs_stride_z, 349 uint rhs_stride_z, 350 uint dst_stride_z 351#if defined(REINTERPRET_OUTPUT_AS_3D) 352 , 353 uint dst_cross_plane_pad 354#endif // REINTERPRET_OUTPUT_AS_3D 355 ) 356{ 357 // Block size 358#define LHS_BLOCK_SIZE ((K0) * (M0)) 359 360#if defined(LHS_INTERLEAVE) 361#define LHS_OFFSET_X (K0) 362#define LHS_STEP_X ((K0) * (V0)) 363#define LHS_STEP_LOOP (1) 364#else // defined(INTERLEAVE) 365#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 366#define LHS_STEP_X (K0) 367#define LHS_STEP_LOOP (V0) 368#endif // defined(INTERLEAVE) 369 370 // Block size 371#define RHS_BLOCK_SIZE ((K0) * (N0)) 372 373 // RHS offset and step X 374#if defined(RHS_INTERLEAVE) 375#define RHS_OFFSET_X (K0) 376#define RHS_STEP_X ((K0) * (H0)) 377#define RHS_STEP_LOOP (1) 378#else // defined(RHS_INTERLEAVE) 379#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 380#define RHS_STEP_X (K0) 381#define RHS_STEP_LOOP (H0) 382#endif // defined(RHS_INTERLEAVE) 383 384 uint x = get_global_id(0); 385 uint y = get_global_id(1); 386 uint z = get_global_id(2); 387 388#if defined(DUMMY_WORK_ITEMS) 389 if((x * N0 >= N) || (y * M0 >= M)) 390 { 391 return; 392 } 393#endif // defined(DUMMY_WORK_ITEMS) 394 395 // Compute LHS matrix address 396 __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z)); 397 398 // Compute RHS matrix address 399 __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y); 400 401#if defined(MATRIX_B_DEPTH) 402 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 403 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z; 404#else // defined(MATRIX_B_DEPTH) 405 rhs_addr += z * rhs_stride_z; 406#endif // defined(MATRIX_B_DEPTH) 407 408 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; 409 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); 410 411 // Initialize the accumulators 412 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; 413 414 for(int i = 0; i < k; i += K0) 415 { 416 // Load values from LHS matrix 417 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs); 418 419 // Load values from RHS matrix 420 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs); 421 422 // Partial matrix multiplication M0,N0,K0 423 ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c); 424 425 // Update address 426 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP); 427 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP); 428 } 429 430 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y); 431 432 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; 433 434#if defined(REINTERPRET_OUTPUT_AS_3D) 435 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D 436 CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 437 438 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we 439 // multiply dst_stride_z by DEPTH_GEMM3D 440 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 441 442#else // defined(REINTERPRET_OUTPUT_AS_3D) 443 444 // Add offset for batched GEMM 445 dst_addr += z * dst_stride_z; 446 447#endif // defined(REINTERPRET_OUTPUT_AS_3D) 448 449 // Convert and store output block 450 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 451 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 452 453 // Store output block 454 REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp); 455 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 456 457#undef LHS_BLOCK_SIZE 458#undef LHS_OFFSET_X 459#undef LHS_STEP_X 460#undef RHS_BLOCK_SIZE 461#undef RHS_OFFSET_X 462#undef RHS_STEP_X 463} 464#endif // defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T) 465 466#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) 467#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) 468#define FUSED_OUTPUT_STAGE_FIXED_POINT 469#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) 470 471/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage using fixed-point arithmetic. 472 * The LHS matrix is NOT reshaped 473 * The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed 474 * 475 * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar) 476 * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint) 477 * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64) 478 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4). 479 * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2) 480 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2) 481 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. 482 * @note Only the following configurations of M0, N0 and K0 are currently supported: 483 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 484 * - N0 = 2, 3, 4, 8, 16 485 * - K0 = 2, 3, 4, 8, 16 486 * - H0 >= 1 487 * 488 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time: 489 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D 490 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D 491 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor. 492 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor 493 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix 494 * 495 * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT 496 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time 497 * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE 498 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. 499 * These values can be used to implement "rectified linear unit" activation functions 500 * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed at compile time. 501 * 502 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM8_SIGNED 503 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes) 504 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 505 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes) 506 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 507 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix 508 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr 509 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes) 510 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 511 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes) 512 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 513 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix 514 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr 515 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) 516 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 517 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) 518 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 519 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix 520 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes) 521 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) 522 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 523 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D) 524 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) 525 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: S32 526 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes) 527 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes) 528 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes) 529 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes) 530 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor 531 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: S32 532 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes) 533 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes) 534 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes) 535 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes) 536 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor 537 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: S32 538 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes) 539 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) 540 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor 541 * @param[in] result_multipliers_ptr (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32 542 * @param[in] result_multipliers_stride_x (Optional) Stride of the output multipliers vector in X dimension (in bytes) 543 * @param[in] result_multipliers_step_x (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) 544 * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector 545 * @param[in] result_shifts_ptr (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32 546 * @param[in] result_shifts_stride_x (Optional) Stride of the output shifts vector in X dimension (in bytes) 547 * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes) 548 * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first element in the output shifts vector 549 */ 550#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) 551__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint 552#elif defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) 553__kernel void gemmlowp_mm_reshaped_only_rhs_t 554#endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) 555(IMAGE_DECLARATION(lhs), 556 IMAGE_DECLARATION(rhs), 557 IMAGE_DECLARATION(dst), 558 uint lhs_stride_z, 559 uint rhs_stride_z, 560 uint dst_stride_z 561#if defined(REINTERPRET_INPUT_AS_3D) 562 , 563 uint lhs_cross_plane_pad 564#endif // REINTERPRET_INPUT_AS_3D 565#if defined(REINTERPRET_OUTPUT_AS_3D) 566 , 567 uint dst_cross_plane_pad 568#endif // REINTERPRET_OUTPUT_AS_3D 569#if defined(A_OFFSET) 570 , 571 IMAGE_DECLARATION(sum_col) 572#endif // defined(A_OFFSET) 573#if defined(B_OFFSET) 574 , 575 IMAGE_DECLARATION(sum_row) 576#endif // defined(B_OFFSET) 577#if defined(ADD_BIAS) 578 , 579 VECTOR_DECLARATION(biases) 580#endif // defined(ADD_BIAS) 581#if defined(PER_CHANNEL_QUANTIZATION) 582 , 583 VECTOR_DECLARATION(result_multipliers), 584 VECTOR_DECLARATION(result_shifts) 585#endif // defined(PER_CHANNEL_QUANTIZATION) 586) 587{ 588 // @note: replace with (DIMENSION + PAD) once we pass the relevant info at compile time 589#define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y) 590#define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y) 591 592 // RHS offset and step X 593#if defined(RHS_INTERLEAVE) 594#define RHS_OFFSET_X (K0) 595#define RHS_STEP_X (K0 * H0) 596#else // defined(RHS_INTERLEAVE) 597#define RHS_OFFSET_X (K0 * N0) 598#define RHS_STEP_X (K0) 599#endif // defined(RHS_INTERLEAVE) 600#define RHS_STEP_LOOP (N0 * K0 * H0) 601 602 uint x = GET_SPATIAL_IDX(0, 1, 1); 603 uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0); 604 uint z = GET_SPATIAL_IDX(2, 1, 1); 605 int xo = (x * N0); 606 607#if defined(DUMMY_WORK_ITEMS) 608 if((xo >= N) || (y >= M)) 609 { 610 return; 611 } 612#endif // defined(DUMMY_WORK_ITEMS) 613 614 // Compute LHS matrix address 615 uint lhs_y = y + z * FULL_LHS_HEIGHT; 616 617 // Compute RHS matrix address 618 uint rhs_offset_x = (x % H0) * RHS_OFFSET_X; 619 uint rhs_offset_y = (x / H0) * rhs_stride_y; 620 621#if defined(MATRIX_B_DEPTH) 622 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 623 rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z; 624#else // defined(MATRIX_B_DEPTH) 625 rhs_offset_y += z * rhs_stride_z; 626#endif // defined(MATRIX_B_DEPTH) 627 628 // Initialize the accumulators 629 TILE(ACC_DATA_TYPE, M0, N0, c); 630 LOOP_UNROLLING(int, i, 0, 1, M0, 631 { 632 c[i].v = 0; 633 }) 634 635 int i = 0; 636 for(; i <= (K - K0); i += K0) 637 { 638 TILE(DATA_TYPE, M0, K0, a); 639 TILE(DATA_TYPE, N0, K0, b); 640 641 // Load values from LHS matrix 642 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a); 643 644 // // Load values from RHS matrix 645 LOOP_UNROLLING(int, _i, 0, 1, N0, 646 { 647 b[_i].v = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X)); 648 }) 649 650 // Partial matrix multiplication M0,N0,K0 651 T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c); 652 653 rhs_offset_x += RHS_STEP_LOOP; 654 } 655 656#if((K % K0) != 0) 657 658 // Left-over accumulations 659 for(; i < K; ++i) 660 { 661 TILE(DATA_TYPE, M0, 1, a); 662 TILE(DATA_TYPE, N0, 1, b); 663 664 // Load values from LHS matrix 665 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a); 666 667 LOOP_UNROLLING(int, _i, 0, 1, N0, 668 { 669 b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X); 670 }) 671 672 T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); 673 674 rhs_offset_x += 1; 675 } 676#endif // ((K % K0) != 0) 677 678#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT) 679 680 TILE(int, M0, N0, c_int); 681 TILE(int, M0, N0, offset_s32); 682 LOOP_UNROLLING(int, i, 0, 1, M0, 683 { 684 offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET; 685 }) 686 687 LOOP_UNROLLING(int, i, 0, 1, M0, 688 { 689 c_int[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0)); 690 }) 691 692#if defined(A_OFFSET) 693 694#if defined(SUM_COL_HAS_BATCHES) 695 int sum_col_y = z; 696#else // defined(SUM_COL_HAS_BATCHES) 697 int sum_col_y = 0; 698#endif // defined(SUM_COL_HAS_BATCHES) 699 TILE(int, 1, N0, a_offset_s32); 700 701 T_LOAD(int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32); 702 703 a_offset_s32[0].v *= A_OFFSET; 704 705 T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32); 706#endif // defined(A_OFFSET) 707 708#if defined(B_OFFSET) 709 // Compute the offset contribution due to B_OFFSET 710 // Note: The sum_row tensor is generated through CLGEMMLowpMatrixAReductionKernel which 711 // does not introduce paddings. For this reason is safe to access the tensor in this manner 712 // without considering that the coordinate "y" could come from an input 3D tensor 713 TILE(int, M0, N0, b_offset_s32); 714 715 T_LOAD(int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y / sizeof(int)), 0, 1, sum_row_stride_x, b_offset_s32); 716 717 LOOP_UNROLLING(int, i, 0, 1, M0, 718 { 719 offset_s32[i].v += b_offset_s32[i].v *B_OFFSET; 720 }) 721 722#endif // defined(B_OFFSET) 723 724#if defined(ADD_BIAS) 725 726 TILE(int, 1, N0, bias); 727 728 T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias); 729 730 T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32); 731#endif // defined(ADD_BIAS) 732 733 LOOP_UNROLLING(int, i, 0, 1, M0, 734 { 735 c_int[i].v += offset_s32[i].v; 736 }) 737 738 TILE(DATA_TYPE, M0, N0, c_lp); 739 740 // Multiply by result_mult_int and shift 741#if defined(PER_CHANNEL_QUANTIZATION) 742 TILE(int, 1, N0, res_mul); 743 TILE(int, 1, N0, res_shift); 744 745 T_LOAD(int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul); 746 T_LOAD(int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift); 747 748 T_QUANTIZE8(int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp); 749#else // defined(PER_CHANNEL_QUANTIZATION) 750 T_QUANTIZE8(int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp); 751#endif // defined(PER_CHANNEL_QUANTIZATION) 752 753#if defined(MIN_BOUND) 754 LOOP_UNROLLING(int, i, 0, 1, M0, 755 { 756 c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND); 757 }) 758#endif // defined(MIN_BOUND) 759#if defined(MAX_BOUND) 760 LOOP_UNROLLING(int, i, 0, 1, M0, 761 { 762 c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND); 763 }) 764#endif // defined(MAX_BOUND) 765 766#else // defined(FUSED_OUTPUT_STAGE_FIXED_POINT) 767 TILE(int, M0, N0, c_lp); 768 769 LOOP_UNROLLING(int, i, 0, 1, M0, 770 { 771 c_lp[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0)); 772 }) 773#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT) 774 775 TILE(uint, M0, 1, dst_indirect_y); 776 777 LOOP_UNROLLING(int, i, 0, 1, M0, 778 { 779#if defined(REINTERPRET_OUTPUT_AS_3D) 780 dst_indirect_y[i].v = (uint)min((int)((y + i) % HEIGHT_GEMM3D), (int)HEIGHT_GEMM3D - 1); 781 dst_indirect_y[i].v += (uint)min((int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT; 782 dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D; 783#else // (REINTERPRET_OUTPUT_AS_3D) 784 dst_indirect_y[i].v = (uint)min((int)y + i, (int)M - 1) + z *FULL_DST_HEIGHT; 785#endif // defined(REINTERPRET_OUTPUT_AS_3D) 786 }) 787 788 const bool cond_x = (xo > (N - N0)) & (PARTIAL_STORE_N0 != 0); 789 790#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT) 791 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y); 792#else // defined(FUSED_OUTPUT_STAGE_FIXED_POINT) 793 T_STORE_INDIRECT_WIDTH_SELECT(int, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y); 794#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT) 795 796#undef RHS_OFFSET_X 797#undef RHS_STEP_X 798#undef RHS_STEP_LOOP 799} 800#endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) 801 802#if defined(GEMMLOWP_MM_NATIVE) 803 804/** This OpenCL kernel computes the matrix multiplication between 2 matrices. 805 * The LHS matrix is NOT reshaped 806 * The RHS matrix is NOT reshaped 807 * 808 * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar) 809 * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint) 810 * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64) 811 * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2) 812 * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2) 813 * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., -DK0=2) 814 * @note Only the following configurations of M0, N0 and K0 are currently supported: 815 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 816 * - N0 = 2, 3, 4, 8, 16 817 * - K0 = 2, 3, 4, 8, 16 818 * 819 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time: 820 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D 821 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D 822 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor. 823 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor 824 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix 825 * 826 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: QASYMM8 827 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes) 828 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 829 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes) 830 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 831 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix 832 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr 833 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes) 834 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 835 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes) 836 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 837 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix 838 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32 839 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) 840 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 841 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) 842 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 843 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix 844 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes) 845 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) 846 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 847 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D) 848 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) 849 */ 850__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), 851 IMAGE_DECLARATION(rhs), 852 IMAGE_DECLARATION(dst), 853 uint lhs_stride_z, 854 uint rhs_stride_z, 855 uint dst_stride_z 856#if defined(REINTERPRET_INPUT_AS_3D) 857 , 858 uint lhs_cross_plane_pad 859#endif // REINTERPRET_INPUT_AS_3D 860#if defined(REINTERPRET_OUTPUT_AS_3D) 861 , 862 uint dst_cross_plane_pad 863#endif // REINTERPRET_OUTPUT_AS_3D 864 ) 865{ 866 uint x = get_global_id(0); 867 uint y = get_global_id(1); 868 uint z = get_global_id(2); 869 870#if defined(DUMMY_WORK_ITEMS) 871 if((x * N0 >= N) || (y * M0 >= M)) 872 { 873 return; 874 } 875#endif // defined(DUMMY_WORK_ITEMS) 876 877 // Compute LHS matrix address 878 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 879 880 // Compute RHS matrix address 881 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE); 882 883#if defined(MATRIX_B_DEPTH) 884 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 885 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; 886#else // defined(MATRIX_B_DEPTH) 887 rhs_offset += z * rhs_stride_z; 888#endif // defined(MATRIX_B_DEPTH) 889 890 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); 891 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); 892 893#if defined(REINTERPRET_INPUT_AS_3D) 894 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D 895 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 896 897 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we 898 // multiply lhs_stride_z by DEPTH_GEMM3D 899 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 900 901#else // defined(REINTERPRET_INPUT_AS_3D) 902 903 // Add offset for batched GEMM 904 lhs_offset += z * lhs_stride_z; 905 906#endif // defined(REINTERPRET_INPUT_AS_3D) 907 908 // Initialize the accumulators 909 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; 910 911 int i = 0; 912 913 for(; i <= (K - K0); i += K0) 914 { 915 // Load values from LHS matrix 916 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 917 918 // Load values from RHS matrix 919 LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); 920 921 // Partial matrix multiplication M0,N0,K0 922#if(GPU_ARCH == GPU_ARCH_MIDGARD) 923 ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c); 924#else // GPU_ARCH == GPU_ARCH_MIDGARD 925 // Transpose the values from RHS matrix 926 TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE); 927 928 ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c); 929#endif // GPU_ARCH == GPU_ARCH_MIDGARD 930 931 // Update the offset 932 lhs_offset += K0; 933 rhs_offset += K0 * rhs_stride_y; 934 } 935 936 // Left-over for loop 937 for(; i < K; ++i) 938 { 939 // Load values from LHS matrix 940 LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 941 942 // Load values from RHS matrix 943 LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); 944 945 // Partial matrix multiplication M0,N0,1 946#if(GPU_ARCH == GPU_ARCH_MIDGARD) 947 ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c); 948#else // GPU_ARCH == GPU_ARCH_MIDGARD 949 // Transpose the values from RHS matrix 950 TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE); 951 952 ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c); 953#endif // GPU_ARCH == GPU_ARCH_MIDGARD 954 955 // Update the offset 956 lhs_offset += 1; 957 rhs_offset += rhs_stride_y; 958 } 959 960 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 961 962 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; 963 964#if defined(REINTERPRET_OUTPUT_AS_3D) 965 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D 966 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 967 968 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we 969 // multiply dst_stride_z by DEPTH_GEMM3D 970 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 971 972#else // defined(REINTERPRET_OUTPUT_AS_3D) 973 974 // Add offset for batched GEMM 975 dst_addr += z * dst_stride_z; 976 977#endif // defined(REINTERPRET_OUTPUT_AS_3D) 978 const bool cond_y = y == 0; 979 const bool cond_x = ((x + 1) * N0 >= N); 980 981 // Convert and store output block 982 REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0)); 983 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 984} 985#endif // defined(GEMMLOWP_MM_NATIVE) 986 987#if defined(GEMMLOWP_MATRIX_A_REDUCTION) 988/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. 989 * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time. 990 * 991 * @note This stage is needed to handle the offset of matrix product 992 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md 993 * 994 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A 995 * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar) 996 * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint) 997 * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3) 998 * 999 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8 1000 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 1001 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1002 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 1003 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1004 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 1005 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1006 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 1007 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32 1008 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 1009 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) 1010 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 1011 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) 1012 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 1013 */ 1014__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src), 1015 IMAGE_DECLARATION(dst)) 1016{ 1017 // Compute source and destination addresses 1018 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); 1019 Image dst = CONVERT_TO_IMAGE_STRUCT(dst); 1020 1021 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) 1022 sum_row_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0; 1023 ACC_DATA_TYPE sum_row = 0; 1024 1025 __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z); 1026 1027 int i = 0; 1028 1029 // This for loop performs 16 accumulations 1030 for(; i <= ((int)COLS_A - 16); i += 16) 1031 { 1032 const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i); 1033 1034 sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF, 1035 VEC_DATA_TYPE(ACC_DATA_TYPE, 4)); 1036 } 1037 1038 // This for loop performs the leftover accumulations 1039 for(; i < COLS_A; ++i) 1040 { 1041 sum_row += (ACC_DATA_TYPE)matrix_a[i]; 1042 } 1043 1044 sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3; 1045 1046#if defined(SCALAR) 1047 sum_row *= (int)SCALAR; 1048#endif // defined(SCALAR) 1049 *((__global int *)dst.ptr) = (int)sum_row; 1050} 1051#endif // defined(GEMMLOWP_MATRIX_A_REDUCTION) 1052 1053#if defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8) 1054/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction. 1055 * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time. 1056 * 1057 * @note This stage is needed to handle the offset of matrix product 1058 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md 1059 * 1060 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A 1061 * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar) 1062 * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint) 1063 * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3) 1064 * 1065 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8 1066 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 1067 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1068 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 1069 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1070 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 1071 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1072 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 1073 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32 1074 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 1075 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) 1076 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 1077 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) 1078 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 1079 */ 1080__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src), 1081 IMAGE_DECLARATION(dst)) 1082{ 1083 // Compute source and destination addresses 1084 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); 1085 Image dst = CONVERT_TO_IMAGE_STRUCT(dst); 1086 1087 ACC_DATA_TYPE sum_row = 0; 1088 1089 __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z); 1090 1091 int i = 0; 1092 1093 // This for loop performs 16 accumulations 1094 for(; i <= ((int)COLS_A - 32); i += 32) 1095 { 1096 VEC_DATA_TYPE(DATA_TYPE, 16) 1097 a0 = vload16(0, matrix_a + i); 1098 1099 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 1100 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 1101 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 1102 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 1103 1104 a0 = vload16(1, matrix_a + i); 1105 1106 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 1107 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 1108 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 1109 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 1110 } 1111 1112 // This for loop performs the leftover accumulations 1113 for(; i < COLS_A; ++i) 1114 { 1115 sum_row += (ACC_DATA_TYPE)matrix_a[i]; 1116 } 1117 1118#if defined(SCALAR) 1119 sum_row *= (int)SCALAR; 1120#endif // defined(SCALAR) 1121 *((__global int *)dst.ptr) = (int)sum_row; 1122} 1123#endif // defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8) 1124 1125#if defined(GEMMLOWP_MATRIX_B_REDUCTION) 1126/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. 1127 * It is also possible to multiply each reduced column by a scalar value, if SCALAR is passed at compile time. 1128 * 1129 * @note This stage is needed to handle the offset of matrix product 1130 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md 1131 * 1132 * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B 1133 * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar) 1134 * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint) 1135 * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. -DSCALAR=3) 1136 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 1137 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 1138 * 1139 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL 1140 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 1141 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1142 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 1143 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1144 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 1145 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1146 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 1147 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32 1148 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 1149 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) 1150 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 1151 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) 1152 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 1153 */ 1154__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src), 1155 IMAGE_DECLARATION(dst)) 1156{ 1157 // Compute source and destination addresses 1158 const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 1159 const uint y = get_global_id(1); 1160 1161 __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z); 1162 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y; 1163 1164 VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) 1165 sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0; 1166 1167 int i = 0; 1168 // This for loop performs 4 accumulations 1169 for(; i <= ((int)ROWS_B - 4); i += 4) 1170 { 1171 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 1172 b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y); 1173 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 1174 b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y); 1175 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 1176 b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y); 1177 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 1178 b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y); 1179 1180 sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3, 1181 VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); 1182 1183 matrix_b += 4 * src_stride_y; 1184 } 1185 1186 // This for loop perfoms the leftover accumulations 1187 for(; i < (int)ROWS_B; ++i) 1188 { 1189 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 1190 b0 = VLOAD(VEC_SIZE)(0, matrix_b); 1191 1192 sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); 1193 1194 matrix_b += src_stride_y; 1195 } 1196 1197#if defined(SCALAR) 1198 sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR; 1199#endif // defined(SCALAR) 1200 VEC_DATA_TYPE(int, VEC_SIZE) 1201 res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE)); 1202 1203 STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 1204} 1205#endif // defined(GEMMLOWP_MATRIX_B_REDUCTION) 1206 1207#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE) 1208 1209#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) 1210 1211#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) 1212 1213/* Helper function used to calculate the offset contribution after matrix multiplication. 1214 * 1215 * This kernel takes a final int32 accumulator value (the output of matrix multiplication), 1216 * and calculates the offset contribution of matrix A and matrix B. 1217 * 1218 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) 1219 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) 1220 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) 1221 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches 1222 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 1223 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 1224 * 1225 * @param[in] x max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) 1226 * @param[in] y get_global_id(1) 1227 * @param[in] z get_global_id(2) 1228 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr 1229 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes) 1230 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes) 1231 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes) 1232 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes) 1233 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor 1234 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr 1235 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes) 1236 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes) 1237 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes) 1238 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes) 1239 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor 1240 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr 1241 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes) 1242 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) 1243 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor 1244 */ 1245inline VEC_INT offset_contribution( 1246 int x, 1247 int y, 1248 int z 1249#if defined(A_OFFSET) 1250 , 1251 IMAGE_DECLARATION(sum_col) 1252#endif // defined(A_OFFSET) 1253#if defined(B_OFFSET) 1254 , 1255 IMAGE_DECLARATION(sum_row) 1256#endif // defined(B_OFFSET) 1257#if defined(ADD_BIAS) 1258 , 1259 VECTOR_DECLARATION(biases) 1260#endif // defined(ADD_BIAS) 1261) 1262{ 1263 VEC_INT a_offset_s32 = (VEC_INT)0; 1264 VEC_INT b_offset_s32 = (VEC_INT)0; 1265 1266 int batch_id = z; 1267#if defined(DEPTH_INPUT3D) 1268 batch_id /= (int)DEPTH_INPUT3D; 1269#endif // defined(DEPTH_INPUT3D) 1270 1271#if defined(A_OFFSET) 1272 // Compute the offset contribution due to A_OFFSET 1273 __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int); 1274 1275 // Compute the offset contribution due to A_OFFSET 1276#if defined(SUM_COL_HAS_BATCHES) 1277 a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y)); 1278#else // defined(SUM_COL_HAS_BATCHES) 1279 a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr); 1280#endif // defined(SUM_COL_HAS_BATCHES) 1281 1282 a_offset_s32 *= (VEC_INT)A_OFFSET; 1283#endif // defined(A_OFFSET) 1284 1285#if defined(B_OFFSET) 1286 // Compute the offset contribution due to A_OFFSET 1287 __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int); 1288 1289 // Compute the offset contribution due to B_OFFSET 1290#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) 1291 b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D); 1292#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) 1293 b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y))); 1294#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) 1295 b_offset_s32 *= (VEC_INT)B_OFFSET; 1296#endif // defined(B_OFFSET) 1297 1298#if defined(ADD_BIAS) 1299 // Add bias 1300 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 1301 1302 VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 1303 b_offset_s32 += (VEC_INT)biases_values; 1304#endif // defined(ADD_BIAS) 1305 1306 return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32; 1307} 1308 1309#if defined(GEMMLOWP_OFFSET_CONTRIBUTION) 1310/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is performed in-place 1311 * 1312 * This kernel takes a final int32 accumulator value (the output of matrix multiplication), 1313 * and adds to it the offset contribution of matrix A and matrix B in-place. 1314 * 1315 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) 1316 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) 1317 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) 1318 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches 1319 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 1320 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 1321 * 1322 * The final result is: 1323 * 1324 * mm_result[i][k] = mm_result[i][k] + 1325 * (sum_col[k] * A_OFFSET) + 1326 * (sum_row[i] * B_OFFSET) + 1327 * (K_OFFSET) 1328 * 1329 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32 1330 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes) 1331 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes) 1332 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes) 1333 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes) 1334 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes) 1335 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes) 1336 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor 1337 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr 1338 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes) 1339 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes) 1340 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes) 1341 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes) 1342 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor 1343 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr 1344 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes) 1345 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes) 1346 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes) 1347 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes) 1348 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor 1349 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr 1350 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes) 1351 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) 1352 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor 1353 */ 1354__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result) 1355#if defined(A_OFFSET) 1356 , 1357 IMAGE_DECLARATION(sum_col) 1358#endif // defined(A_OFFSET) 1359#if defined(B_OFFSET) 1360 , 1361 IMAGE_DECLARATION(sum_row) 1362#endif // defined(B_OFFSET) 1363#if defined(ADD_BIAS) 1364 , 1365 VECTOR_DECLARATION(biases) 1366#endif // defined(ADD_BIAS)) 1367 ) 1368{ 1369 const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 1370 const int y = get_global_id(1); 1371 const int z = get_global_id(2); 1372 1373 // Compute offset contribution 1374 VEC_INT offset_term_s32 = offset_contribution( 1375 x, y, z 1376#if defined(A_OFFSET) 1377 , 1378 sum_col_ptr, 1379 sum_col_stride_x, 1380 sum_col_step_x, 1381 sum_col_stride_y, 1382 sum_col_step_y, 1383 sum_col_offset_first_element_in_bytes 1384#endif // defined(A_OFFSET) 1385#if defined(B_OFFSET) 1386 , 1387 sum_row_ptr, 1388 sum_row_stride_x, 1389 sum_row_step_x, 1390 sum_row_stride_y, 1391 sum_row_step_y, 1392 sum_row_offset_first_element_in_bytes 1393#endif // defined(B_OFFSET) 1394#if defined(ADD_BIAS) 1395 , 1396 biases_ptr, 1397 biases_stride_x, 1398 biases_step_x, 1399 biases_offset_first_element_in_bytes 1400#endif // defined(ADD_BIAS) 1401 ); 1402 1403 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z; 1404 1405 VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr); 1406 1407 // Add the offset terms to GEMM's result 1408 in_s32_0 += offset_term_s32; 1409 1410 // Store the result with the offset contribution 1411 STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 1412} 1413#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION) 1414 1415#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN) 1416/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8. 1417 * 1418 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage. 1419 * 1420 * 1421 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) 1422 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) 1423 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) 1424 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches 1425 * 1426 * The result before the output stage is: 1427 * 1428 * mm_result[i][k] = mm_result[i][k] + 1429 * (sum_col[k] * A_OFFSET) + 1430 * (sum_row[i] * B_OFFSET) + 1431 * (K_OFFSET) 1432 * 1433 * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations: 1434 * 1435 * -# Add offset terms to final result 1436 * -# Multiply each entry of result by result_mult_int 1437 * -# Add bias to final result (if -DADD_BIAS is passed at compile time) 1438 * -# Shift the int32 accumulator by result_shift 1439 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time) 1440 * -# Clamp the resulting int32 values: 1441 * - to the [0..255] range and cast to QASYMM8. 1442 * - to the [-128..127] range and cast to QASYMM8_SIGNED. 1443 * 1444 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT 1445 * 1446 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time 1447 * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE 1448 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. 1449 * These values can be used to implement "rectified linear unit" activation functions 1450 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 1451 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 1452 * 1453 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32 1454 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes) 1455 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes) 1456 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes) 1457 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes) 1458 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes) 1459 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes) 1460 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor 1461 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr 1462 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes) 1463 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes) 1464 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes) 1465 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes) 1466 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor 1467 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr 1468 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes) 1469 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes) 1470 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes) 1471 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes) 1472 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor 1473 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr 1474 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes) 1475 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) 1476 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor 1477 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED 1478 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 1479 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) 1480 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 1481 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) 1482 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 1483 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1484 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 1485 * @param[in] result_multipliers_ptr (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32 1486 * @param[in] result_multipliers_stride_x (Optional) Stride of the output multipliers vector in X dimension (in bytes) 1487 * @param[in] result_multipliers_step_x (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) 1488 * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector 1489 * @param[in] result_shifts_ptr (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32 1490 * @param[in] result_shifts_stride_x (Optional) Stride of the output shifts vector in X dimension (in bytes) 1491 * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes) 1492 * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first element in the output shifts vector 1493 */ 1494__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result) 1495#if defined(A_OFFSET) 1496 , 1497 IMAGE_DECLARATION(sum_col) 1498#endif // defined(A_OFFSET) 1499#if defined(B_OFFSET) 1500 , 1501 IMAGE_DECLARATION(sum_row) 1502#endif // defined(B_OFFSET) 1503 , 1504#if defined(ADD_BIAS) 1505 VECTOR_DECLARATION(biases), 1506#endif // defined(ADD_BIAS) 1507 TENSOR3D_DECLARATION(dst) 1508#if defined(PER_CHANNEL_QUANTIZATION) 1509 , 1510 VECTOR_DECLARATION(result_multipliers), 1511 VECTOR_DECLARATION(result_shifts) 1512#endif // defined(PER_CHANNEL_QUANTIZATION) 1513 ) 1514{ 1515 const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 1516 const int y = get_global_id(1); 1517 const int z = get_global_id(2); 1518 1519 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 1520 1521 // Compute offset contribution 1522 VEC_INT offset_term_s32 = offset_contribution( 1523 x, y, z 1524#if defined(A_OFFSET) 1525 , 1526 sum_col_ptr, 1527 sum_col_stride_x, 1528 sum_col_step_x, 1529 sum_col_stride_y, 1530 sum_col_step_y, 1531 sum_col_offset_first_element_in_bytes 1532#endif // defined(A_OFFSET) 1533#if defined(B_OFFSET) 1534 , 1535 sum_row_ptr, 1536 sum_row_stride_x, 1537 sum_row_step_x, 1538 sum_row_stride_y, 1539 sum_row_step_y, 1540 sum_row_offset_first_element_in_bytes 1541#endif // defined(B_OFFSET) 1542#if defined(ADD_BIAS) 1543 , 1544 biases_ptr, 1545 biases_stride_x, 1546 biases_step_x, 1547 biases_offset_first_element_in_bytes 1548#endif // defined(ADD_BIAS) 1549 ); 1550 1551 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z; 1552 1553 VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr); 1554 1555 // Add the offset terms to GEMM's result 1556 in_s32 += offset_term_s32; 1557 1558 // -------------- OUTPUT STAGE 1559 1560 // Add the offset terms to GEMM's result 1561 in_s32 += (VEC_INT)RESULT_OFFSET; 1562 1563 // Multiply by result_mult_int and shift 1564#if defined(PER_CHANNEL_QUANTIZATION) 1565 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int); 1566 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int); 1567 VEC_INT result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr); 1568 VEC_INT result_shifts_values = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr); 1569 1570 in_s32 *= result_multipliers_values; 1571 in_s32 >>= result_shifts_values; 1572#else // defined(PER_CHANNEL_QUANTIZATION) 1573 in_s32 *= RESULT_MULTIPLIER; 1574 1575 in_s32 >>= RESULT_SHIFT; 1576#endif // defined(PER_CHANNEL_QUANTIZATION) 1577 1578 VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 1579 res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 1580 1581#if defined(MIN_BOUND) 1582 res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 1583#endif // defined(MIN_BOUND) 1584#if defined(MAX_BOUND) 1585 res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 1586#endif // defined(MAX_BOUND) 1587 1588 // Store the result 1589 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 1590} 1591#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN) 1592 1593#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT) 1594/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes down to uint8. 1595 * 1596 * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage. 1597 * 1598 * 1599 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) 1600 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) 1601 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) 1602 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches 1603 * 1604 * The result before the output stage is: 1605 * 1606 * mm_result[i][k] = mm_result[i][k] + 1607 * (sum_col[k] * A_OFFSET) + 1608 * (sum_row[i] * B_OFFSET) + 1609 * (K_OFFSET) 1610 * 1611 * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations: 1612 * 1613 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier 1614 * -# Add bias to final result if bias tensor is not a nullptr 1615 * -# Round to nearest division by a power-of-two using result_shift 1616 * -# Add offset to each result 1617 * -# Clamp the value between the specified min and max bounds 1618 * -# Clamp the resulting int32 values: 1619 * - to the [0..255] range and cast to QASYMM8. 1620 * - to the [-128..127] range and cast to QASYMM8_SIGNED. 1621 * 1622 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT 1623 * 1624 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time 1625 * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE 1626 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. 1627 * These values can be used to implement "rectified linear unit" activation functions 1628 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 1629 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 1630 * 1631 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32 1632 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes) 1633 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes) 1634 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes) 1635 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes) 1636 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes) 1637 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes) 1638 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor 1639 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr 1640 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes) 1641 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes) 1642 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes) 1643 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes) 1644 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor 1645 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr 1646 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes) 1647 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes) 1648 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes) 1649 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes) 1650 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor 1651 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr 1652 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes) 1653 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) 1654 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor 1655 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED 1656 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 1657 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) 1658 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 1659 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) 1660 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 1661 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1662 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 1663 * @param[in] result_multipliers_ptr (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32 1664 * @param[in] result_multipliers_stride_x (Optional) Stride of the output multipliers vector in X dimension (in bytes) 1665 * @param[in] result_multipliers_step_x (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) 1666 * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector 1667 * @param[in] result_shifts_ptr (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32 1668 * @param[in] result_shifts_stride_x (Optional) Stride of the output shifts vector in X dimension (in bytes) 1669 * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes) 1670 * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first element in the output shifts vector 1671 */ 1672__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result) 1673#if defined(A_OFFSET) 1674 , 1675 IMAGE_DECLARATION(sum_col) 1676#endif // defined(A_OFFSET) 1677#if defined(B_OFFSET) 1678 , 1679 IMAGE_DECLARATION(sum_row) 1680#endif // defined(B_OFFSET) 1681 , 1682#if defined(ADD_BIAS) 1683 VECTOR_DECLARATION(biases), 1684#endif // defined(ADD_BIAS) 1685 TENSOR3D_DECLARATION(dst) 1686#if defined(PER_CHANNEL_QUANTIZATION) 1687 , 1688 VECTOR_DECLARATION(result_multipliers), 1689 VECTOR_DECLARATION(result_shifts) 1690#endif // defined(PER_CHANNEL_QUANTIZATION) 1691 ) 1692{ 1693 const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 1694 const int y = get_global_id(1); 1695 const int z = get_global_id(2); 1696 1697 // Compute offset contribution 1698 VEC_INT offset_term_s32 = offset_contribution( 1699 x, y, z 1700#if defined(A_OFFSET) 1701 , 1702 sum_col_ptr, 1703 sum_col_stride_x, 1704 sum_col_step_x, 1705 sum_col_stride_y, 1706 sum_col_step_y, 1707 sum_col_offset_first_element_in_bytes 1708#endif // defined(A_OFFSET) 1709#if defined(B_OFFSET) 1710 , 1711 sum_row_ptr, 1712 sum_row_stride_x, 1713 sum_row_step_x, 1714 sum_row_stride_y, 1715 sum_row_step_y, 1716 sum_row_offset_first_element_in_bytes 1717#endif // defined(B_OFFSET) 1718#if defined(ADD_BIAS) 1719 , 1720 biases_ptr, 1721 biases_stride_x, 1722 biases_step_x, 1723 biases_offset_first_element_in_bytes 1724#endif // defined(ADD_BIAS) 1725 ); 1726 1727 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z; 1728 1729 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 1730 1731 VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr); 1732 1733 // Add the offset terms to GEMM's result 1734 in_s32 += offset_term_s32; 1735 1736 // -------------- OUTPUT STAGE 1737 1738 // Multiply by result_mult_int and shift 1739#if defined(PER_CHANNEL_QUANTIZATION) 1740 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int); 1741 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int); 1742 VEC_INT result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr); 1743 VEC_INT result_shifts_values = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr); 1744 1745 VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE); 1746 VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE); 1747 in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0); 1748#else // defined(PER_CHANNEL_QUANTIZATION) 1749 1750#if RESULT_SHIFT < 0 1751 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 1752#else // RESULT_SHIFT >= 0 1753 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 1754#endif // RESULT_SHIFT < 0 1755 1756#endif // defined(PER_CHANNEL_QUANTIZATION) 1757 1758 // Add the offset terms to GEMM's result 1759 in_s32 += (VEC_INT)RESULT_OFFSET; 1760 1761 VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 1762 res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 1763 1764#if defined(MIN_BOUND) 1765 res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 1766#endif // defined(MIN_BOUND) 1767#if defined(MAX_BOUND) 1768 res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 1769#endif // defined(MAX_BOUND) 1770 1771 // Store the result 1772 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 1773} 1774#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT) 1775 1776#undef VEC_INT 1777 1778#endif // defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) 1779 1780#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN) 1781/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED 1782 * 1783 * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. 1784 * The following computations will be performed by the kernel: 1785 * 1786 * -# Add offset terms to final result 1787 * -# Multiply each entry of result by result_mult_int 1788 * -# Add bias to final result (if -DADD_BIAS is passed at compile time) 1789 * -# Shift the int32 accumulator by result_shift 1790 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time) 1791 * -# Clamp the resulting int32 values: 1792 * -# - to the [0..255] range and cast to QASYMM8. 1793 * -# - to the [-128..127] range and cast to QASYMM8_SIGNED. 1794 * 1795 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT 1796 * 1797 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time 1798 * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE 1799 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. 1800 * These values can be used to implement "rectified linear unit" activation functions 1801 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 1802 * 1803 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 1804 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 1805 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1806 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 1807 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1808 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 1809 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1810 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 1811 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr 1812 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes) 1813 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) 1814 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor 1815 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED 1816 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 1817 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) 1818 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 1819 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) 1820 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 1821 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1822 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 1823 */ 1824__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), 1825#if defined(ADD_BIAS) 1826 VECTOR_DECLARATION(biases), 1827#endif // defined(ADD_BIAS) 1828 TENSOR3D_DECLARATION(dst)) 1829{ 1830 // Compute source and destination addresses 1831 int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 1832 int y = get_global_id(1); 1833 int z = get_global_id(2); 1834 1835 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 1836 1837 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 1838 1839 VEC_DATA_TYPE(int, VEC_SIZE) 1840 input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 1841 1842#if defined(ADD_BIAS) 1843 // Add bias 1844 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 1845 1846 VEC_DATA_TYPE(int, VEC_SIZE) 1847 biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 1848 input_values += biases_values; 1849#endif // defined(ADD_BIAS) 1850 1851 // Add the offset terms to GEMM's result 1852 input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET; 1853 1854 // Multiply by result_mult_int and shift 1855 input_values *= RESULT_MULT_INT; 1856 1857#if RESULT_SHIFT < 0 1858 input_values >>= -RESULT_SHIFT; 1859#else // RESULT_SHIFT >= 0 1860 input_values >>= RESULT_SHIFT; 1861#endif // RESULT_SHIFT < 0 1862 1863 VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 1864 res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 1865 1866#if defined(MIN_BOUND) 1867 res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 1868#endif // defined(MIN_BOUND) 1869#if defined(MAX_BOUND) 1870 res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 1871#endif // defined(MAX_BOUND) 1872 1873 // Store the result 1874 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 1875} 1876#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN) 1877 1878#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT) 1879/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED 1880 * 1881 * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. 1882 * The following computations will be performed by the kernel: 1883 * 1884 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier 1885 * -# Add bias to final result if bias tensor is not a nullptr 1886 * -# Round to nearest division by a power-of-two using result_shift 1887 * -# Add offset to each result 1888 * -# Clamp the value between the specified min and max bounds 1889 * -# Clamp the resulting int32 values: 1890 * - to the [0..255] range and cast to QASYMM8. 1891 * - to the [-128..127] range and cast to QASYMM8_SIGNED. 1892 * 1893 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT 1894 * 1895 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time 1896 * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE 1897 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. 1898 * These values can be used to implement "rectified linear unit" activation functions 1899 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 1900 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 1901 * 1902 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 1903 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 1904 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1905 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 1906 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1907 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 1908 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1909 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 1910 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr 1911 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes) 1912 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) 1913 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor 1914 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED 1915 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 1916 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) 1917 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 1918 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) 1919 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 1920 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1921 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 1922 */ 1923__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src), 1924#if defined(ADD_BIAS) 1925 VECTOR_DECLARATION(biases), 1926#endif // defined(ADD_BIAS) 1927 TENSOR3D_DECLARATION(dst)) 1928{ 1929 // Compute source and destination addresses 1930 int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 1931 int y = get_global_id(1); 1932 int z = get_global_id(2); 1933 1934 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 1935 1936 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 1937 1938 VEC_DATA_TYPE(int, VEC_SIZE) 1939 input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 1940 1941#if defined(ADD_BIAS) 1942 // Add bias 1943 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 1944 1945 VEC_DATA_TYPE(int, VEC_SIZE) 1946 biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 1947 input_values += biases_values; 1948#endif // defined(ADD_BIAS) 1949 1950 // Multiply by result_mult_int and shift 1951#if RESULT_SHIFT < 0 1952 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 1953#else // RESULT_SHIFT >= 0 1954 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 1955#endif // RESULT_SHIFT < 0 1956 1957 // Add the offset terms to GEMM's result 1958 input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT; 1959 1960 VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 1961 res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 1962 1963#if defined(MIN_BOUND) 1964 res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 1965#endif // defined(MIN_BOUND) 1966#if defined(MAX_BOUND) 1967 res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 1968#endif // defined(MAX_BOUND) 1969 1970 // Store the result 1971 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 1972} 1973#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT) 1974 1975#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16) 1976/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16 1977 * 1978 * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QSYMM16 value. 1979 * The following computations will be performed by the kernel: 1980 * 1981 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier 1982 * -# Add bias to final result if bias tensor is not a nullptr 1983 * -# Round to nearest division by a power-of-two using result_shift 1984 * -# Add offset to each result 1985 * -# Clamp the value between the specified min and max bounds 1986 * -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16. 1987 * 1988 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT 1989 * 1990 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time 1991 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. 1992 * These values can be used to implement "rectified linear unit" activation functions 1993 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 1994 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 1995 * 1996 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 1997 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 1998 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1999 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 2000 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 2001 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 2002 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 2003 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 2004 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr 2005 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes) 2006 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes) 2007 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor 2008 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QSYMM16 2009 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 2010 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) 2011 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 2012 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) 2013 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 2014 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 2015 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 2016 */ 2017__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src), 2018#if defined(ADD_BIAS) 2019 VECTOR_DECLARATION(biases), 2020#endif // defined(ADD_BIAS) 2021 TENSOR3D_DECLARATION(dst)) 2022{ 2023 // Compute source and destination addresses 2024 int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 2025 int y = get_global_id(1); 2026 int z = get_global_id(2); 2027 2028 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 2029 2030 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z; 2031 2032 VEC_DATA_TYPE(int, VEC_SIZE) 2033 input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 2034 2035#if defined(ADD_BIAS) 2036 // Add bias 2037 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 2038 2039 VEC_DATA_TYPE(int, VEC_SIZE) 2040 biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 2041 input_values += biases_values; 2042#endif // defined(ADD_BIAS) 2043 2044 // Multiply by result_mult_int and shift 2045#if RESULT_SHIFT < 0 2046 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 2047#else // RESULT_SHIFT >= 0 2048 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 2049#endif // RESULT_SHIFT < 0 2050 2051 VEC_DATA_TYPE(short, VEC_SIZE) 2052 res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE)); 2053 2054#if defined(MIN_BOUND) 2055 res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND); 2056#endif // defined(MIN_BOUND) 2057#if defined(MAX_BOUND) 2058 res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND); 2059#endif // defined(MAX_BOUND) 2060 2061 // Store the result 2062 STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 2063} 2064#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16) 2065 2066#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT) 2067/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED 2068 * 2069 * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. 2070 * The following computations will be performed by the kernel: 2071 * 2072 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier 2073 * -# Add bias to final result if bias tensor is not a nullptr 2074 * -# Requantize 2075 * -# Add offset to each result 2076 * -# Clamp the value between the specified min and max bounds 2077 * -# Clamp the resulting int32 values: 2078 * - to the [0..255] range and cast to QASYMM8. 2079 * - to the [-128..127] range and cast to QASYMM8_SIGNED. 2080 * 2081 * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER 2082 * 2083 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time 2084 * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE 2085 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND. 2086 * These values can be used to implement "rectified linear unit" activation functions 2087 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 2088 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE 2089 * 2090 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32 2091 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 2092 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 2093 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 2094 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 2095 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 2096 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 2097 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 2098 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr 2099 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) 2100 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) 2101 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor 2102 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8 2103 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 2104 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) 2105 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 2106 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) 2107 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 2108 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 2109 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) 2110 * @param[in] dst_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 2111 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 2112 */ 2113__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src), 2114#if defined(ADD_BIAS) 2115 VECTOR_DECLARATION(biases), 2116#endif // defined(ADD_BIAS) 2117#if defined(DST_HEIGHT) 2118 TENSOR4D_DECLARATION(dst)) 2119#else // defined(DST_HEIGHT) 2120 TENSOR3D_DECLARATION(dst)) 2121#endif // defined(DST_HEIGHT) 2122{ 2123 // Compute source and destination addresses 2124 int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 2125 int y = get_global_id(1); 2126 int z = get_global_id(2); 2127 2128 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 2129 2130 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 2131 2132 VEC_DATA_TYPE(int, VEC_SIZE) 2133 input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 2134 2135#if defined(ADD_BIAS) 2136 // Add bias 2137 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 2138 2139 VEC_DATA_TYPE(int, VEC_SIZE) 2140 biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 2141 input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values; 2142#endif // defined(ADD_BIAS) 2143 2144 // Convert to float 2145 VEC_DATA_TYPE(float, VEC_SIZE) 2146 input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE)); 2147 input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET); 2148 2149 VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 2150 res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 2151 2152#if defined(MIN_BOUND) 2153 res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 2154#endif // defined(MIN_BOUND) 2155#if defined(MAX_BOUND) 2156 res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 2157#endif // defined(MAX_BOUND) 2158 2159 // Store the result 2160 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 2161} 2162#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT) 2163