1 /* 2 * Copyright (c) 2019-2021 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 #include "activation_float_helpers.h" 25 #include "helpers.h" 26 27 /** Utility macro to access a vector with the scalar positions 28 * 29 * Supported cases are: Offset can only be of the same size of the OpenCL vector (2,3,4,8,16) 30 * 31 * @param[in] offset The offset within the vector. Offset can only be of the same size of the OpenCL vector (2,3,4,8,16) 32 * @param[in] n0 The number of consecutive columns to access. n0 + offset must be <= 16 33 * @param[in] x Vector to access 34 * @{ 35 */ 36 #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x) 37 #define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) 38 39 // offset == 0 40 #define scalar_access_0_1(x) ((x).s0) 41 #define scalar_access_0_2(x) ((x).s01) 42 #define scalar_access_0_3(x) ((x).s012) 43 #define scalar_access_0_4(x) ((x).s0123) 44 #define scalar_access_0_8(x) ((x).s01234567) 45 #define scalar_access_0_16(x) ((x).s0123456789ABCDEF) 46 47 // offset == 1 48 #define scalar_access_1_1(x) ((x).s1) 49 #define scalar_access_1_2(x) ((x).s12) 50 #define scalar_access_1_3(x) ((x).s123) 51 #define scalar_access_1_4(x) ((x).s1234) 52 #define scalar_access_1_8(x) ((x).s12345678) 53 54 // offset == 2 55 #define scalar_access_2_1(x) ((x).s2) 56 #define scalar_access_2_2(x) ((x).s23) 57 #define scalar_access_2_3(x) ((x).s234) 58 #define scalar_access_2_4(x) ((x).s2345) 59 #define scalar_access_2_8(x) ((x).s23456789) 60 61 // offset == 3 62 #define scalar_access_3_1(x) ((x).s3) 63 #define scalar_access_3_2(x) ((x).s34) 64 #define scalar_access_3_3(x) ((x).s345) 65 #define scalar_access_3_4(x) ((x).s3456) 66 #define scalar_access_3_8(x) ((x).s3456789A) 67 68 // offset == 4 69 #define scalar_access_4_1(x) ((x).s4) 70 #define scalar_access_4_2(x) ((x).s45) 71 #define scalar_access_4_3(x) ((x).s456) 72 #define scalar_access_4_4(x) ((x).s4567) 73 #define scalar_access_4_8(x) ((x).s456789AB) 74 75 // offset == 8 76 #define scalar_access_8_1(x) ((x).s8) 77 #define scalar_access_8_2(x) ((x).s89) 78 #define scalar_access_8_3(x) ((x).s89A) 79 #define scalar_access_8_4(x) ((x).s89AB) 80 #define scalar_access_8_8(x) ((x).s89ABCDEF) 81 82 // offset == 12 83 #define scalar_access_12_1(x) ((x).sC) 84 #define scalar_access_12_2(x) ((x).sCD) 85 #define scalar_access_12_3(x) ((x).sCDE) 86 #define scalar_access_12_4(x) ((x).sCDEF) 87 88 // offset == 16 89 #define scalar_access_16_1(x) ((x).sF) 90 91 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) without allocating variables. 92 * @name LOAD_TENSOR_ROW_n 93 * 94 * @param[in] N0 The number of columns to load 95 * @param[in] DATA_TYPE The data type of variables 96 * @param[in] BASENAME The basename of the destination variables for the loaded rows 97 * @param[in] PTR The base pointer 98 * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16 99 * @param[in] STRIDE_Y The stride value in y-axis direction 100 * @param[in] Z The z-axis offset vector 101 * @{ 102 */ 103 #define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 104 ({}) 105 106 #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 107 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 108 109 #define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 110 LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 111 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 112 113 #define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 114 LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 115 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 116 117 #define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 118 LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 119 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 120 121 #define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 122 LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 123 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 124 125 #define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 126 LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 127 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 128 129 #define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 130 LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 131 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 132 133 #define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 134 LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 135 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 136 137 #define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 138 LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 139 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 140 141 #define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 142 LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 143 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 144 145 #define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 146 LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 147 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 148 149 #define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 150 LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 151 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 152 153 #define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 154 LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 155 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 156 157 #define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 158 LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 159 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 160 161 #define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 162 LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 163 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 164 165 #define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 166 LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 167 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 168 /** @}*/ // end of group LOAD_TENSOR_ROW_n 169 170 /** Load tensor (consecutive rows and columns) with Z offset. 171 * @name LOAD_TENSOR 172 * 173 * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16 174 * The data to load is expected to have consecutive names for each row. 175 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2. 176 * The Z offset is expected to have consecutive names. 177 * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2. 178 * 179 * @param[in] M0 The number of consecutive rows 180 * @param[in] N0 The number of consecutive columns 181 * @param[in] DATA_TYPE The data type of the target 182 * @param[in] BASENAME The basename of the result variables 183 * @param[in] PTR The base pointer for the data 184 * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16 185 * @param[in] STRIDE_Y The stride in y-axis direction 186 * @param[in] Z The z-axis offset vector 187 * @{ 188 */ 189 #define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 190 #define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 191 /** @} */ // end of group LOAD_TENSOR 192 193 /** Load 2D tensor (consecutive rows and columns) with Z offset. 194 * @name LOAD_TENSOR_M0Xn 195 * 196 * @param[in] M0 The number of rows to load [0-16] 197 * @param[in] N0 The number of columns to load [0-16] 198 * @param[in] DATA_TYPE The data type of variables 199 * @param[in] BASENAME The basename of the destination variables for the loaded rows 200 * @param[in] PTR The base pointer 201 * @param[in] STRIDE_Y The stride value in y-axis direction 202 * @param[in] Z The z-axis offset vector 203 * @{ 204 */ 205 #define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 206 ({}) 207 208 #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 209 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 210 211 #define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 212 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 213 214 #define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 215 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 216 217 #define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 218 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 219 220 #define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 221 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 222 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 223 224 #define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 225 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 226 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 227 228 #define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 229 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 230 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 231 232 #define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 233 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 234 235 #define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 236 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 237 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 238 239 #define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 240 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 241 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 242 243 #define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 244 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 245 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 246 247 #define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 248 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 249 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 250 251 #define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 252 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 253 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 254 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 255 256 #define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 257 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 258 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 259 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 260 261 #define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 262 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 263 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 264 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 265 266 #define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 267 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 268 /** @}*/ // end of group LOAD_TENSOR_M0Xn 269 270 /** Load 2D tensor (consecutive rows and columns) with Z offset. 271 * @name LOAD_TENSOR_M0XN0 272 * 273 * @param[in] M0 The number of consecutive rows [0-16] 274 * @param[in] N0 The number of consecutive columns [0-16] 275 * @param[in] DATA_TYPE The data type of the target 276 * @param[in] BASENAME The basename of the result variables 277 * @param[in] PTR The base pointer for the data 278 * @param[in] STRIDE_Y The stride in y-axis direction 279 * @param[in] Z The z-axis offset vector 280 * @{ 281 */ 282 #define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 283 #define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 284 285 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). 286 * @name LOAD_ROW_n 287 * 288 * @param[in] N0 The number of columns to load 289 * @param[in] DATA_TYPE The data type of variables 290 * @param[in] BASENAME The basename of the destination variables for the loaded rows 291 * @param[in] PTR The base pointer 292 * @param[in] OFFSET The offset within a row 293 * @param[in] STRIDE_Y The stride value in y-axis direction 294 * @param[in] Z The z-axis offset vector 295 * @{ 296 */ 297 #define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 298 VEC_DATA_TYPE(DATA_TYPE, N0) \ 299 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 300 301 #define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 302 LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 303 VEC_DATA_TYPE(DATA_TYPE, N0) \ 304 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 305 306 #define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 307 LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 308 VEC_DATA_TYPE(DATA_TYPE, N0) \ 309 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 310 311 #define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 312 LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 313 VEC_DATA_TYPE(DATA_TYPE, N0) \ 314 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 315 316 #define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 317 LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 318 VEC_DATA_TYPE(DATA_TYPE, N0) \ 319 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 320 321 #define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 322 LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 323 VEC_DATA_TYPE(DATA_TYPE, N0) \ 324 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 325 326 #define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 327 LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 328 VEC_DATA_TYPE(DATA_TYPE, N0) \ 329 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 330 331 #define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 332 LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 333 VEC_DATA_TYPE(DATA_TYPE, N0) \ 334 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 335 336 #define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 337 LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 338 VEC_DATA_TYPE(DATA_TYPE, N0) \ 339 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 340 341 #define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 342 LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 343 VEC_DATA_TYPE(DATA_TYPE, N0) \ 344 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 345 346 #define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 347 LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 348 VEC_DATA_TYPE(DATA_TYPE, N0) \ 349 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 350 351 #define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 352 LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 353 VEC_DATA_TYPE(DATA_TYPE, N0) \ 354 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 355 356 #define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 357 LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 358 VEC_DATA_TYPE(DATA_TYPE, N0) \ 359 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 360 361 #define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 362 LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 363 VEC_DATA_TYPE(DATA_TYPE, N0) \ 364 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 365 366 #define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 367 LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 368 VEC_DATA_TYPE(DATA_TYPE, N0) \ 369 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 370 371 #define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 372 LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 373 VEC_DATA_TYPE(DATA_TYPE, N0) \ 374 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 375 376 /** @}*/ // end of group LOAD_ROW_n 377 378 /** Load Blocks (consecutive rows and columns) with Z offset. 379 * @name LOAD_BLOCK 380 * 381 * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16 382 * The data to load is expected to have consecutive names for each row. 383 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2. 384 * The Z offset is expected to have consecutive names. 385 * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2. 386 * 387 * @param[in] M0 The number of consecutive rows 388 * @param[in] N0 The number of consecutive columns 389 * @param[in] DATA_TYPE The data type of the target 390 * @param[in] BASENAME The basename of the result variables 391 * @param[in] PTR The base pointer for the data 392 * @param[in] OFFSET The offset within a row 393 * @param[in] STRIDE_Y The stride in y-axis direction 394 * @param[in] Z The z-axis offset vector 395 * @{ 396 */ 397 #define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 398 #define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 399 /** @} */ // end of group LOAD_BLOCK 400 401 /** Partially load the 0 to (n-1)th rows of the given variables 402 * @name LOAD_ROW_PARTIAL_n 403 * Within each row, load the lower @p LOAD_N0 elements of vectors of width @p N0 404 * 405 * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty. 406 * 407 * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16 408 * @param[in] LOAD_N0 The **lower** size of the vectors to load. Supported: [1-16 and <= @p N0 409 * @param[in] DATA_TYPE The data type of the vectors 410 * @param[in] BASENAME The basename of the variables 411 * @param[in] PTR The base pointer 412 * @param[in] OFFSET The offset within a row 413 * @param[in] STRIDE_Y The stride value in y-axis direction 414 * @param[in] Z The offset in z-axis direction 415 * @{ 416 */ 417 #define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 418 VLOAD_PARTIAL(N0, LOAD_N0) \ 419 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 420 421 #define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 422 LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 423 VLOAD_PARTIAL(N0, LOAD_N0) \ 424 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 425 426 #define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 427 LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 428 VLOAD_PARTIAL(N0, LOAD_N0) \ 429 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 430 431 #define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 432 LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 433 VLOAD_PARTIAL(N0, LOAD_N0) \ 434 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 435 436 #define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 437 LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 438 VLOAD_PARTIAL(N0, LOAD_N0) \ 439 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 440 441 #define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 442 LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 443 VLOAD_PARTIAL(N0, LOAD_N0) \ 444 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 445 446 #define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 447 LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 448 VLOAD_PARTIAL(N0, LOAD_N0) \ 449 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 450 451 #define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 452 LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 453 VLOAD_PARTIAL(N0, LOAD_N0) \ 454 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 455 456 #define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 457 LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 458 VLOAD_PARTIAL(N0, LOAD_N0) \ 459 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 460 461 #define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 462 LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 463 VLOAD_PARTIAL(N0, LOAD_N0) \ 464 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 465 466 #define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 467 LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 468 VLOAD_PARTIAL(N0, LOAD_N0) \ 469 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 470 471 #define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 472 LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 473 VLOAD_PARTIAL(N0, LOAD_N0) \ 474 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 475 476 #define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 477 LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 478 VLOAD_PARTIAL(N0, LOAD_N0) \ 479 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 480 481 #define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 482 LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 483 VLOAD_PARTIAL(N0, LOAD_N0) \ 484 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 485 486 #define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 487 LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 488 VLOAD_PARTIAL(N0, LOAD_N0) \ 489 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 490 491 #define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 492 LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 493 VLOAD_PARTIAL(N0, LOAD_N0) \ 494 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 495 /** @} */ // end of groupd LOAD_ROW_PARTIAL_n 496 497 /** Partially load a block of the given size LOAD_M0xLOAD_N0 498 * @name LOAD_BLOCK_PARTIAL 499 * 500 * @note The vector width @p N0 is also required for correct partial storing behaviour. 501 * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty. 502 * 503 * The data to load is expected to have consecutive names for each row. 504 * E.g., for LOAD_M0=3 and basename=c, the expected names are c0, c1 and c2. 505 * The Z offset is expected to have consecutive names. 506 * E.g., for LOAD_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 507 * 508 * @param[in] LOAD_M0 The number of rows to load. Supported: 1-16 509 * @param[in] LOAD_N0 The lower number of elements of vectors to load. Supported: 1-16 and <= @p N0 510 * @param[in] N0 The size of each vector. Supported: 1, 2, 3, 4, 8, 16 511 * @param[in] DATA_TYPE The data type of the vectors 512 * @param[in] BASENAME The basename of the variables 513 * @param[in] PTR The base pointer 514 * @param[in] OFFSET The offset within a row 515 * @param[in] STRIDE_Y The stride value in y-axis direction 516 * @param[in] Z The offset in z-axis direction 517 * @{ 518 */ 519 #define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 520 #define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 521 /** Load a block that can be partial in both x and y dimensions 522 * 523 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty. 524 * 525 * The data to load is expected to have consecutive names for each row. 526 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 527 * The Z offset is expected to have consecutive names. 528 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 529 * 530 * @param[in] M0 The number of rows to load, for non-partial blocks. Supported: 1-16 531 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 532 * @param[in] DATA_TYPE The data type of the vectors 533 * @param[in] BASENAME The basename of the variables 534 * @param[in] PTR The base pointer 535 * @param[in] OFFSET The offset within a row 536 * @param[in] STRIDE_Y The stride value in y-axis direction 537 * @param[in] Z The offset in z-axis direction 538 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 539 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 540 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0. 541 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0. 542 */ 543 #define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 544 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 545 { \ 546 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 547 } \ 548 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 549 { \ 550 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 551 } \ 552 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 553 { \ 554 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 555 } \ 556 else \ 557 { \ 558 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 559 } 560 /** Load a block that can only be partial in x but not y. 561 * 562 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty. 563 * 564 * The data to load is expected to have consecutive names for each row. 565 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 566 * The Z offset is expected to have consecutive names. 567 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 568 * 569 * @param[in] M0 The number of rows to load, for non-partial blocks. Supported: 1-16 570 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 571 * @param[in] DATA_TYPE The data type of the vectors 572 * @param[in] BASENAME The basename of the variables 573 * @param[in] PTR The base pointer 574 * @param[in] OFFSET The offset within a row 575 * @param[in] STRIDE_Y The stride value in y-axis direction 576 * @param[in] Z The offset in z-axis direction 577 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0) 578 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0. 579 */ 580 #define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 581 if(!(PARTIAL_COND_X)) \ 582 { \ 583 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 584 } \ 585 else \ 586 { \ 587 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 588 } 589 /** Load a block that can only be partial in y but not x. 590 * 591 * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty. 592 * 593 * The data to store is expected to have consecutive names for each row. 594 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 595 * The Z offset is expected to have consecutive names. 596 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 597 * 598 * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16 599 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 600 * @param[in] DATA_TYPE The data type of the vectors 601 * @param[in] BASENAME The basename of the variables 602 * @param[in] PTR The base pointer 603 * @param[in] OFFSET The offset within a row 604 * @param[in] STRIDE_Y The stride value in y-axis direction 605 * @param[in] Z The offset in z-axis direction 606 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0) 607 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0. 608 */ 609 #define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 610 if(!(PARTIAL_COND_Y)) \ 611 { \ 612 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 613 } \ 614 else \ 615 { \ 616 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 617 } 618 /** @} */ // end of group LOAD_BLOCK_PARTIAL 619 /** Boundary-aware GeMM block load 620 * @name LOAD_BLOCK_BOUNDARY_AWARE 621 * This macro assumes the following schemes to achieve boundary-awareness: 622 * - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim. 623 * - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings. 624 * - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim. 625 * The macro then ensures that the src tensor can be loaded without any paddings in both x and y dim. 626 * 627 * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial 628 * blocks **at the end**. 629 * Say, the src tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/ 630 * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters: 631 * 632 * *--x--> x == 0 x == 1 633 * | |<------------------------------N-------------------------->| 634 * y |<--------------N0------------->|<----PARTIAL_STORE_N0----->| 635 * | -------------############################################################# 636 * * | | |...............................|...........................| 637 * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.| 638 * | | |...............................|...........................| 639 * M --############################################################# 640 * | | | |...........................| 641 * y == 1 | M0 | Non-boundary block |....Boundary block in x....| 642 * | | | |...........................| 643 * |------------############################################################# 644 * 645 * Then @p PARTIAL_STORE_M0 = M % M0 and @p PARTIAL_STORE_N0 = N % N0 646 * 647 * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty. 648 * 649 * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension, 650 * and select corresponding load methods such that the boundary detection logic is only added when needed. 651 * 652 * The data to load is expected to have consecutive names for each row. 653 * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. 654 * The Z offset is expected to have consecutive names. 655 * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. 656 * 657 * The macro will result in a declaration of @p M0 vectors of size @p N0 with data 658 * type @p DATA_TYPE containing values partially loaded from the specified 659 * address in memory. The remaining (N0 - PARTIAL_STORE_N0) elements will be 660 * filled with zeros. 661 * 662 * @param[in] M0 The number of rows to load, for non-partial blocks. Supported: 1-16 663 * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16 664 * @param[in] DATA_TYPE The data type of the vectors 665 * @param[in] BASENAME The basename of the variables 666 * @param[in] PTR The base pointer 667 * @param[in] OFFSET The offset within a row 668 * @param[in] STRIDE_Y The stride value in y-axis direction 669 * @param[in] Z The offset in z-axis direction 670 * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0) 671 * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0) 672 * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0. 673 * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0. 674 * @{ 675 */ 676 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 677 // Case1: No partial blocks in either x or y 678 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 679 LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 680 681 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 682 // Case2: Partial blocks in y 683 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 684 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 685 LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 686 687 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 688 // Case3: Partial blocks in x 689 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 690 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 691 LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 692 693 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 694 // Case4: Partial blocks in both x and y 695 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 696 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 697 LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 698 699 #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 700 701 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). 702 * @name LOAD_TEXTURE2D_ROW_n 703 * 704 * @param[in] N0 The number of pixels to read 705 * @param[in] DATA_TYPE The data type of variables 706 * @param[in] BASENAME The basename of the destination variables for the loaded rows 707 * @param[in] IMG The 2D OpenCL image object 708 * @param[in] X_COORD The x coordinate for the top-left pixel 709 * @param[in] Y_COORD The y coordinate for the top-left pixel 710 * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels) 711 * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels) 712 * @{ 713 */ 714 #define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 715 BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW)) 716 717 #define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 718 LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 719 BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW)) 720 721 #define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 722 LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 723 BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW)) 724 725 #define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 726 LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 727 BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW)) 728 729 #define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 730 LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 731 BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW)) 732 733 #define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 734 LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 735 BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW)) 736 737 #define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 738 LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 739 BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW)) 740 741 #define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 742 LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 743 BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW)) 744 745 #define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 746 LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 747 BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW)) 748 749 #define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 750 LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 751 BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW)) 752 753 #define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 754 LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 755 BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW)) 756 757 #define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 758 LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 759 BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW)) 760 761 #define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 762 LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 763 BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW)) 764 765 #define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 766 LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 767 BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW)) 768 769 #define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 770 LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 771 BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW)) 772 773 #define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 774 LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 775 BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW)) 776 /** @} */ // end of group LOAD_TEXTURE2D_ROW_n 777 778 /** Load a 2D texture in unit of pixel. A pixel is made of 4 floating point values 779 * @name LOAD_TEXTURE2D 780 * 781 * Supported cases are M0=1,2,3,...,16 and N0=1 782 * The data to load is expected to have consecutive names for each row. 783 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2. 784 * 785 * @param[in] M0 The number of consecutive rows 786 * @param[in] N0 The number of consecutive pixels. Only 1, 2 and 4 are supported 787 * @param[in] DATA_TYPE The data type of the target 788 * @param[in] BASENAME The basename of the result variables 789 * @param[in] IMG The 2D OpenCL image object 790 * @param[in] X_COORD The x coordinate for the top-left pixel 791 * @param[in] Y_COORD The y coordinate for the top-left pixel 792 * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels) 793 * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels) 794 * @{ 795 */ 796 #define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 797 #define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 798 /** @} */ // end of group LOAD_TEXTURE2D 799 800 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded. 801 * @name LOAD_ROW_INDIRECT_n 802 * 803 * @param[in] N0 The number of columns to load 804 * @param[in] DATA_TYPE The data type of variables 805 * @param[in] BASENAME The basename of the destination variables for the loaded rows 806 * @param[in] PTR The base pointer 807 * @param[in] OFFSET The offset within a row 808 * @param[in] STRIDE_Y The stride value in y-axis direction 809 * @param[in] Y The y-axis offset vector 810 * @param[in] Y_MASK The y-axis mask vector. If 0, forces BASENAMEn to 0 811 * @{ 812 */ 813 #define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 814 VEC_DATA_TYPE(DATA_TYPE, N0) \ 815 BASENAME##0; \ 816 if(Y_MASK##0 != 0) \ 817 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \ 818 else \ 819 BASENAME##0 = 0; 820 821 #define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 822 LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 823 VEC_DATA_TYPE(DATA_TYPE, N0) \ 824 BASENAME##1; \ 825 if(Y_MASK##1 != 0) \ 826 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \ 827 else \ 828 BASENAME##1 = 0; 829 830 #define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 831 LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 832 VEC_DATA_TYPE(DATA_TYPE, N0) \ 833 BASENAME##2; \ 834 if(Y_MASK##2 != 0) \ 835 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \ 836 else \ 837 BASENAME##2 = 0; 838 839 #define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 840 LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 841 VEC_DATA_TYPE(DATA_TYPE, N0) \ 842 BASENAME##3; \ 843 if(Y_MASK##3 != 0) \ 844 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \ 845 else \ 846 BASENAME##3 = 0; 847 848 #define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 849 LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 850 VEC_DATA_TYPE(DATA_TYPE, N0) \ 851 BASENAME##4; \ 852 if(Y_MASK##4 != 0) \ 853 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \ 854 else \ 855 BASENAME##4 = 0; 856 857 #define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 858 LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 859 VEC_DATA_TYPE(DATA_TYPE, N0) \ 860 BASENAME##5; \ 861 if(Y_MASK##5 != 0) \ 862 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \ 863 else \ 864 BASENAME##5 = 0; 865 866 #define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 867 LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 868 VEC_DATA_TYPE(DATA_TYPE, N0) \ 869 BASENAME##6; \ 870 if(Y_MASK##6 != 0) \ 871 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \ 872 else \ 873 BASENAME##6 = 0; 874 875 #define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 876 LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 877 VEC_DATA_TYPE(DATA_TYPE, N0) \ 878 BASENAME##7; \ 879 if(Y_MASK##7 != 0) \ 880 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \ 881 else \ 882 BASENAME##7 = 0; 883 884 #define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 885 LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 886 VEC_DATA_TYPE(DATA_TYPE, N0) \ 887 BASENAME##8; \ 888 if(Y_MASK##8 != 0) \ 889 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \ 890 else \ 891 BASENAME##8 = 0; 892 893 #define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 894 LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 895 VEC_DATA_TYPE(DATA_TYPE, N0) \ 896 BASENAME##9; \ 897 if(Y_MASK##9 != 0) \ 898 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \ 899 else \ 900 BASENAME##9 = 0; 901 902 #define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 903 LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 904 VEC_DATA_TYPE(DATA_TYPE, N0) \ 905 BASENAME##A; \ 906 if(Y_MASK##A != 0) \ 907 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \ 908 else \ 909 BASENAME##A = 0; 910 911 #define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 912 LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 913 VEC_DATA_TYPE(DATA_TYPE, N0) \ 914 BASENAME##B; \ 915 if(Y_MASK##B != 0) \ 916 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \ 917 else \ 918 BASENAME##B = 0; 919 920 #define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 921 LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 922 VEC_DATA_TYPE(DATA_TYPE, N0) \ 923 BASENAME##C; \ 924 if(Y_MASK##C != 0) \ 925 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \ 926 else \ 927 BASENAME##C = 0; 928 929 #define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 930 LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 931 VEC_DATA_TYPE(DATA_TYPE, N0) \ 932 BASENAME##D; \ 933 if(Y_MASK##D != 0) \ 934 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \ 935 else \ 936 BASENAME##D = 0; 937 938 #define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 939 LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 940 VEC_DATA_TYPE(DATA_TYPE, N0) \ 941 BASENAME##E; \ 942 if(Y_MASK##E != 0) \ 943 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \ 944 else \ 945 BASENAME##E = 0; 946 947 #define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 948 LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 949 VEC_DATA_TYPE(DATA_TYPE, N0) \ 950 BASENAME##F; \ 951 if(Y_MASK##F != 0) \ 952 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \ 953 else \ 954 BASENAME##F = 0; 955 956 /** Load blocks (consecutive rows and columns) with Y offset. 957 * @name LOAD_BLOCK_INDIRECT 958 * 959 * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16 960 * The data to load is expected to have consecutive names for each row. 961 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2. 962 * The Z offset is expected to have consecutive names. 963 * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2. 964 * 965 * @param[in] M0 The number of consecutive rows 966 * @param[in] N0 The number of consecutive columns 967 * @param[in] DATA_TYPE The data type of the target 968 * @param[in] BASENAME The basename of the result variables 969 * @param[in] PTR The base pointer for the data 970 * @param[in] OFFSET The offset within a row 971 * @param[in] STRIDE_Y The stride in y-axis direction 972 * @param[in] Y The y-axis offset vector 973 * @param[in] Y_MASK The y-axis mask vector. If 0, forces BASENAMEn to 0 974 * @{ 975 */ 976 #define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 977 #define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 978 979 /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). 980 * @name LOAD_ELEMENT_n 981 * 982 * @param[in] N0 The number of rows to load 983 * @param[in] DATA_TYPE The data type of variables 984 * @param[in] BASENAME The basename of the destination variables for the loaded rows 985 * @param[in] PTR The base pointer 986 * @param[in] OFFSET The offset within a row 987 * @param[in] STRIDE_Y The stride value in y-axis direction 988 * @{ 989 */ 990 #define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 991 VEC_DATA_TYPE(DATA_TYPE, N0) \ 992 BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y)); 993 994 #define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 995 LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 996 VEC_DATA_TYPE(DATA_TYPE, N0) \ 997 BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y)); 998 999 #define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1000 LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1001 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1002 BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y)); 1003 1004 #define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1005 LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1006 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1007 BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y)); 1008 1009 #define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1010 LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1011 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1012 BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y)); 1013 1014 #define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1015 LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1016 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1017 BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y)); 1018 1019 #define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1020 LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1021 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1022 BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y)); 1023 1024 #define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1025 LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1026 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1027 BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y)); 1028 1029 #define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1030 LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1031 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1032 BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y)); 1033 1034 #define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1035 LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1036 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1037 BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y)); 1038 1039 #define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1040 LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1041 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1042 BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y)); 1043 1044 #define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1045 LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1046 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1047 BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y)); 1048 1049 #define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1050 LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1051 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1052 BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y)); 1053 1054 #define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1055 LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1056 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1057 BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y)); 1058 1059 #define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1060 LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1061 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1062 BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y)); 1063 1064 #define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1065 LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 1066 VEC_DATA_TYPE(DATA_TYPE, N0) \ 1067 BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y)); 1068 1069 /** @}*/ // end of group LOAD_ELEMENT_n 1070 1071 /** Load Scalar as Vector (consecutive elements). 1072 * @name LOAD_SCALAR_AS_VECTOR 1073 * 1074 * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16 1075 * The data to load is expected to have consecutive names for each row. 1076 * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2. 1077 * 1078 * @param[in] M0 The number of consecutive rows 1079 * @param[in] N0 The number of consecutive columns 1080 * @param[in] DATA_TYPE The data type of the target 1081 * @param[in] BASENAME The basename of the result variables 1082 * @param[in] PTR The base pointer for the data 1083 * @param[in] OFFSET The offset within a row 1084 * @param[in] STRIDE_Y The stride in y-axis direction 1085 * @{ 1086 */ 1087 #define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 1088 #define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 1089 /** @} */ // end of group LOAD_SCALAR_AS_VECTOR 1090 1091 /** Basic macros to calculate Z offset values from Z0 to Zn-1 1092 * @name CALCULATE_Z_OFFSET_n 1093 * 1094 * @param[in] M0 The number of offset values to calculate 1095 * @param[in] DATA_TYPE The data type of the results 1096 * @param[in] Z The basename of the result variables 1097 * @param[in] Y The work-itme ID of y-axis 1098 * @param[in] HEIGHT_GEMM3D The height of GEMM3D 1099 * @param[in] DEPTH_GEMM3D The depth of GEMM3D 1100 * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension 1101 * @param[in] STRIDE_Y The stride value in y-axis direction 1102 * 1103 * @{ 1104 */ 1105 #define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1106 Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 1107 Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ 1108 Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); 1109 1110 #define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1111 CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1112 Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 1113 Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ 1114 Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); 1115 1116 #define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1117 CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1118 Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 1119 Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ 1120 Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); 1121 1122 #define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1123 CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1124 Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 1125 Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ 1126 Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); 1127 1128 #define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1129 CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1130 Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 1131 Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ 1132 Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); 1133 1134 #define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1135 CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1136 Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 1137 Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ 1138 Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); 1139 1140 #define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1141 CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1142 Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 1143 Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ 1144 Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); 1145 1146 #define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1147 CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 1148 Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 1149 Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ 1150 Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); 1151 1152 /** @} */ // end of group CALCULATE_Z_OFFSET_n 1153 1154 /** Calculate Z offset values from Z0 to Zn-1 1155 * @name CALCULATE_Z_OFFSET 1156 * 1157 * The Z offsets are expected to have consecutive names. 1158 * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3. 1159 * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account 1160 * the possible cross plane paddings in case of the plance changes across the z-dimension. 1161 * 1162 * <!-- 1163 * | | 1164 * | plane0 | 1165 * | | 1166 * |__________________| 1167 * |******************| 1168 * | cross_plane_pad | 1169 * |******************| 1170 * | | 1171 * | plane1 | 1172 * | | 1173 * |__________________| 1174 * --> 1175 * 1176 * @param[in] M0 The number of offset values to calculate 1177 * @param[in] DATA_TYPE The data type of the results 1178 * @param[in] Z The basename of the result variables 1179 * @param[in] Y The work-itme ID of y-axis 1180 * @param[in] HEIGHT_GEMM3D The height of GEMM3D 1181 * @param[in] DEPTH_GEMM3D The depth of GEMM3D 1182 * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension 1183 * @param[in] STRIDE_Y The stride value in y-axis direction 1184 * @{ 1185 */ 1186 #define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 1187 #define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 1188 /** @} */ // end of group CALCULATE_Z_OFFSET 1189 1190 /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1) 1191 * @name SCALE_ROW_n 1192 * 1193 * @param[in] DATA_TYPE The data type of the variables 1194 * @param[in] BASENAME The basename of the variables 1195 * @param[in] SCALE The scale factor 1196 * @{ 1197 */ 1198 #define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 1199 BASENAME##0 *= (DATA_TYPE)SCALE; 1200 1201 #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 1202 SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 1203 BASENAME##1 *= (DATA_TYPE)SCALE; 1204 1205 #define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 1206 SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 1207 BASENAME##2 *= (DATA_TYPE)SCALE; 1208 1209 #define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 1210 SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 1211 BASENAME##3 *= (DATA_TYPE)SCALE; 1212 1213 #define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 1214 SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 1215 BASENAME##4 *= (DATA_TYPE)SCALE; 1216 1217 #define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 1218 SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 1219 BASENAME##5 *= (DATA_TYPE)SCALE; 1220 1221 #define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 1222 SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 1223 BASENAME##6 *= (DATA_TYPE)SCALE; 1224 1225 #define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 1226 SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 1227 BASENAME##7 *= (DATA_TYPE)SCALE; 1228 1229 #define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 1230 SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 1231 BASENAME##8 *= (DATA_TYPE)SCALE; 1232 1233 #define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 1234 SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 1235 BASENAME##9 *= (DATA_TYPE)SCALE; 1236 1237 #define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 1238 SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 1239 BASENAME##A *= (DATA_TYPE)SCALE; 1240 1241 #define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 1242 SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 1243 BASENAME##B *= (DATA_TYPE)SCALE; 1244 1245 #define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 1246 SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 1247 BASENAME##C *= (DATA_TYPE)SCALE; 1248 1249 #define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 1250 SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 1251 BASENAME##D *= (DATA_TYPE)SCALE; 1252 1253 #define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 1254 SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 1255 BASENAME##E *= (DATA_TYPE)SCALE; 1256 1257 #define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \ 1258 SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 1259 BASENAME##F *= (DATA_TYPE)SCALE; 1260 /** @} */ // end of group SCALE_ROW_n 1261 1262 /** Scale elements stored in a block (BASENAME) 1263 * @name SCALE_BLOCK 1264 * 1265 * Supported cases are N=1,2,3,...,16 1266 * 1267 * @param[in] N The number of rows in the block 1268 * @param[in] DATA_TYPE The data type of the block 1269 * @param[in] BASENAME The basename of the block 1270 * @param[in] SCALE The scale factor 1271 * @{ 1272 */ 1273 #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) 1274 #define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) 1275 /** @} */ // end of group SCALE_BLOCK 1276 1277 /** Create a new vector containing the values at the given index for a set of given vectors 1278 * @name COLUMN_VECTORn 1279 * 1280 * @param[in] IDX_COL The index value 1281 * @param[in] BASENAME The basename of the destination vectors 1282 * @param[in] X The basename of the source vectors 1283 * @param[in] TYPE The data type of the destination vectors 1284 * @{ 1285 */ 1286 #define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ 1287 TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); 1288 #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ 1289 VEC_DATA_TYPE(TYPE, 2) \ 1290 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); 1291 #define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \ 1292 VEC_DATA_TYPE(TYPE, 3) \ 1293 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); 1294 #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ 1295 VEC_DATA_TYPE(TYPE, 4) \ 1296 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); 1297 #define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ 1298 VEC_DATA_TYPE(TYPE, 8) \ 1299 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); 1300 #define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ 1301 VEC_DATA_TYPE(TYPE, 16) \ 1302 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); 1303 /** @} */ // end of group COLUMN_VECTORn 1304 1305 /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector 1306 * @name COLUMN_VECTOR_SCALARn 1307 * 1308 * @param[in] IDX_COL The index value 1309 * @param[in] BASENAME The basename of the destination vectors 1310 * @param[in] X The basename of the source vectors 1311 * @param[in] TYPE The data type of the destination vectors 1312 * @{ 1313 */ 1314 #define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \ 1315 TYPE BASENAME##IDX_COL = (TYPE)((X##0)); 1316 #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ 1317 VEC_DATA_TYPE(TYPE, 2) \ 1318 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); 1319 #define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \ 1320 VEC_DATA_TYPE(TYPE, 3) \ 1321 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2)); 1322 #define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \ 1323 VEC_DATA_TYPE(TYPE, 4) \ 1324 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3)); 1325 #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ 1326 VEC_DATA_TYPE(TYPE, 8) \ 1327 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); 1328 #define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ 1329 VEC_DATA_TYPE(TYPE, 16) \ 1330 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); 1331 /** @} */ // end of group COLUMN_VECTORn 1332 1333 /** Create transposed vectors of the given vectors 1334 * @name TRANSPOSE_K0Xn 1335 * 1336 * @param[in] K0 The size of the source vectors 1337 * @param[in] BASENAME The basename of transposed vectors 1338 * @param[in] BS The basename of source vectors for transposition 1339 * @param[in] TYPE The data type of the transposed vectors 1340 * @{ 1341 */ 1342 #define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \ 1343 COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); 1344 #define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \ 1345 COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \ 1346 COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE); 1347 #define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \ 1348 TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE); \ 1349 COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE); 1350 #define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \ 1351 TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE); \ 1352 COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE); 1353 #define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \ 1354 TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE); \ 1355 COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE); \ 1356 COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE); \ 1357 COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE); \ 1358 COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE); 1359 #define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \ 1360 TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE); \ 1361 COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE); \ 1362 COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE); \ 1363 COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE); \ 1364 COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE); \ 1365 COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE); \ 1366 COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE); \ 1367 COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE); \ 1368 COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE); 1369 1370 /** @} */ // end of group TRANSPOSE_K0Xn 1371 1372 /** Create column vectors to contain the values at the given index for a set of given vectors 1373 * 1374 * @param[in] K0 The number of source vectors 1375 * @param[in] IDX_COL The index value 1376 * @param[in] BASENAME The basename of the destination vectors 1377 * @param[in] BS The basename of the source vectors 1378 * @param[in] TYPE The data type of the destination vectors 1379 */ 1380 #define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \ 1381 CONCAT(COLUMN_VECTOR, K0) \ 1382 (IDX_COL, BASENAME, BS, TYPE); 1383 1384 /** Create column vectors to contain the values at the given index. Utility macro for transposing a column-vector 1385 * 1386 * @param[in] K0 The number of source vectors 1387 * @param[in] IDX_COL The index value 1388 * @param[in] BASENAME The basename of the destination vectors 1389 * @param[in] BS The basename of the source vectors 1390 * @param[in] TYPE The data type of the destination vectors 1391 */ 1392 #define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \ 1393 CONCAT(COLUMN_VECTOR_SCALAR, K0) \ 1394 (IDX_COL, BASENAME, BS, TYPE); 1395 1396 /** Create transposed vectors form the given source vectors 1397 * 1398 * @param[in] K0 The size of source vectors 1399 * @param[in] N0 The number of source vectors 1400 * @param[in] BASENAME The basename of transposed vectors 1401 * @param[in] BS The basename of source vectors for transposition 1402 * @param[in] TYPE The data type of the transposed vectors 1403 * 1404 */ 1405 #define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \ 1406 CONCAT(TRANSPOSE_K0X, N0) \ 1407 (K0, BASENAME, BS, TYPE); 1408 1409 /** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1) 1410 * @name ADD_ROW_n 1411 * 1412 * @param[in] BASENAME The basename of the destination variables 1413 * @param[in] BIAS The basename of the added variables 1414 * @{ 1415 */ 1416 #define ADD_ROW_1(BASENAME, BIAS) \ 1417 BASENAME##0 += BIAS##0; 1418 1419 #define ADD_ROW_2(BASENAME, BIAS) \ 1420 ADD_ROW_1(BASENAME, BIAS) \ 1421 BASENAME##1 += BIAS##1; 1422 1423 #define ADD_ROW_3(BASENAME, BIAS) \ 1424 ADD_ROW_2(BASENAME, BIAS) \ 1425 BASENAME##2 += BIAS##2; 1426 1427 #define ADD_ROW_4(BASENAME, BIAS) \ 1428 ADD_ROW_3(BASENAME, BIAS) \ 1429 BASENAME##3 += BIAS##3; 1430 1431 #define ADD_ROW_5(BASENAME, BIAS) \ 1432 ADD_ROW_4(BASENAME, BIAS) \ 1433 BASENAME##4 += BIAS##4; 1434 1435 #define ADD_ROW_6(BASENAME, BIAS) \ 1436 ADD_ROW_5(BASENAME, BIAS) \ 1437 BASENAME##5 += BIAS##5; 1438 1439 #define ADD_ROW_7(BASENAME, BIAS) \ 1440 ADD_ROW_6(BASENAME, BIAS) \ 1441 BASENAME##6 += BIAS##6; 1442 1443 #define ADD_ROW_8(BASENAME, BIAS) \ 1444 ADD_ROW_7(BASENAME, BIAS) \ 1445 BASENAME##7 += BIAS##7; 1446 1447 #define ADD_ROW_9(BASENAME, BIAS) \ 1448 ADD_ROW_8(BASENAME, BIAS) \ 1449 BASENAME##8 += BIAS##8; 1450 1451 #define ADD_ROW_10(BASENAME, BIAS) \ 1452 ADD_ROW_9(BASENAME, BIAS) \ 1453 BASENAME##9 += BIAS##9; 1454 1455 #define ADD_ROW_11(BASENAME, BIAS) \ 1456 ADD_ROW_10(BASENAME, BIAS) \ 1457 BASENAME##A += BIAS##A; 1458 1459 #define ADD_ROW_12(BASENAME, BIAS) \ 1460 ADD_ROW_11(BASENAME, BIAS) \ 1461 BASENAME##B += BIAS##B; 1462 1463 #define ADD_ROW_13(BASENAME, BIAS) \ 1464 ADD_ROW_12(BASENAME, BIAS) \ 1465 BASENAME##C += BIAS##C; 1466 1467 #define ADD_ROW_14(BASENAME, BIAS) \ 1468 ADD_ROW_13(BASENAME, BIAS) \ 1469 BASENAME##D += BIAS##D; 1470 1471 #define ADD_ROW_15(BASENAME, BIAS) \ 1472 ADD_ROW_14(BASENAME, BIAS) \ 1473 BASENAME##E += BIAS##E; 1474 1475 #define ADD_ROW_16(BASENAME, BIAS) \ 1476 ADD_ROW_15(BASENAME, BIAS) \ 1477 BASENAME##F += BIAS##F; 1478 1479 /** @} */ // end of group ADD_ROW_n 1480 1481 /** Add the block (BIAS) to another block (BASENAME) 1482 * @name ADD_BLOCK 1483 * 1484 * Supported cases are N=1,2,3,...,16 1485 * 1486 * @param[in] N The number of vectors in the block 1487 * @param[in] BASENAME The basename of the destination variables 1488 * @param[in] BIAS The basename of the added variables 1489 * @{ 1490 */ 1491 #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) 1492 #define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) 1493 /** @} */ // end of group ADD_BLOCK 1494 1495 /** Broadcast (add single value) to the each element of the destination variables 1496 * @name ADD_ROW_BROADCAST_n 1497 * 1498 * @param[in] BASENAME The basename of the destination variables 1499 * @param[in] BIAS The variable containing the value to add 1500 * @{ 1501 */ 1502 #define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 1503 BASENAME##0 += BIAS; 1504 1505 #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 1506 ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 1507 BASENAME##1 += BIAS; 1508 1509 #define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 1510 ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 1511 BASENAME##2 += BIAS; 1512 1513 #define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 1514 ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 1515 BASENAME##3 += BIAS; 1516 1517 #define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 1518 ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 1519 BASENAME##4 += BIAS; 1520 1521 #define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 1522 ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 1523 BASENAME##5 += BIAS; 1524 1525 #define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 1526 ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 1527 BASENAME##6 += BIAS; 1528 1529 #define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 1530 ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 1531 BASENAME##7 += BIAS; 1532 1533 #define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 1534 ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 1535 BASENAME##8 += BIAS; 1536 1537 #define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 1538 ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 1539 BASENAME##9 += BIAS; 1540 1541 #define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 1542 ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 1543 BASENAME##A += BIAS; 1544 1545 #define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 1546 ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 1547 BASENAME##B += BIAS; 1548 1549 #define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 1550 ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 1551 BASENAME##C += BIAS; 1552 1553 #define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 1554 ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 1555 BASENAME##D += BIAS; 1556 1557 #define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 1558 ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 1559 BASENAME##E += BIAS; 1560 1561 #define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \ 1562 ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 1563 BASENAME##F += BIAS; 1564 1565 /** Broadcast (add a value) to the each element of the destination block (BASENAME) 1566 * @name ADD_BLOCK_BROADCAST 1567 * 1568 * Supported cases are N=1,2,3,...,16. 1569 * 1570 * @param[in] N The number of vectors in the block 1571 * @param[in] BASENAME The basename of the destination variables 1572 * @param[in] BIAS The variable containing the value to add 1573 * @{ 1574 */ 1575 #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) 1576 #define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) 1577 /** @} */ // end of group ADD_BLOCK_BROADCAST 1578 1579 /** Apply activation to the given variables 1580 * @name ACTIVATION_ROW_n 1581 * 1582 * @param[in] ACTIVATION_TYPE The type of the activation 1583 * @param[in] DATA_TYPE The data type of the vectors 1584 * @param[in] BASENAME The basename of the variables 1585 * @param[in] A_VAL Additional value required by the activation 1586 * @param[in] B_VAL Additional value required by the activation 1587 * @{ 1588 */ 1589 #define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1590 BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL); 1591 1592 #define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1593 ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1594 BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL); 1595 1596 #define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1597 ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1598 BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL); 1599 1600 #define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1601 ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1602 BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL); 1603 1604 #define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1605 ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1606 BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL); 1607 1608 #define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1609 ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1610 BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL); 1611 1612 #define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1613 ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1614 BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL); 1615 1616 #define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1617 ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1618 BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL); 1619 1620 #define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1621 ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1622 BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL); 1623 1624 #define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1625 ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1626 BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL); 1627 1628 #define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1629 ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1630 BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL); 1631 1632 #define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1633 ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1634 BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL); 1635 1636 #define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1637 ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1638 BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL); 1639 1640 #define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1641 ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1642 BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL); 1643 1644 #define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1645 ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1646 BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL); 1647 1648 #define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1649 ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 1650 BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL); 1651 /** @} */ // end of group ACTIVATION_ROW_n 1652 1653 /** Apply activation to a block (BASENAME) 1654 * @name ACTIVATION_BLOCK 1655 * 1656 * Supported cases are N=1,2,3,...,16. 1657 * 1658 * @param[in] N The number of vectors in the block 1659 * @param[in] ACTIVATION_TYPE The type of the activation 1660 * @param[in] DATA_TYPE The data type of the vectors 1661 * @param[in] BASENAME The basename of the variables 1662 * @param[in] A_VAL Additional value required by the activation 1663 * @param[in] B_VAL Additional value required by the activation 1664 * @{ 1665 */ 1666 #define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 1667 #define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 1668 /** @} */ // end of group ACTIVATION_BLOCK 1669 1670 /** Apply convert_<data_type> to the given variables 1671 * @name CONVERT_ROW_n 1672 * 1673 * @param[in] N The size of the vectors 1674 * @param[in] DATA_TYPE The data type of the vectors 1675 * @param[in] BASENAME_SRC The basename of the source variables 1676 * @param[in] BASENAME_DST The basename of the destination variables 1677 */ 1678 #define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1679 VEC_DATA_TYPE(DATA_TYPE, N) \ 1680 BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N)); 1681 1682 #define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1683 CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1684 VEC_DATA_TYPE(DATA_TYPE, N) \ 1685 BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N)); 1686 1687 #define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1688 CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1689 VEC_DATA_TYPE(DATA_TYPE, N) \ 1690 BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N)); 1691 1692 #define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1693 CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1694 VEC_DATA_TYPE(DATA_TYPE, N) \ 1695 BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N)); 1696 1697 #define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1698 CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1699 VEC_DATA_TYPE(DATA_TYPE, N) \ 1700 BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N)); 1701 1702 #define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1703 CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1704 VEC_DATA_TYPE(DATA_TYPE, N) \ 1705 BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N)); 1706 1707 #define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1708 CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1709 VEC_DATA_TYPE(DATA_TYPE, N) \ 1710 BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N)); 1711 1712 #define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1713 CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1714 VEC_DATA_TYPE(DATA_TYPE, N) \ 1715 BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N)); 1716 1717 #define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1718 CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1719 VEC_DATA_TYPE(DATA_TYPE, N) \ 1720 BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N)); 1721 1722 #define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1723 CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1724 VEC_DATA_TYPE(DATA_TYPE, N) \ 1725 BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N)); 1726 1727 #define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1728 CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1729 VEC_DATA_TYPE(DATA_TYPE, N) \ 1730 BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N)); 1731 1732 #define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1733 CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1734 VEC_DATA_TYPE(DATA_TYPE, N) \ 1735 BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N)); 1736 1737 #define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1738 CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1739 VEC_DATA_TYPE(DATA_TYPE, N) \ 1740 BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N)); 1741 1742 #define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1743 CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1744 VEC_DATA_TYPE(DATA_TYPE, N) \ 1745 BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N)); 1746 1747 #define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1748 CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1749 VEC_DATA_TYPE(DATA_TYPE, N) \ 1750 BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N)); 1751 1752 #define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1753 CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 1754 VEC_DATA_TYPE(DATA_TYPE, N) \ 1755 BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N)); 1756 /** @} */ // end of group CONVERT_ROW_n 1757 1758 /** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST) 1759 * @name CONVERT_BLOCK 1760 * 1761 * Supported cases N=1,2,3,...,16. 1762 * 1763 * @param[in] M The number of vectors to convert 1764 * @param[in] N The size of the vectors 1765 * @param[in] DATA_TYPE The data type of the vectors 1766 * @param[in] BASENAME_SRC The basename of the source variables 1767 * @param[in] BASENAME_DST The basename of the destination variables 1768 */ 1769 #define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 1770 #define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 1771 /** @} */ // end of group CONVERT_BLOCK