1/* 2 * Copyright (c) 2017-2021 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#define PARTIAL_STORE_M0 VEC_SIZE_LEFTOVER_X 25#define PARTIAL_STORE_N0 VEC_SIZE_LEFTOVER_Y 26 27#include "helpers.h" 28#include "repeat.h" 29 30#if defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y) 31 32#if VEC_SIZE_X == 1 33#if VEC_SIZE_Y == 1 34#define TRANSPOSED_U(val) \ 35 { \ 36 u0 \ 37 } 38#elif VEC_SIZE_Y == 2 39#define TRANSPOSED_U(val) \ 40 { \ 41 u0, u1 \ 42 } 43#elif VEC_SIZE_Y == 3 44#define TRANSPOSED_U(val) \ 45 { \ 46 u0, u1, u2 \ 47 } 48#elif VEC_SIZE_Y == 4 49#define TRANSPOSED_U(val) \ 50 { \ 51 u0, u1, u2, u3 \ 52 } 53#elif VEC_SIZE_Y == 8 54#define TRANSPOSED_U(val) \ 55 { \ 56 u0, u1, u2, u3, u4, u5, u6, u7 \ 57 } 58#elif VEC_SIZE_Y == 16 59#define TRANSPOSED_U(val) \ 60 { \ 61 u0, u1, u2, u3, u4, u5, u6, u7, \ 62 u8, u9, u10, u11, u12, u13, u14, u15 \ 63 } 64#endif /* switch VEC_SIZE_Y */ 65#else // VEC_SIZE_X == 1 66#if VEC_SIZE_Y == 1 67#define TRANSPOSED_U(val) \ 68 { \ 69 u0.val \ 70 } 71#elif VEC_SIZE_Y == 2 72#define TRANSPOSED_U(val) \ 73 { \ 74 u0.val, u1.val \ 75 } 76#elif VEC_SIZE_Y == 3 77#define TRANSPOSED_U(val) \ 78 { \ 79 u0.val, u1.val, u2.val \ 80 } 81#elif VEC_SIZE_Y == 4 82#define TRANSPOSED_U(val) \ 83 { \ 84 u0.val, u1.val, u2.val, u3.val \ 85 } 86#elif VEC_SIZE_Y == 8 87#define TRANSPOSED_U(val) \ 88 { \ 89 u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val \ 90 } 91#elif VEC_SIZE_Y == 16 92#define TRANSPOSED_U(val) \ 93 { \ 94 u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val, \ 95 u8.val, u9.val, u10.val, u11.val, u12.val, u13.val, u14.val, u15.val \ 96 } 97#endif /* switch VEC_SIZE_Y */ 98#endif // VEC_SIZE_X == 1 99 100#if DATA_TYPE_IN_BYTES == 4 101#define DATA_TYPE uint 102#elif DATA_TYPE_IN_BYTES == 2 103#define DATA_TYPE ushort 104#elif DATA_TYPE_IN_BYTES == 1 105#define DATA_TYPE uchar 106#else /* switch DATA_TYPE_IN_BYTES */ 107#error DATA_TYPE_IN_BYTES not supported for transpose 108#endif /* switch DATA_TYPE_IN_BYTES */ 109 110/** This OpenCL kernel computes the matrix transposition of input matrix 111 * 112 * @note The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be: 113 * -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices 114 * -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices 115 * -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices 116 * -# -DVEC_SIZE_X is the number of elements processed in X dimension 117 * -# -DVEC_SIZE_LEFTOVER_X is the leftover size in the X dimension; x_dimension % VEC_SIZE_X 118 * -# -DVEC_SIZE_Y is the number of elements processed in Y dimension 119 * -# -DVEC_SIZE_LEFTOVER_Y is the leftover size in the Y dimension; y_dimension % VEC_SIZE_Y 120 * 121 * 122 * @param[in] src_ptr Pointer to the source matrix. Supported data types: All 123 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) 124 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 125 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) 126 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 127 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix 128 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as src_ptr 129 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) 130 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) 131 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) 132 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) 133 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix 134 */ 135__kernel void transpose(IMAGE_DECLARATION(src), 136 IMAGE_DECLARATION(dst)) 137{ 138 uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0); 139 uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0); 140 141 // Compute addresses 142 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y; 143 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y; 144 145 // Load the NxM block at (x, y) 146 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 147 u0 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)src_addr); 148#if VEC_SIZE_Y > 1 149 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 150 u1 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + src_stride_y)); 151#endif /* VEC_SIZE_Y > 1 */ 152#if VEC_SIZE_Y > 2 153 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 154 u2 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); 155#endif /* VEC_SIZE_Y > 2 */ 156#if VEC_SIZE_Y > 3 157 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 158 u3 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); 159#endif /* VEC_SIZE_Y > 3 */ 160#if VEC_SIZE_Y > 4 161 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 162 u4 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); 163 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 164 u5 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y)); 165 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 166 u6 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y)); 167 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 168 u7 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y)); 169#endif /* VEC_SIZE_Y > 4 */ 170#if VEC_SIZE_Y > 8 171 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 172 u8 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y)); 173 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 174 u9 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 9 * src_stride_y)); 175 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 176 u10 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 10 * src_stride_y)); 177 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 178 u11 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 11 * src_stride_y)); 179 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 180 u12 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 12 * src_stride_y)); 181 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 182 u13 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 13 * src_stride_y)); 183 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 184 u14 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 14 * src_stride_y)); 185 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 186 u15 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 15 * src_stride_y)); 187#endif /* VEC_SIZE_Y > 8 */ 188 189 //Create transposed vectors 190 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 191 t0 = TRANSPOSED_U(s0); 192#if VEC_SIZE_X > 1 193 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 194 t1 = TRANSPOSED_U(s1); 195#endif /* VEC_SIZE_X > 1 */ 196#if VEC_SIZE_X > 2 197 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 198 t2 = TRANSPOSED_U(s2); 199#endif /* VEC_SIZE_X > 2 */ 200#if VEC_SIZE_X > 3 201 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 202 t3 = TRANSPOSED_U(s3); 203#endif /* VEC_SIZE_X > 3 */ 204#if VEC_SIZE_X > 4 205 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 206 t4 = TRANSPOSED_U(s4); 207 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 208 t5 = TRANSPOSED_U(s5); 209 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 210 t6 = TRANSPOSED_U(s6); 211 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 212 t7 = TRANSPOSED_U(s7); 213#endif /* VEC_SIZE_X > 4 */ 214#if VEC_SIZE_X > 8 215 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 216 t8 = TRANSPOSED_U(s8); 217 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 218 t9 = TRANSPOSED_U(s9); 219 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 220 tA = TRANSPOSED_U(sA); 221 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 222 tB = TRANSPOSED_U(sB); 223 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 224 tC = TRANSPOSED_U(sC); 225 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 226 tD = TRANSPOSED_U(sD); 227 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 228 tE = TRANSPOSED_U(sE); 229 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 230 tF = TRANSPOSED_U(sF); 231#endif /* VEC_SIZE_X > 8 */ 232 233 // Store the block at (y, x) 234 REPEAT_VAR_INIT_TO_CONST(VEC_SIZE_X, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0; 235 STORE_BLOCK_BOUNDARY_AWARE(VEC_SIZE_X, VEC_SIZE_Y, DATA_TYPE, t, (__global uchar *)dst_addr, dst_stride_y, zout, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_Y, VEC_SIZE_LEFTOVER_X != 0 236 && get_global_id(0) == 0, 237 VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0); 238} 239 240#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)