1/* 2 * Copyright (c) 2019-2021 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "helpers.h" 25 26#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) & defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) 27/** This function computes the mean and variance of each plane of the input tensor and provides it as output. 28 * 29 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 30 * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float 31 * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7 32 * 33 * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 34 * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) 35 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) 36 * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) 37 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) 38 * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) 39 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) 40 * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) 41 * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) 42 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor 43 * @param[out] output_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr 44 * @param[in] output_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes) 45 * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes) 46 * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes) 47 * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes) 48 * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes) 49 * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes) 50 * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor 51 */ 52__kernel void compute_mean_var( 53 TENSOR4D_DECLARATION(input), 54 TENSOR3D_DECLARATION(output)) 55{ 56 Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); 57 Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); 58 59#if defined(NHWC) 60 const int ch = get_global_id(0); // Current channel 61 const int batch = get_global_id(1); // Current batch 62 const int elements_plane = DIM_Y * DIM_Z; 63 INTERNAL_DATA_TYPE part_sum = 0.f; 64 INTERNAL_DATA_TYPE part_sum_sq = 0.f; 65 const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE); 66 67 for(int i_w = 0; i_w < DIM_Y; ++i_w) 68 { 69 for(int i_h = 0; i_h < DIM_Z; ++i_h) 70 { 71 INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch)); 72 part_sum += data; 73 part_sum_sq += data * data; 74 } 75 } 76 77 INTERNAL_DATA_TYPE mean = (part_sum / elements_plane); 78 INTERNAL_DATA_TYPE var = (part_sum_sq / elements_plane) - (mean * mean); 79 __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch); 80 *output_address0 = mean; 81 __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch); 82 *output_address1 = var; 83#else // !defined(NHWC) 84 const int ch = get_global_id(2) % DIM_Z; // Current channel 85 const int batch = get_global_id(2) / DIM_Z; // Current batch 86 const int elements_plane = DIM_X * DIM_Y; 87 88 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE) 89 part_sum = 0.f; 90 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE) 91 part_sum_sq = 0.f; 92 // Calculate partial sum 93 for(int y = 0; y < DIM_Y; ++y) 94 { 95 int x = 0; 96 for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) 97 { 98 // Load data 99 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE) 100 data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)); 101 part_sum += data; 102 part_sum_sq += data * data; 103 } 104 // Left-overs loop 105 for(; x < DIM_X; ++x) 106 { 107 INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch))); 108 part_sum.s0 += data; 109 part_sum_sq.s0 += data * data; 110 } 111 } 112 // Perform reduction 113#if VEC_SIZE > 8 114 part_sum.s01234567 += part_sum.s89abcdef; 115 part_sum_sq.s01234567 += part_sum_sq.s89abcdef; 116#endif // VEC_SIZE > 8 117#if VEC_SIZE > 4 118 part_sum.s0123 += part_sum.s4567; 119 part_sum_sq.s0123 += part_sum_sq.s4567; 120#endif // VEC_SIZE > 4 121#if VEC_SIZE > 2 122 part_sum.s01 += part_sum.s23; 123 part_sum_sq.s01 += part_sum_sq.s23; 124#endif // VEC_SIZE > 2 125 part_sum.s0 += part_sum.s1; 126 part_sum_sq.s0 += part_sum_sq.s1; 127 128 INTERNAL_DATA_TYPE sum = (INTERNAL_DATA_TYPE)part_sum.s0; 129 INTERNAL_DATA_TYPE sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0; 130 131 const INTERNAL_DATA_TYPE mean = (sum / elements_plane); 132 const INTERNAL_DATA_TYPE var = (sum_sq / elements_plane) - (mean * mean); 133 134 __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch); 135 *output_address0 = mean; 136 __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch); 137 *output_address1 = var; 138 139#endif // defined(NHWC) 140} 141#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */ 142 143#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) 144/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension. 145 * 146 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 147 * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float 148 * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3 149 * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4 150 * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f 151 * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7 152 * 153 * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 154 * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) 155 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) 156 * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) 157 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) 158 * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) 159 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) 160 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor 161 * @param[out] output_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr 162 * @param[in] output_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes) 163 * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes) 164 * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes) 165 * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes) 166 * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes) 167 * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes) 168 * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor 169 */ 170__kernel void instance_normalization( 171 TENSOR4D_DECLARATION(input), 172 TENSOR3D_DECLARATION(mean_var) 173#ifndef IN_PLACE 174 , 175 TENSOR4D_DECLARATION(output) 176#endif /* IN_PLACE */ 177) 178{ 179 Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); 180 Tensor3D mean_var = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(mean_var); 181#ifndef IN_PLACE 182 Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); 183#endif /* IN_PLACE */ 184 185#if defined(NHWC) 186 const int ch = get_global_id(0); // Current channel 187 const int batch = get_global_id(2); // Current batch 188#else /* defined(NHWC) */ 189 const int ch = get_global_id(2) % DIM_Z; // Current channel 190 const int batch = get_global_id(2) / DIM_Z; // Current batch 191#endif /* defined(NHWC) */ 192 193 const __global INTERNAL_DATA_TYPE *mean_ptr = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 0, batch); 194 const __global INTERNAL_DATA_TYPE *var_ptr = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 1, batch); 195 const INTERNAL_DATA_TYPE mean = (INTERNAL_DATA_TYPE) * mean_ptr; 196 const INTERNAL_DATA_TYPE var = (INTERNAL_DATA_TYPE) * var_ptr; 197 const INTERNAL_DATA_TYPE multip = GAMMA / sqrt(var + EPSILON); 198 const INTERNAL_DATA_TYPE beta = (INTERNAL_DATA_TYPE)BETA; 199 200#if defined(NHWC) 201 const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE); 202#ifndef IN_PLACE 203 const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE); 204#endif /* IN_PLACE */ 205 206 for(int i_w = 0; i_w < DIM_Y; ++i_w) 207 { 208 for(int i_h = 0; i_h < DIM_Z; ++i_h) 209 { 210 __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); 211#ifdef IN_PLACE 212 __global DATA_TYPE *output_address = input_address; 213#else /* !IN_PLACE */ 214 __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); 215#endif /* IN_PLACE */ 216 *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA; 217 } 218 } 219#else // !defined(NHWC) 220 for(int y = 0; y < DIM_Y; ++y) 221 { 222 int x = 0; 223 for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) 224 { 225 __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); 226#ifdef IN_PLACE 227 __global DATA_TYPE *output_address = input_address; 228#else /* !IN_PLACE */ 229 __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); 230#endif /* IN_PLACE */ 231 232 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE) 233 data = CONVERT(VLOAD(VEC_SIZE)(0, input_address), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)); 234 235 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE) 236 res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA; 237 VSTORE(VEC_SIZE) 238 (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, output_address); 239 } 240 // Left-overs loop 241 for(; x < DIM_X; ++x) 242 { 243 __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); 244#ifdef IN_PLACE 245 __global DATA_TYPE *output_address = input_address; 246#else /* !IN_PLACE */ 247 __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); 248#endif /* IN_PLACE */ 249 *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA; 250 } 251 } 252#endif // defined(NHWC) 253} 254#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */ 255