1/* 2 * Copyright (c) 2018-2021 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "helpers.h" 25 26// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling. 27// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number). 28#define EPS_GRID 0.00001f 29 30#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) // Check for compile time constants 31 32/** Performs a roi align on a single output pixel. 33 * 34 * @param[in] input Pointer to input Tensor3D struct. 35 * @param[in] region_start_x Start x index projected onto the input tensor. 36 * @param[in] region_end_x End x index projected onto the input tensor. 37 * @param[in] region_start_y Start y index projected onto the input tensor. 38 * @param[in] region_end_y End y index projected onto the input tensor. 39 * @param[in] pz z index of the input tensor. 40 * 41 * @return An average pooled value from the region specified in the input tensor. 42 */ 43inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x, 44 float bin_size_x, 45 float grid_size_x, 46 float region_end_x, 47 float region_start_y, 48 float bin_size_y, 49 float grid_size_y, 50 float region_end_y, 51 int pz) 52{ 53 // Iterate through the pooling region 54 float sum = 0; 55 for(int iy = 0; iy < grid_size_y; ++iy) 56 { 57 for(int ix = 0; ix < grid_size_x; ++ix) 58 { 59 // Align the window in the middle of every bin 60 const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y; 61 const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x; 62 63 // Interpolation in the unit square 64 const int y_low = (int)y; 65 const int x_low = (int)x; 66 const int y_high = y_low + 1; 67 const int x_high = x_low + 1; 68 69 const float ly = y - y_low; 70 const float lx = x - x_low; 71 const float hy = 1.f - ly; 72 const float hx = 1.f - lx; 73 74 const float w1 = hy * hx; 75 const float w2 = hy * lx; 76 const float w3 = ly * hx; 77 const float w4 = ly * lx; 78#if defined(NHWC) 79 const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low); 80 const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low); 81 const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high); 82 const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high); 83#else // !defined(NHWC) 84 const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz); 85 const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz); 86 const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz); 87 const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz); 88#endif // defined(NHWC) 89 sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; 90 } 91 } 92 93 return (DATA_TYPE)(sum / (grid_size_x * grid_size_y)); 94} 95 96/** Performs a roi align function. 97 * 98 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32; 99 * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32; 100 * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z; 101 * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y; 102 * @note Spatial scale must be passed using -DSPATIAL_SCALE; 103 * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi 104 * will have a default sampling ratio of roi_dims/pooling_dims 105 * 106 * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16, F32 107 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) 108 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) 109 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) 110 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) 111 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) 112 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) 113 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the pooled region of the source tensor as specifed by ROI 114 * @param[in] rois_ptr Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr 115 * @param[in] rois_stride_x Stride of the ROIs tensor in X dimension (in bytes) 116 * @param[in] rois_step_x Step of the ROIs tensor in X dimension (in bytes) 117 * @param[in] rois_stride_y Stride of the ROIs tensor in Y dimension (in bytes) 118 * @param[in] rois_step_y Step of the ROIs tensor in Y dimension (in bytes) 119 * @param[in] rois_offset_first_element_in_bytes The offset of the first element in the ROIs tensor 120 * @param[out] output_ptr Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr 121 * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) 122 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) 123 * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) 124 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) 125 * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) 126 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) 127 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor 128 * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) 129 * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) 130 */ 131__kernel void roi_align_layer( 132 TENSOR3D_DECLARATION(input), 133 IMAGE_DECLARATION(rois), 134 TENSOR3D_DECLARATION(output), 135 unsigned int input_stride_w, unsigned int output_stride_w) 136{ 137 // Get pixels pointer 138 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input); 139 Image rois = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois); 140 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); 141 142#if defined(NHWC) 143 const int px = get_global_id(1); 144 const int py = get_global_id(2); 145 const int pw = get_global_id(0); 146#else // !defined(NHWC) 147 const int px = get_global_id(0); 148 const int py = get_global_id(1); 149 const int pw = get_global_id(2); 150#endif // defined(NHWC) 151 152 // Load roi parameters 153 // roi is laid out as follows { batch_index, x1, y1, x2, y2 } 154 const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw)); 155 const VEC_DATA_TYPE(DATA_TYPE, 4) 156 roi = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw)); 157 const float2 roi_anchor = convert_float2(roi.s01) * convert_float(SPATIAL_SCALE); 158 const float2 roi_dims = fmax(convert_float2(roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f); 159 160 // Calculate pooled region start and end 161 const float2 spatial_indx = (float2)(px, py); 162 const float2 pooled_dims = (float2)(POOLED_DIM_X, POOLED_DIM_Y); 163 const float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y); 164 165 const float2 bin_size = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y)); 166 float2 region_start = spatial_indx * bin_size + roi_anchor; 167 float2 region_end = (spatial_indx + 1) * bin_size + roi_anchor; 168 169 region_start = clamp(region_start, 0, max_spatial_dims); 170 region_end = clamp(region_end, 0, max_spatial_dims); 171 172#if defined(SAMPLING_RATIO) 173 const float2 roi_bin_grid = SAMPLING_RATIO; 174#else // !defined(SAMPLING_RATIO) 175 // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2. 176 const float2 roi_bin_grid = ceil(bin_size - EPS_GRID); 177#endif // defined(SAMPLING_RATIO) 178 179 // Move input and output pointer across the fourth dimension 180 input.ptr += roi_batch * input_stride_w; 181 output.ptr += pw * output_stride_w; 182 for(int pz = 0; pz < MAX_DIM_Z; ++pz) 183 { 184#if defined(NHWC) 185 __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py); 186#else // !defined(NHWC) 187 __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz); 188#endif // defined(NHWC) 189 *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input, 190 region_start.x, 191 bin_size.x, 192 roi_bin_grid.x, 193 region_end.x, 194 region_start.y, 195 bin_size.y, 196 roi_bin_grid.y, 197 region_end.y, pz); 198 } 199} 200#endif // Check for compile time constants 201