1 /* 2 * Copyright (c) 2021-2022 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #pragma once 26 27 #include "arm_compute/core/Error.h" 28 #include "depthfirst_driver.hpp" 29 #include "utils.hpp" 30 #if !defined(_WIN64) && !defined(__OpenBSD__) 31 #include <alloca.h> 32 #endif /* !defined(_WIN64) && !defined(__OpenBSD__) */ 33 34 namespace arm_conv { 35 namespace pooling { 36 37 template <typename TInput, typename TOutput, typename OutputStage = Nothing> 38 class IGenericDepthfirstStrategy; 39 40 template <typename TInput, typename TOutput> 41 class IGenericDepthfirstStrategy<TInput, TOutput, Nothing> 42 { 43 public: 44 virtual ~IGenericDepthfirstStrategy() = default; 45 46 typedef void (*KernelType)( 47 uint64_t window_cells, 48 uint64_t n_valid_cells, 49 uint64_t n_channels, 50 const TInput *const *, 51 TOutput * 52 ); 53 54 virtual KernelType get_kernel(void) const = 0; 55 }; 56 57 template <typename TInput, typename TOutput> 58 class IGenericDepthfirstStrategy<TInput, TOutput, Requantize32> 59 { 60 public: 61 virtual ~IGenericDepthfirstStrategy() = default; 62 63 typedef void (*KernelType)( 64 uint64_t window_cells, 65 uint64_t n_valid_cells, 66 uint64_t n_channels, 67 const TInput *const *, 68 TOutput *, 69 const Requantize32 & 70 ); 71 72 virtual KernelType get_kernel(void) const = 0; 73 }; 74 75 template <typename TInput, typename TOutput, typename OutputStage> 76 struct Invoker; 77 78 template <typename TInput, typename TOutput> 79 struct Invoker<TInput, TOutput, Nothing> 80 { invokearm_conv::pooling::Invoker81 static inline void invoke( 82 const typename IGenericDepthfirstStrategy<TInput, TOutput, Nothing>::KernelType kern, 83 uint64_t window_cells, 84 uint64_t n_valid_cells, 85 uint64_t n_channels, 86 const TInput *const *inptrs, 87 TOutput *outptr, 88 const Nothing & 89 ) 90 { 91 kern(window_cells, n_valid_cells, n_channels, inptrs, outptr); 92 } 93 }; 94 95 template <typename TInput, typename TOutput> 96 struct Invoker<TInput, TOutput, Requantize32> 97 { invokearm_conv::pooling::Invoker98 static inline void invoke( 99 const typename IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>::KernelType kern, 100 uint64_t window_cells, 101 uint64_t n_valid_cells, 102 uint64_t n_channels, 103 const TInput *const *inptrs, 104 TOutput *outptr, 105 const Requantize32 &qp 106 ) 107 { 108 kern(window_cells, n_valid_cells, n_channels, inptrs, outptr, qp); 109 } 110 }; 111 112 template <typename TInput, typename TOutput, typename OutputStage> 113 class GenericDepthfirstWrapper : public IDepthfirstStrategy 114 { 115 using StratType = IGenericDepthfirstStrategy<TInput, TOutput, OutputStage>; 116 117 std::unique_ptr<const StratType> m_strat; 118 const unsigned int window_rows, window_cols; 119 120 public: GenericDepthfirstWrapper(const StratType * strat,const PoolingArgs & args)121 GenericDepthfirstWrapper(const StratType *strat, const PoolingArgs &args) 122 : m_strat(strat), window_rows(args.pool_window.rows), window_cols(args.pool_window.cols) 123 { 124 } 125 get_input_rows(void) const126 unsigned int get_input_rows(void) const override { return window_rows; } get_input_cols(void) const127 unsigned int get_input_cols(void) const override { return window_cols; } get_output_rows(void) const128 unsigned int get_output_rows(void) const override { return 1; } get_output_cols(void) const129 unsigned int get_output_cols(void) const override { return 1; } 130 get_kernel(void) const131 typename StratType::KernelType get_kernel(void) const { return m_strat->get_kernel(); } 132 }; 133 134 template <typename TInput, typename TOutput=TInput, typename OutputStage=Nothing> 135 class PoolingDepthfirstGeneric : public DepthfirstDriver<TInput, TOutput> 136 { 137 const OutputStage m_os; 138 139 protected: get_working_size_per_thread(unsigned int) const140 size_t get_working_size_per_thread(unsigned int) const override { return 0; } initialise_working_space(void *,unsigned int) const141 void initialise_working_space(void *, unsigned int) const override { /* Nothing */ } 142 143 /* Compute a portion of the output tensor with padding. */ compute_tile_padded(unsigned int output_i,unsigned int output_j,unsigned int channel_start,unsigned int channel_end,const TensorSpec<const TInput * > & input,const TensorSpec<TOutput * > & output,void *) const144 void compute_tile_padded( 145 unsigned int output_i, unsigned int output_j, 146 unsigned int channel_start, unsigned int channel_end, 147 const TensorSpec<const TInput *> &input, 148 const TensorSpec<TOutput *> &output, 149 void * 150 ) const override 151 { 152 // Determine start position and padding 153 const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top; 154 const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i); 155 const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0); 156 const int end_i = start_i + this->m_args.pool_window.rows; 157 const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows); 158 const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom); 159 160 const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left; 161 const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j); 162 const auto pad_left = static_cast<unsigned int>(start_j < 0 ? -start_j : 0); 163 const int end_j = start_j + this->m_args.pool_window.cols; 164 const auto pad_right = static_cast<unsigned int>((unsigned int) end_j < this->m_args.input_cols ? 0 : end_j - this->m_args.input_cols); 165 const auto valid_cols = this->m_args.pool_window.cols - (pad_left + pad_right); 166 167 // Determine the number of valid cells and prepare the pointers 168 const auto n_valid_cells = valid_rows * valid_cols; 169 auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *))); 170 { 171 auto my_ptr = inptrs; 172 auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start; 173 for (auto i = valid_rows; i; i--) 174 { 175 auto ptr = row_ptr; 176 row_ptr += input.ld_row; 177 178 for (auto j = valid_cols; j; j--) 179 { 180 *(my_ptr++) = ptr; 181 ptr += input.ld_col; 182 } 183 } 184 } 185 186 auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start; 187 188 // Some padding variants include (or exclude) the padding values; we handle 189 // this by computing the extent of the padded input tensor and hence 190 // computing the total number of cells captured in the pooling window. 191 const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom; 192 const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i; 193 const auto right_padded_width = this->m_args.input_cols + this->m_args.padding.right; 194 const auto captured_cols = std::min<int>(end_j, right_padded_width) - start_j; 195 const auto captured_cells = captured_rows * captured_cols; 196 const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells; 197 198 // Execute the kernel 199 Invoker<TInput, TOutput, OutputStage>::invoke( 200 reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(), 201 window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os 202 ); 203 } 204 205 // Compute a portion of the work with only top/bottom padding. compute_row_padded_tile_row(const unsigned int output_i,unsigned int output_j,unsigned int n_tile_cols,const unsigned int channel_start,const unsigned int channel_end,const TensorSpec<const TInput * > & input,const TensorSpec<TOutput * > & output,void * working_space) const206 void compute_row_padded_tile_row( 207 const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols, 208 const unsigned int channel_start, const unsigned int channel_end, 209 const TensorSpec<const TInput *> &input, 210 const TensorSpec<TOutput *> &output, 211 void *working_space 212 ) const override 213 { 214 ARM_COMPUTE_UNUSED(working_space); 215 // Determine start position and padding 216 const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top; 217 const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i); 218 const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0); 219 const int end_i = start_i + this->m_args.pool_window.rows; 220 const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows); 221 const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom); 222 223 const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left; 224 const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j); 225 const auto valid_cols = this->m_args.pool_window.cols; 226 227 // Determine the number of valid cells and prepare the pointers 228 const auto n_valid_cells = valid_rows * valid_cols; 229 auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *))); 230 { 231 auto my_ptr = inptrs; 232 auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start; 233 for (auto i = valid_rows; i; i--) 234 { 235 auto ptr = row_ptr; 236 row_ptr += input.ld_row; 237 238 for (auto j = valid_cols; j; j--) 239 { 240 *(my_ptr++) = ptr; 241 ptr += input.ld_col; 242 } 243 } 244 } 245 246 auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start; 247 248 // Some padding variants include (or exclude) the padding values; we handle 249 // this by computing the extent of the padded input tensor and hence 250 // computing the total number of cells captured in the pooling window. 251 const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom; 252 const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i; 253 const auto captured_cells = captured_rows * valid_cols; 254 const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells; 255 256 for (; n_tile_cols; n_tile_cols--) 257 { 258 // Execute the kernel 259 Invoker<TInput, TOutput, OutputStage>::invoke( 260 reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(), 261 window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os 262 ); 263 264 // Update the pointers; the output strides by a column and the inputs 265 // stride by a number of columns. 266 outptr += output.ld_col; 267 for (auto n = 0u; n < n_valid_cells; n++) 268 { 269 inptrs[n] += this->m_args.pool_stride.cols * input.ld_col; 270 } 271 } 272 } 273 274 public: PoolingDepthfirstGeneric(const IGenericDepthfirstStrategy<TInput,TOutput,OutputStage> * strat,const PoolingArgs & args,const OutputStage & os={} )275 PoolingDepthfirstGeneric( 276 const IGenericDepthfirstStrategy<TInput, TOutput, OutputStage> *strat, 277 const PoolingArgs &args, 278 const OutputStage &os = {} 279 ) 280 : DepthfirstDriver<TInput, TOutput>( 281 new GenericDepthfirstWrapper<TInput, TOutput, OutputStage>(strat, args), 282 args 283 ), 284 m_os(os) 285 { 286 } 287 }; 288 289 } // namespace pooling 290 } // namespace arm_conv 291