1 /* 2 * Copyright (c) 2021 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 #pragma once 25 26 #include "pool_common.hpp" 27 28 #include <stack> 29 #include <vector> 30 31 namespace arm_conv { 32 namespace pooling { 33 34 template <class strategy> 35 class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type> 36 { 37 using TInput = typename strategy::operand_type; 38 using TOutput = typename strategy::return_type; 39 40 const PoolingArgs m_args; // Copy of arguments 41 input_rows(void)42 constexpr static unsigned int input_rows(void) 43 { 44 return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows(); 45 } 46 input_cols(void)47 constexpr static unsigned int input_cols(void) 48 { 49 return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols(); 50 } 51 sizeof_input_buffer(void) const52 size_t sizeof_input_buffer(void) const 53 { 54 return sizeof(TInput) * m_args.n_channels; 55 } 56 sizeof_output_buffer(void) const57 size_t sizeof_output_buffer(void) const 58 { 59 return sizeof(TOutput) * m_args.n_channels; 60 } 61 62 public: PoolingDepthfirstCacheOblivious(const PoolingArgs & args)63 PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args) 64 { 65 } 66 67 PoolingDepthfirstCacheOblivious(PoolingDepthfirstCacheOblivious &) = delete; 68 PoolingDepthfirstCacheOblivious &operator=(PoolingDepthfirstCacheOblivious &) = delete; 69 get_working_size(void) const70 size_t get_working_size(void) const override 71 { 72 // We require an array of pointers for the inputs and outputs, a 73 // channel-length vector in which to dump surplus output, and a 74 // channel-length vector of padding values. 75 return sizeof_input_buffer() + sizeof_output_buffer(); 76 } 77 execute(const void * const input,void * const output,void * const working_space) const78 void execute( 79 const void *const input, 80 void *const output, 81 void *const working_space 82 ) const override 83 { 84 const size_t ld_input_col = m_args.n_channels; 85 const size_t ld_input_row = ld_input_col * m_args.input_cols; 86 const size_t ld_input_batch = ld_input_row * m_args.input_rows; 87 const size_t ld_output_col = ld_input_col; 88 const size_t ld_output_row = ld_output_col * m_args.output_cols; 89 const size_t ld_output_batch = ld_output_row * m_args.output_rows; 90 91 execute( 92 input, ld_input_col, ld_input_row, ld_input_batch, 93 output, ld_output_col, ld_output_row, ld_output_batch, 94 working_space 95 ); 96 } 97 execute(const void * const input,size_t ld_input_col,size_t ld_input_row,size_t ld_input_batch,void * const output,size_t ld_output_col,size_t ld_output_row,size_t ld_output_batch,void * const working_space) const98 void execute( 99 const void *const input, 100 size_t ld_input_col, 101 size_t ld_input_row, 102 size_t ld_input_batch, 103 void *const output, 104 size_t ld_output_col, 105 size_t ld_output_row, 106 size_t ld_output_batch, 107 void *const working_space 108 ) const override 109 { 110 execute( 111 m_args.n_batches, m_args.input_rows, m_args.input_cols, 112 m_args.n_channels, 113 input, ld_input_col, ld_input_row, ld_input_batch, 114 m_args.padding, 115 m_args.output_rows, m_args.output_cols, 116 output, ld_output_col, ld_output_row, ld_output_batch, 117 working_space 118 ); 119 } 120 execute(unsigned int batches,unsigned int input_height,unsigned int input_width,unsigned int channels,const void * const _input,size_t ld_input_col,size_t ld_input_row,size_t ld_input_batch,const PaddingValues & padding,unsigned int output_height,unsigned int output_width,void * const _output,size_t ld_output_col,size_t ld_output_row,size_t ld_output_batch,void * const _working_space) const121 void execute( 122 unsigned int batches, 123 unsigned int input_height, 124 unsigned int input_width, 125 unsigned int channels, 126 const void *const _input, 127 size_t ld_input_col, 128 size_t ld_input_row, 129 size_t ld_input_batch, 130 const PaddingValues &padding, 131 unsigned int output_height, 132 unsigned int output_width, 133 void *const _output, 134 size_t ld_output_col, 135 size_t ld_output_row, 136 size_t ld_output_batch, 137 void *const _working_space 138 ) const override 139 { 140 strategy strat(m_args.cpu_info); 141 #ifdef CYCLE_PROFILING 142 arm_gemm::profiler prof; 143 #endif // CYCLE_PROFILING 144 145 // Cast input and output pointers into the right types 146 const TInput *const inptr = static_cast<const TInput *>(_input); 147 TOutput *const outptr = static_cast<TOutput *>(_output); 148 149 // Allocate portions of the working space 150 uint8_t *const working_space = static_cast<uint8_t *>(_working_space); 151 TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space); 152 TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer()); 153 154 // Fill the input buffer 155 const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE) 156 ? static_cast<TInput>(0) 157 : (std::numeric_limits<TInput>::has_infinity 158 ? -std::numeric_limits<TInput>::infinity() 159 : std::numeric_limits<TInput>::lowest()); 160 for (unsigned int i = 0; i < channels; i++) 161 { 162 input_buffer[i] = pad_value; 163 } 164 165 // Keep subdividing the output plane across the longest dimension until we 166 // reach the size of the tile. Queue items for later processing. Note - we 167 // can determine the largest size of the queue a priori from the input 168 // tensor size, this would allow us to allocate memory within the working 169 // space and improve performance. 170 struct WorkItem 171 { 172 unsigned int output_i, output_j; 173 unsigned int output_height, output_width; 174 175 WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width) 176 : output_i(i), output_j(j), output_height(height), output_width(width) {} 177 }; 178 179 auto execute = [&] (const WorkItem &item) { 180 // Create an array for the output pointers 181 TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()]; 182 TOutput **const outptr_array = _outptr_array; 183 184 // Construct the output pointer array 185 { 186 const auto output_pad_right = strategy::out_rows() - item.output_width; 187 auto outptr_element = outptr_array; 188 auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col; 189 190 // Fill the array with pointers to the output buffer 191 for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++) 192 { 193 outptr_array[i] = output_buffer; 194 } 195 196 // Fill in the valid portion of the array 197 for (unsigned int i = 0; i < item.output_height; i++) 198 { 199 auto outptr_col = outptr_row; 200 for (unsigned int j = 0; j < item.output_width; j++) 201 { 202 *(outptr_element++) = outptr_col; 203 outptr_col += ld_output_col; 204 } 205 outptr_element += output_pad_right; 206 outptr_row += ld_output_row; 207 } 208 } 209 210 const int start_i = item.output_i * strategy::stride_rows() - padding.top; 211 const int end_i = start_i + input_rows(); 212 const unsigned int pad_top = std::max(0, 0 - start_i); 213 const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height)); 214 215 const int start_j = item.output_j * strategy::stride_cols() - padding.left; 216 const int end_j = start_j + input_cols(); 217 const unsigned int pad_left = std::max(0, 0 - start_j); 218 const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width)); 219 220 // Create an array for the input pointers 221 const TInput * _inptr_array[input_rows() * input_cols()]; 222 const TInput **const inptr_array = _inptr_array; 223 { 224 const unsigned int row_padding = pad_top + pad_bottom; 225 const unsigned int valid_rows = input_rows() - row_padding; 226 227 const unsigned int col_padding = pad_left + pad_right; 228 const unsigned int valid_cols = input_cols() - col_padding; 229 230 // Fill the array with pointers to the input buffer 231 for (unsigned int i = 0; i < input_rows() * input_cols(); i++) 232 { 233 inptr_array[i] = input_buffer; 234 } 235 236 // Compute valid initial pointer 237 auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col; 238 239 // Fill in the valid portion of the input array 240 auto inptr_element = inptr_array + pad_top * input_cols() + pad_left; 241 for (unsigned int i = 0; i < valid_rows; i++) 242 { 243 auto inptr_col = inptr_row; 244 for (unsigned int j = 0; j < valid_cols; j++) 245 { 246 *(inptr_element++) = inptr_col; 247 inptr_col += ld_input_col; 248 } 249 250 inptr_row += ld_input_row; 251 inptr_element += col_padding; // Skip the padding elements 252 } 253 } 254 255 // Call the kernel 256 #ifdef CYCLE_PROFILING 257 // TODO Work number 258 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols())); 259 #endif // CYCLE_PROFILING 260 strat.kernel(channels, inptr_array, outptr_array, 261 pad_left, pad_top, pad_right, pad_bottom); 262 }; 263 264 // Add the initial work item to the stack of work. 265 std::stack<WorkItem, std::vector<WorkItem>> stack; 266 stack.push(WorkItem(0, 0, output_height, output_width)); 267 while (!stack.empty()) 268 { 269 // Pop an item from the stack, bisect the largest dimension and either 270 // execute the resulting tiles or add them to the stack if they are too 271 // large. 272 const WorkItem item(stack.top()); 273 stack.pop(); 274 275 if (item.output_height <= strategy::out_rows() && 276 item.output_width <= strategy::out_cols()) 277 { 278 execute(item); 279 } 280 else 281 { 282 // Split the largest dimension, such that we get an exact number of 283 // tiles in the first partition. 284 if (item.output_height >= item.output_width) 285 { 286 const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows(); 287 const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2; 288 289 const unsigned int height_first = tiles_first * strategy::out_rows(); 290 const unsigned int height_second = item.output_height - height_first; 291 292 stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width)); 293 stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width)); 294 } 295 else 296 { 297 const unsigned int width_in_tiles = item.output_width / strategy::out_cols(); 298 const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2; 299 300 const unsigned int width_first = tiles_first * strategy::out_cols(); 301 const unsigned int width_second = item.output_width - width_first; 302 303 stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second)); 304 stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first)); 305 } 306 } 307 } 308 } 309 }; 310 311 } // namespace pooling 312 } // namespace arm_conv 313