1 /* 2 * Copyright (c) 2021-2022 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #pragma once 26 27 #include "depthwise_depthfirst.hpp" 28 #include "interleaves/generic_quantized_dot_product.hpp" 29 30 #ifdef CYCLE_PROFILING 31 #include "profiler.hpp" 32 #endif 33 34 #include <limits> 35 36 namespace arm_conv { 37 namespace depthwise { 38 39 template <typename TInput, typename TWeight, typename TOutput, typename TAccum> 40 class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing> 41 { 42 using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>; 43 44 protected: get_packing_args(const DepthwiseArgs & args) const45 virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const 46 { 47 return interleaves::PackingArguments( 48 args.kernel_rows, args.kernel_cols, sizeof(TWeight), 49 true, sizeof(TAccum), 50 this->get_vl_type(), 51 sizeof(TAccum), 1, 52 [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool 53 { 54 if (pos < args.kernel_rows * args.kernel_cols) 55 { 56 y = pos % args.kernel_cols; 57 x = pos / args.kernel_cols; 58 return true; 59 } 60 return false; 61 } 62 ); 63 } 64 65 public: 66 using Parent::Parent; 67 get_storage_size(const DepthwiseArgs & args) const68 size_t get_storage_size(const DepthwiseArgs &args) const override 69 { 70 return interleaves::get_storage_size_generic(this->get_packing_args(args), args); 71 } 72 pack_parameters(const DepthwiseArgs & args,void * buffer,const void * biases,const Nothing &,const void * weights,size_t ld_weight_col,size_t ld_weight_row) const73 void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override 74 { 75 interleaves::pack_parameters_generic( 76 this->get_packing_args(args), args, 77 buffer, biases, weights, ld_weight_col, ld_weight_row 78 ); 79 } 80 81 using KernelType = std::function<void( 82 const TInput *const *, // Input pointers 83 TOutput *const *, // Output pointers 84 const void *, // Ravelled bias, weights, and quantization parameters 85 unsigned int, // # output channels 86 TAccum, TAccum // Min and max activation clamps 87 )>; 88 virtual KernelType get_kernel(void) const = 0; 89 }; 90 91 92 template <typename TInput, typename TWeight, typename TOutput> 93 class DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t> : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32> 94 { 95 using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>; 96 97 public: 98 using Parent::Parent; 99 get_storage_size(const DepthwiseArgs & args) const100 size_t get_storage_size(const DepthwiseArgs &args) const override 101 { 102 return interleaves::quantized::get_storage_size(args, this->get_vl_type(), this->get_accumulator_depth_vl()); 103 } 104 pack_parameters(const DepthwiseArgs & args,void * buffer,const void * biases,const arm_gemm::Requantize32 & qp,const void * weights,size_t ld_weight_col,size_t ld_weight_row) const105 void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override 106 { 107 interleaves::quantized::pack_parameters<TWeight>( 108 buffer, reinterpret_cast<const int32_t *>(biases), 109 reinterpret_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row, 110 args, qp, this->get_vl_type(), this->get_accumulator_depth_vl() 111 ); 112 } 113 114 using KernelType = std::function<void( 115 const TInput *const *, // Input pointers 116 TOutput *const *, // Output pointers 117 const void *, // Ravelled bias, weights, and quantization parameters 118 unsigned int, // # output channels 119 const arm_gemm::Requantize32 & 120 )>; 121 virtual KernelType get_kernel(void) const = 0; 122 }; 123 124 125 template <typename TInput, typename TWeight, typename TOutput, typename TAccum> 126 class GenericDepthfirstMultiplierKernelStrategy 127 { 128 const arm_gemm::VLType m_vl_type; 129 const unsigned int m_output_rows, m_output_cols; 130 131 public: GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows,unsigned int output_cols,arm_gemm::VLType vl_type)132 GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type) 133 : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols) 134 { 135 } 136 137 virtual ~GenericDepthfirstMultiplierKernelStrategy() = default; 138 get_vl_type(void) const139 arm_gemm::VLType get_vl_type(void) const { return m_vl_type; } get_output_rows(void) const140 unsigned int get_output_rows(void) const { return m_output_rows; } get_output_cols(void) const141 unsigned int get_output_cols(void) const { return m_output_cols; } 142 143 using KernelType = std::function<void( 144 const TInput *const *, // Input pointers 145 TOutput *const *, // Output pointers 146 const TWeight *, // Ravelled weight parameters 147 const TAccum *, // Bias, 148 unsigned int, unsigned int, // Number of kernel points, number of output channels 149 TAccum, TAccum // Activation minimum and maximum 150 )>; 151 virtual KernelType get_kernel(void) const = 0; 152 }; 153 154 template <typename TInput, typename TWeight, typename TOutput> 155 class GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, int32_t> 156 { 157 const arm_gemm::VLType m_vl_type; 158 const unsigned int m_output_rows, m_output_cols; 159 160 public: GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows,unsigned int output_cols,arm_gemm::VLType vl_type)161 GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type) 162 : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols) 163 { 164 } 165 166 virtual ~GenericDepthfirstMultiplierKernelStrategy() = default; 167 get_vl_type(void) const168 arm_gemm::VLType get_vl_type(void) const { return m_vl_type; } get_output_rows(void) const169 unsigned int get_output_rows(void) const { return m_output_rows; } get_output_cols(void) const170 unsigned int get_output_cols(void) const { return m_output_cols; } 171 172 using KernelType = std::function<void( 173 const TInput *const *, // Input pointers 174 TOutput *const *, // Output pointers 175 const TWeight *, // Ravelled weight parameters 176 const int32_t *, // Bias, 177 unsigned int, unsigned int, // Number of kernel points, number of output channels 178 const int32_t *, const int32_t *, const int32_t *, // Per-channel left-shifts, multipliers, right-shifts (need to account for start channel) 179 const arm_gemm::Requantize32 & 180 )>; 181 virtual KernelType get_kernel(void) const = 0; 182 }; 183 184 template <typename TInput, 185 typename TWeight=TInput, 186 typename TOutput=TInput, 187 typename TAccum=typename DefaultTAccum<TInput>::Type, 188 typename OutputStage=typename DefaultOutputStage<TOutput>::Type> 189 class GenericDepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage> 190 { 191 using KernelStrategyType = GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, TAccum>; 192 std::unique_ptr<KernelStrategyType> m_kern; 193 194 protected: get_packing_args(const DepthwiseArgs & args) const195 virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const 196 { 197 return interleaves::PackingArguments( 198 args.kernel_rows, args.kernel_cols, sizeof(TWeight), 199 false, sizeof(TAccum), 200 this->get_vl_type(), 201 sizeof(TAccum), 1, 202 [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool 203 { 204 if (pos < args.kernel_rows * args.kernel_cols) 205 { 206 y = pos % args.kernel_cols; 207 x = pos / args.kernel_cols; 208 return true; 209 } 210 return false; 211 } 212 ); 213 } 214 215 public: GenericDepthfirstMultiplierStrategy(KernelStrategyType * kern,const DepthwiseArgs & args)216 GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args) 217 : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>( 218 kern->get_output_rows(), kern->get_output_cols(), 219 args.kernel_rows, args.kernel_cols, 220 args.stride_rows, args.stride_cols 221 ), 222 m_kern(kern) 223 { 224 }; 225 get_vl_type(void) const226 arm_gemm::VLType get_vl_type(void) const override { return m_kern->get_vl_type(); } get_kernel(void) const227 const typename KernelStrategyType::KernelType get_kernel(void) const { return m_kern->get_kernel(); } 228 get_storage_size(const DepthwiseArgs & args) const229 size_t get_storage_size(const DepthwiseArgs &args) const override 230 { 231 return interleaves::get_storage_size_generic(this->get_packing_args(args), args); 232 } 233 pack_parameters(const DepthwiseArgs & args,void * buffer,const void * biases,const OutputStage &,const void * weights,size_t ld_weight_col,size_t ld_weight_row) const234 void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override 235 { 236 interleaves::pack_parameters_generic( 237 this->get_packing_args(args), args, 238 buffer, biases, weights, ld_weight_col, ld_weight_row 239 ); 240 } 241 }; 242 243 // Specialise elements of the wrapper based on the type of kernel. 244 namespace depthfirst_multiplier { 245 246 /* Working space element which contains a pointer for each row of input, a row 247 * of padding, and a space which can be used to construct an NCHW-ordered patch 248 * of input. 249 */ 250 template <typename T, bool IsGeneric=false, typename OutputStage=Nothing> 251 class InputPatchElement 252 { 253 public: 254 struct Workspace 255 { 256 constexpr static bool InputPatchIsGeneric = IsGeneric; 257 const T **input_rows; 258 T *input_padding; 259 T *input_patch; 260 }; 261 get_element_size(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)262 static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) 263 { 264 return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args); 265 } 266 267 template <class WorkspaceType> initialise(WorkspaceType * ws,void * buffer,const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)268 static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) 269 { 270 auto buffer_bytes = reinterpret_cast<char *>(buffer); 271 272 ws->input_rows = reinterpret_cast<const T **>(buffer_bytes); 273 buffer_bytes += sizeof_input_rows(args); 274 275 ws->input_padding = reinterpret_cast<T*>(buffer_bytes); 276 buffer_bytes += sizeof_input_padding(args); 277 278 ws->input_patch = reinterpret_cast<T*>(buffer_bytes); 279 buffer_bytes += sizeof_input_patch(args); 280 281 // Initialise the padding 282 memset(ws->input_padding, 283 get_input_buffer_fill_value(args.output_stage), 284 sizeof_input_padding(args)); 285 286 return buffer_bytes; 287 } 288 289 protected: sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)290 static size_t sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) 291 { 292 if (IsGeneric) 293 { 294 return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols; 295 } 296 else 297 { 298 return sizeof(T *) * args.strategy->get_input_rows(); 299 } 300 } 301 sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)302 static size_t sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) 303 { 304 // Round-up the number of columns to be a whole number of QUADS 305 auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T)); 306 return sizeof(T) * input_cols; 307 } 308 sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)309 static size_t sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) 310 { 311 if (IsGeneric) 312 { 313 // Round-up the number of columns to be a whole number of QUADS 314 auto output_cols = arm_gemm::roundup<size_t>(args.strategy->get_output_cols(), 16 / sizeof(T)); 315 const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols; 316 return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols; 317 } 318 else 319 { 320 // Round-up the number of columns to be a whole number of QUADS 321 auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T)); 322 return sizeof(T) * args.strategy->get_input_rows() * input_cols; 323 } 324 } 325 }; 326 327 template <bool IsGeneric, typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage> 328 struct StrategyType 329 { 330 using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum>; 331 332 template <typename WorkspaceType> executearm_conv::depthwise::depthfirst_multiplier::StrategyType333 static void execute( 334 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, 335 const OutputStage &, const unsigned int, 336 const void *parameters, const void * 337 ) 338 { 339 strat->get_kernel()( 340 ws->input_rows, 341 ws->outptr_array, 342 parameters, args.channel_multiplier, 343 ws->activation_min, ws->activation_max 344 ); 345 } 346 }; 347 348 template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage> 349 struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage> 350 { 351 using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>; 352 353 template <typename WorkspaceType> executearm_conv::depthwise::depthfirst_multiplier::StrategyType354 static void execute( 355 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, 356 const OutputStage &, const unsigned int start_output_channel, 357 const void *parameters, const void *bias 358 ) 359 { 360 strat->get_kernel()( 361 ws->input_rows, ws->outptr_array, 362 reinterpret_cast<const TWeight *>(parameters), 363 bias == nullptr ? nullptr : reinterpret_cast<const TAccum *>(bias) + start_output_channel, 364 strat->get_kernel_rows() * strat->get_kernel_cols(), 365 args.channel_multiplier, 366 ws->activation_min, ws->activation_max 367 ); 368 } 369 }; 370 371 template <typename TInput, typename TWeight, typename TOutput> 372 struct StrategyType<false, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32> 373 { 374 using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t>; 375 376 template <typename WorkspaceType> executearm_conv::depthwise::depthfirst_multiplier::StrategyType377 static void execute( 378 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, 379 const arm_gemm::Requantize32 &qp, const unsigned int, 380 const void *parameters, const void * 381 ) 382 { 383 strat->get_kernel()( 384 ws->input_rows, 385 ws->outptr_array, 386 parameters, args.channel_multiplier, 387 qp 388 ); 389 } 390 }; 391 392 template <typename TInput, typename TWeight, typename TOutput> 393 struct StrategyType<true, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32> 394 { 395 using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>; 396 397 template <typename WorkspaceType> executearm_conv::depthwise::depthfirst_multiplier::StrategyType398 static void execute( 399 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, 400 const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel, 401 const void *parameters, const void * 402 ) 403 { 404 auto get_ptr = [start_output_channel] (const int32_t *ptr) -> const int32_t * 405 { 406 return ptr == nullptr ? nullptr : ptr + start_output_channel; 407 }; 408 409 strat->get_kernel()( 410 ws->input_rows, ws->outptr_array, 411 reinterpret_cast<const TWeight *>(parameters), 412 get_ptr(qp.bias), 413 strat->get_kernel_rows() * strat->get_kernel_cols(), 414 args.channel_multiplier, 415 get_ptr(qp.per_channel_left_shifts), 416 get_ptr(qp.per_channel_muls), 417 get_ptr(qp.per_channel_right_shifts), 418 qp 419 ); 420 } 421 }; 422 423 template <bool IsGeneric> struct PrepareInputSample; 424 425 template <> struct PrepareInputSample<false> 426 { 427 template <typename WorkspaceType, typename StrategyType, typename T> executearm_conv::depthwise::depthfirst_multiplier::PrepareInputSample428 static void execute( 429 const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat, 430 T *base_ptr, size_t ld_row, size_t ld_col, 431 const unsigned int input_pad_top, const unsigned int valid_rows, 432 const unsigned int input_pad_left, const unsigned int valid_cols 433 ) 434 { 435 fill_nchw_patch_array( 436 ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(), 437 base_ptr, ld_row, ld_col, 438 ws->input_padding, 439 input_pad_top, valid_rows, 440 input_pad_left, valid_cols 441 ); 442 } 443 }; 444 445 template <> struct PrepareInputSample<true> 446 { 447 template <typename WorkspaceType, typename StrategyType, typename T> executearm_conv::depthwise::depthfirst_multiplier::PrepareInputSample448 static void execute( 449 const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat, 450 T *base_ptr, size_t ld_row, size_t ld_col, 451 const unsigned int input_pad_top, const unsigned int valid_rows, 452 const unsigned int input_pad_left, const unsigned int valid_cols 453 ) 454 { 455 fill_patch_array_generic_kernel( 456 ws->input_rows, ws->input_patch, 457 strat->get_output_rows(), strat->get_output_cols(), 458 args.kernel_rows, args.kernel_cols, 459 args.stride_rows, args.stride_cols, 460 base_ptr, ld_row, ld_col, 461 ws->input_padding, 462 input_pad_top, valid_rows, 463 input_pad_left, valid_cols 464 ); 465 } 466 }; 467 468 } // namespace depthfirst_multiplier 469 470 template <typename TInput, 471 typename TWeight=TInput, 472 typename TOutput=TInput, 473 typename TAccum=typename DefaultTAccum<TInput>::Type, 474 bool is_generic=false, 475 typename OutputStage=typename DefaultOutputStage<TOutput>::Type> 476 class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, TOutput> 477 { 478 protected: 479 using StratType = typename depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::Type; 480 using WorkspaceManager = Workspace< 481 OutputArrayElement<TOutput>, 482 depthfirst_multiplier::InputPatchElement<TInput, is_generic, OutputStage>, 483 ActivationsElement<TOutput, OutputStage> 484 >; 485 using WorkingSpace = typename WorkspaceManager::WorkspaceType; 486 487 OutputStage m_os; // Copy of the output parameters 488 const void *m_bias = nullptr; // Copy of the bias (should we need it) 489 490 public: DepthwiseDepthfirstMultiplier(StratType * const strat,const DepthwiseArgs & args,const OutputStage & os={})491 DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {}) 492 : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os) 493 { 494 } 495 496 DepthwiseDepthfirstMultiplier(DepthwiseDepthfirstMultiplier &) = delete; 497 DepthwiseDepthfirstMultiplier &operator=(DepthwiseDepthfirstMultiplier &) = delete; 498 get_storage_size(void) const499 size_t get_storage_size(void) const override 500 { 501 return reinterpret_cast<const StratType *>(this->m_strat.get()) 502 ->get_storage_size(this->m_args); 503 } 504 pack_parameters(void * buffer,const void * biases,const void * weights,size_t ld_weight_col,size_t ld_weight_row)505 void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override 506 { 507 reinterpret_cast<const StratType *>(this->m_strat.get()) 508 ->pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row); 509 m_bias = biases; 510 depthwise_depthfirst::stash_bias(m_os, biases); 511 } 512 get_working_size_per_thread(const unsigned int n_input_channels) const513 size_t get_working_size_per_thread(const unsigned int n_input_channels) const override 514 { 515 DepthwiseArgs args(this->m_args); 516 args.input_channels = n_input_channels; 517 return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os)); 518 } 519 initialise_working_space(void * buffer,unsigned int n_input_channels) const520 void initialise_working_space(void *buffer, unsigned int n_input_channels) const override 521 { 522 DepthwiseArgs args(this->m_args); 523 args.input_channels = n_input_channels; 524 return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os)); 525 } 526 compute_tile_padded(unsigned int output_i,unsigned int output_j,unsigned int output_channel_start,unsigned int output_channel_end,const TensorSpec<const TInput * > & input,const TensorSpec<TOutput * > & output,const void * parameters,void * working_space_raw) const527 void compute_tile_padded( 528 unsigned int output_i, unsigned int output_j, 529 unsigned int output_channel_start, unsigned int output_channel_end, 530 const TensorSpec<const TInput *> &input, 531 const TensorSpec<TOutput *> &output, 532 const void *parameters, 533 void *working_space_raw 534 ) const override 535 { 536 // Get the working space 537 auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw); 538 539 const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top; 540 const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0); 541 const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii); 542 543 const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left; 544 const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0); 545 const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij); 546 547 // Compute the output pointer array. We'll update this array after every 548 // invocation of the kernel. 549 fill_pointer_array( 550 ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), 551 output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, 552 output.ld_row, output.ld_col, 553 ws->output_buffer, 554 0, this->m_args.output_rows - output_i, // Top padding, # valid rows 555 0, this->m_args.output_cols - output_j // Left padding, # valid columns 556 ); 557 558 // Compute the parameter stride 559 DepthwiseArgs single_iter(this->m_args); 560 single_iter.input_channels = 1; 561 const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get()) 562 ->get_storage_size(single_iter); 563 564 for (; output_channel_start < output_channel_end; 565 output_channel_start += this->m_args.channel_multiplier) 566 { 567 // Compute the input pointer array 568 const auto input_channel = output_channel_start / this->m_args.channel_multiplier; 569 570 // Construct the input patch 571 depthfirst_multiplier::PrepareInputSample<is_generic>::execute( 572 this->m_args, ws, this->m_strat.get(), 573 input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col, 574 input_pad_top, this->m_args.input_rows - input_i, 575 input_pad_left, this->m_args.input_cols - input_j 576 ); 577 578 // Execute the kernel 579 depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::execute( 580 this->m_args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start, 581 parameters, m_bias 582 ); 583 584 // Update the output pointers 585 for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++) 586 { 587 ws->outptr_array[n] += this->m_args.channel_multiplier; 588 } 589 590 // Progress the parameters 591 parameters = reinterpret_cast<const char *>(parameters) + parameter_stride; 592 } 593 } 594 }; 595 596 } // namespace depthwise 597 } // namespace arm_conv 598