1 /* 2 * Copyright (c) 2021-2022 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #pragma once 26 27 #include "depthwise_depthfirst.hpp" 28 29 namespace arm_conv { 30 namespace depthwise { 31 32 template <typename TInput, typename TOutput, typename TAccum> 33 struct GenericDepthfirstKernelStrategyFunctionType 34 { 35 using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const void *, const unsigned int, const unsigned int, const TAccum, const TAccum)>; 36 }; 37 38 template <typename TInput, typename TOutput> 39 struct GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, int32_t> 40 { 41 using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const arm_gemm::Requantize32 &, unsigned int, unsigned int)>; 42 }; 43 44 template <typename TInput, typename TWeight, typename TOutput, typename TAccum> 45 class GenericDepthfirstKernelStrategy 46 { 47 unsigned int m_n_output_points; 48 arm_gemm::VLType m_vl_type; 49 unsigned int m_accumulator_depth_vl; 50 51 public: GenericDepthfirstKernelStrategy(unsigned int n_output_points,arm_gemm::VLType vl_type,unsigned int accumulator_depth_vl=1)52 GenericDepthfirstKernelStrategy(unsigned int n_output_points, arm_gemm::VLType vl_type, unsigned int accumulator_depth_vl=1) 53 : m_n_output_points(n_output_points), m_vl_type(vl_type), m_accumulator_depth_vl(accumulator_depth_vl) 54 { 55 } 56 57 virtual ~GenericDepthfirstKernelStrategy() = default; 58 get_vl_type() const59 virtual arm_gemm::VLType get_vl_type() const { return m_vl_type; } get_accumulator_depth_vl() const60 virtual unsigned int get_accumulator_depth_vl() const { return m_accumulator_depth_vl; } get_n_output_points() const61 virtual unsigned int get_n_output_points() const { return m_n_output_points; } 62 63 using KernelType = typename GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, TAccum>::KernelType; 64 virtual KernelType get_kernel(void) const = 0; 65 }; 66 67 template <typename TInput, 68 typename TWeight=TInput, 69 typename TOutput=TInput, 70 typename TAccum=typename DefaultTAccum<TInput>::Type, 71 typename OutputStage=typename DefaultOutputStage<TOutput>::Type> 72 class GenericDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage> 73 { 74 protected: 75 using KernelStrategyType = GenericDepthfirstKernelStrategy<TInput, TWeight, TOutput, TAccum>; 76 std::unique_ptr<KernelStrategyType> m_strategy; 77 78 public: GenericDepthfirstStrategy(KernelStrategyType * strat,unsigned int n_output_rows,unsigned int n_output_cols,const DepthwiseArgs & args)79 GenericDepthfirstStrategy( 80 KernelStrategyType *strat, unsigned int n_output_rows, unsigned int n_output_cols, 81 const DepthwiseArgs &args 82 ) 83 : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>( 84 n_output_rows, n_output_cols, 85 args.kernel_rows, args.kernel_cols, 86 args.stride_rows, args.stride_cols 87 ), 88 m_strategy(strat) 89 { 90 } 91 92 GenericDepthfirstStrategy(GenericDepthfirstStrategy &) = delete; 93 GenericDepthfirstStrategy operator=(GenericDepthfirstStrategy &) = delete; 94 get_vl_type(void) const95 arm_gemm::VLType get_vl_type(void) const override { return m_strategy->get_vl_type(); } get_accumulator_depth_vl(void) const96 unsigned int get_accumulator_depth_vl(void) const override { return m_strategy->get_accumulator_depth_vl(); } 97 get_storage_size(const DepthwiseArgs & args) const98 size_t get_storage_size(const DepthwiseArgs &args) const override 99 { 100 interleaves::PackingArguments packing_args( 101 this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight), 102 false, sizeof(TAccum), // Don't pack the bias 103 this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(), 104 [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool 105 { return this->get_kernel_packing_point(idx, x, y); } 106 ); 107 return interleaves::get_storage_size_generic(packing_args, args); 108 } 109 pack_parameters(const DepthwiseArgs & args,void * buffer,const void * biases,const OutputStage &,const void * weights,size_t ld_weight_col,size_t ld_weight_row) const110 void pack_parameters( 111 const DepthwiseArgs &args, void *buffer, 112 const void *biases, const OutputStage &, 113 const void *weights, size_t ld_weight_col, size_t ld_weight_row 114 ) const override 115 { 116 interleaves::PackingArguments packing_args( 117 this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight), 118 false, sizeof(TAccum), // Don't pack the bias 119 this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(), 120 [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool 121 { return this->get_kernel_packing_point(idx, x, y); } 122 ); 123 interleaves::pack_parameters_generic( 124 packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row); 125 } 126 get_kernel() const127 const typename KernelStrategyType::KernelType get_kernel() const { return m_strategy->get_kernel(); } 128 }; 129 130 // Use a templated function to marshal arguments when executing the kernel. 131 template <typename OutputStage> struct DepthwiseDepthfirstGenericKernelCall; 132 133 template <> 134 struct DepthwiseDepthfirstGenericKernelCall<Nothing> 135 { 136 template <typename StratType, typename WorkspaceType, typename TAccum> executearm_conv::depthwise::DepthwiseDepthfirstGenericKernelCall137 static void execute( 138 const StratType *strat, const WorkspaceType *ws, const Nothing &, 139 const TAccum *bias, const void *params, 140 const unsigned int n_kernel_points, const unsigned int n_output_channels 141 ) 142 { 143 strat->get_kernel()( 144 ws->inptr_array, 145 ws->outptr_array, 146 params, bias, 147 n_kernel_points, n_output_channels, 148 ws->activation_min, ws->activation_max 149 ); 150 } 151 }; 152 153 template <> 154 struct DepthwiseDepthfirstGenericKernelCall<arm_gemm::Requantize32> 155 { 156 template <typename StratType, typename WorkspaceType> executearm_conv::depthwise::DepthwiseDepthfirstGenericKernelCall157 static void execute( 158 const StratType *strat, const WorkspaceType *ws, const arm_gemm::Requantize32 &qp, 159 const int32_t *, const void *params, 160 const unsigned int n_kernel_points, const unsigned int n_output_channels 161 ) 162 { 163 strat->get_kernel()( 164 ws->inptr_array, 165 ws->outptr_array, 166 params, qp, 167 n_kernel_points, n_output_channels 168 ); 169 } 170 }; 171 172 173 /* Workspace Element for an array of input pointers as consumed by the 174 * "Generic" depthwise kernels. 175 */ 176 template <typename T> 177 class GenericInputArrayElement 178 { 179 public: 180 struct Workspace 181 { 182 const T **inptr_array; 183 }; 184 185 template <class OutputStage> get_element_size(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)186 static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) 187 { 188 const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols; 189 return sizeof(T **) * args.strategy->get_input_rows() * args.strategy->get_input_cols() * kernel_points; 190 } 191 192 template <class WorkspaceType, class OutputStage> initialise(WorkspaceType * ws,void * buffer,const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)193 static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) 194 { 195 ws->inptr_array = reinterpret_cast<const T**>(buffer); 196 return reinterpret_cast<char *>(buffer) + get_element_size(args); 197 } 198 }; 199 200 template <typename TInput, typename TWeight=TInput, typename TOutput=TInput, 201 typename TAccum=typename DefaultTAccum<TInput>::Type, 202 typename OutputStage=typename DefaultOutputStage<TOutput>::Type> 203 class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage> 204 { 205 using StratType = GenericDepthfirstStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>; 206 using Parent = DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>; 207 using WorkspaceManager = Workspace< 208 OutputArrayElement<TOutput>, 209 GenericInputArrayElement<TInput>, 210 InputBufferElement<TInput>, 211 ActivationsElement<TAccum, OutputStage> 212 >; 213 using WorkingSpace = typename WorkspaceManager::WorkspaceType; 214 const TAccum *m_bias = nullptr; 215 216 public: DepthwiseDepthfirstGeneric(StratType * const strat,const DepthwiseArgs & args,const OutputStage & os={})217 DepthwiseDepthfirstGeneric(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os={}) 218 : Parent(strat, args, os) 219 { 220 } 221 222 DepthwiseDepthfirstGeneric(DepthwiseDepthfirstGeneric &) = delete; 223 DepthwiseDepthfirstGeneric &operator=(DepthwiseDepthfirstGeneric &) = delete; 224 pack_parameters(void * buffer,const void * biases,const void * weights,size_t ld_weight_col,size_t ld_weight_row)225 void pack_parameters( 226 void *buffer, const void *biases, 227 const void *weights, size_t ld_weight_col, size_t ld_weight_row 228 ) override 229 { 230 Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row); 231 m_bias = reinterpret_cast<const TAccum *>(biases); // Get a copy of the biases 232 depthwise_depthfirst::stash_bias(this->get_output_stage(), m_bias); 233 } 234 get_working_size_per_thread(const unsigned int n_input_channels) const235 size_t get_working_size_per_thread(const unsigned int n_input_channels) const override 236 { 237 DepthwiseArgs args(this->m_args); 238 args.input_channels = n_input_channels; 239 return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())); 240 } 241 initialise_working_space(void * buffer,unsigned int n_input_channels) const242 void initialise_working_space(void *buffer, unsigned int n_input_channels) const override 243 { 244 DepthwiseArgs args(this->m_args); 245 args.input_channels = n_input_channels; 246 return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())); 247 } 248 249 protected: compute_tile_padded(unsigned int output_i,unsigned int output_j,unsigned int channel_start,unsigned int channel_end,const TensorSpec<const TInput * > & input,const TensorSpec<TOutput * > & output,const void * parameters,void * working_space_raw) const250 void compute_tile_padded( 251 unsigned int output_i, unsigned int output_j, 252 unsigned int channel_start, unsigned int channel_end, 253 const TensorSpec<const TInput *> &input, 254 const TensorSpec<TOutput *> &output, 255 const void *parameters, 256 void *working_space_raw 257 ) const override 258 { 259 // Get the working space 260 WorkingSpace *ws = reinterpret_cast<WorkingSpace *>(working_space_raw); 261 262 const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top; 263 const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0); 264 const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii); 265 266 const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left; 267 const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0); 268 const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij); 269 270 fill_pointer_array_generic_kernel<const TInput>( 271 ws->inptr_array, 272 this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), 273 this->m_args.kernel_rows, this->m_args.kernel_cols, 274 this->m_args.stride_rows, this->m_args.stride_cols, 275 input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start, 276 input.ld_row, input.ld_col, 277 ws->input_buffer, 278 input_pad_top, this->m_args.input_rows - input_i, 279 input_pad_left, this->m_args.input_cols - input_j 280 ); 281 282 // Compute the output pointer array 283 fill_pointer_array<TOutput>( 284 ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), 285 output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start, 286 output.ld_row, output.ld_col, 287 ws->output_buffer, 288 0, this->m_args.output_rows - output_i, // Top padding, # valid rows 289 0, this->m_args.output_cols - output_j // Left padding, # valid columns 290 ); 291 292 // Execute the kernel 293 DepthwiseDepthfirstGenericKernelCall<OutputStage>::execute( 294 reinterpret_cast<const StratType *>(this->m_strat.get()), ws, 295 this->get_output_stage(), m_bias, parameters, 296 this->m_args.kernel_rows * this->m_args.kernel_cols, 297 channel_end - channel_start 298 ); 299 } 300 }; 301 302 } // namespace depthwise 303 } // namespace arm_conv 304