1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "depthwise_depthfirst.hpp"
28 #include "interleaves/generic_quantized_dot_product.hpp"
29 
30 #ifdef CYCLE_PROFILING
31 #include "profiler.hpp"
32 #endif
33 
34 #include <limits>
35 
36 namespace arm_conv {
37 namespace depthwise {
38 
39 template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
40 class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>
41 {
42   using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>;
43 
44   protected:
get_packing_args(const DepthwiseArgs & args) const45   virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
46   {
47     return interleaves::PackingArguments(
48       args.kernel_rows, args.kernel_cols, sizeof(TWeight),
49       true, sizeof(TAccum),
50       this->get_vl_type(),
51       sizeof(TAccum), 1,
52       [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
53       {
54         if (pos < args.kernel_rows * args.kernel_cols)
55         {
56           y = pos % args.kernel_cols;
57           x = pos / args.kernel_cols;
58           return true;
59         }
60         return false;
61       }
62     );
63   }
64 
65   public:
66   using Parent::Parent;
67 
get_storage_size(const DepthwiseArgs & args) const68   size_t get_storage_size(const DepthwiseArgs &args) const override
69   {
70     return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
71   }
72 
pack_parameters(const DepthwiseArgs & args,void * buffer,const void * biases,const Nothing &,const void * weights,size_t ld_weight_col,size_t ld_weight_row) const73   void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
74   {
75     interleaves::pack_parameters_generic(
76       this->get_packing_args(args), args,
77       buffer, biases, weights, ld_weight_col, ld_weight_row
78     );
79   }
80 
81   using KernelType = std::function<void(
82     const TInput *const *,  // Input pointers
83     TOutput *const *,  // Output pointers
84     const void *,  // Ravelled bias, weights, and quantization parameters
85     unsigned int,  // # output channels
86     TAccum, TAccum  // Min and max activation clamps
87   )>;
88   virtual KernelType get_kernel(void) const = 0;
89 };
90 
91 
92 template <typename TInput, typename TWeight, typename TOutput>
93 class DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t> : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
94 {
95   using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
96 
97   public:
98   using Parent::Parent;
99 
get_storage_size(const DepthwiseArgs & args) const100   size_t get_storage_size(const DepthwiseArgs &args) const override
101   {
102     return interleaves::quantized::get_storage_size(args, this->get_vl_type(), this->get_accumulator_depth_vl());
103   }
104 
pack_parameters(const DepthwiseArgs & args,void * buffer,const void * biases,const arm_gemm::Requantize32 & qp,const void * weights,size_t ld_weight_col,size_t ld_weight_row) const105   void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
106   {
107     interleaves::quantized::pack_parameters<TWeight>(
108       buffer, reinterpret_cast<const int32_t *>(biases),
109       reinterpret_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row,
110       args, qp, this->get_vl_type(), this->get_accumulator_depth_vl()
111     );
112   }
113 
114   using KernelType = std::function<void(
115     const TInput *const *,  // Input pointers
116     TOutput *const *,  // Output pointers
117     const void *,  // Ravelled bias, weights, and quantization parameters
118     unsigned int,  // # output channels
119     const arm_gemm::Requantize32 &
120   )>;
121   virtual KernelType get_kernel(void) const = 0;
122 };
123 
124 
125 template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
126 class GenericDepthfirstMultiplierKernelStrategy
127 {
128   const arm_gemm::VLType m_vl_type;
129   const unsigned int m_output_rows, m_output_cols;
130 
131   public:
GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows,unsigned int output_cols,arm_gemm::VLType vl_type)132   GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
133   : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
134   {
135   }
136 
137   virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
138 
get_vl_type(void) const139   arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
get_output_rows(void) const140   unsigned int get_output_rows(void) const { return m_output_rows; }
get_output_cols(void) const141   unsigned int get_output_cols(void) const { return m_output_cols; }
142 
143   using KernelType = std::function<void(
144     const TInput *const *,  // Input pointers
145     TOutput *const *,  // Output pointers
146     const TWeight *,  // Ravelled weight parameters
147     const TAccum *,  // Bias,
148     unsigned int, unsigned int,  // Number of kernel points, number of output channels
149     TAccum, TAccum  // Activation minimum and maximum
150   )>;
151   virtual KernelType get_kernel(void) const = 0;
152 };
153 
154 template <typename TInput, typename TWeight, typename TOutput>
155 class GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, int32_t>
156 {
157   const arm_gemm::VLType m_vl_type;
158   const unsigned int m_output_rows, m_output_cols;
159 
160   public:
GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows,unsigned int output_cols,arm_gemm::VLType vl_type)161   GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
162   : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
163   {
164   }
165 
166   virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
167 
get_vl_type(void) const168   arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
get_output_rows(void) const169   unsigned int get_output_rows(void) const { return m_output_rows; }
get_output_cols(void) const170   unsigned int get_output_cols(void) const { return m_output_cols; }
171 
172   using KernelType = std::function<void(
173     const TInput *const *,  // Input pointers
174     TOutput *const *,  // Output pointers
175     const TWeight *,  // Ravelled weight parameters
176     const int32_t *,  // Bias,
177     unsigned int, unsigned int,  // Number of kernel points, number of output channels
178     const int32_t *, const int32_t *, const int32_t *,  // Per-channel left-shifts, multipliers, right-shifts (need to account for start channel)
179     const arm_gemm::Requantize32 &
180   )>;
181   virtual KernelType get_kernel(void) const = 0;
182 };
183 
184 template <typename TInput,
185           typename TWeight=TInput,
186           typename TOutput=TInput,
187           typename TAccum=typename DefaultTAccum<TInput>::Type,
188           typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
189 class GenericDepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
190 {
191   using KernelStrategyType = GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, TAccum>;
192   std::unique_ptr<KernelStrategyType> m_kern;
193 
194   protected:
get_packing_args(const DepthwiseArgs & args) const195   virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
196   {
197     return interleaves::PackingArguments(
198       args.kernel_rows, args.kernel_cols, sizeof(TWeight),
199       false, sizeof(TAccum),
200       this->get_vl_type(),
201       sizeof(TAccum), 1,
202       [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
203       {
204         if (pos < args.kernel_rows * args.kernel_cols)
205         {
206           y = pos % args.kernel_cols;
207           x = pos / args.kernel_cols;
208           return true;
209         }
210         return false;
211       }
212     );
213   }
214 
215   public:
GenericDepthfirstMultiplierStrategy(KernelStrategyType * kern,const DepthwiseArgs & args)216   GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args)
217   : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
218       kern->get_output_rows(), kern->get_output_cols(),
219       args.kernel_rows, args.kernel_cols,
220       args.stride_rows, args.stride_cols
221     ),
222     m_kern(kern)
223   {
224   };
225 
get_vl_type(void) const226   arm_gemm::VLType get_vl_type(void) const override { return m_kern->get_vl_type(); }
get_kernel(void) const227   const typename KernelStrategyType::KernelType get_kernel(void) const { return m_kern->get_kernel(); }
228 
get_storage_size(const DepthwiseArgs & args) const229   size_t get_storage_size(const DepthwiseArgs &args) const override
230   {
231     return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
232   }
233 
pack_parameters(const DepthwiseArgs & args,void * buffer,const void * biases,const OutputStage &,const void * weights,size_t ld_weight_col,size_t ld_weight_row) const234   void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
235   {
236     interleaves::pack_parameters_generic(
237       this->get_packing_args(args), args,
238       buffer, biases, weights, ld_weight_col, ld_weight_row
239     );
240   }
241 };
242 
243 // Specialise elements of the wrapper based on the type of kernel.
244 namespace depthfirst_multiplier {
245 
246 /* Working space element which contains a pointer for each row of input, a row
247  * of padding, and a space which can be used to construct an NCHW-ordered patch
248  * of input.
249  */
250 template <typename T, bool IsGeneric=false, typename OutputStage=Nothing>
251 class InputPatchElement
252 {
253   public:
254   struct Workspace
255   {
256     constexpr static bool InputPatchIsGeneric = IsGeneric;
257     const T **input_rows;
258     T *input_padding;
259     T *input_patch;
260   };
261 
get_element_size(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)262   static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
263   {
264     return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args);
265   }
266 
267   template <class WorkspaceType>
initialise(WorkspaceType * ws,void * buffer,const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)268   static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
269   {
270     auto buffer_bytes = reinterpret_cast<char *>(buffer);
271 
272     ws->input_rows = reinterpret_cast<const T **>(buffer_bytes);
273     buffer_bytes += sizeof_input_rows(args);
274 
275     ws->input_padding = reinterpret_cast<T*>(buffer_bytes);
276     buffer_bytes += sizeof_input_padding(args);
277 
278     ws->input_patch = reinterpret_cast<T*>(buffer_bytes);
279     buffer_bytes += sizeof_input_patch(args);
280 
281     // Initialise the padding
282     memset(ws->input_padding,
283            get_input_buffer_fill_value(args.output_stage),
284            sizeof_input_padding(args));
285 
286     return buffer_bytes;
287   }
288 
289   protected:
sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)290   static size_t sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
291   {
292     if (IsGeneric)
293     {
294       return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
295     }
296     else
297     {
298       return sizeof(T *) * args.strategy->get_input_rows();
299     }
300   }
301 
sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)302   static size_t sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
303   {
304     // Round-up the number of columns to be a whole number of QUADS
305     auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
306     return sizeof(T) * input_cols;
307   }
308 
sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)309   static size_t sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
310   {
311     if (IsGeneric)
312     {
313       // Round-up the number of columns to be a whole number of QUADS
314       auto output_cols = arm_gemm::roundup<size_t>(args.strategy->get_output_cols(), 16 / sizeof(T));
315       const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
316       return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols;
317     }
318     else
319     {
320       // Round-up the number of columns to be a whole number of QUADS
321       auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
322       return sizeof(T) * args.strategy->get_input_rows() * input_cols;
323     }
324   }
325 };
326 
327 template <bool IsGeneric, typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
328 struct StrategyType
329 {
330   using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum>;
331 
332   template <typename WorkspaceType>
executearm_conv::depthwise::depthfirst_multiplier::StrategyType333   static void execute(
334     const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
335     const OutputStage &, const unsigned int,
336     const void *parameters, const void *
337   )
338   {
339     strat->get_kernel()(
340       ws->input_rows,
341       ws->outptr_array,
342       parameters, args.channel_multiplier,
343       ws->activation_min, ws->activation_max
344     );
345   }
346 };
347 
348 template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
349 struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage>
350 {
351   using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
352 
353   template <typename WorkspaceType>
executearm_conv::depthwise::depthfirst_multiplier::StrategyType354   static void execute(
355     const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
356     const OutputStage &, const unsigned int start_output_channel,
357     const void *parameters, const void *bias
358   )
359   {
360     strat->get_kernel()(
361       ws->input_rows, ws->outptr_array,
362       reinterpret_cast<const TWeight *>(parameters),
363       bias == nullptr ? nullptr : reinterpret_cast<const TAccum *>(bias) + start_output_channel,
364       strat->get_kernel_rows() * strat->get_kernel_cols(),
365       args.channel_multiplier,
366       ws->activation_min, ws->activation_max
367     );
368   }
369 };
370 
371 template <typename TInput, typename TWeight, typename TOutput>
372 struct StrategyType<false, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
373 {
374   using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t>;
375 
376   template <typename WorkspaceType>
executearm_conv::depthwise::depthfirst_multiplier::StrategyType377   static void execute(
378     const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
379     const arm_gemm::Requantize32 &qp, const unsigned int,
380     const void *parameters, const void *
381   )
382   {
383     strat->get_kernel()(
384       ws->input_rows,
385       ws->outptr_array,
386       parameters, args.channel_multiplier,
387       qp
388     );
389   }
390 };
391 
392 template <typename TInput, typename TWeight, typename TOutput>
393 struct StrategyType<true, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
394 {
395   using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
396 
397   template <typename WorkspaceType>
executearm_conv::depthwise::depthfirst_multiplier::StrategyType398   static void execute(
399     const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
400     const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel,
401     const void *parameters, const void *
402   )
403   {
404     auto get_ptr = [start_output_channel] (const int32_t *ptr) -> const int32_t *
405     {
406       return ptr == nullptr ? nullptr : ptr + start_output_channel;
407     };
408 
409     strat->get_kernel()(
410       ws->input_rows, ws->outptr_array,
411       reinterpret_cast<const TWeight *>(parameters),
412       get_ptr(qp.bias),
413       strat->get_kernel_rows() * strat->get_kernel_cols(),
414       args.channel_multiplier,
415       get_ptr(qp.per_channel_left_shifts),
416       get_ptr(qp.per_channel_muls),
417       get_ptr(qp.per_channel_right_shifts),
418       qp
419     );
420   }
421 };
422 
423 template <bool IsGeneric> struct PrepareInputSample;
424 
425 template <> struct PrepareInputSample<false>
426 {
427   template <typename WorkspaceType, typename StrategyType, typename T>
executearm_conv::depthwise::depthfirst_multiplier::PrepareInputSample428   static void execute(
429     const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat,
430     T *base_ptr, size_t ld_row, size_t ld_col,
431     const unsigned int input_pad_top, const unsigned int valid_rows,
432     const unsigned int input_pad_left, const unsigned int valid_cols
433   )
434   {
435     fill_nchw_patch_array(
436       ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(),
437       base_ptr, ld_row, ld_col,
438       ws->input_padding,
439       input_pad_top, valid_rows,
440       input_pad_left, valid_cols
441     );
442   }
443 };
444 
445 template <> struct PrepareInputSample<true>
446 {
447   template <typename WorkspaceType, typename StrategyType, typename T>
executearm_conv::depthwise::depthfirst_multiplier::PrepareInputSample448   static void execute(
449     const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat,
450     T *base_ptr, size_t ld_row, size_t ld_col,
451     const unsigned int input_pad_top, const unsigned int valid_rows,
452     const unsigned int input_pad_left, const unsigned int valid_cols
453   )
454   {
455     fill_patch_array_generic_kernel(
456       ws->input_rows, ws->input_patch,
457       strat->get_output_rows(), strat->get_output_cols(),
458       args.kernel_rows, args.kernel_cols,
459       args.stride_rows, args.stride_cols,
460       base_ptr, ld_row, ld_col,
461       ws->input_padding,
462       input_pad_top, valid_rows,
463       input_pad_left, valid_cols
464     );
465   }
466 };
467 
468 }  // namespace depthfirst_multiplier
469 
470 template <typename TInput,
471           typename TWeight=TInput,
472           typename TOutput=TInput,
473           typename TAccum=typename DefaultTAccum<TInput>::Type,
474           bool is_generic=false,
475           typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
476 class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, TOutput>
477 {
478   protected:
479   using StratType = typename depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
480   using WorkspaceManager = Workspace<
481     OutputArrayElement<TOutput>,
482     depthfirst_multiplier::InputPatchElement<TInput, is_generic, OutputStage>,
483     ActivationsElement<TOutput, OutputStage>
484   >;
485   using WorkingSpace = typename WorkspaceManager::WorkspaceType;
486 
487   OutputStage m_os;  // Copy of the output parameters
488   const void *m_bias = nullptr;  // Copy of the bias (should we need it)
489 
490   public:
DepthwiseDepthfirstMultiplier(StratType * const strat,const DepthwiseArgs & args,const OutputStage & os={})491   DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
492   : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
493   {
494   }
495 
496   DepthwiseDepthfirstMultiplier(DepthwiseDepthfirstMultiplier &) = delete;
497   DepthwiseDepthfirstMultiplier &operator=(DepthwiseDepthfirstMultiplier &) = delete;
498 
get_storage_size(void) const499   size_t get_storage_size(void) const override
500   {
501     return reinterpret_cast<const StratType *>(this->m_strat.get())
502       ->get_storage_size(this->m_args);
503   }
504 
pack_parameters(void * buffer,const void * biases,const void * weights,size_t ld_weight_col,size_t ld_weight_row)505   void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
506   {
507     reinterpret_cast<const StratType *>(this->m_strat.get())
508       ->pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
509     m_bias = biases;
510     depthwise_depthfirst::stash_bias(m_os, biases);
511   }
512 
get_working_size_per_thread(const unsigned int n_input_channels) const513   size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
514   {
515     DepthwiseArgs args(this->m_args);
516     args.input_channels = n_input_channels;
517     return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
518   }
519 
initialise_working_space(void * buffer,unsigned int n_input_channels) const520   void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
521   {
522     DepthwiseArgs args(this->m_args);
523     args.input_channels = n_input_channels;
524     return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
525   }
526 
compute_tile_padded(unsigned int output_i,unsigned int output_j,unsigned int output_channel_start,unsigned int output_channel_end,const TensorSpec<const TInput * > & input,const TensorSpec<TOutput * > & output,const void * parameters,void * working_space_raw) const527   void compute_tile_padded(
528     unsigned int output_i, unsigned int output_j,
529     unsigned int output_channel_start, unsigned int output_channel_end,
530     const TensorSpec<const TInput *> &input,
531     const TensorSpec<TOutput *> &output,
532     const void *parameters,
533     void *working_space_raw
534   ) const override
535   {
536     // Get the working space
537     auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
538 
539     const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
540     const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
541     const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
542 
543     const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
544     const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
545     const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
546 
547     // Compute the output pointer array. We'll update this array after every
548     // invocation of the kernel.
549     fill_pointer_array(
550       ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
551       output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
552       output.ld_row, output.ld_col,
553       ws->output_buffer,
554       0, this->m_args.output_rows - output_i, // Top padding, # valid rows
555       0, this->m_args.output_cols - output_j  // Left padding, # valid columns
556     );
557 
558     // Compute the parameter stride
559     DepthwiseArgs single_iter(this->m_args);
560     single_iter.input_channels = 1;
561     const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get())
562       ->get_storage_size(single_iter);
563 
564     for (; output_channel_start < output_channel_end;
565          output_channel_start += this->m_args.channel_multiplier)
566     {
567       // Compute the input pointer array
568       const auto input_channel = output_channel_start / this->m_args.channel_multiplier;
569 
570       // Construct the input patch
571       depthfirst_multiplier::PrepareInputSample<is_generic>::execute(
572         this->m_args, ws, this->m_strat.get(),
573         input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col,
574         input_pad_top, this->m_args.input_rows - input_i,
575         input_pad_left, this->m_args.input_cols - input_j
576       );
577 
578       // Execute the kernel
579       depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::execute(
580         this->m_args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
581         parameters, m_bias
582       );
583 
584       // Update the output pointers
585       for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
586       {
587         ws->outptr_array[n] += this->m_args.channel_multiplier;
588       }
589 
590       // Progress the parameters
591       parameters = reinterpret_cast<const char *>(parameters) + parameter_stride;
592     }
593   }
594 };
595 
596 }  // namespace depthwise
597 }  // namespace arm_conv
598