1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "depthwise_depthfirst.hpp"
28 
29 namespace arm_conv {
30 namespace depthwise {
31 
32 template <typename TInput, typename TOutput, typename TAccum>
33 struct GenericDepthfirstKernelStrategyFunctionType
34 {
35   using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const void *, const unsigned int, const unsigned int, const TAccum, const TAccum)>;
36 };
37 
38 template <typename TInput, typename TOutput>
39 struct GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, int32_t>
40 {
41   using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const arm_gemm::Requantize32 &, unsigned int, unsigned int)>;
42 };
43 
44 template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
45 class GenericDepthfirstKernelStrategy
46 {
47   unsigned int m_n_output_points;
48   arm_gemm::VLType m_vl_type;
49   unsigned int m_accumulator_depth_vl;
50 
51   public:
GenericDepthfirstKernelStrategy(unsigned int n_output_points,arm_gemm::VLType vl_type,unsigned int accumulator_depth_vl=1)52   GenericDepthfirstKernelStrategy(unsigned int n_output_points, arm_gemm::VLType vl_type, unsigned int accumulator_depth_vl=1)
53   : m_n_output_points(n_output_points), m_vl_type(vl_type), m_accumulator_depth_vl(accumulator_depth_vl)
54   {
55   }
56 
57   virtual ~GenericDepthfirstKernelStrategy() = default;
58 
get_vl_type() const59   virtual arm_gemm::VLType get_vl_type() const { return m_vl_type; }
get_accumulator_depth_vl() const60   virtual unsigned int get_accumulator_depth_vl() const { return m_accumulator_depth_vl; }
get_n_output_points() const61   virtual unsigned int get_n_output_points() const { return m_n_output_points; }
62 
63   using KernelType = typename GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, TAccum>::KernelType;
64   virtual KernelType get_kernel(void) const = 0;
65 };
66 
67 template <typename TInput,
68           typename TWeight=TInput,
69           typename TOutput=TInput,
70           typename TAccum=typename DefaultTAccum<TInput>::Type,
71           typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
72 class GenericDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
73 {
74   protected:
75   using KernelStrategyType = GenericDepthfirstKernelStrategy<TInput, TWeight, TOutput, TAccum>;
76   std::unique_ptr<KernelStrategyType> m_strategy;
77 
78   public:
GenericDepthfirstStrategy(KernelStrategyType * strat,unsigned int n_output_rows,unsigned int n_output_cols,const DepthwiseArgs & args)79   GenericDepthfirstStrategy(
80     KernelStrategyType *strat, unsigned int n_output_rows, unsigned int n_output_cols,
81     const DepthwiseArgs &args
82   )
83   : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
84       n_output_rows, n_output_cols,
85       args.kernel_rows, args.kernel_cols,
86       args.stride_rows, args.stride_cols
87     ),
88     m_strategy(strat)
89   {
90   }
91 
92   GenericDepthfirstStrategy(GenericDepthfirstStrategy &) = delete;
93   GenericDepthfirstStrategy operator=(GenericDepthfirstStrategy &) = delete;
94 
get_vl_type(void) const95   arm_gemm::VLType get_vl_type(void) const override { return m_strategy->get_vl_type(); }
get_accumulator_depth_vl(void) const96   unsigned int get_accumulator_depth_vl(void) const override { return m_strategy->get_accumulator_depth_vl(); }
97 
get_storage_size(const DepthwiseArgs & args) const98   size_t get_storage_size(const DepthwiseArgs &args) const override
99   {
100     interleaves::PackingArguments packing_args(
101       this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
102       false, sizeof(TAccum),  // Don't pack the bias
103       this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
104       [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
105       { return this->get_kernel_packing_point(idx, x, y); }
106     );
107     return interleaves::get_storage_size_generic(packing_args, args);
108   }
109 
pack_parameters(const DepthwiseArgs & args,void * buffer,const void * biases,const OutputStage &,const void * weights,size_t ld_weight_col,size_t ld_weight_row) const110   void pack_parameters(
111     const DepthwiseArgs &args, void *buffer,
112     const void *biases, const OutputStage &,
113     const void *weights, size_t ld_weight_col, size_t ld_weight_row
114   ) const override
115   {
116     interleaves::PackingArguments packing_args(
117       this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
118       false, sizeof(TAccum),  // Don't pack the bias
119       this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
120       [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
121       { return this->get_kernel_packing_point(idx, x, y); }
122     );
123     interleaves::pack_parameters_generic(
124       packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row);
125   }
126 
get_kernel() const127   const typename KernelStrategyType::KernelType get_kernel() const { return m_strategy->get_kernel(); }
128 };
129 
130 // Use a templated function to marshal arguments when executing the kernel.
131 template <typename OutputStage> struct DepthwiseDepthfirstGenericKernelCall;
132 
133 template <>
134 struct DepthwiseDepthfirstGenericKernelCall<Nothing>
135 {
136   template <typename StratType, typename WorkspaceType, typename TAccum>
executearm_conv::depthwise::DepthwiseDepthfirstGenericKernelCall137   static void execute(
138     const StratType *strat, const WorkspaceType *ws, const Nothing &,
139     const TAccum *bias, const void *params,
140     const unsigned int n_kernel_points, const unsigned int n_output_channels
141   )
142   {
143     strat->get_kernel()(
144       ws->inptr_array,
145       ws->outptr_array,
146       params, bias,
147       n_kernel_points, n_output_channels,
148       ws->activation_min, ws->activation_max
149     );
150   }
151 };
152 
153 template <>
154 struct DepthwiseDepthfirstGenericKernelCall<arm_gemm::Requantize32>
155 {
156   template <typename StratType, typename WorkspaceType>
executearm_conv::depthwise::DepthwiseDepthfirstGenericKernelCall157   static void execute(
158     const StratType *strat, const WorkspaceType *ws, const arm_gemm::Requantize32 &qp,
159     const int32_t *, const void *params,
160     const unsigned int n_kernel_points, const unsigned int n_output_channels
161   )
162   {
163     strat->get_kernel()(
164       ws->inptr_array,
165       ws->outptr_array,
166       params, qp,
167       n_kernel_points, n_output_channels
168     );
169   }
170 };
171 
172 
173 /* Workspace Element for an array of input pointers as consumed by the
174  * "Generic" depthwise kernels.
175  */
176 template <typename T>
177 class GenericInputArrayElement
178 {
179   public:
180   struct Workspace
181   {
182     const T **inptr_array;
183   };
184 
185   template <class OutputStage>
get_element_size(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)186   static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
187   {
188     const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
189     return sizeof(T **) * args.strategy->get_input_rows() * args.strategy->get_input_cols() * kernel_points;
190   }
191 
192   template <class WorkspaceType, class OutputStage>
initialise(WorkspaceType * ws,void * buffer,const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)193   static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
194   {
195     ws->inptr_array = reinterpret_cast<const T**>(buffer);
196     return reinterpret_cast<char *>(buffer) + get_element_size(args);
197   }
198 };
199 
200 template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
201           typename TAccum=typename DefaultTAccum<TInput>::Type,
202           typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
203 class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
204 {
205   using StratType = GenericDepthfirstStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
206   using Parent = DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
207   using WorkspaceManager = Workspace<
208     OutputArrayElement<TOutput>,
209     GenericInputArrayElement<TInput>,
210     InputBufferElement<TInput>,
211     ActivationsElement<TAccum, OutputStage>
212   >;
213   using WorkingSpace = typename WorkspaceManager::WorkspaceType;
214   const TAccum *m_bias = nullptr;
215 
216   public:
DepthwiseDepthfirstGeneric(StratType * const strat,const DepthwiseArgs & args,const OutputStage & os={})217   DepthwiseDepthfirstGeneric(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os={})
218   : Parent(strat, args, os)
219   {
220   }
221 
222   DepthwiseDepthfirstGeneric(DepthwiseDepthfirstGeneric &) = delete;
223   DepthwiseDepthfirstGeneric &operator=(DepthwiseDepthfirstGeneric &) = delete;
224 
pack_parameters(void * buffer,const void * biases,const void * weights,size_t ld_weight_col,size_t ld_weight_row)225   void pack_parameters(
226     void *buffer, const void *biases,
227     const void *weights, size_t ld_weight_col, size_t ld_weight_row
228   ) override
229   {
230     Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row);
231     m_bias = reinterpret_cast<const TAccum *>(biases);  // Get a copy of the biases
232     depthwise_depthfirst::stash_bias(this->get_output_stage(), m_bias);
233   }
234 
get_working_size_per_thread(const unsigned int n_input_channels) const235   size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
236   {
237     DepthwiseArgs args(this->m_args);
238     args.input_channels = n_input_channels;
239     return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage()));
240   }
241 
initialise_working_space(void * buffer,unsigned int n_input_channels) const242   void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
243   {
244     DepthwiseArgs args(this->m_args);
245     args.input_channels = n_input_channels;
246     return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage()));
247   }
248 
249   protected:
compute_tile_padded(unsigned int output_i,unsigned int output_j,unsigned int channel_start,unsigned int channel_end,const TensorSpec<const TInput * > & input,const TensorSpec<TOutput * > & output,const void * parameters,void * working_space_raw) const250   void compute_tile_padded(
251     unsigned int output_i, unsigned int output_j,
252     unsigned int channel_start, unsigned int channel_end,
253     const TensorSpec<const TInput *> &input,
254     const TensorSpec<TOutput *> &output,
255     const void *parameters,
256     void *working_space_raw
257   ) const override
258   {
259     // Get the working space
260     WorkingSpace *ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
261 
262     const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
263     const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
264     const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
265 
266     const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
267     const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
268     const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
269 
270     fill_pointer_array_generic_kernel<const TInput>(
271       ws->inptr_array,
272       this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
273       this->m_args.kernel_rows, this->m_args.kernel_cols,
274       this->m_args.stride_rows, this->m_args.stride_cols,
275       input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
276       input.ld_row, input.ld_col,
277       ws->input_buffer,
278       input_pad_top, this->m_args.input_rows - input_i,
279       input_pad_left, this->m_args.input_cols - input_j
280     );
281 
282     // Compute the output pointer array
283     fill_pointer_array<TOutput>(
284       ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
285       output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
286       output.ld_row, output.ld_col,
287       ws->output_buffer,
288       0, this->m_args.output_rows - output_i, // Top padding, # valid rows
289       0, this->m_args.output_cols - output_j  // Left padding, # valid columns
290     );
291 
292     // Execute the kernel
293     DepthwiseDepthfirstGenericKernelCall<OutputStage>::execute(
294       reinterpret_cast<const StratType *>(this->m_strat.get()), ws,
295       this->get_output_stage(), m_bias, parameters,
296       this->m_args.kernel_rows * this->m_args.kernel_cols,
297       channel_end - channel_start
298     );
299   }
300 };
301 
302 }  // namespace depthwise
303 }  // namespace arm_conv
304