1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "arm_compute/core/Error.h"
28 #include "depthfirst_driver.hpp"
29 #include "utils.hpp"
30 #if !defined(_WIN64) && !defined(__OpenBSD__)
31 #include <alloca.h>
32 #endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
33 
34 namespace arm_conv {
35 namespace pooling {
36 
37 template <typename TInput, typename TOutput, typename OutputStage = Nothing>
38 class IGenericDepthfirstStrategy;
39 
40 template <typename TInput, typename TOutput>
41 class IGenericDepthfirstStrategy<TInput, TOutput, Nothing>
42 {
43   public:
44   virtual ~IGenericDepthfirstStrategy() = default;
45 
46   typedef void (*KernelType)(
47     uint64_t window_cells,
48     uint64_t n_valid_cells,
49     uint64_t n_channels,
50     const TInput *const *,
51     TOutput *
52   );
53 
54   virtual KernelType get_kernel(void) const = 0;
55 };
56 
57 template <typename TInput, typename TOutput>
58 class IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>
59 {
60   public:
61   virtual ~IGenericDepthfirstStrategy() = default;
62 
63   typedef void (*KernelType)(
64     uint64_t window_cells,
65     uint64_t n_valid_cells,
66     uint64_t n_channels,
67     const TInput *const *,
68     TOutput *,
69     const Requantize32 &
70   );
71 
72   virtual KernelType get_kernel(void) const = 0;
73 };
74 
75 template <typename TInput, typename TOutput, typename OutputStage>
76 struct Invoker;
77 
78 template <typename TInput, typename TOutput>
79 struct Invoker<TInput, TOutput, Nothing>
80 {
invokearm_conv::pooling::Invoker81   static inline void invoke(
82     const typename IGenericDepthfirstStrategy<TInput, TOutput, Nothing>::KernelType kern,
83     uint64_t window_cells,
84     uint64_t n_valid_cells,
85     uint64_t n_channels,
86     const TInput *const *inptrs,
87     TOutput *outptr,
88     const Nothing &
89   )
90   {
91     kern(window_cells, n_valid_cells, n_channels, inptrs, outptr);
92   }
93 };
94 
95 template <typename TInput, typename TOutput>
96 struct Invoker<TInput, TOutput, Requantize32>
97 {
invokearm_conv::pooling::Invoker98   static inline void invoke(
99     const typename IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>::KernelType kern,
100     uint64_t window_cells,
101     uint64_t n_valid_cells,
102     uint64_t n_channels,
103     const TInput *const *inptrs,
104     TOutput *outptr,
105     const Requantize32 &qp
106   )
107   {
108     kern(window_cells, n_valid_cells, n_channels, inptrs, outptr, qp);
109   }
110 };
111 
112 template <typename TInput, typename TOutput, typename OutputStage>
113 class GenericDepthfirstWrapper : public IDepthfirstStrategy
114 {
115   using StratType = IGenericDepthfirstStrategy<TInput, TOutput, OutputStage>;
116 
117   std::unique_ptr<const StratType> m_strat;
118   const unsigned int window_rows, window_cols;
119 
120   public:
GenericDepthfirstWrapper(const StratType * strat,const PoolingArgs & args)121   GenericDepthfirstWrapper(const StratType *strat, const PoolingArgs &args)
122   : m_strat(strat), window_rows(args.pool_window.rows), window_cols(args.pool_window.cols)
123   {
124   }
125 
get_input_rows(void) const126   unsigned int get_input_rows(void) const override { return window_rows; }
get_input_cols(void) const127   unsigned int get_input_cols(void) const override { return window_cols; }
get_output_rows(void) const128   unsigned int get_output_rows(void) const override { return 1; }
get_output_cols(void) const129   unsigned int get_output_cols(void) const override { return 1; }
130 
get_kernel(void) const131   typename StratType::KernelType get_kernel(void) const { return m_strat->get_kernel(); }
132 };
133 
134 template <typename TInput, typename TOutput=TInput, typename OutputStage=Nothing>
135 class PoolingDepthfirstGeneric : public DepthfirstDriver<TInput, TOutput>
136 {
137   const OutputStage m_os;
138 
139   protected:
get_working_size_per_thread(unsigned int) const140   size_t get_working_size_per_thread(unsigned int) const override { return 0; }
initialise_working_space(void *,unsigned int) const141   void initialise_working_space(void *, unsigned int) const override { /* Nothing */ }
142 
143   /* Compute a portion of the output tensor with padding. */
compute_tile_padded(unsigned int output_i,unsigned int output_j,unsigned int channel_start,unsigned int channel_end,const TensorSpec<const TInput * > & input,const TensorSpec<TOutput * > & output,void *) const144   void compute_tile_padded(
145     unsigned int output_i, unsigned int output_j,
146     unsigned int channel_start, unsigned int channel_end,
147     const TensorSpec<const TInput *> &input,
148     const TensorSpec<TOutput *> &output,
149     void *
150   ) const override
151   {
152     // Determine start position and padding
153     const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
154     const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
155     const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
156     const int end_i = start_i + this->m_args.pool_window.rows;
157     const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
158     const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
159 
160     const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
161     const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
162     const auto pad_left = static_cast<unsigned int>(start_j < 0 ? -start_j : 0);
163     const int end_j = start_j + this->m_args.pool_window.cols;
164     const auto pad_right = static_cast<unsigned int>((unsigned int) end_j < this->m_args.input_cols ? 0 : end_j - this->m_args.input_cols);
165     const auto valid_cols = this->m_args.pool_window.cols - (pad_left + pad_right);
166 
167     // Determine the number of valid cells and prepare the pointers
168     const auto n_valid_cells = valid_rows * valid_cols;
169     auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *)));
170     {
171       auto my_ptr = inptrs;
172       auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start;
173       for (auto i = valid_rows; i; i--)
174       {
175         auto ptr = row_ptr;
176         row_ptr += input.ld_row;
177 
178         for (auto j = valid_cols; j; j--)
179         {
180           *(my_ptr++) = ptr;
181           ptr += input.ld_col;
182         }
183       }
184     }
185 
186     auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start;
187 
188     // Some padding variants include (or exclude) the padding values; we handle
189     // this by computing the extent of the padded input tensor and hence
190     // computing the total number of cells captured in the pooling window.
191     const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
192     const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
193     const auto right_padded_width = this->m_args.input_cols + this->m_args.padding.right;
194     const auto captured_cols = std::min<int>(end_j, right_padded_width) - start_j;
195     const auto captured_cells = captured_rows * captured_cols;
196     const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
197 
198     // Execute the kernel
199     Invoker<TInput, TOutput, OutputStage>::invoke(
200       reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
201       window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
202     );
203   }
204 
205   // Compute a portion of the work with only top/bottom padding.
compute_row_padded_tile_row(const unsigned int output_i,unsigned int output_j,unsigned int n_tile_cols,const unsigned int channel_start,const unsigned int channel_end,const TensorSpec<const TInput * > & input,const TensorSpec<TOutput * > & output,void * working_space) const206   void compute_row_padded_tile_row(
207     const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
208     const unsigned int channel_start, const unsigned int channel_end,
209     const TensorSpec<const TInput *> &input,
210     const TensorSpec<TOutput *> &output,
211     void *working_space
212   ) const override
213   {
214     ARM_COMPUTE_UNUSED(working_space);
215     // Determine start position and padding
216     const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
217     const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
218     const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
219     const int end_i = start_i + this->m_args.pool_window.rows;
220     const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
221     const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
222 
223     const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
224     const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
225     const auto valid_cols = this->m_args.pool_window.cols;
226 
227     // Determine the number of valid cells and prepare the pointers
228     const auto n_valid_cells = valid_rows * valid_cols;
229     auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *)));
230     {
231       auto my_ptr = inptrs;
232       auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start;
233       for (auto i = valid_rows; i; i--)
234       {
235         auto ptr = row_ptr;
236         row_ptr += input.ld_row;
237 
238         for (auto j = valid_cols; j; j--)
239         {
240           *(my_ptr++) = ptr;
241           ptr += input.ld_col;
242         }
243       }
244     }
245 
246     auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start;
247 
248     // Some padding variants include (or exclude) the padding values; we handle
249     // this by computing the extent of the padded input tensor and hence
250     // computing the total number of cells captured in the pooling window.
251     const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
252     const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
253     const auto captured_cells = captured_rows * valid_cols;
254     const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
255 
256     for (; n_tile_cols; n_tile_cols--)
257     {
258       // Execute the kernel
259       Invoker<TInput, TOutput, OutputStage>::invoke(
260         reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
261         window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
262       );
263 
264       // Update the pointers; the output strides by a column and the inputs
265       // stride by a number of columns.
266       outptr += output.ld_col;
267       for (auto n = 0u; n < n_valid_cells; n++)
268       {
269         inptrs[n] += this->m_args.pool_stride.cols * input.ld_col;
270       }
271     }
272   }
273 
274   public:
PoolingDepthfirstGeneric(const IGenericDepthfirstStrategy<TInput,TOutput,OutputStage> * strat,const PoolingArgs & args,const OutputStage & os={} )275   PoolingDepthfirstGeneric(
276     const IGenericDepthfirstStrategy<TInput, TOutput, OutputStage> *strat,
277     const PoolingArgs &args,
278     const OutputStage &os = {}
279   )
280   : DepthfirstDriver<TInput, TOutput>(
281       new GenericDepthfirstWrapper<TInput, TOutput, OutputStage>(strat, args),
282       args
283     ),
284     m_os(os)
285   {
286   }
287 };
288 
289 }  // namespace pooling
290 }  // namespace arm_conv
291