1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #pragma once
25 
26 #include "pool_common.hpp"
27 
28 #include <stack>
29 #include <vector>
30 
31 namespace arm_conv {
32 namespace pooling {
33 
34 template <class strategy>
35 class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
36 {
37   using TInput = typename strategy::operand_type;
38   using TOutput = typename strategy::return_type;
39 
40   const PoolingArgs m_args;  // Copy of arguments
41 
input_rows(void)42   constexpr static unsigned int input_rows(void)
43   {
44     return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
45   }
46 
input_cols(void)47   constexpr static unsigned int input_cols(void)
48   {
49     return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
50   }
51 
sizeof_input_buffer(void) const52   size_t sizeof_input_buffer(void) const
53   {
54     return sizeof(TInput) * m_args.n_channels;
55   }
56 
sizeof_output_buffer(void) const57   size_t sizeof_output_buffer(void) const
58   {
59     return sizeof(TOutput) * m_args.n_channels;
60   }
61 
62   public:
PoolingDepthfirstCacheOblivious(const PoolingArgs & args)63   PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args)
64   {
65   }
66 
67   PoolingDepthfirstCacheOblivious(PoolingDepthfirstCacheOblivious &) = delete;
68   PoolingDepthfirstCacheOblivious &operator=(PoolingDepthfirstCacheOblivious &) = delete;
69 
get_working_size(void) const70   size_t get_working_size(void) const override
71   {
72     // We require an array of pointers for the inputs and outputs, a
73     // channel-length vector in which to dump surplus output, and a
74     // channel-length vector of padding values.
75     return sizeof_input_buffer() + sizeof_output_buffer();
76   }
77 
execute(const void * const input,void * const output,void * const working_space) const78   void execute(
79     const void *const input,
80     void *const output,
81     void *const working_space
82   ) const override
83   {
84     const size_t ld_input_col = m_args.n_channels;
85     const size_t ld_input_row = ld_input_col * m_args.input_cols;
86     const size_t ld_input_batch = ld_input_row * m_args.input_rows;
87     const size_t ld_output_col = ld_input_col;
88     const size_t ld_output_row = ld_output_col * m_args.output_cols;
89     const size_t ld_output_batch = ld_output_row * m_args.output_rows;
90 
91     execute(
92       input, ld_input_col, ld_input_row, ld_input_batch,
93       output, ld_output_col, ld_output_row, ld_output_batch,
94       working_space
95     );
96   }
97 
execute(const void * const input,size_t ld_input_col,size_t ld_input_row,size_t ld_input_batch,void * const output,size_t ld_output_col,size_t ld_output_row,size_t ld_output_batch,void * const working_space) const98   void execute(
99     const void *const input,
100     size_t ld_input_col,
101     size_t ld_input_row,
102     size_t ld_input_batch,
103     void *const output,
104     size_t ld_output_col,
105     size_t ld_output_row,
106     size_t ld_output_batch,
107     void *const working_space
108   ) const override
109   {
110     execute(
111       m_args.n_batches, m_args.input_rows, m_args.input_cols,
112       m_args.n_channels,
113       input, ld_input_col, ld_input_row, ld_input_batch,
114       m_args.padding,
115       m_args.output_rows, m_args.output_cols,
116       output, ld_output_col, ld_output_row, ld_output_batch,
117       working_space
118     );
119   }
120 
execute(unsigned int batches,unsigned int input_height,unsigned int input_width,unsigned int channels,const void * const _input,size_t ld_input_col,size_t ld_input_row,size_t ld_input_batch,const PaddingValues & padding,unsigned int output_height,unsigned int output_width,void * const _output,size_t ld_output_col,size_t ld_output_row,size_t ld_output_batch,void * const _working_space) const121   void execute(
122     unsigned int batches,
123     unsigned int input_height,
124     unsigned int input_width,
125     unsigned int channels,
126     const void *const _input,
127     size_t ld_input_col,
128     size_t ld_input_row,
129     size_t ld_input_batch,
130     const PaddingValues &padding,
131     unsigned int output_height,
132     unsigned int output_width,
133     void *const _output,
134     size_t ld_output_col,
135     size_t ld_output_row,
136     size_t ld_output_batch,
137     void *const _working_space
138   ) const override
139   {
140     strategy strat(m_args.cpu_info);
141 #ifdef CYCLE_PROFILING
142     arm_gemm::profiler prof;
143 #endif // CYCLE_PROFILING
144 
145     // Cast input and output pointers into the right types
146     const TInput *const inptr = static_cast<const TInput *>(_input);
147     TOutput *const outptr = static_cast<TOutput *>(_output);
148 
149     // Allocate portions of the working space
150     uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
151     TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
152     TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer());
153 
154     // Fill the input buffer
155     const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE)
156                            ? static_cast<TInput>(0)
157                            : (std::numeric_limits<TInput>::has_infinity
158                               ? -std::numeric_limits<TInput>::infinity()
159                               : std::numeric_limits<TInput>::lowest());
160     for (unsigned int i = 0; i < channels; i++)
161     {
162       input_buffer[i] = pad_value;
163     }
164 
165     // Keep subdividing the output plane across the longest dimension until we
166     // reach the size of the tile. Queue items for later processing. Note - we
167     // can determine the largest size of the queue a priori from the input
168     // tensor size, this would allow us to allocate memory within the working
169     // space and improve performance.
170     struct WorkItem
171     {
172       unsigned int output_i, output_j;
173       unsigned int output_height, output_width;
174 
175       WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width)
176         : output_i(i), output_j(j), output_height(height), output_width(width) {}
177     };
178 
179     auto execute = [&] (const WorkItem &item) {
180       // Create an array for the output pointers
181       TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
182       TOutput **const outptr_array = _outptr_array;
183 
184       // Construct the output pointer array
185       {
186         const auto output_pad_right = strategy::out_rows() - item.output_width;
187         auto outptr_element = outptr_array;
188         auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col;
189 
190         // Fill the array with pointers to the output buffer
191         for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++)
192         {
193           outptr_array[i] = output_buffer;
194         }
195 
196         // Fill in the valid portion of the array
197         for (unsigned int i = 0; i < item.output_height; i++)
198         {
199           auto outptr_col = outptr_row;
200           for (unsigned int j = 0; j < item.output_width; j++)
201           {
202             *(outptr_element++) = outptr_col;
203             outptr_col += ld_output_col;
204           }
205           outptr_element += output_pad_right;
206           outptr_row += ld_output_row;
207         }
208       }
209 
210       const int start_i = item.output_i * strategy::stride_rows() - padding.top;
211       const int end_i = start_i + input_rows();
212       const unsigned int pad_top = std::max(0, 0 - start_i);
213       const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height));
214 
215       const int start_j = item.output_j * strategy::stride_cols() - padding.left;
216       const int end_j = start_j + input_cols();
217       const unsigned int pad_left = std::max(0, 0 - start_j);
218       const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width));
219 
220       // Create an array for the input pointers
221       const TInput * _inptr_array[input_rows() * input_cols()];
222       const TInput **const inptr_array = _inptr_array;
223       {
224         const unsigned int row_padding = pad_top + pad_bottom;
225         const unsigned int valid_rows = input_rows() - row_padding;
226 
227         const unsigned int col_padding = pad_left + pad_right;
228         const unsigned int valid_cols = input_cols() - col_padding;
229 
230         // Fill the array with pointers to the input buffer
231         for (unsigned int i = 0; i < input_rows() * input_cols(); i++)
232         {
233           inptr_array[i] = input_buffer;
234         }
235 
236         // Compute valid initial pointer
237         auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col;
238 
239         // Fill in the valid portion of the input array
240         auto inptr_element = inptr_array + pad_top * input_cols() + pad_left;
241         for (unsigned int i = 0; i < valid_rows; i++)
242         {
243           auto inptr_col = inptr_row;
244           for (unsigned int j = 0; j < valid_cols; j++)
245           {
246             *(inptr_element++) = inptr_col;
247             inptr_col += ld_input_col;
248           }
249 
250           inptr_row += ld_input_row;
251           inptr_element += col_padding;  // Skip the padding elements
252         }
253       }
254 
255       // Call the kernel
256 #ifdef CYCLE_PROFILING
257       // TODO Work number
258       auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols()));
259 #endif // CYCLE_PROFILING
260       strat.kernel(channels, inptr_array, outptr_array,
261                    pad_left, pad_top, pad_right, pad_bottom);
262     };
263 
264     // Add the initial work item to the stack of work.
265     std::stack<WorkItem, std::vector<WorkItem>> stack;
266     stack.push(WorkItem(0, 0, output_height, output_width));
267     while (!stack.empty())
268     {
269       // Pop an item from the stack, bisect the largest dimension and either
270       // execute the resulting tiles or add them to the stack if they are too
271       // large.
272       const WorkItem item(stack.top());
273       stack.pop();
274 
275       if (item.output_height <= strategy::out_rows() &&
276           item.output_width <= strategy::out_cols())
277       {
278         execute(item);
279       }
280       else
281       {
282         // Split the largest dimension, such that we get an exact number of
283         // tiles in the first partition.
284         if (item.output_height >= item.output_width)
285         {
286           const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows();
287           const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2;
288 
289           const unsigned int height_first = tiles_first * strategy::out_rows();
290           const unsigned int height_second = item.output_height - height_first;
291 
292           stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width));
293           stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width));
294         }
295         else
296         {
297           const unsigned int width_in_tiles = item.output_width / strategy::out_cols();
298           const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2;
299 
300           const unsigned int width_first = tiles_first * strategy::out_cols();
301           const unsigned int width_second = item.output_width - width_first;
302 
303           stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second));
304           stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first));
305         }
306       }
307     }
308   }
309 };
310 
311 }  // namespace pooling
312 }  // namespace arm_conv
313