xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 /* Depthwise kernel drivers commonly require a per-thread blob of working space
26  * in which to store parameters required by the depthwise implementations. The
27  * composition of this working space varies with the driver, kernel, and data
28  * types -- but the tasks of requesting sufficient space, allocating buffer
29  * space, and performing initialisation of the working space are common.
30  *
31  * The classes in this file consist of a number of working space "Elements"
32  * (which are logical units of functionality) and a Workspace type which allows
33  * for compile time composition of elements into a single working space type.
34  *
35  * Creating a workspace
36  * ====================
37  *
38  * A new workspace type can be created by combining Elements as an argument to
39  * the Workspace class. For instance:
40  *
41  *   Workspace<
42  *     depthwise_depthfirst::InputArrayElement<float>,
43  *     InputBufferElement<float>,
44  *     OutputArrayElement<float>
45  *   >
46  *
47  * Creates a new Workspace consisting of the given elements. The workspace type
48  * contained within this class (`Workspace<...>::WorkspaceType`) is equivalent to:
49  *
50  *   struct WorkspaceType
51  *   {
52  *     const float **inptr_array;  // From InputArrayElement<float>
53  *     float *input_buffer;  // From InputBufferElement<float>
54  *     float **outptr_array;  // From OutputArrayElement<float>
55  *     float *output_buffer;  // From OutputArrayElement<float>
56  *   };
57  *
58  * Calling `Workspace<...>::get_sizeof_workspace(...)` will return the amount
59  * of space required to store the above struct and the elements contained
60  * within it. Once this space has been allocated, the workspace can be
61  * initialised by calling `Workspace<...>::initialise` with a pointer to the
62  * buffer and the same arguments. This will place a struct of type
63  * `Workspace<...>::WorkspaceType` at the start of the buffer, and share the
64  * remaining space between the specified elements. As this is all done at
65  * compile time, later code can access elements from the `WorkspaceType` by
66  * name.
67  *
68  * Writing a new element
69  * =====================
70  *
71  * Each Element must provide:
72  *  - A struct called "Workspace" containing the variables contained within
73  *    this portion of the workspace.
74  *  - A static method called `get_element_size` which returns the amount of
75  *    buffer space required by this element of the workspace (NOT including the
76  *    size of the Workspace struct). For example, an element which stores a
77  *    vector of pointers will return the amount of space required top store the
78  *    vector.
79  *  - A static method called `initialise` which accepts a pointer to a struct
80  *    which will be composed of the Element's `Workspace` struct (along with
81  *    other elements), a pointer to the start of the buffer allocated for this
82  *    portion of the workspace, and arguments to be used to initialise the
83  *    workspace. The Element should consume as much of the buffer as it
84  *    requires, initialise the Workspace, and then return the pointer to the
85  *    next free byte of the buffer.
86  *
87  * See the below elements for an example of how this should work.
88  */
89 
90 #pragma once
91 
92 #include "depthwise.hpp"
93 #include "depthfirst_driver.hpp"
94 #include "src/core/NEON/kernels/arm_gemm/utils.hpp"
95 
96 namespace arm_conv {
97 namespace depthwise {
98 namespace {  // anonymous because we expect this to appear in several compilation units
99 
100 /* Arguments to use to size and initialise a workspace.
101  */
102 template <class StratType, class OutputStage=Nothing>
103 struct WorkspaceArgs
104 {
105   const StratType *strategy;
106   const DepthwiseArgs &depthwise_args;
107   const OutputStage &output_stage;
108 
WorkspaceArgsarm_conv::depthwise::__anon783f6b5b0111::WorkspaceArgs109   WorkspaceArgs(const StratType *strat, const DepthwiseArgs &dwargs, const OutputStage &os = {})
110   : strategy(strat), depthwise_args(dwargs), output_stage(os)
111   {
112   }
113 };
114 
115 
116 /* Sometimes we use templated structs to fill in workspace types, the Empty
117  * element can be useful for when a blank element is required for some sets of
118  * parameters.
119  */
120 struct EmptyElement
121 {
122   struct Workspace {};
123 
124   template <class StratType, class OutputStage>
get_element_sizearm_conv::depthwise::__anon783f6b5b0111::EmptyElement125   static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &) { return 0; }
126 
127   template <class WorkspaceType, class StratType, class OutputStage>
initialisearm_conv::depthwise::__anon783f6b5b0111::EmptyElement128   static void *initialise(WorkspaceType *, void *buffer, const WorkspaceArgs<StratType, OutputStage> &)
129   {
130     return buffer;
131   }
132 };
133 
134 
135 /* Store fused activations for a kernel.
136  *
137  * Activations are set based on the DepthwiseArgs.
138  */
139 template <typename T, class OutputStage=Nothing>
140 class ActivationsElement
141 {
142   public:
143   struct Workspace
144   {
145     T activation_min, activation_max;
146   };
147 
148   template <typename StratType>
get_element_size(const WorkspaceArgs<StratType,OutputStage> &)149   static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &)
150   {
151     return 0;
152   }
153 
154   template <class WorkspaceType, class StratType>
initialise(WorkspaceType * ws,void * buffer,const WorkspaceArgs<StratType,OutputStage> & args)155   static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
156   {
157     ws->activation_min = static_cast<T>(-std::numeric_limits<float>::infinity());
158     ws->activation_max = static_cast<T>(std::numeric_limits<float>::infinity());
159 
160     switch (args.depthwise_args.activation.type)
161     {
162       case arm_gemm::Activation::Type::BoundedReLU:
163         ws->activation_max = static_cast<T>(args.depthwise_args.activation.param1);
164         // Fall through
165       case arm_gemm::Activation::Type::ReLU:
166         ws->activation_min = static_cast<T>(0);
167         break;
168       default:
169         break;
170     }
171 
172     return buffer;
173   }
174 };
175 
176 /* Activation clamps are contained within `arm_gemm::Requantize32`, so if the
177  * output stage is one of these we substitute in an empty workspace element.
178  */
179 template <typename T>
180 class ActivationsElement<T, arm_gemm::Requantize32> : public EmptyElement
181 {
182 };
183 
184 
185 /* Get the value with which to fill an input buffer. This defaults to `0`
186  * (which we return as a `char` since it gets used by `memset`).
187  */
188 template <typename OutputStage>
get_input_buffer_fill_value(const OutputStage &)189 char get_input_buffer_fill_value(const OutputStage &)
190 {
191   return 0;
192 }
193 
194 /* In the case of kernels operating on quantized data, we need to fill the
195  * input buffer with the zero offset of the input tensor.
196  */
197 template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) __attribute__ ((unused));
get_input_buffer_fill_value(const arm_gemm::Requantize32 & qp)198 template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp)
199 {
200   return qp.a_offset;
201 }
202 
203 
204 /* Container for a vector of padding values which can be safely consumed by the
205  * depthwise kernel. The padding values are initialised to either `0` or the
206  * zero offset of the input tensor (if quantized).
207  */
208 template <typename T>
209 class InputBufferElement
210 {
211   public:
212   struct Workspace
213   {
214     T *input_buffer;
215   };
216 
217   template <typename StratType, typename OutputStage>
get_element_size(const WorkspaceArgs<StratType,OutputStage> & args)218   static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
219   {
220     return sizeof(T) * args.depthwise_args.input_channels;
221   }
222 
223   template <class WorkspaceType, typename StratType, typename OutputStage>
initialise(WorkspaceType * ws,void * buffer,const WorkspaceArgs<StratType,OutputStage> & args)224   static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
225   {
226     ws->input_buffer = reinterpret_cast<T*>(buffer);
227     memset(ws->input_buffer, get_input_buffer_fill_value(args.output_stage), get_element_size(args));
228     return reinterpret_cast<char *>(buffer) + get_element_size(args);
229   }
230 };
231 
232 
233 /* Container for an array of output pointers, and a buffer which can be used as
234  * a destination for unnecessary writes.
235  */
236 template <typename T>
237 class OutputArrayElement
238 {
239   public:
240   struct Workspace
241   {
242     T **outptr_array;
243     T *output_buffer;
244   };
245 
246   template <typename OutputStage>
get_element_size(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)247   static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
248   {
249     return sizeof_outptr_array(args) + sizeof_output_buffer(args);
250   }
251 
252   template <class WorkspaceType, typename OutputStage>
initialise(WorkspaceType * ws,void * buffer,const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)253   static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
254   {
255     char *buffer_bytes = reinterpret_cast<char *>(buffer);
256 
257     ws->outptr_array = reinterpret_cast<T **>(buffer_bytes);
258     buffer_bytes += sizeof_outptr_array(args);
259 
260     ws->output_buffer = reinterpret_cast<T *>(buffer_bytes);
261     buffer_bytes += sizeof_output_buffer(args);
262 
263     return buffer_bytes;
264   }
265 
266   protected:
267   template <typename OutputStage>
sizeof_outptr_array(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)268   static size_t sizeof_outptr_array(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
269   {
270     return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols();
271   }
272 
273   template <typename OutputStage>
sizeof_output_buffer(const WorkspaceArgs<IDepthfirstStrategy,OutputStage> & args)274   static size_t sizeof_output_buffer(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
275   {
276     return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
277   }
278 };
279 
280 
281 /* Container for requantization parameters.
282  *
283  * This removes the distinction between per-layer and per-channel
284  * requantization parameters by providing a vector of requantization parameters
285  * regardless of whether per-layer or per-channel is selected.
286  */
287 class RequantizationParametersElement
288 {
289   public:
290   struct Workspace
291   {
292     const int32_t *bias, *requant_muls, *requant_shifts;
293   };
294 
295   template <typename StratType>
get_element_size(const WorkspaceArgs<StratType,arm_gemm::Requantize32> & args)296   static size_t get_element_size(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
297   {
298     return sizeof_bias(args) + sizeof_requant_muls(args) + sizeof_requant_shifts(args);
299   }
300 
301   template <typename WorkspaceType, typename StratType>
initialise(WorkspaceType * ws,void * buffer,const WorkspaceArgs<StratType,arm_gemm::Requantize32> & args)302   static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
303   {
304     const auto n_output_channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
305     char *buffer_bytes = reinterpret_cast<char *>(buffer);
306 
307     ws->bias = args.output_stage.bias;
308     ws->requant_muls = args.output_stage.per_channel_muls;
309     ws->requant_shifts = args.output_stage.per_channel_right_shifts;
310 
311     if (ws->bias == nullptr)
312     {
313       ws->bias = reinterpret_cast<const int32_t *>(buffer_bytes);
314       memset(buffer_bytes, 0, sizeof_bias(args));
315       buffer_bytes += sizeof_bias(args);
316     }
317 
318     if (ws->requant_muls == nullptr)
319     {
320       ws->requant_muls = reinterpret_cast<const int32_t *>(buffer_bytes);
321       auto muls = reinterpret_cast<int32_t *>(buffer_bytes);
322       buffer_bytes += sizeof_requant_muls(args);
323 
324       for (auto n = 0u; n < n_output_channels; n++)
325       {
326         muls[n] = args.output_stage.per_layer_mul;
327       }
328     }
329 
330     if (ws->requant_shifts == nullptr)
331     {
332       ws->requant_shifts = reinterpret_cast<int32_t *>(buffer_bytes);
333       auto shifts = reinterpret_cast<int32_t *>(buffer_bytes);
334       buffer_bytes += sizeof_requant_shifts(args);
335 
336       for (auto n = 0u; n < n_output_channels; n++)
337       {
338         shifts[n] = args.output_stage.per_layer_right_shift;
339       }
340     }
341 
342     return buffer_bytes;
343   }
344 
345   protected:
346   template <typename StratType>
sizeof_bias(const WorkspaceArgs<StratType,arm_gemm::Requantize32> & args)347   static size_t sizeof_bias(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
348   {
349     return args.output_stage.bias != nullptr ?
350       0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
351   }
352 
353   template <typename StratType>
sizeof_requant_muls(const WorkspaceArgs<StratType,arm_gemm::Requantize32> & args)354   static size_t sizeof_requant_muls(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
355   {
356     return args.output_stage.per_channel_muls != nullptr ?
357       0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
358   }
359 
360   template <typename StratType>
sizeof_requant_shifts(const WorkspaceArgs<StratType,arm_gemm::Requantize32> & args)361   static size_t sizeof_requant_shifts(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
362   {
363     return args.output_stage.per_channel_right_shifts != nullptr ?
364       0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
365   }
366 };
367 
368 
369 template <typename ...Elements>
370 class Workspace;
371 
372 template <typename Element, typename ...Elements>
373 class Workspace<Element, Elements...>
374 {
375   public:
376   struct WorkspaceType : Element::Workspace, Workspace<Elements...>::WorkspaceType
377   {
378   };
379 
380   template <class S, class T>
initialise(void * buffer,const WorkspaceArgs<S,T> & args)381   static void initialise(void *buffer, const WorkspaceArgs<S, T> &args)
382   {
383     // Allocate sufficient space for the struct, then initialise each of the
384     // elements in turn.
385     auto ws = reinterpret_cast<WorkspaceType *>(buffer);
386     initialise_elements(ws, ws + 1, args);
387   }
388 
389   template <class S, class T=Nothing>
get_sizeof_workspace(const WorkspaceArgs<S,T> & args)390   static size_t get_sizeof_workspace(const WorkspaceArgs<S, T> &args)
391   {
392     return sizeof(WorkspaceType) + get_element_sizes(args);
393   }
394 
395   template <class S, class T>
get_element_sizes(const WorkspaceArgs<S,T> & args)396   static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &args)
397   {
398     return Element::get_element_size(args) + Workspace<Elements...>::get_element_sizes(args);
399   }
400 
401   template <class WorkspaceType, class S, class T>
initialise_elements(WorkspaceType * ws,void * buffer,const WorkspaceArgs<S,T> & args)402   static void initialise_elements(WorkspaceType *ws, void *buffer, const WorkspaceArgs<S, T> &args)
403   {
404     buffer = Element::initialise(ws, buffer, args);  // Get the next buffer
405     Workspace<Elements...>::initialise_elements(ws, buffer, args);
406   }
407 };
408 
409 template <>
410 class Workspace<>
411 {
412   public:
413   struct WorkspaceType
414   {
415   };
416 
417   template <class S, class T>
get_element_sizes(const WorkspaceArgs<S,T> &)418   static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &)
419   {
420     return 0;
421   }
422 
423   template <class WorkspaceType, class S, class T>
initialise_elements(WorkspaceType *,void *,const WorkspaceArgs<S,T> &)424   static void initialise_elements(WorkspaceType *, void *, const WorkspaceArgs<S, T> &)
425   {
426   }
427 };
428 
429 }  // namespace {anonymous}
430 }  // namespace depthwise
431 }  // namespace arm_conv
432