xref: /aosp_15_r20/external/ComputeLibrary/src/runtime/NEON/functions/NELSTMLayer.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1*c217d954SCole Faust /*
2*c217d954SCole Faust  * Copyright (c) 2018-2022 Arm Limited.
3*c217d954SCole Faust  *
4*c217d954SCole Faust  * SPDX-License-Identifier: MIT
5*c217d954SCole Faust  *
6*c217d954SCole Faust  * Permission is hereby granted, free of charge, to any person obtaining a copy
7*c217d954SCole Faust  * of this software and associated documentation files (the "Software"), to
8*c217d954SCole Faust  * deal in the Software without restriction, including without limitation the
9*c217d954SCole Faust  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10*c217d954SCole Faust  * sell copies of the Software, and to permit persons to whom the Software is
11*c217d954SCole Faust  * furnished to do so, subject to the following conditions:
12*c217d954SCole Faust  *
13*c217d954SCole Faust  * The above copyright notice and this permission notice shall be included in all
14*c217d954SCole Faust  * copies or substantial portions of the Software.
15*c217d954SCole Faust  *
16*c217d954SCole Faust  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*c217d954SCole Faust  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*c217d954SCole Faust  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19*c217d954SCole Faust  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*c217d954SCole Faust  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*c217d954SCole Faust  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*c217d954SCole Faust  * SOFTWARE.
23*c217d954SCole Faust  */
24*c217d954SCole Faust #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
25*c217d954SCole Faust 
26*c217d954SCole Faust #include "arm_compute/core/Utils.h"
27*c217d954SCole Faust #include "arm_compute/core/Validate.h"
28*c217d954SCole Faust #include "arm_compute/core/utils/misc/InfoHelpers.h"
29*c217d954SCole Faust #include "arm_compute/core/utils/misc/ShapeCalculator.h"
30*c217d954SCole Faust #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
31*c217d954SCole Faust #include "arm_compute/runtime/common/LSTMParams.h"
32*c217d954SCole Faust #include "src/common/utils/Log.h"
33*c217d954SCole Faust 
34*c217d954SCole Faust namespace arm_compute
35*c217d954SCole Faust {
36*c217d954SCole Faust using namespace arm_compute::misc::shape_calculator;
37*c217d954SCole Faust using namespace arm_compute::utils::info_helpers;
38*c217d954SCole Faust 
39*c217d954SCole Faust NELSTMLayer::~NELSTMLayer() = default;
40*c217d954SCole Faust 
NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)41*c217d954SCole Faust NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
42*c217d954SCole Faust     : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
43*c217d954SCole Faust       _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
44*c217d954SCole Faust       _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
45*c217d954SCole Faust       _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
46*c217d954SCole Faust       _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
47*c217d954SCole Faust       _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(),
48*c217d954SCole Faust       _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(),
49*c217d954SCole Faust       _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(),
50*c217d954SCole Faust       _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(),
51*c217d954SCole Faust       _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(),
52*c217d954SCole Faust       _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false),
53*c217d954SCole Faust       _is_layer_norm_lstm(false)
54*c217d954SCole Faust {
55*c217d954SCole Faust }
56*c217d954SCole Faust 
configure(const ITensor * input,const ITensor * input_to_forget_weights,const ITensor * input_to_cell_weights,const ITensor * input_to_output_weights,const ITensor * recurrent_to_forget_weights,const ITensor * recurrent_to_cell_weights,const ITensor * recurrent_to_output_weights,const ITensor * forget_gate_bias,const ITensor * cell_bias,const ITensor * output_gate_bias,const ITensor * output_state_in,const ITensor * cell_state_in,ITensor * scratch_buffer,ITensor * output_state_out,ITensor * cell_state_out,ITensor * output,const LSTMParams<ITensor> & lstm_params,const ActivationLayerInfo & activation_info,float cell_threshold,float projection_threshold)57*c217d954SCole Faust void NELSTMLayer::configure(const ITensor *input,
58*c217d954SCole Faust                             const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
59*c217d954SCole Faust                             const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
60*c217d954SCole Faust                             const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
61*c217d954SCole Faust                             const ITensor *output_state_in, const ITensor *cell_state_in,
62*c217d954SCole Faust                             ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
63*c217d954SCole Faust                             const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
64*c217d954SCole Faust {
65*c217d954SCole Faust     ARM_COMPUTE_ERROR_ON_NULLPTR(input,
66*c217d954SCole Faust                                  input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
67*c217d954SCole Faust                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
68*c217d954SCole Faust                                  forget_gate_bias, cell_bias, output_gate_bias,
69*c217d954SCole Faust                                  output_state_in, cell_state_in,
70*c217d954SCole Faust                                  scratch_buffer, output_state_out, cell_state_out, output);
71*c217d954SCole Faust     ARM_COMPUTE_LOG_PARAMS(input,
72*c217d954SCole Faust                            input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
73*c217d954SCole Faust                            recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
74*c217d954SCole Faust                            forget_gate_bias, cell_bias, output_gate_bias,
75*c217d954SCole Faust                            output_state_in, cell_state_in,
76*c217d954SCole Faust                            scratch_buffer, output_state_out, cell_state_out, output,
77*c217d954SCole Faust                            lstm_params, activation_info, cell_threshold, projection_threshold);
78*c217d954SCole Faust 
79*c217d954SCole Faust     _is_layer_norm_lstm = lstm_params.use_layer_norm();
80*c217d954SCole Faust 
81*c217d954SCole Faust     // Set lstm parameters
82*c217d954SCole Faust     LSTMParams<ITensorInfo> lstm_params_info{};
83*c217d954SCole Faust     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
84*c217d954SCole Faust 
85*c217d954SCole Faust     // Validate
86*c217d954SCole Faust     ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(),
87*c217d954SCole Faust                                                      input_to_cell_weights->info(), input_to_output_weights->info(),
88*c217d954SCole Faust                                                      recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
89*c217d954SCole Faust                                                      forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
90*c217d954SCole Faust                                                      output_state_in->info(), cell_state_in->info(),
91*c217d954SCole Faust                                                      scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
92*c217d954SCole Faust                                                      lstm_params_info, activation_info, cell_threshold, projection_threshold));
93*c217d954SCole Faust 
94*c217d954SCole Faust     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
95*c217d954SCole Faust 
96*c217d954SCole Faust     // Configure block that calculates the forget gate
97*c217d954SCole Faust     // forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
98*c217d954SCole Faust     // We optimize this as follows:
99*c217d954SCole Faust     // forget_gate = Activation( (input,output_state_in) * (input_to_forget_weights,recurrent_to_forget_weights) + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
100*c217d954SCole Faust     _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
101*c217d954SCole Faust     _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
102*c217d954SCole Faust     _forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
103*c217d954SCole Faust 
104*c217d954SCole Faust     std::vector<const ITensor *> inputs_vector;
105*c217d954SCole Faust     inputs_vector.emplace_back(input);
106*c217d954SCole Faust     inputs_vector.emplace_back(output_state_in);
107*c217d954SCole Faust 
108*c217d954SCole Faust     _memory_group.manage(&_forget_gate_out2);
109*c217d954SCole Faust     _concat_inputs_forget_gate.configure(inputs_vector, &_forget_gate_out2, Window::DimX);
110*c217d954SCole Faust 
111*c217d954SCole Faust     std::vector<const ITensor *> weights_vector;
112*c217d954SCole Faust 
113*c217d954SCole Faust     weights_vector.emplace_back(input_to_forget_weights);
114*c217d954SCole Faust     weights_vector.emplace_back(recurrent_to_forget_weights);
115*c217d954SCole Faust 
116*c217d954SCole Faust     _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX);
117*c217d954SCole Faust 
118*c217d954SCole Faust     _memory_group.manage(&_forget_gate_out5);
119*c217d954SCole Faust     _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
120*c217d954SCole Faust     _memory_group.manage(&_forget_gate_out1);
121*c217d954SCole Faust     _memory_group.manage(&_forget_gate_out3);
122*c217d954SCole Faust     _forget_gate_out6.allocator()->allocate();
123*c217d954SCole Faust 
124*c217d954SCole Faust     Tensor *forget_gate_out = &_forget_gate_out5;
125*c217d954SCole Faust     if(lstm_params.has_peephole_opt())
126*c217d954SCole Faust     {
127*c217d954SCole Faust         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
128*c217d954SCole Faust 
129*c217d954SCole Faust         _run_peephole_opt = true;
130*c217d954SCole Faust         _memory_group.manage(&_forget_gate_out4);
131*c217d954SCole Faust         _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
132*c217d954SCole Faust         _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
133*c217d954SCole Faust         _forget_gate_out4.allocator()->allocate();
134*c217d954SCole Faust         _forget_gate_out5.allocator()->allocate();
135*c217d954SCole Faust         forget_gate_out = &_forget_gate_out3;
136*c217d954SCole Faust     }
137*c217d954SCole Faust     else
138*c217d954SCole Faust     {
139*c217d954SCole Faust         _forget_gate_out3.allocator()->allocate();
140*c217d954SCole Faust     }
141*c217d954SCole Faust     if(_is_layer_norm_lstm)
142*c217d954SCole Faust     {
143*c217d954SCole Faust         _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
144*c217d954SCole Faust         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
145*c217d954SCole Faust         _memory_group.manage(&_forget_layer_norm_out1);
146*c217d954SCole Faust         _memory_group.manage(&_forget_layer_norm_out2);
147*c217d954SCole Faust         _mean_std_norm_forget_gate.configure(forget_gate_out);
148*c217d954SCole Faust         _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
149*c217d954SCole Faust         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
150*c217d954SCole Faust         forget_gate_out->allocator()->allocate();
151*c217d954SCole Faust         _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
152*c217d954SCole Faust         _forget_layer_norm_out1.allocator()->allocate();
153*c217d954SCole Faust         forget_gate_out = &_forget_layer_norm_out2;
154*c217d954SCole Faust     }
155*c217d954SCole Faust     _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
156*c217d954SCole Faust 
157*c217d954SCole Faust     // Configure block that calculates the input gate
158*c217d954SCole Faust     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
159*c217d954SCole Faust     // input_gate = 1 - forget_gate, with CIFG
160*c217d954SCole Faust     // We optimize this as follows:
161*c217d954SCole Faust     // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
162*c217d954SCole Faust     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
163*c217d954SCole Faust     Tensor *input_gate_out = &_input_gate_out1;
164*c217d954SCole Faust     if(lstm_params.has_cifg_opt())
165*c217d954SCole Faust     {
166*c217d954SCole Faust         _memory_group.manage(&_input_gate_out1);
167*c217d954SCole Faust         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
168*c217d954SCole Faust         _subtract_input_gate.configure(&_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
169*c217d954SCole Faust         _ones.allocator()->allocate();
170*c217d954SCole Faust         _run_cifg_opt = true;
171*c217d954SCole Faust     }
172*c217d954SCole Faust     else
173*c217d954SCole Faust     {
174*c217d954SCole Faust         _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
175*c217d954SCole Faust         _input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
176*c217d954SCole Faust 
177*c217d954SCole Faust         std::vector<const ITensor *> lstm_weights;
178*c217d954SCole Faust         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
179*c217d954SCole Faust         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
180*c217d954SCole Faust 
181*c217d954SCole Faust         _concat_weights_input_gate.configure(lstm_weights, &_input_gate_out2, Window::DimX);
182*c217d954SCole Faust 
183*c217d954SCole Faust         _memory_group.manage(&_input_gate_out1);
184*c217d954SCole Faust         _memory_group.manage(&_input_gate_out4);
185*c217d954SCole Faust 
186*c217d954SCole Faust         _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
187*c217d954SCole Faust         _input_gate_out2.allocator()->allocate();
188*c217d954SCole Faust         input_gate_out = &_input_gate_out3;
189*c217d954SCole Faust 
190*c217d954SCole Faust         if(_run_peephole_opt)
191*c217d954SCole Faust         {
192*c217d954SCole Faust             _memory_group.manage(&_input_gate_out4);
193*c217d954SCole Faust             _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
194*c217d954SCole Faust             _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
195*c217d954SCole Faust             _input_gate_out3.allocator()->allocate();
196*c217d954SCole Faust             _input_gate_out4.allocator()->allocate();
197*c217d954SCole Faust             input_gate_out = &_input_gate_out1;
198*c217d954SCole Faust         }
199*c217d954SCole Faust         else
200*c217d954SCole Faust         {
201*c217d954SCole Faust             _input_gate_out1.allocator()->allocate();
202*c217d954SCole Faust         }
203*c217d954SCole Faust 
204*c217d954SCole Faust         if(_is_layer_norm_lstm)
205*c217d954SCole Faust         {
206*c217d954SCole Faust             _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
207*c217d954SCole Faust             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
208*c217d954SCole Faust             _memory_group.manage(&_input_layer_norm_out1);
209*c217d954SCole Faust             _memory_group.manage(&_input_layer_norm_out2);
210*c217d954SCole Faust             _mean_std_norm_input_gate.configure(input_gate_out);
211*c217d954SCole Faust             _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
212*c217d954SCole Faust             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
213*c217d954SCole Faust             input_gate_out->allocator()->allocate();
214*c217d954SCole Faust             _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
215*c217d954SCole Faust             _input_layer_norm_out1.allocator()->allocate();
216*c217d954SCole Faust             input_gate_out = &_input_layer_norm_out2;
217*c217d954SCole Faust         }
218*c217d954SCole Faust         _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
219*c217d954SCole Faust     }
220*c217d954SCole Faust 
221*c217d954SCole Faust     // Configure block that calculates the cell state
222*c217d954SCole Faust     // cell_state = Clip((PixelwiseMul(input_gate, Activation(input * input_to_cell_weights + output_state_in * recurrent_to_cell_weights + cell_bias)) + PixelwiseMul(forget_gate, cell_state)), cell_threshold)
223*c217d954SCole Faust     TensorShape cell_state1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
224*c217d954SCole Faust     _cell_state_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
225*c217d954SCole Faust     _cell_state_out2.allocator()->init(TensorInfo(cell_state1_shape, 1, input->info()->data_type()));
226*c217d954SCole Faust     _cell_state_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
227*c217d954SCole Faust     _cell_state_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
228*c217d954SCole Faust     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
229*c217d954SCole Faust 
230*c217d954SCole Faust     _memory_group.manage(&_cell_state_out1);
231*c217d954SCole Faust     _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
232*c217d954SCole Faust     _memory_group.manage(&_cell_state_out2);
233*c217d954SCole Faust     _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
234*c217d954SCole Faust     _memory_group.manage(&_cell_state_out3);
235*c217d954SCole Faust     _gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
236*c217d954SCole Faust     _cell_state_out2.allocator()->allocate();
237*c217d954SCole Faust     _memory_group.manage(&_cell_state_out4);
238*c217d954SCole Faust     _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
239*c217d954SCole Faust     Tensor *cell_state_out_ptr = &_cell_state_out4;
240*c217d954SCole Faust     if(_is_layer_norm_lstm)
241*c217d954SCole Faust     {
242*c217d954SCole Faust         _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
243*c217d954SCole Faust         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
244*c217d954SCole Faust         _memory_group.manage(&_cell_layer_norm_out1);
245*c217d954SCole Faust         _memory_group.manage(&_cell_layer_norm_out2);
246*c217d954SCole Faust         _mean_std_norm_cell_gate.configure(cell_state_out_ptr);
247*c217d954SCole Faust         _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
248*c217d954SCole Faust         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
249*c217d954SCole Faust         cell_state_out_ptr->allocator()->allocate();
250*c217d954SCole Faust         _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
251*c217d954SCole Faust         _cell_layer_norm_out1.allocator()->allocate();
252*c217d954SCole Faust         cell_state_out_ptr = &_cell_layer_norm_out2;
253*c217d954SCole Faust     }
254*c217d954SCole Faust     _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
255*c217d954SCole Faust     _memory_group.manage(&_cell_state_out5);
256*c217d954SCole Faust     _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
257*c217d954SCole Faust     cell_state_out_ptr->allocator()->allocate();
258*c217d954SCole Faust     _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
259*c217d954SCole Faust     _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
260*c217d954SCole Faust     _cell_state_out3.allocator()->allocate();
261*c217d954SCole Faust     _cell_state_out5.allocator()->allocate();
262*c217d954SCole Faust     // Perform clipping
263*c217d954SCole Faust     if(cell_threshold != 0.f)
264*c217d954SCole Faust     {
265*c217d954SCole Faust         _perform_cell_clipping = true;
266*c217d954SCole Faust         _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold));
267*c217d954SCole Faust     }
268*c217d954SCole Faust 
269*c217d954SCole Faust     // Configure block that calculates the output
270*c217d954SCole Faust     // output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
271*c217d954SCole Faust     // We optimize this as follows:
272*c217d954SCole Faust     // output_state_out = Activation( (input,output_state_in) * (input_to_output_weights, recurrent_to_output_weights) + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
273*c217d954SCole Faust     _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
274*c217d954SCole Faust     _output4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
275*c217d954SCole Faust 
276*c217d954SCole Faust     std::vector<const ITensor *> in_out_weights;
277*c217d954SCole Faust     in_out_weights.emplace_back(input_to_output_weights);
278*c217d954SCole Faust     in_out_weights.emplace_back(recurrent_to_output_weights);
279*c217d954SCole Faust 
280*c217d954SCole Faust     _concat_weights_output.configure(in_out_weights, &_output2, Window::DimX);
281*c217d954SCole Faust     _memory_group.manage(&_output1);
282*c217d954SCole Faust     _memory_group.manage(&_output4);
283*c217d954SCole Faust 
284*c217d954SCole Faust     _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
285*c217d954SCole Faust 
286*c217d954SCole Faust     _output2.allocator()->allocate();
287*c217d954SCole Faust     _forget_gate_out2.allocator()->allocate();
288*c217d954SCole Faust 
289*c217d954SCole Faust     Tensor *output_gate_out = &_output4;
290*c217d954SCole Faust     if(lstm_params.has_peephole_opt())
291*c217d954SCole Faust     {
292*c217d954SCole Faust         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
293*c217d954SCole Faust 
294*c217d954SCole Faust         _memory_group.manage(&_output3);
295*c217d954SCole Faust         _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
296*c217d954SCole Faust         _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
297*c217d954SCole Faust         _output4.allocator()->allocate();
298*c217d954SCole Faust         output_gate_out = &_output1;
299*c217d954SCole Faust 
300*c217d954SCole Faust         // Allocate intermediate buffers
301*c217d954SCole Faust         _output3.allocator()->allocate();
302*c217d954SCole Faust     }
303*c217d954SCole Faust     else
304*c217d954SCole Faust     {
305*c217d954SCole Faust         _output1.allocator()->allocate();
306*c217d954SCole Faust     }
307*c217d954SCole Faust     if(_is_layer_norm_lstm)
308*c217d954SCole Faust     {
309*c217d954SCole Faust         _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
310*c217d954SCole Faust         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
311*c217d954SCole Faust         _memory_group.manage(&_output_layer_norm_out1);
312*c217d954SCole Faust         _memory_group.manage(&_output_layer_norm_out2);
313*c217d954SCole Faust         _mean_std_norm_output_gate.configure(output_gate_out);
314*c217d954SCole Faust         _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
315*c217d954SCole Faust         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
316*c217d954SCole Faust         output_gate_out->allocator()->allocate();
317*c217d954SCole Faust         _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
318*c217d954SCole Faust         _output_layer_norm_out1.allocator()->allocate();
319*c217d954SCole Faust         output_gate_out = &_output_layer_norm_out2;
320*c217d954SCole Faust     }
321*c217d954SCole Faust     _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
322*c217d954SCole Faust 
323*c217d954SCole Faust     // Configure block that calculates the output state
324*c217d954SCole Faust     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
325*c217d954SCole Faust      *
326*c217d954SCole Faust      *                      -- Clip(lstm_res * projection_weights + projection_bias, projection_threshold) , if there is a projection
327*c217d954SCole Faust      *                     /
328*c217d954SCole Faust      *  output_state =  --
329*c217d954SCole Faust      *                     \
330*c217d954SCole Faust      *                      -- lstm_res , otherwise
331*c217d954SCole Faust      */
332*c217d954SCole Faust     ITensor *output_state_out_tmp = lstm_params.has_projection() ? &_output_state1 : output_state_out;
333*c217d954SCole Faust     _cell_state_activation.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
334*c217d954SCole Faust     _output_state1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
335*c217d954SCole Faust 
336*c217d954SCole Faust     _memory_group.manage(&_cell_state_activation);
337*c217d954SCole Faust     _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
338*c217d954SCole Faust     _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
339*c217d954SCole Faust     _cell_state_activation.allocator()->allocate();
340*c217d954SCole Faust     output_gate_out->allocator()->allocate();
341*c217d954SCole Faust 
342*c217d954SCole Faust     if(lstm_params.has_projection())
343*c217d954SCole Faust     {
344*c217d954SCole Faust         _has_projection_weights = true;
345*c217d954SCole Faust         _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
346*c217d954SCole Faust         _output_state1.allocator()->allocate();
347*c217d954SCole Faust         // Perform clipping
348*c217d954SCole Faust         if(projection_threshold != 0.f)
349*c217d954SCole Faust         {
350*c217d954SCole Faust             _perform_projection_clipping = true;
351*c217d954SCole Faust             _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
352*c217d954SCole Faust         }
353*c217d954SCole Faust     }
354*c217d954SCole Faust 
355*c217d954SCole Faust     // Copy cell state and output
356*c217d954SCole Faust     _copy_cell_state.configure(&_cell_state_out1, cell_state_out);
357*c217d954SCole Faust     _copy_output.configure(output_state_out, output);
358*c217d954SCole Faust 
359*c217d954SCole Faust     // Vector for holding the tensors to store in scratch buffer
360*c217d954SCole Faust     std::vector<const ITensor *> scratch_inputs;
361*c217d954SCole Faust     if(!lstm_params.has_cifg_opt())
362*c217d954SCole Faust     {
363*c217d954SCole Faust         scratch_inputs.emplace_back(input_gate_out);
364*c217d954SCole Faust     }
365*c217d954SCole Faust     scratch_inputs.emplace_back(&_cell_state_out1);
366*c217d954SCole Faust     scratch_inputs.emplace_back(forget_gate_out);
367*c217d954SCole Faust     scratch_inputs.emplace_back(output_gate_out);
368*c217d954SCole Faust     _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer, Window::DimX);
369*c217d954SCole Faust     input_gate_out->allocator()->allocate();
370*c217d954SCole Faust     _cell_state_out1.allocator()->allocate();
371*c217d954SCole Faust     forget_gate_out->allocator()->allocate();
372*c217d954SCole Faust     output_gate_out->allocator()->allocate();
373*c217d954SCole Faust }
374*c217d954SCole Faust 
validate(const ITensorInfo * input,const ITensorInfo * input_to_forget_weights,const ITensorInfo * input_to_cell_weights,const ITensorInfo * input_to_output_weights,const ITensorInfo * recurrent_to_forget_weights,const ITensorInfo * recurrent_to_cell_weights,const ITensorInfo * recurrent_to_output_weights,const ITensorInfo * forget_gate_bias,const ITensorInfo * cell_bias,const ITensorInfo * output_gate_bias,const ITensorInfo * output_state_in,const ITensorInfo * cell_state_in,const ITensorInfo * scratch_buffer,const ITensorInfo * output_state_out,const ITensorInfo * cell_state_out,const ITensorInfo * output,const LSTMParams<ITensorInfo> & lstm_params,const ActivationLayerInfo & activation_info,float cell_threshold,float projection_threshold)375*c217d954SCole Faust Status NELSTMLayer::validate(const ITensorInfo *input,
376*c217d954SCole Faust                              const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
377*c217d954SCole Faust                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
378*c217d954SCole Faust                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
379*c217d954SCole Faust                              const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
380*c217d954SCole Faust                              const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
381*c217d954SCole Faust                              const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
382*c217d954SCole Faust {
383*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
384*c217d954SCole Faust                                         input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
385*c217d954SCole Faust                                         recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
386*c217d954SCole Faust                                         forget_gate_bias, cell_bias, output_gate_bias,
387*c217d954SCole Faust                                         output_state_in, cell_state_in,
388*c217d954SCole Faust                                         scratch_buffer, output_state_out, cell_state_out, output);
389*c217d954SCole Faust 
390*c217d954SCole Faust     // Check data types
391*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
392*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
393*c217d954SCole Faust                                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
394*c217d954SCole Faust                                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
395*c217d954SCole Faust                                                        forget_gate_bias, cell_bias, output_gate_bias,
396*c217d954SCole Faust                                                        output_state_in, cell_state_in,
397*c217d954SCole Faust                                                        scratch_buffer, output_state_out, cell_state_out, output);
398*c217d954SCole Faust 
399*c217d954SCole Faust     // Check dimensions
400*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
401*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(input_to_forget_weights->num_dimensions() > 2);
402*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(input_to_cell_weights->num_dimensions() > 2);
403*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() > 2);
404*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_forget_weights->num_dimensions() > 2);
405*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_cell_weights->num_dimensions() > 2);
406*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() > 2);
407*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() > 1);
408*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->num_dimensions() > 1);
409*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(output_gate_bias->num_dimensions() > 1);
410*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
411*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->num_dimensions() > 2);
412*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(scratch_buffer->num_dimensions() > 2);
413*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
414*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
415*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
416*c217d954SCole Faust     ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
417*c217d954SCole Faust                                 && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
418*c217d954SCole Faust 
419*c217d954SCole Faust     const unsigned int num_batches = input->dimension(1);
420*c217d954SCole Faust     const unsigned int num_cells   = input_to_output_weights->dimension(1);
421*c217d954SCole Faust 
422*c217d954SCole Faust     if(lstm_params.use_layer_norm())
423*c217d954SCole Faust     {
424*c217d954SCole Faust         // If CIFG is used, input layer normalization weights tensor is omitted
425*c217d954SCole Faust         if(lstm_params.has_cifg_opt())
426*c217d954SCole Faust         {
427*c217d954SCole Faust             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
428*c217d954SCole Faust         }
429*c217d954SCole Faust         else
430*c217d954SCole Faust         {
431*c217d954SCole Faust             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_layer_norm_weights());
432*c217d954SCole Faust             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights()->num_dimensions() > 1);
433*c217d954SCole Faust             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights()->dimension(0) != num_cells);
434*c217d954SCole Faust             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
435*c217d954SCole Faust         }
436*c217d954SCole Faust 
437*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
438*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
439*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
440*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
441*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
442*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->dimension(0) != num_cells);
443*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->dimension(0) != num_cells);
444*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->dimension(0) != num_cells);
445*c217d954SCole Faust     }
446*c217d954SCole Faust 
447*c217d954SCole Faust     // Check peephole optimization
448*c217d954SCole Faust     if(lstm_params.has_peephole_opt())
449*c217d954SCole Faust     {
450*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
451*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
452*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_output_weights()->num_dimensions() > 1);
453*c217d954SCole Faust     }
454*c217d954SCole Faust 
455*c217d954SCole Faust     TensorShape      units_out_transposed_shape = compute_transposed_shape(*recurrent_to_output_weights);
456*c217d954SCole Faust     TensorShape      num_units_transposed_shape = compute_transposed_shape(*forget_gate_bias);
457*c217d954SCole Faust     const TensorInfo units_out_transposed_info  = TensorInfo(units_out_transposed_shape, 1, input->data_type());
458*c217d954SCole Faust     const TensorInfo num_units_transposed_info  = TensorInfo(num_units_transposed_shape, 1, input->data_type());
459*c217d954SCole Faust 
460*c217d954SCole Faust     TensorInfo input_gate      = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
461*c217d954SCole Faust     TensorInfo forget_gate     = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
462*c217d954SCole Faust     TensorInfo output_gate_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
463*c217d954SCole Faust     TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
464*c217d954SCole Faust 
465*c217d954SCole Faust     std::vector<const ITensorInfo *> inputs_vector;
466*c217d954SCole Faust     inputs_vector.emplace_back(input);
467*c217d954SCole Faust     inputs_vector.emplace_back(output_state_in);
468*c217d954SCole Faust     const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
469*c217d954SCole Faust     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
470*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
471*c217d954SCole Faust 
472*c217d954SCole Faust     // Validate forget gate
473*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
474*c217d954SCole Faust 
475*c217d954SCole Faust     if(lstm_params.has_peephole_opt())
476*c217d954SCole Faust     {
477*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
478*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
479*c217d954SCole Faust     }
480*c217d954SCole Faust     if(lstm_params.use_layer_norm())
481*c217d954SCole Faust     {
482*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate));
483*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
484*c217d954SCole Faust                                                                         RoundingPolicy::TO_ZERO));
485*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
486*c217d954SCole Faust     }
487*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
488*c217d954SCole Faust 
489*c217d954SCole Faust     // Validate input gate
490*c217d954SCole Faust     if(!lstm_params.has_cifg_opt())
491*c217d954SCole Faust     {
492*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
493*c217d954SCole Faust                                             lstm_params.recurrent_to_input_weights(),
494*c217d954SCole Faust                                             lstm_params.input_gate_bias());
495*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
496*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
497*c217d954SCole Faust         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
498*c217d954SCole Faust 
499*c217d954SCole Faust         std::vector<const ITensorInfo *> lstm_weights;
500*c217d954SCole Faust         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
501*c217d954SCole Faust         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
502*c217d954SCole Faust         TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
503*c217d954SCole Faust         TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
504*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
505*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
506*c217d954SCole Faust 
507*c217d954SCole Faust         if(lstm_params.has_peephole_opt())
508*c217d954SCole Faust         {
509*c217d954SCole Faust             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
510*c217d954SCole Faust             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
511*c217d954SCole Faust             ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
512*c217d954SCole Faust             ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
513*c217d954SCole Faust         }
514*c217d954SCole Faust 
515*c217d954SCole Faust         if(lstm_params.use_layer_norm())
516*c217d954SCole Faust         {
517*c217d954SCole Faust             ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate));
518*c217d954SCole Faust             ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
519*c217d954SCole Faust             ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
520*c217d954SCole Faust         }
521*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
522*c217d954SCole Faust     }
523*c217d954SCole Faust     else
524*c217d954SCole Faust     {
525*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
526*c217d954SCole Faust     }
527*c217d954SCole Faust 
528*c217d954SCole Faust     // Validate cell state
529*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
530*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
531*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
532*c217d954SCole Faust     if(lstm_params.use_layer_norm())
533*c217d954SCole Faust     {
534*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
535*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
536*c217d954SCole Faust                                                                         RoundingPolicy::TO_ZERO));
537*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
538*c217d954SCole Faust     }
539*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
540*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
541*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
542*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
543*c217d954SCole Faust     if(cell_threshold != 0.f)
544*c217d954SCole Faust     {
545*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold,
546*c217d954SCole Faust                                                                                                               -cell_threshold)));
547*c217d954SCole Faust     }
548*c217d954SCole Faust 
549*c217d954SCole Faust     // Validate output gate tmp
550*c217d954SCole Faust     std::vector<const ITensorInfo *> in_out_weights;
551*c217d954SCole Faust     in_out_weights.emplace_back(input_to_output_weights);
552*c217d954SCole Faust     in_out_weights.emplace_back(recurrent_to_output_weights);
553*c217d954SCole Faust     TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
554*c217d954SCole Faust     TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
555*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
556*c217d954SCole Faust 
557*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
558*c217d954SCole Faust 
559*c217d954SCole Faust     if(lstm_params.has_peephole_opt())
560*c217d954SCole Faust     {
561*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
562*c217d954SCole Faust                                                                         RoundingPolicy::TO_ZERO));
563*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
564*c217d954SCole Faust     }
565*c217d954SCole Faust     if(lstm_params.use_layer_norm())
566*c217d954SCole Faust     {
567*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
568*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
569*c217d954SCole Faust                                                                         RoundingPolicy::TO_ZERO));
570*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
571*c217d954SCole Faust     }
572*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
573*c217d954SCole Faust 
574*c217d954SCole Faust     // Validate output state
575*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
576*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
577*c217d954SCole Faust     if(lstm_params.has_projection())
578*c217d954SCole Faust     {
579*c217d954SCole Faust         ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
580*c217d954SCole Faust         if(projection_threshold != 0.f)
581*c217d954SCole Faust         {
582*c217d954SCole Faust             ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out,
583*c217d954SCole Faust                                                                     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
584*c217d954SCole Faust         }
585*c217d954SCole Faust     }
586*c217d954SCole Faust 
587*c217d954SCole Faust     // Validate copy kernel
588*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(&cell_state_tmp, cell_state_out));
589*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(output_state_out, output));
590*c217d954SCole Faust 
591*c217d954SCole Faust     // Validate scratch concatenation
592*c217d954SCole Faust     std::vector<const ITensorInfo *> inputs_vector_info_raw;
593*c217d954SCole Faust     if(!lstm_params.has_cifg_opt())
594*c217d954SCole Faust     {
595*c217d954SCole Faust         inputs_vector_info_raw.push_back(&input_gate);
596*c217d954SCole Faust     }
597*c217d954SCole Faust     inputs_vector_info_raw.push_back(&cell_state_tmp);
598*c217d954SCole Faust     inputs_vector_info_raw.push_back(&forget_gate);
599*c217d954SCole Faust     inputs_vector_info_raw.push_back(&output_gate_tmp);
600*c217d954SCole Faust 
601*c217d954SCole Faust     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer, Window::DimX));
602*c217d954SCole Faust     return Status{};
603*c217d954SCole Faust }
604*c217d954SCole Faust 
run()605*c217d954SCole Faust void NELSTMLayer::run()
606*c217d954SCole Faust {
607*c217d954SCole Faust     prepare();
608*c217d954SCole Faust 
609*c217d954SCole Faust     MemoryGroupResourceScope scope_mg(_memory_group);
610*c217d954SCole Faust 
611*c217d954SCole Faust     _concat_inputs_forget_gate.run();
612*c217d954SCole Faust     _fully_connected_forget_gate.run();
613*c217d954SCole Faust 
614*c217d954SCole Faust     if(_run_peephole_opt)
615*c217d954SCole Faust     {
616*c217d954SCole Faust         _pixelwise_mul_forget_gate.run();
617*c217d954SCole Faust         _accum_forget_gate1.run();
618*c217d954SCole Faust     }
619*c217d954SCole Faust     if(_is_layer_norm_lstm)
620*c217d954SCole Faust     {
621*c217d954SCole Faust         _mean_std_norm_forget_gate.run();
622*c217d954SCole Faust         _pixelwise_mul_forget_gate_coeff.run();
623*c217d954SCole Faust         _accum_forget_gate_bias.run();
624*c217d954SCole Faust     }
625*c217d954SCole Faust     _activation_forget_gate.run();
626*c217d954SCole Faust 
627*c217d954SCole Faust     if(_run_cifg_opt)
628*c217d954SCole Faust     {
629*c217d954SCole Faust         if(_ones.info()->data_type() == DataType::F16)
630*c217d954SCole Faust         {
631*c217d954SCole Faust             std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
632*c217d954SCole Faust         }
633*c217d954SCole Faust         else
634*c217d954SCole Faust         {
635*c217d954SCole Faust             std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
636*c217d954SCole Faust         }
637*c217d954SCole Faust         _subtract_input_gate.run();
638*c217d954SCole Faust     }
639*c217d954SCole Faust     else
640*c217d954SCole Faust     {
641*c217d954SCole Faust         _fully_connected_input_gate.run();
642*c217d954SCole Faust 
643*c217d954SCole Faust         if(_run_peephole_opt)
644*c217d954SCole Faust         {
645*c217d954SCole Faust             _pixelwise_mul_input_gate.run();
646*c217d954SCole Faust             _accum_input_gate1.run();
647*c217d954SCole Faust         }
648*c217d954SCole Faust 
649*c217d954SCole Faust         if(_is_layer_norm_lstm)
650*c217d954SCole Faust         {
651*c217d954SCole Faust             _mean_std_norm_input_gate.run();
652*c217d954SCole Faust             _pixelwise_mul_input_gate_coeff.run();
653*c217d954SCole Faust             _accum_input_gate_bias.run();
654*c217d954SCole Faust         }
655*c217d954SCole Faust         _activation_input_gate.run();
656*c217d954SCole Faust     }
657*c217d954SCole Faust 
658*c217d954SCole Faust     _fully_connected_cell_state.run();
659*c217d954SCole Faust     _transpose_cell_state.run();
660*c217d954SCole Faust     _gemm_cell_state1.run();
661*c217d954SCole Faust     _accum_cell_state1.run();
662*c217d954SCole Faust     if(_is_layer_norm_lstm)
663*c217d954SCole Faust     {
664*c217d954SCole Faust         _mean_std_norm_cell_gate.run();
665*c217d954SCole Faust         _pixelwise_mul_cell_gate_coeff.run();
666*c217d954SCole Faust         _accum_cell_gate_bias.run();
667*c217d954SCole Faust     }
668*c217d954SCole Faust 
669*c217d954SCole Faust     _activation_cell_state.run();
670*c217d954SCole Faust     _pixelwise_mul_cell_state1.run();
671*c217d954SCole Faust     _pixelwise_mul_cell_state2.run();
672*c217d954SCole Faust     _accum_cell_state2.run();
673*c217d954SCole Faust 
674*c217d954SCole Faust     if(_perform_cell_clipping)
675*c217d954SCole Faust     {
676*c217d954SCole Faust         _cell_clip.run();
677*c217d954SCole Faust     }
678*c217d954SCole Faust 
679*c217d954SCole Faust     _fully_connected_output.run();
680*c217d954SCole Faust     if(_run_peephole_opt)
681*c217d954SCole Faust     {
682*c217d954SCole Faust         _pixelwise_mul_output_state1.run();
683*c217d954SCole Faust         _accum_output1.run();
684*c217d954SCole Faust     }
685*c217d954SCole Faust     if(_is_layer_norm_lstm)
686*c217d954SCole Faust     {
687*c217d954SCole Faust         _mean_std_norm_output_gate.run();
688*c217d954SCole Faust         _pixelwise_mul_output_gate_coeff.run();
689*c217d954SCole Faust         _accum_output_gate_bias.run();
690*c217d954SCole Faust     }
691*c217d954SCole Faust     _activation_output.run();
692*c217d954SCole Faust 
693*c217d954SCole Faust     _activation_output_state.run();
694*c217d954SCole Faust     _pixelwise_mul_output_state2.run();
695*c217d954SCole Faust 
696*c217d954SCole Faust     if(_has_projection_weights)
697*c217d954SCole Faust     {
698*c217d954SCole Faust         _fully_connected_output_state.run();
699*c217d954SCole Faust         if(_perform_projection_clipping)
700*c217d954SCole Faust         {
701*c217d954SCole Faust             _projection_clip.run();
702*c217d954SCole Faust         }
703*c217d954SCole Faust     }
704*c217d954SCole Faust 
705*c217d954SCole Faust     _copy_cell_state.run();
706*c217d954SCole Faust     _copy_output.run();
707*c217d954SCole Faust 
708*c217d954SCole Faust     _concat_scratch_buffer.run();
709*c217d954SCole Faust }
710*c217d954SCole Faust 
prepare()711*c217d954SCole Faust void NELSTMLayer::prepare()
712*c217d954SCole Faust {
713*c217d954SCole Faust     if(!_is_prepared)
714*c217d954SCole Faust     {
715*c217d954SCole Faust         _concat_weights_forget_gate.run();
716*c217d954SCole Faust         if(!_run_cifg_opt)
717*c217d954SCole Faust         {
718*c217d954SCole Faust             _concat_weights_input_gate.run();
719*c217d954SCole Faust         }
720*c217d954SCole Faust         _concat_weights_output.run();
721*c217d954SCole Faust         _is_prepared = true;
722*c217d954SCole Faust     }
723*c217d954SCole Faust }
724*c217d954SCole Faust } // namespace arm_compute
725