Compute Library
 21.02
NEQLSTMLayer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
28 #include "arm_compute/core/Utils.h"
42 
43 namespace arm_compute
44 {
45 using namespace arm_compute::utils::info_helpers;
46 namespace
47 {
48 Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
49  float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
50 {
51  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
52  ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
53  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
54  return Status{};
55 }
56 } // namespace
57 
58 Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
59 {
60  // Output quantization scale will be different, but ignored here
61  // since it will be configured at configure() stage.
62  const TensorInfo out
63  {
64  in
65  };
66  return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
67 }
68 
69 void NEQLSTMLayer::configure_layer_norm(NEQLSTMLayer::LayerNormGate g, const ITensor *in)
70 {
71  ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
72 
73  Tensor &out = get_layer_norm_output(g);
74  _memory_group.manage(&out);
75  out.allocator()->init(*(in->info()));
76 
77  get_layer_norm(g) = std::make_unique<NEQLSTMLayerNormalizationKernel>();
78  get_layer_norm(g)->configure(in, &out, get_layer_norm_weight(g), get_layer_norm_bias(g));
79 }
80 
81 NEQLSTMLayer::TensorCopyKernel::~TensorCopyKernel() = default;
82 
83 Status NEQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const ITensorInfo &dst)
84 {
85  ARM_COMPUTE_RETURN_ERROR_ON(src.tensor_shape().num_dimensions() > max_dimension_supported);
86  ARM_COMPUTE_RETURN_ERROR_ON(dst.tensor_shape().num_dimensions() > max_dimension_supported);
88  ARM_COMPUTE_RETURN_ERROR_ON(dst.tensor_shape().y() != src.tensor_shape().y());
89  return Status{};
90 }
91 
92 void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst)
93 {
95  _src = &src;
96  _dst = &dst;
97  _row_size = std::min(_src->info()->tensor_shape().x(), _dst->info()->tensor_shape().x());
98  _window = calculate_max_window(*_src->info(), Steps());
99 }
100 
102 {
103  Iterator input_iter{ _src, _window };
104  Iterator output_iter{ _dst, _window };
105 
106  execute_window_loop(_window, [&](const Coordinates &)
107  {
108  memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
109  },
110  input_iter, output_iter);
111 }
112 
113 NEQLSTMLayer::~NEQLSTMLayer() = default;
114 
115 NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
116  : _memory_group(), _transpose_input_to_forget_weights(), _transpose_input_to_cell_weights(), _transpose_input_to_output_weights(), _transpose_input_to_input_weights(),
117  _transpose_recurrent_to_forget_weights(), _transpose_recurrent_to_cell_weights(), _transpose_recurrent_to_output_weights(), _transpose_recurrent_to_input_weights(), _transpose_projection_weights(),
118  _input_to_input_reduction(), _recurrent_to_input_reduction(), _input_to_forget_reduction(), _recurrent_to_forget_reduction(), _input_to_cell_reduction(), _recurrent_to_cell_reduction(),
119  _input_to_output_reduction(), _recurrent_to_output_reduction(), _projection_reduction(), _projection_bias_add(), _mm_input_to_forget(), _mm_recurrent_to_forget(), _pixelwise_mul_cell_to_forget(),
120  _input_to_forget_outstage(), _recurrent_to_forget_outstage(), _cell_to_forget_outstage(), _accumulate_input_recurrent_forget(), _accumulate_cell_forget(), _forget_gate_sigmoid(), _mm_input_to_cell(),
121  _input_to_cell_outstage(), _mm_recurrent_to_cell(), _recurrent_to_cell_outstage(), _accumulate_input_recurrent_modulation(), _cell_gate_tanh(), _input_gate_sub(), _mm_input_to_input(),
122  _input_to_input_outstage(), _mm_recurrent_to_input(), _recurrent_to_input_outstage(), _accumulate_input_recurrent_input(), _pixelwise_mul_cell_to_input(), _cell_to_input_outstage(),
123  _accumulate_cell_input(), _input_gate_sigmoid(), _pixelwise_mul_forget_cell(), _pixelwise_mul_input_cell(), _add_forget_cell(), _cell_clip(), _mm_input_to_output(), _input_to_output_outstage(),
124  _mm_recurrent_to_output(), _recurrent_to_output_outstage(), _accumulate_input_recurrent_output(), _pixelwise_mul_cell_to_output(), _cell_to_output_outstage(), _accumulate_cell_to_output(),
125  _output_gate_sigmoid(), _hidden_tanh(), _pixelwise_mul_hidden(), _hidden_outstage(), _mm_projection(), _projection_outstage(), _accumulate_projection(), _projection_clip(), _projection_bias_copy(),
126  _projection_output_to_accumulate_copy(), _projection_accumulate_to_output_copy(), _hidden_to_output_copy(), _layer_norms(), _copy_output(), _layer_norm_weights(), _layer_norm_bias(),
127  _layer_norm_output()
128 {
129  _memory_group = MemoryGroup(std::move(memory_manager));
130 }
131 
132 void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
133  const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias,
134  Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale,
135  const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
136 {
137  _memory_group.manage(mm_res);
138  _memory_group.manage(outstage_res);
139 
140  mm_res->allocator()->init(mm_res_info);
141  outstage_res->allocator()->init(outstage_tensor_info);
142 
143  // Configure matrix-multiplication
144  mm.configure(mm_input, mm_weights, nullptr, mm_res);
145 
146  // Configure output stage
147  quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
148  outstage.configure(mm_res, bias, outstage_res, gemmlowp_info);
149  mm_res->allocator()->allocate();
150 }
151 
155  const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
156  const ITensor *cell_state_in, ITensor *output_state_in,
157  ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
158  const LSTMParams<ITensor> &lstm_params)
159 {
160  ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
161  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
162  forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
163 
164  // Set lstm parameters
165  LSTMParams<ITensorInfo> lstm_params_info{};
166  build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
167 
168  // Validate
169  ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
170  recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
171  forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
172  cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
173  lstm_params_info));
174 
175  const int batch_size = input->info()->dimension(1);
176  const int num_units = input_to_output_weights->info()->dimension(1);
177  const int output_size = output_state_out->info()->dimension(_out_state_output_size_dimension_idx);
178 
179  const UniformQuantizationInfo qinput = input->info()->quantization_info().uniform();
180  const UniformQuantizationInfo qcell_state_in = cell_state_in->info()->quantization_info().uniform();
181  const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();
182 
183  _projection_bias = lstm_params.projection_bias();
184  _input_to_forget_weights = input_to_forget_weights;
185  _input_to_cell_weights = input_to_cell_weights;
186  _input_to_output_weights = input_to_output_weights;
187  _recurrent_to_forget_weights = recurrent_to_forget_weights;
188  _recurrent_to_cell_weights = recurrent_to_cell_weights;
189  _recurrent_to_output_weights = recurrent_to_output_weights;
190  _projection_weights = lstm_params.projection_weights();
191 
192  // Layer normalization
193  _has_layer_norm = lstm_params.use_layer_norm();
194  if(_has_layer_norm)
195  {
196  set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
197  set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
198  set_layer_norm_weight(lstm_params.input_layer_norm_weights(), LayerNormGate::Input);
199  set_layer_norm_weight(lstm_params.output_layer_norm_weights(), LayerNormGate::Output);
200 
201  set_layer_norm_bias(forget_gate_bias, LayerNormGate::Forget);
202  set_layer_norm_bias(cell_bias, LayerNormGate::Cell);
203  set_layer_norm_bias(lstm_params.input_gate_bias(), LayerNormGate::Input);
204  set_layer_norm_bias(output_gate_bias, LayerNormGate::Output);
205  }
206 
207  _has_cifg = lstm_params.has_cifg_opt();
208  _has_projection = lstm_params.has_projection();
209  _has_peephole = lstm_params.has_peephole_opt();
210 
211  // Calculate and decompose effective scales for optimizing matmul calculation
212  const int32_t cell_shift = log2(qcell_state_in.scale);
213 
214  // Calculate quantized parameters for clipping.
215  int16_t quantized_cell_clip = 0;
216  if(lstm_params.cell_clip() > 0.0f)
217  {
218  quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
219  }
220  _has_cell_clipping = quantized_cell_clip > 0;
221 
222  // Precompute effective bias for optimizing the matmul computations.
223  if(!_has_cifg)
224  {
225  _input_to_input_weights = lstm_params.input_to_input_weights();
226  _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
227 
228  _input_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
229  _recurrent_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
230  _input_to_input_reduction->configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
231  _recurrent_to_input_reduction->configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
232  }
233 
234  _input_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
235  _recurrent_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
236  _input_to_cell_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
237  _recurrent_to_cell_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
238  _input_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
239  _recurrent_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
240 
241  _recurrent_to_cell_reduction->configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
242  _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
243  _input_to_cell_reduction->configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
244  _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
245  _input_to_output_reduction->configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
246  _recurrent_to_output_reduction->configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
247  if(_has_projection)
248  {
249  _projection_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
250  _projection_reduction->configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
251  if(_projection_bias != nullptr)
252  {
253  _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
254  }
255  }
256 
257  // Pre-transpose weights to be used in GEMM.
258  _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed);
259  _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed);
260  _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed);
261  _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
262  _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
263  _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
264  if(!_has_cifg)
265  {
266  _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
267  _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
268  }
269  if(_has_projection)
270  {
271  _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed);
272  }
273 
274  GEMMLowpOutputStageInfo gemmlowp_info;
277  gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int16_t>::max();
278  gemmlowp_info.output_data_type = DataType::QSYMM16;
279 
280  const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
281  // Forget gate.
282  const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
283  const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
284  configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
285  input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
286  &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
287  mm_out_info, forget_gate_outstage_info);
288 
289  const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
290  configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
291  output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
292  &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
293  mm_out_info, forget_gate_outstage_info);
294 
295  _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
296  _input_to_forget_outstage_res.allocator()->allocate();
297 
298  if(_has_peephole)
299  {
300  _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
301  _memory_group.manage(&_mul_cell_to_forget_res);
302  _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
303  _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
304  _memory_group.manage(&_cell_to_forget_outstage_res);
305  const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
306  quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
307  _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
308  _mul_cell_to_forget_res.allocator()->allocate();
309  _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
310  _cell_to_forget_outstage_res.allocator()->allocate();
311  }
312 
313  Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
314 
315  if(_has_layer_norm)
316  {
317  configure_layer_norm(LayerNormGate::Forget, forget_activation_input);
318  forget_activation_input->allocator()->allocate();
319  forget_activation_input = &get_layer_norm_output(LayerNormGate::Forget);
320  }
321 
322  // Output quantization info of Sigmoid and Tanh activations
323  const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
324  const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
325 
326  _memory_group.manage(&_forget_gate);
327  _forget_gate.allocator()->init(forget_gate_info);
328  _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
329  forget_activation_input->allocator()->allocate();
330 
331  // Modulation gate.
332  const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
333  const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
334  configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
335  input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
336  &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
337  mm_out_info, cell_outstage_info);
338 
339  const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
340  configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
341  output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
342  &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
343  mm_out_info, cell_outstage_info);
344 
345  _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
346  _input_to_cell_outstage_res.allocator()->allocate();
347 
348  Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
349 
350  if(_has_layer_norm)
351  {
352  configure_layer_norm(LayerNormGate::Cell, cell_activation_input);
353  cell_activation_input->allocator()->allocate();
354  cell_activation_input = &get_layer_norm_output(LayerNormGate::Cell);
355  }
356 
357  const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
358 
359  _memory_group.manage(&_cell_gate);
360  _cell_gate.allocator()->init(cell_gate_info);
361  _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
362  cell_activation_input->allocator()->allocate();
363 
364  // Input gate.
365  const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
366  _input_gate.allocator()->init(input_gate_info);
367  _memory_group.manage(&_input_gate);
368  if(_has_cifg)
369  {
370  _ones.allocator()->init(*_forget_gate.info());
371  _input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
372  _ones.allocator()->allocate();
373  }
374  else
375  {
376  const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
377  const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
378  configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
379  input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
380  &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
381  mm_out_info, input_outstage_info);
382 
383  const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
384  configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
385  output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
386  &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
387  mm_out_info, input_outstage_info);
388  _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
389  _input_to_input_outstage_res.allocator()->allocate();
390 
391  if(_has_peephole)
392  {
393  _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
394  _memory_group.manage(&_mul_cell_to_input_res);
395  _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
396  const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
397  quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
398  _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
399  _memory_group.manage(&_cell_to_input_outstage_res);
400  _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
401  _mul_cell_to_input_res.allocator()->allocate();
402  _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
403  _cell_to_input_outstage_res.allocator()->allocate();
404  }
405 
406  Tensor *input_activation_input = &_recurrent_to_input_outstage_res;
407 
408  if(_has_layer_norm)
409  {
410  configure_layer_norm(LayerNormGate::Input, input_activation_input);
411  input_activation_input->allocator()->allocate();
412  input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
413  }
414 
415  _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
416  input_activation_input->allocator()->allocate();
417  }
418  // Cell.
419  // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
420  _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
421  const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale;
422  const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
423  const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
424  _memory_group.manage(&_mul_input_cell_res);
425  _mul_input_cell_res.allocator()->init(mul_input_cell_info);
426  _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
427  _cell_gate.allocator()->allocate();
428  _add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
429  _mul_input_cell_res.allocator()->allocate();
430  _forget_gate.allocator()->allocate();
431  if(_has_cell_clipping)
432  {
433  _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
434  }
435  // Output gate.
436  const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
437  const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
438  configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
439  input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
440  &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
441  mm_out_info, output_outstage_info);
442 
443  const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
444  configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
445  output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
446  &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
447  mm_out_info, output_outstage_info);
448 
449  _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
450  _input_to_output_outstage_res.allocator()->allocate();
451 
452  if(_has_peephole)
453  {
454  // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
455  // Here we are not using the output stage because all operations are done in float
456  _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
457  _memory_group.manage(&_mul_cell_to_output_res);
458  _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
459 
460  const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
461  quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
462  _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
463  _memory_group.manage(&_cell_to_output_outstage_res);
464  _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
465  _mul_cell_to_output_res.allocator()->allocate();
466 
467  _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
468  _cell_to_output_outstage_res.allocator()->allocate();
469  }
470 
471  Tensor *output_activation_input = &_recurrent_to_output_outstage_res;
472 
473  if(_has_layer_norm)
474  {
475  configure_layer_norm(LayerNormGate::Output, output_activation_input);
476  output_activation_input->allocator()->allocate();
477  output_activation_input = &get_layer_norm_output(LayerNormGate::Output);
478  }
479  const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
480 
481  _memory_group.manage(&_output_gate);
482  _output_gate.allocator()->init(output_gate_info);
483  _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
484  output_activation_input->allocator()->allocate();
485 
486  // Hidden.
487  _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
488  // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
489  _memory_group.manage(&_hidden_mul_res);
490  const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
491  _hidden_mul_res.allocator()->init(hidden_mul_res);
492  _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
493  _output_gate.allocator()->allocate();
494  _input_gate.allocator()->allocate();
495  const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
496  quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
497  gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
498  gemmlowp_info.output_data_type = output_state_in->info()->data_type();
499 
500  _projection_tensor_copy_required = (num_units != output_size);
501  ITensor *hidden_gate_result = output_state_out;
502 
503  _memory_group.manage(&_hidden_gate);
504 
505  if(_projection_tensor_copy_required)
506  {
507  _hidden_gate.allocator()->init(*output_state_out->info());
508  _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
509  hidden_gate_result = &_hidden_gate;
510  }
511 
512  _hidden_outstage.configure(&_hidden_mul_res, nullptr, hidden_gate_result, gemmlowp_info);
513  _hidden_mul_res.allocator()->allocate();
514 
515  // Projection.
516  if(_has_projection)
517  {
518  const TensorInfo projection_outstage_info(*output_state_out->info());
519  const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
520  const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
521  gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
523  gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
525 
526  TensorInfo projection_mm_out_info{ mm_out_info };
527  projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
528 
529  configure_mm(_mm_projection, _projection_outstage, gemmlowp_info,
530  hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
531  &_mm_projection_res, &_projection_outstage_res, projection_scale,
532  projection_mm_out_info, projection_outstage_info);
533 
534  ITensor *accumulate_destination = output_state_out;
535 
536  if(_projection_tensor_copy_required)
537  {
538  _hidden_gate.allocator()->allocate();
539  _projection_accumulate_res.allocator()->init(*output_state_in->info());
540  _projection_accumulate_res.info()->set_tensor_shape(_projection_outstage_res.info()->tensor_shape());
541  _projection_output_to_accumulate_copy.configure(*output_state_in, _projection_accumulate_res);
542  accumulate_destination = &_projection_accumulate_res;
543  }
544 
545  _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
546  _projection_outstage_res.allocator()->allocate();
547 
548  if(_projection_tensor_copy_required)
549  {
550  _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
551  _projection_accumulate_res.allocator()->allocate();
552  }
553 
554  int8_t quantized_projection_clip{ 0 };
555  if(lstm_params.projection_clip() > 0.0f)
556  {
557  quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
558  }
559 
560  if(quantized_projection_clip > 0)
561  {
562  _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip));
563  _has_projection_clipping = true;
564  }
565  }
566  else
567  {
568  if(_projection_tensor_copy_required)
569  {
570  _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
571  _hidden_gate.allocator()->allocate();
572  }
573  }
574 
575  // Copy output_state_out to output
576  _copy_output.configure(output_state_out, output);
577 }
578 
582  const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
583  const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
584  const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
585  const LSTMParams<ITensorInfo> &lstm_params)
586 {
587  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
588  recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
589  cell_state_out, output_state_out, output);
590 
592  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
593 
594  const unsigned int input_size = input->dimension(0);
595  const unsigned int batch_size = input->dimension(1);
596  const unsigned int num_units = input_to_output_weights->dimension(1);
597  const unsigned int output_size = output_state_out->dimension(_out_state_output_size_dimension_idx);
598 
599  ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
600  ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
601  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
602  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
603  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
604  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
606  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
607  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
608 
609  ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
610  ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
611  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias);
613  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, cell_bias, output_gate_bias);
614 
615  ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->num_dimensions() != 2);
616  ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->dimension(0) != num_units);
617  ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->dimension(1) != batch_size);
619 
620  ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() != 2);
621  ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->dimension(0) != output_size);
622  ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->dimension(1) != batch_size);
624 
625  // Check whether peephole weights are all there or none
626  if(lstm_params.has_peephole_opt())
627  {
630  ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
631  ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
634 
635  if(!lstm_params.has_cifg_opt())
636  {
640  }
641  }
642 
643  const UniformQuantizationInfo qinput = input->quantization_info().uniform();
644  const UniformQuantizationInfo qcell_state_in = cell_state_in->quantization_info().uniform();
645  const UniformQuantizationInfo qoutput_state_in = output_state_in->quantization_info().uniform();
646 
647  // Calculate and decompose effective scales for optimizing matmul calculation
648  const int32_t cell_shift = log2(qcell_state_in.scale);
649  ARM_COMPUTE_RETURN_ERROR_ON(cell_shift > -9);
650 
651  // Calculate quantized parameters for clipping.
652  int16_t quantized_cell_clip = 0;
653  if(lstm_params.cell_clip() > 0.0f)
654  {
655  quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
656  }
657 
658  // Precompute effective bias for optimizing the matmul computations.
659  const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
660  const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
661  if(!lstm_params.has_cifg_opt())
662  {
663  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
665  true)));
666  }
667  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
668  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
669  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
670  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
671  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
672  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
673  if(lstm_params.has_projection())
674  {
676  lstm_params.hidden_state_zero(),
677  true)));
678  if(lstm_params.projection_bias() != nullptr)
679  {
681  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE));
682  }
683  }
684 
685  const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
686  const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
687 
688  // Validate weights transpose
689  ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_forget_weights, &input_weights_transposed));
690  ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed));
691  ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_weights_transposed));
692  ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
693  ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
694  ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
695  if(!lstm_params.has_cifg_opt())
696  {
697  ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
698  ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
699  }
700  if(lstm_params.has_projection())
701  {
702  const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
703  ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
704  }
705 
706  GEMMLowpOutputStageInfo gemmlowp_info;
709  gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int16_t>::max();
710  gemmlowp_info.output_data_type = DataType::QSYMM16;
711 
712  const bool has_layer_norm = lstm_params.use_layer_norm();
713 
714  // Forget gate.
716  const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
717  const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
718  const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
719  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
720 
721  const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
722  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
723 
724  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
725 
726  if(lstm_params.has_peephole_opt())
727  {
731  const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
733  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
734  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
735  }
736 
737  if(has_layer_norm)
738  {
739  const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
740  const ITensorInfo *b_info = forget_gate_bias;
741  ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(forget_outstage_info, *w_info, *b_info));
742  }
743 
744  // Output quantization info of Sigmoid and Tanh activations
745  const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
746  const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
747 
749 
750  // Modulation gate.
752  const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
753  const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
754  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
755 
756  const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
757  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
758 
759  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
760 
761  if(has_layer_norm)
762  {
763  const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
764  const ITensorInfo *b_info = cell_bias;
765  ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));
766  }
767  const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
768 
770 
771  // Input gate.
772  const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
773  if(lstm_params.has_cifg_opt())
774  {
775  ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
776  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
777  }
778  else
779  {
782  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
783  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
785  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
786 
788  const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
789  const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
790  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
791 
792  const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
793  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
794 
795  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
796 
797  if(lstm_params.has_peephole_opt())
798  {
801  const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
803  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
804  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
805  }
806 
807  if(has_layer_norm)
808  {
809  const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
810  const ITensorInfo *b_info = lstm_params.input_gate_bias();
811  ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info));
812  }
813 
815  }
816  // Cell.
817  ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
819  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
820  if(quantized_cell_clip > 0)
821  {
823  quantized_cell_clip)));
824  }
825  // Output gate.
827  const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
828  const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
829  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
830 
831  const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
832  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
833 
834  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
835  if(lstm_params.has_peephole_opt())
836  {
838  // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
839  // Here we are not using the output stage because all operations are done in float
840  // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
841  // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
844  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
845  }
846 
847  if(has_layer_norm)
848  {
849  const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
850  const ITensorInfo *b_info = output_gate_bias;
851  ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(output_outstage_info, *w_info, *b_info));
852  }
853 
854  const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
856 
857  // Hidden.
859  const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
860  const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
861  ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
862 
864  const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
865  ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
866  gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
867  gemmlowp_info.output_data_type = hidden_out_info.data_type();
868  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
869 
870  const bool projection_tensor_copy_required = num_units != output_size;
871 
872  // Projection.
873  if(lstm_params.has_projection())
874  {
875  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
876  ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
877 
878  const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
879  const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
881  gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
883  gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
885 
886  const TensorInfo projection_outstage_info(*output_state_out);
887  const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
888 
889  TensorInfo projection_mm_out_info{ mm_out_info };
890  projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
891 
892  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
893  &projection_outstage_info));
894 
895  if(projection_tensor_copy_required)
896  {
897  ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
898  }
899 
900  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
901 
902  if(projection_tensor_copy_required)
903  {
904  ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
905  }
906 
907  int8_t quantized_projection_clip{ 0 };
908  if(lstm_params.projection_clip() > 0.0f)
909  {
910  quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
911  }
912 
913  if(quantized_projection_clip > 0)
914  {
916  quantized_projection_clip)));
917  }
918  }
919  else
920  {
921  if(projection_tensor_copy_required)
922  {
923  ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
924  }
925  }
926 
927  if(cell_state_out->total_size() > 0)
928  {
929  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
930  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
931  }
932 
933  if(output_state_out->total_size() > 0)
934  {
936  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
937  }
938 
939  ARM_COMPUTE_RETURN_ON_ERROR(NECopy::validate(output_state_out, output));
940  return Status{};
941 }
942 
944 {
945  prepare();
946 
947  // Acquire all the temporaries
948  MemoryGroupResourceScope scope_mg(_memory_group);
949 
950  // Forget gate.
951  _mm_input_to_forget.run();
952  _input_to_forget_outstage.run();
953 
954  _mm_recurrent_to_forget.run();
955  _recurrent_to_forget_outstage.run();
956  _accumulate_input_recurrent_forget.run();
957 
958  if(_has_peephole)
959  {
960  _pixelwise_mul_cell_to_forget.run();
961  _cell_to_forget_outstage.run();
962  _accumulate_cell_forget.run();
963  }
964 
965  if(_has_layer_norm)
966  {
967  NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY);
968  }
969 
970  _forget_gate_sigmoid.run();
971 
972  // Modulation gate.
973  _mm_input_to_cell.run();
974  _input_to_cell_outstage.run();
975 
976  _mm_recurrent_to_cell.run();
977  _recurrent_to_cell_outstage.run();
978  _accumulate_input_recurrent_modulation.run();
979 
980  if(_has_layer_norm)
981  {
982  NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY);
983  }
984 
985  _cell_gate_tanh.run();
986 
987  // Input gate
988  if(_has_cifg)
989  {
990  _input_gate_sub.run();
991  }
992  else
993  {
994  _mm_input_to_input.run();
995  _input_to_input_outstage.run();
996  _mm_recurrent_to_input.run();
997  _recurrent_to_input_outstage.run();
998  _accumulate_input_recurrent_input.run();
999 
1000  if(_has_peephole)
1001  {
1002  _pixelwise_mul_cell_to_input.run();
1003  _cell_to_input_outstage.run();
1004  _accumulate_cell_input.run();
1005  }
1006 
1007  if(_has_layer_norm)
1008  {
1009  NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY);
1010  }
1011 
1012  _input_gate_sigmoid.run();
1013  }
1014 
1015  // Cell.
1016  _pixelwise_mul_forget_cell.run();
1017  _pixelwise_mul_input_cell.run();
1018  _add_forget_cell.run();
1019 
1020  if(_has_cell_clipping)
1021  {
1022  _cell_clip.run();
1023  }
1024 
1025  // Output gate.
1026  _mm_input_to_output.run();
1027  _input_to_output_outstage.run();
1028  _mm_recurrent_to_output.run();
1029  _recurrent_to_output_outstage.run();
1030  _accumulate_input_recurrent_output.run();
1031  if(_has_peephole)
1032  {
1033  _pixelwise_mul_cell_to_output.run();
1034  _cell_to_output_outstage.run();
1035  _accumulate_cell_to_output.run();
1036  }
1037 
1038  if(_has_layer_norm)
1039  {
1040  NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY);
1041  }
1042 
1043  _output_gate_sigmoid.run();
1044 
1045  // Hidden.
1046  _hidden_tanh.run();
1047  _pixelwise_mul_hidden.run();
1048  _hidden_outstage.run();
1049 
1050  // Projection.
1051  if(_has_projection)
1052  {
1053  _mm_projection.run();
1054  _projection_outstage.run();
1055 
1056  if(_projection_tensor_copy_required)
1057  {
1058  _projection_output_to_accumulate_copy.run();
1059  }
1060 
1061  _accumulate_projection.run();
1062 
1063  if(_projection_tensor_copy_required)
1064  {
1065  _projection_accumulate_to_output_copy.run();
1066  }
1067 
1068  if(_has_projection_clipping)
1069  {
1070  _projection_clip.run();
1071  }
1072  }
1073  else
1074  {
1075  if(_projection_tensor_copy_required)
1076  {
1077  _hidden_to_output_copy.run();
1078  }
1079  }
1080 
1081  // Copy output_state_out to output
1082  _copy_output.run();
1083 }
1084 
1086 {
1087  if(!_is_prepared)
1088  {
1089  // Pre-transpose weights to be used in GEMM.
1090  _input_to_forget_weights_transposed.allocator()->allocate();
1091  _input_to_cell_weights_transposed.allocator()->allocate();
1092  _input_to_output_weights_transposed.allocator()->allocate();
1093  _recurrent_to_forget_weights_transposed.allocator()->allocate();
1094  _recurrent_to_cell_weights_transposed.allocator()->allocate();
1095  _recurrent_to_output_weights_transposed.allocator()->allocate();
1096  _transpose_input_to_forget_weights.run();
1097  _transpose_input_to_cell_weights.run();
1098  _transpose_input_to_output_weights.run();
1099  _transpose_recurrent_to_forget_weights.run();
1100  _transpose_recurrent_to_cell_weights.run();
1101  _transpose_recurrent_to_output_weights.run();
1102 
1103  // Precompute effective biases
1104  if(_has_cifg)
1105  {
1106  std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
1107  }
1108  else
1109  {
1110  _input_to_input_eff_bias.allocator()->allocate();
1111  _recurrent_to_input_eff_bias.allocator()->allocate();
1112  NEScheduler::get().schedule(_input_to_input_reduction.get(), Window::DimY);
1113  NEScheduler::get().schedule(_recurrent_to_input_reduction.get(), Window::DimY);
1114 
1115  _input_to_input_weights_transposed.allocator()->allocate();
1116  _recurrent_to_input_weights_transposed.allocator()->allocate();
1117  _transpose_input_to_input_weights.run();
1118  _transpose_recurrent_to_input_weights.run();
1119  _input_to_input_weights->mark_as_unused();
1120  _recurrent_to_input_weights->mark_as_unused();
1121  }
1122  _input_to_forget_eff_bias.allocator()->allocate();
1123  _recurrent_to_forget_eff_bias.allocator()->allocate();
1124  _input_to_cell_eff_bias.allocator()->allocate();
1125  _recurrent_to_cell_eff_bias.allocator()->allocate();
1126  _input_to_output_eff_bias.allocator()->allocate();
1127  _recurrent_to_output_eff_bias.allocator()->allocate();
1128  NEScheduler::get().schedule(_input_to_forget_reduction.get(), Window::DimY);
1129  NEScheduler::get().schedule(_recurrent_to_forget_reduction.get(), Window::DimY);
1130  NEScheduler::get().schedule(_input_to_cell_reduction.get(), Window::DimY);
1131  NEScheduler::get().schedule(_recurrent_to_cell_reduction.get(), Window::DimY);
1132  NEScheduler::get().schedule(_input_to_output_reduction.get(), Window::DimY);
1133  NEScheduler::get().schedule(_recurrent_to_output_reduction.get(), Window::DimY);
1134 
1135  if(_has_projection)
1136  {
1137  _projection_eff_bias.allocator()->allocate();
1138  NEScheduler::get().schedule(_projection_reduction.get(), Window::DimY);
1139  if(_projection_bias != nullptr)
1140  {
1141  _projection_bias_add.run();
1142  _projection_bias->mark_as_unused();
1143  }
1144 
1145  _projection_weights_transposed.allocator()->allocate();
1146  _transpose_projection_weights.run();
1147  _projection_weights->mark_as_unused();
1148 
1149  if(!_projection_tensor_copy_required)
1150  {
1151  _hidden_gate.mark_as_unused();
1152  _projection_accumulate_res.mark_as_unused();
1153  }
1154  }
1155 
1156  // Mark weights as unused
1157  _input_to_forget_weights->mark_as_unused();
1158  _input_to_cell_weights->mark_as_unused();
1159  _input_to_output_weights->mark_as_unused();
1160  _recurrent_to_forget_weights->mark_as_unused();
1161  _recurrent_to_cell_weights->mark_as_unused();
1162  _recurrent_to_output_weights->mark_as_unused();
1163 
1164  _is_prepared = true;
1165  }
1166 }
1167 } // namespace arm_compute
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const T * projection_weights() const
Definition: LSTMParams.h:227
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
Static function to check if given info will lead to a valid configuration of NEQLSTMLayerNormalizatio...
void run() override
Run the kernels contained in the function.
int32_t gemmlowp_multiplier
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
Definition: Types.h:1956
const T * input_to_input_weights() const
Definition: LSTMParams.h:197
int16_t quantize_qsymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a 16-bit symmetric quantization scheme.
Shape of a tensor.
Definition: TensorShape.h:39
Quantize using a fixed point multiplication.
NEQLSTMLayer(std::shared_ptr< IMemoryManager > memory_manager=nullptr)
Default constructor.
void run() override final
Run the kernels contained in the function.
quantized, symmetric fixed-point 16-bit number
bool use_layer_norm() const
Definition: LSTMParams.h:312
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info=ActivationLayerInfo())
Static function to check if given info will lead to a valid configuration of NEArithmeticAddition.
void init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo &sub_info)
Shares the same backing memory with another tensor allocator, while the tensor info might be differen...
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
bool has_peephole_opt() const
Definition: LSTMParams.h:297
T * forget_layer_norm_weights() const
Definition: LSTMParams.h:242
void build_lstm_params_tensor_info(const LSTMParams< T > &lstm_params, LSTMParams< ITensorInfo > *lstm_params_info)
Build LSTMParams<ITensorInfo> object by extracting the metadata from each tensor. ...
Definition: InfoHelpers.h:71
virtual ITensorInfo & set_tensor_shape(const TensorShape &shape)=0
Set the shape of an already initialized tensor.
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
[NEActivationLayer snippet]
void run() override
Run the kernels contained in the function.
float output_intermediate_scale() const
Definition: LSTMParams.h:282
bool has_cifg_opt() const
Definition: LSTMParams.h:307
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
float cell_intermediate_scale() const
Definition: LSTMParams.h:277
float forget_intermediate_scale() const
Definition: LSTMParams.h:272
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Quantization info when assuming per layer quantization.
int32_t gemmlowp_offset
GEMMLowp output stage offset used for quantizing to QASYMM8.
Definition: Types.h:1955
T * cell_to_input_weights() const
Definition: LSTMParams.h:207
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
Status class.
Definition: Error.h:52
int32_t gemmlowp_max_bound
GEMMLowp max value used to saturate down the output result before converting back to QASYMM8...
Definition: Types.h:1959
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Activation Layer Information class.
Definition: Types.h:1550
GEMMLowpOutputStageType type
GEMMLowp output stage type.
Definition: Types.h:1954
Interface for Neon tensor.
Definition: ITensor.h:36
void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info=ActivationLayerInfo())
Initialise the kernel&#39;s inputs, output and conversion policy.
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2021 Arm Limited.
static Status validate(const ITensorInfo *input, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, const LSTMParams< ITensorInfo > &lstm_params)
Static function to check if given info will lead to a valid configuration of NEQLSTMLayer.
TensorAllocator * allocator()
Return a pointer to the tensor&#39;s allocator.
Definition: Tensor.cpp:48
ITensorInfo * info() const override
Interface to be implemented by the child class to return the tensor&#39;s metadata.
Definition: Tensor.cpp:33
DataType data_type() const override
Data type used for each element of the tensor.
Definition: TensorInfo.h:270
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
void mark_as_unused() const
Marks a tensor as unused.
Definition: ITensor.cpp:168
1 channel, 1 S32 per channel
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Definition: MemoryGroup.h:79
void configure(const ITensor *input, ITensor *output)
Initialise the kernel&#39;s inputs and output.
Definition: NETranspose.cpp:32
const T * recurrent_to_input_weights() const
Definition: LSTMParams.h:202
int32_t hidden_state_zero() const
Definition: LSTMParams.h:287
const T * projection_bias() const
Definition: LSTMParams.h:232
Quantization information.
T * output_layer_norm_weights() const
Definition: LSTMParams.h:252
float input_intermediate_scale() const
Definition: LSTMParams.h:267
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel&#39;s inputs, output.
void run() override
Run the kernels contained in the function.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:543
void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info=ActivationLayerInfo())
Initialise the kernel&#39;s inputs, output and convertion policy.
~NEQLSTMLayer()
Default destructor.
void run() override
Run the kernels contained in the function.
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NECopy.
Definition: NECopy.cpp:58
float hidden_state_scale() const
Definition: LSTMParams.h:292
void allocate() override
Allocate size specified by TensorInfo of CPU memory.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
GEMMLowp output stage info.
Definition: Types.h:1952
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
Basic implementation of the tensor interface.
Definition: Tensor.h:37
void configure(const ITensor *input, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, const ITensor *cell_state_in, ITensor *output_state_in, ITensor *cell_state_out, ITensor *output_state_out, ITensor *output, const LSTMParams< ITensor > &lstm_params)
Initialize function&#39;s tensors.
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
quantized, symmetric fixed-point 8-bit number
float cell_clip() const
Definition: LSTMParams.h:257
T * cell_to_forget_weights() const
Definition: LSTMParams.h:217
bool has_projection() const
Definition: LSTMParams.h:302
float projection_clip() const
Definition: LSTMParams.h:262
int32_t gemmlowp_shift
GEMMLowp output stage shift used for quantizing to uint8.
Definition: Types.h:1957
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
T * cell_to_output_weights() const
Definition: LSTMParams.h:222
void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info=ActivationLayerInfo())
Initialise the kernel&#39;s inputs, output and conversion policy.
void configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info)
Initialise the kernel&#39;s inputs, output.
Memory group resources scope handling class.
Definition: IMemoryGroup.h:82
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info=ActivationLayerInfo())
Static function to check if given info will lead to a valid configuration of NEPixelWiseMultiplicatio...
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
T * input_layer_norm_weights() const
Definition: LSTMParams.h:237
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443
static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpOutputStage.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
void run() override
Run the kernels contained in the function.
Definition: NECopy.cpp:66
void run() override
Run the kernels contained in the function.
Basic function to execute GEMMLowpQuantizeDown kernels on Neon.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
[NEActivationLayer snippet]
const T * input_gate_bias() const
Definition: LSTMParams.h:212
uint8_t * buffer() const override
Interface to be implemented by the child class to return a pointer to CPU memory. ...
Definition: Tensor.cpp:43
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:45
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
ITensorInfo & set_tensor_shape(const TensorShape &shape) override
Set the shape of an already initialized tensor.
Definition: TensorInfo.cpp:352
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info=GEMMInfo())
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixMultiply...
T * cell_layer_norm_weights() const
Definition: LSTMParams.h:247
quantized, asymmetric fixed-point 8-bit number signed
void configure(ITensor *input, ITensor *output)
Initialise the function&#39;s source and destination.
Definition: NECopy.cpp:48
static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixAReducti...
int32_t gemmlowp_min_bound
GEMMLowp min value used to saturate down the output result before converting back to QASYMM8...
Definition: Types.h:1958
const TensorShape & tensor_shape() const override
Size for each dimension of the tensor.
Definition: TensorInfo.h:262
DataType output_data_type
Output tensor data type to use if the output is not initialized.
Definition: Types.h:1964
Truncates the least significant values that are lost in operations.
void prepare() override
Prepare the function for executing.
Basic function to execute GEMMLowpMatrixMultiplyCore on Neon.
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NETranspose.
Definition: NETranspose.cpp:39
Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes, const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
static IScheduler & get()
Access the scheduler singleton.
Definition: Scheduler.cpp:94
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info=ActivationLayerInfo())
Static function to check if given info will lead to a valid configuration of NEArithmeticSubtraction...