24 #ifndef ARM_COMPUTE_CLQLSTMLAYER_H 25 #define ARM_COMPUTE_CLQLSTMLAYER_H 41 class CLCompileContext;
43 class CLQLSTMLayerNormalizationKernel;
49 class ClGemmLowpMatrixAReductionKernel;
70 CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager =
nullptr);
182 const ICLTensor *recurrent_to_forget_weights,
const ICLTensor *recurrent_to_cell_weights,
const ICLTensor *recurrent_to_output_weights,
240 void prepare()
override;
243 enum class LayerNormGate : uint8_t
251 static constexpr uint8_t _layer_norm_count =
static_cast<uint8_t
>(LayerNormGate::Count);
252 static constexpr uint32_t _out_state_output_size_dimension_idx = 0;
271 CLTensor *outstage_res,
float gemmlowp_scale,
277 class TensorCopyKernel
279 static constexpr uint32_t max_dimension_supported = 2;
310 CLTranspose _transpose_recurrent_to_forget_weights{};
311 CLTranspose _transpose_recurrent_to_cell_weights{};
312 CLTranspose _transpose_recurrent_to_output_weights{};
313 CLTranspose _transpose_recurrent_to_input_weights{};
315 std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_input_reduction;
316 std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
317 std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_forget_reduction;
318 std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
319 std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_cell_reduction;
320 std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
321 std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_output_reduction;
322 std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
323 std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _projection_reduction;
370 std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
373 TensorCopyKernel _projection_bias_copy{};
374 TensorCopyKernel _projection_output_to_accumulate_copy{};
375 TensorCopyKernel _projection_accumulate_to_output_copy{};
376 TensorCopyKernel _hidden_to_output_copy{};
383 const ICLTensor *_recurrent_to_input_weights{
nullptr };
384 const ICLTensor *_projection_bias{
nullptr };
385 const ICLTensor *_input_to_forget_weights{
nullptr };
386 const ICLTensor *_input_to_cell_weights{
nullptr };
387 const ICLTensor *_input_to_output_weights{
nullptr };
388 const ICLTensor *_recurrent_to_forget_weights{
nullptr };
389 const ICLTensor *_recurrent_to_cell_weights{
nullptr };
390 const ICLTensor *_recurrent_to_output_weights{
nullptr };
391 const ICLTensor *_projection_weights{
nullptr };
392 std::array<const ICLTensor *, _layer_norm_count> _layer_norm_weights{ {} };
393 std::array<const ICLTensor *, _layer_norm_count> _layer_norm_bias{ {} };
396 inline LayerNormIndexType getGateIndex(LayerNormGate g)
398 return static_cast<LayerNormIndexType
>(g);
401 inline void set_layer_norm_weight(
const ICLTensor *
t, LayerNormGate g)
403 _layer_norm_weights[getGateIndex(g)] =
t;
406 inline void set_layer_norm_bias(
const ICLTensor *t, LayerNormGate g)
408 _layer_norm_bias[getGateIndex(g)] =
t;
411 inline const ICLTensor *get_layer_norm_weight(LayerNormGate g)
413 return _layer_norm_weights[getGateIndex(g)];
416 inline const ICLTensor *get_layer_norm_bias(LayerNormGate g)
418 return _layer_norm_bias[getGateIndex(g)];
423 return *_layer_norms[getGateIndex(g)];
426 inline void configure_layer_norm(LayerNormGate g,
const ICLTensor *in);
430 CLTensor _input_to_forget_weights_transposed{
nullptr };
431 CLTensor _input_to_cell_weights_transposed{
nullptr };
432 CLTensor _input_to_output_weights_transposed{
nullptr };
433 CLTensor _input_to_input_weights_transposed{
nullptr };
434 CLTensor _recurrent_to_forget_weights_transposed{
nullptr };
435 CLTensor _recurrent_to_cell_weights_transposed{
nullptr };
436 CLTensor _recurrent_to_output_weights_transposed{
nullptr };
437 CLTensor _recurrent_to_input_weights_transposed{
nullptr };
438 CLTensor _projection_weights_transposed{
nullptr };
439 CLTensor _input_to_input_eff_bias{
nullptr };
440 CLTensor _recurrent_to_input_eff_bias{
nullptr };
441 CLTensor _input_to_forget_eff_bias{
nullptr };
442 CLTensor _recurrent_to_forget_eff_bias{
nullptr };
443 CLTensor _input_to_cell_eff_bias{
nullptr };
444 CLTensor _recurrent_to_cell_eff_bias{
nullptr };
445 CLTensor _input_to_output_eff_bias{
nullptr };
446 CLTensor _recurrent_to_output_eff_bias{
nullptr };
447 CLTensor _projection_reduction_res{
nullptr };
448 CLTensor _projection_eff_bias{
nullptr };
449 CLTensor _mm_input_to_forget_res{
nullptr };
450 CLTensor _mm_recurrent_to_forget_res{
nullptr };
451 CLTensor _mul_cell_to_forget_res{
nullptr };
452 CLTensor _input_to_forget_outstage_res{
nullptr };
453 CLTensor _cell_to_forget_outstage_res{
nullptr };
454 CLTensor _recurrent_to_forget_outstage_res{
nullptr };
456 CLTensor _mm_input_to_cell_res{
nullptr };
457 CLTensor _input_to_cell_outstage_res{
nullptr };
458 CLTensor _mm_recurrent_to_cell_res{
nullptr };
459 CLTensor _recurrent_to_cell_outstage_res{
nullptr };
461 CLTensor _mul_input_cell_res{
nullptr };
462 CLTensor _mm_input_to_input_res{
nullptr };
463 CLTensor _input_to_input_outstage_res{
nullptr };
464 CLTensor _mm_recurrent_to_input_res{
nullptr };
465 CLTensor _mul_cell_to_input_res{
nullptr };
466 CLTensor _cell_to_input_outstage_res{
nullptr };
467 CLTensor _recurrent_to_input_outstage_res{
nullptr };
469 CLTensor _mm_input_to_output_res{
nullptr };
470 CLTensor _input_to_output_outstage_res{
nullptr };
471 CLTensor _mm_recurrent_to_output_res{
nullptr };
472 CLTensor _mul_cell_to_output_res{
nullptr };
473 CLTensor _cell_to_output_outstage_res{
nullptr };
474 CLTensor _recurrent_to_output_outstage_res{
nullptr };
476 CLTensor _hidden_mul_res{
nullptr };
478 CLTensor _mm_projection_res{
nullptr };
479 CLTensor _projection_outstage_res{
nullptr };
480 CLTensor _projection_out_res{
nullptr };
481 CLTensor _projection_accumulate_res{
nullptr };
483 std::array<CLTensor, _layer_norm_count> _layer_norm_output{ {} };
485 inline CLTensor &get_layer_norm_output(LayerNormGate g)
487 return _layer_norm_output[getGateIndex(g)];
490 bool _is_prepared{
false };
491 bool _has_cifg{
false };
492 bool _has_cell_clipping{
false };
493 bool _has_projection{
false };
494 bool _has_projection_clipping{
false };
495 bool _has_peephole{
false };
496 bool _has_layer_norm{
false };
497 bool _projection_tensor_copy_required{
false };
Base class for all functions.
Basic function to run opencl::kernels::ClCopyKernel.
Basic function to execute GEMMLowpQuantizeDown kernels on CL.
Basic function to run opencl::kernels::ClSaturatedArithmeticKernel for addition.
auto recurrent_to_forget_weights
Store the tensor's metadata.
Basic function to run opencl::kernels::ClActivationKernel.
decltype(strategy::transforms) typedef type
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
auto input_to_cell_weights
auto recurrent_to_output_weights
auto input_to_output_weights
GEMMLowp output stage info.
auto recurrent_to_cell_weights
Interface for the kernel to do layer normalization.
Basic function to run opencl::ClMul.
Interface for OpenCL tensor.
Basic function to execute an opencl::kernels::ClTransposeKernel.
Basic function to run CLQLSTMLayer.
Store the tensor's metadata.
Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL.
im2col_func configure(src_target.info(), dst_target.info(), spatial_kernel, conv_info, has_bias)
Basic function to run opencl::kernels::ClSaturatedArithmeticKernel for subtraction.
auto input_to_forget_weights
Describe a multidimensional execution window.
Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes, const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
Basic implementation of the OpenCL tensor interface.