ComputeLibrary/v21.02/_c_l_reduction_operation_8cpp_source.xhtml

 /*
  * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"

 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"

 namespace arm_compute
 {
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape(), _num_of_stages(), _reduction_axis(), _is_serial(),
       _is_reshape_required(false)
 {
 }

 CLReductionOperation::~CLReductionOperation() = default;

 Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");

     const unsigned int num_of_stages       = utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
     const bool         is_serial           = needs_serialized_reduction(op, input->data_type(), axis);
     const bool         is_reshape_required = !keep_dims;

     if(is_reshape_required && output->total_size() != 0)
     {
         const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }

     auto *output_internal = output;

     TensorInfo output_before_reshape;
     const auto input_shape        = input->tensor_shape();
     const auto input_data_type    = input->data_type();
     const auto input_num_channles = input->num_channels();
     const auto input_qinfo        = input->quantization_info();
     const auto output_data_type   = output->data_type();

     auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
     {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };

     if(is_reshape_required)
     {
         auto shape_before_reshape = input_shape;
         shape_before_reshape.set(axis, 1);
         initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
         output_internal = &output_before_reshape;
     }

     if(is_serial)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
     }
     else
     {
         // Create temporary tensor infos
         std::vector<TensorInfo> sums_vector(num_of_stages - 1);

         // Create intermediate tensor info
         TensorShape shape{ input_shape };

         shape.set(0, ceil(shape.x() / 128.f));

         for(unsigned int i = 0; i < num_of_stages - 1; i++)
         {
             initialize_tensorinfo(sums_vector[i], shape, input_data_type, input_num_channles, input_qinfo);
         }

         ReductionOperation first_kernel_op;
         ReductionOperation intermediate_kernel_op;
         ReductionOperation last_kernel_op;
         switch(op)
         {
             case ReductionOperation::SUM:
             case ReductionOperation::MEAN_SUM:
                 first_kernel_op        = ReductionOperation::SUM;
                 intermediate_kernel_op = ReductionOperation::SUM;
                 last_kernel_op         = op;
                 break;
             case ReductionOperation::SUM_SQUARE:
                 first_kernel_op        = ReductionOperation::SUM_SQUARE;
                 intermediate_kernel_op = ReductionOperation::SUM;
                 last_kernel_op         = ReductionOperation::SUM;
                 break;
             case ReductionOperation::PROD:
                 first_kernel_op        = ReductionOperation::PROD;
                 intermediate_kernel_op = ReductionOperation::PROD;
                 last_kernel_op         = ReductionOperation::PROD;
                 break;
             case ReductionOperation::MIN:
                 first_kernel_op        = ReductionOperation::MIN;
                 intermediate_kernel_op = ReductionOperation::MIN;
                 last_kernel_op         = ReductionOperation::MIN;
                 break;
             case ReductionOperation::MAX:
                 first_kernel_op        = ReductionOperation::MAX;
                 intermediate_kernel_op = ReductionOperation::MAX;
                 last_kernel_op         = ReductionOperation::MAX;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
         }

         // Validate ReductionOperation only on first kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, &sums_vector[0], axis, first_kernel_op));

         // Validate ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < num_of_stages - 1; ++i)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[i - 1], &sums_vector[i], axis, intermediate_kernel_op));
         }

         // Validate ReductionOperation on the last stage
         const unsigned int last_stage = num_of_stages - 1;
         ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output_internal, axis, last_kernel_op, input->dimension(0)));
     }

     if(is_reshape_required)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output));
     }

     return Status{};
 }

 ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
 {
     if(!_is_reshape_required && _is_serial)
     {
         return output;
     }

     auto intermediate_result_vector_size = _is_serial ? 1 : _num_of_stages;

     if(!_is_reshape_required)
     {
         --intermediate_result_vector_size;
     }

     _results_vector.resize(intermediate_result_vector_size);
     auto shape = input->info()->tensor_shape();

     shape.set(_reduction_axis, _is_serial ? 1 : ceil(shape.x() / 128.f));

     for(auto &v : _results_vector)
     {
         if(&v == &_results_vector.back() && _is_reshape_required)
         {
             shape.set(_reduction_axis, 1);
         }
         v.allocator()->init(input->info()->clone()->set_tensor_shape(shape));
     }

     return _is_reshape_required ? &_results_vector.back() : output;
 }

 void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
 }

 void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     _num_of_stages       = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
     _reduction_axis      = axis;
     _is_serial           = needs_serialized_reduction(op, input->info()->data_type(), axis);
     _is_reshape_required = !keep_dims;

     auto *output_internal = configure_intermediate_result_vector(input, output);

     if(_is_reshape_required)
     {
         const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
         const auto        output_data_type = input->info()->data_type();
         auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
     }

     // Configure reduction operation kernels
     _reduction_kernels_vector.reserve(_num_of_stages);

     // Create temporary tensors
     if(_is_serial)
     {
         if(_is_reshape_required)
         {
             _memory_group.manage(&_results_vector.back());
         }

         _reduction_kernels_vector.emplace_back(std::make_unique<CLReductionOperationKernel>());
         _reduction_kernels_vector[0]->configure(compile_context, input, output_internal, axis, op, 0);
     }
     else
     {
         _border_handlers_vector.reserve(_num_of_stages);
         _memory_group.manage(&_results_vector[0]);

         ReductionOperation first_kernel_op;
         ReductionOperation intermediate_kernel_op;
         ReductionOperation last_kernel_op;
         PixelValue         pixelValue;
         switch(op)
         {
             case ReductionOperation::SUM:
             case ReductionOperation::MEAN_SUM:
                 first_kernel_op        = ReductionOperation::SUM;
                 intermediate_kernel_op = ReductionOperation::SUM;
                 last_kernel_op         = op;
                 pixelValue             = PixelValue();
                 break;
             case ReductionOperation::SUM_SQUARE:
                 first_kernel_op        = ReductionOperation::SUM_SQUARE;
                 intermediate_kernel_op = ReductionOperation::SUM;
                 last_kernel_op         = ReductionOperation::SUM;
                 pixelValue             = PixelValue();
                 break;
             case ReductionOperation::PROD:
                 first_kernel_op        = ReductionOperation::PROD;
                 intermediate_kernel_op = ReductionOperation::PROD;
                 last_kernel_op         = ReductionOperation::PROD;
                 pixelValue             = PixelValue(1, input->info()->data_type());
                 break;
             case ReductionOperation::MIN:
                 first_kernel_op        = ReductionOperation::MIN;
                 intermediate_kernel_op = ReductionOperation::MIN;
                 last_kernel_op         = ReductionOperation::MIN;
                 pixelValue             = std::get<1>(get_min_max(input->info()->data_type()));
                 break;
             case ReductionOperation::MAX:
                 first_kernel_op        = ReductionOperation::MAX;
                 intermediate_kernel_op = ReductionOperation::MAX;
                 last_kernel_op         = ReductionOperation::MAX;
                 pixelValue             = std::get<0>(get_min_max(input->info()->data_type()));
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
         }

         _reduction_kernels_vector.emplace_back(std::make_unique<CLReductionOperationKernel>());
         _reduction_kernels_vector[0]->configure(compile_context, input, &_results_vector[0], axis, first_kernel_op);

         _border_handlers_vector.emplace_back(std::make_unique<CLFillBorderKernel>());
         _border_handlers_vector[0]->configure(compile_context, input, _reduction_kernels_vector[0]->border_size(), BorderMode::CONSTANT, pixelValue);

         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
             _memory_group.manage(&_results_vector[i]);

             _reduction_kernels_vector.emplace_back(std::make_unique<CLReductionOperationKernel>());
             _reduction_kernels_vector[i]->configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);

             _border_handlers_vector.emplace_back(std::make_unique<CLFillBorderKernel>());
             _border_handlers_vector[i]->configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i]->border_size(), BorderMode::CONSTANT, pixelValue);

             _results_vector[i - 1].allocator()->allocate();
         }

         // Apply ReductionOperation on the last stage
         const unsigned int last_stage  = _num_of_stages - 1;
         const unsigned int input_width = input->info()->dimension(0);

         if(_is_reshape_required)
         {
             _memory_group.manage(&_results_vector.back());
         }

         _reduction_kernels_vector.emplace_back(std::make_unique<CLReductionOperationKernel>());
         _reduction_kernels_vector[last_stage]->configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);

         _border_handlers_vector.emplace_back(std::make_unique<CLFillBorderKernel>());
         _border_handlers_vector[last_stage]->configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage]->border_size(), BorderMode::CONSTANT, pixelValue);

         _results_vector[last_stage - 1].allocator()->allocate();
     }

     if(_is_reshape_required)
     {
         _reshape.configure(compile_context, &_results_vector.back(), output);
         _results_vector.back().allocator()->allocate();
     }
 }

 void CLReductionOperation::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);

     if(_is_serial)
     {
         CLScheduler::get().enqueue(*_reduction_kernels_vector[0], false);
     }
     else
     {
         for(unsigned int i = 0; i < _num_of_stages; ++i)
         {
             CLScheduler::get().enqueue(*_border_handlers_vector[i], false);
             CLScheduler::get().enqueue(*_reduction_kernels_vector[i], false);
         }
     }

     if(_is_reshape_required)
     {
         _reshape.run();
     }
 }
 } // namespace arm_compute
PixelValue.h

arm_compute::utils::calculate_number_of_stages_only_x_axis
unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis)
Calculate number of stages for parallel implementations.
Definition: Utils.cpp:68

arm_compute::ITensorInfo::set_num_channels
virtual ITensorInfo & set_num_channels(int num_channels)=0
Set the number of channels to the specified value.

arm_compute::needs_serialized_reduction
bool needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int axis)
Check if the given reduction operation should be handled in a serial way.
Definition: Utils.cpp:453

arm_compute::PixelValue
Class describing the value of a pixel for any image format.
Definition: PixelValue.h:34

arm_compute::test::validation::shape
shape
Definition: DFT.cpp:115

arm_compute::CLReshapeLayer::validate
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of CLReshapeLayer.
Definition: CLReshapeLayer.cpp:65

arm_compute::TensorShape
Shape of a tensor.
Definition: TensorShape.h:39

arm_compute::ITensorInfo::dimension
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.

arm_compute::ReductionOperation
ReductionOperation
Available reduction operations.
Definition: Types.h:521

arm_compute::CLScheduler::get
static CLScheduler & get()
Access the scheduler singleton.
Definition: CLScheduler.cpp:104

ARM_COMPUTE_ERROR
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352

arm_compute::ITensorInfo::set_tensor_shape
virtual ITensorInfo & set_tensor_shape(const TensorShape &shape)=0
Set the shape of an already initialized tensor.

arm_compute::ReductionOperation::SUM_SQUARE
Sum of squares.

ARM_COMPUTE_RETURN_ON_ERROR
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204

arm_compute::ITensorInfo::data_type
virtual DataType data_type() const =0
Data type used for each element of the tensor.

arm_compute::TensorInfo::set_data_type
ITensorInfo & set_data_type(DataType data_type) override
Set the data type to the specified value.
Definition: TensorInfo.cpp:321

arm_compute::CLKernelLibrary::get
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Definition: CLKernelLibrary.cpp:1119

arm_compute::ITensorInfo
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40

arm_compute::Status
Status class.
Definition: Error.h:52

TensorInfo.h

arm_compute
Copyright (c) 2017-2021 Arm Limited.
Definition: 00_introduction.dox:24

arm_compute::CLReshapeLayer::run
void run() override
Run the kernels contained in the function.
Definition: CLReshapeLayer.cpp:73

arm_compute::test::validation::input
auto input
Definition: LSTMLayerQuantized.cpp:486

arm_compute::MemoryGroup::manage
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Definition: MemoryGroup.h:79

arm_compute::test::validation::data_type
const DataType data_type
Definition: Im2Col.cpp:150

CLScheduler.h
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.

arm_compute::QuantizationInfo
Quantization information.
Definition: QuantizationInfo.h:70

arm_compute::ReductionOperation::SUM
Sum.

arm_compute::test::validation::input_shape
TensorShape input_shape
Validate test suite is to test ARM_COMPUTE_RETURN_ON_* macros we use to check the validity of given a...
Definition: LSTMLayerQuantized.cpp:466

std

arm_compute::test::validation::output_shape
TensorShape output_shape
Definition: LSTMLayerQuantized.cpp:469

arm_compute::ITensorInfo::tensor_shape
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.

arm_compute::CLReductionOperation::run
void run() override
Run the kernels contained in the function.
Definition: CLReductionOperation.cpp:320

arm_compute::misc::shape_calculator::compute_reduced_shape
TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims=true)
Calculate the reduced shape of a tensor given an axis.
Definition: ShapeCalculator.h:1256

arm_compute::auto_init_if_empty
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: AutoConfiguration.h:42

ShapeCalculator.h

arm_compute::CLReductionOperationKernel::validate
static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width=0)
Static function to check if given info will lead to a valid configuration of CLReductionOperationKern...
Definition: CLReductionOperationKernel.cpp:246

arm_compute::misc::ICloneable::clone
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.

Validate.h

arm_compute::ITensor::info
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.

arm_compute::ITensorInfo::set_quantization_info
virtual ITensorInfo & set_quantization_info(const QuantizationInfo &quantization_info)=0
Set the quantization settings (scale and offset) of the tensor.

arm_compute::CLReductionOperation::CLReductionOperation
CLReductionOperation(std::shared_ptr< IMemoryManager > memory_manager=nullptr)
Default Constructor.
Definition: CLReductionOperation.cpp:40

arm_compute::CLReductionOperation::validate
static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims=true)
Static function to check if given info will lead to a valid configuration of CLReductionOperation.
Definition: CLReductionOperation.cpp:48

CLReductionOperation.h

arm_compute::ReductionOperation::PROD
Product.

arm_compute::ITensorInfo::quantization_info
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.

arm_compute::CLScheduler::enqueue
void enqueue(ICLKernel &kernel, bool flush=true)
Schedule the execution of the passed kernel if possible.
Definition: CLScheduler.cpp:183

arm_compute::CLCompileContext
CLCompileContext class.
Definition: CLCompileContext.h:202

CLFillBorderKernel.h

CLReductionOperationKernel.h

arm_compute::MemoryGroupResourceScope
Memory group resources scope handling class.
Definition: IMemoryGroup.h:82

arm_compute::ICLTensor
Interface for OpenCL tensor.
Definition: ICLTensor.h:42

AutoConfiguration.h

ICLTensor.h

arm_compute::ITensorInfo::total_size
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443

arm_compute::PaddingMode::CONSTANT

arm_compute::test::validation::qinfo
const QuantizationInfo qinfo
Definition: Im2Col.cpp:155

arm_compute::ReductionOperation::MEAN_SUM
Mean of sum.

ARM_COMPUTE_RETURN_ERROR_ON_MSG
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244

ARM_COMPUTE_ERROR_ON_NULLPTR
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161

arm_compute::TensorInfo
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:45

Helpers.h

arm_compute::CLReductionOperation::configure
void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims=true)
Set the input and output tensors.
Definition: CLReductionOperation.cpp:193

arm_compute::CLReshapeLayer::configure
void configure(const ICLTensor *input, ICLTensor *output)
Initialise the kernel&#39;s inputs and outputs.
Definition: CLReshapeLayer.cpp:52

arm_compute::CLReductionOperation::~CLReductionOperation
~CLReductionOperation()
Default Destructor.

arm_compute::NonLinearFilterFunction::MAX
Non linear dilate.

Utils.h

arm_compute::Dimensions< size_t >::num_max_dimensions
static constexpr size_t num_max_dimensions
Number of dimensions the tensor has.
Definition: Dimensions.h:46

arm_compute::NonLinearFilterFunction::MIN
Non linear erode.

arm_compute::DataType
DataType
Available data types.
Definition: Types.h:77

input_width
const size_t input_width
Definition: NEDepthwiseConvolutionLayerNativeKernel.cpp:69

arm_compute::get_min_max
std::tuple< PixelValue, PixelValue > get_min_max(DataType dt)
Compute the mininum and maximum values a data type can take.
Definition: Utils.h:564

arm_compute::ITensorInfo::num_channels
virtual size_t num_channels() const =0
The number of channels for each tensor element.

arm_compute::TensorShape::set
TensorShape & set(size_t dimension, size_t value, bool apply_dim_correction=true, bool increase_dim_unit=true)
Accessor to set the value of one of the dimensions.
Definition: TensorShape.h:79