Compute Library
 20.05
CLReductionOperation.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
28 #include "arm_compute/core/Error.h"
37 #include "support/MemorySupport.h"
38 
39 namespace arm_compute
40 {
41 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
42  : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape_kernel(), _op(), _num_of_stages(), _reduction_axis(), _is_serial(),
43  _is_reshape_required(false)
44 {
45 }
46 
47 Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
48 {
50  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
51  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
52 
53  const unsigned int num_of_stages = calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
54  const bool is_serial = needs_serialized_reduction(op, input->data_type(), axis);
55  const bool is_reshape_required = !keep_dims;
56 
57  if(is_reshape_required && output->total_size() != 0)
58  {
59  const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
60  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
61  }
62 
63  auto *output_internal = output;
64 
65  TensorInfo output_before_reshape;
66  const auto input_shape = input->tensor_shape();
67  const auto input_data_type = input->data_type();
68  const auto input_num_channles = input->num_channels();
69  const auto input_qinfo = input->quantization_info();
70  const auto output_data_type = output->data_type();
71 
72  auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
73  {
75  };
76 
77  if(is_reshape_required)
78  {
79  auto shape_before_reshape = input_shape;
80  shape_before_reshape.set(axis, 1);
81  initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
82  output_internal = &output_before_reshape;
83  }
84 
85  if(is_serial)
86  {
88  }
89  else
90  {
91  // Create temporary tensor infos
92  std::vector<TensorInfo> sums_vector(num_of_stages - 1);
93 
94  // Create intermediate tensor info
96 
97  shape.set(0, ceil(shape.x() / 128.f));
98 
99  for(unsigned int i = 0; i < num_of_stages - 1; i++)
100  {
101  initialize_tensorinfo(sums_vector[i], shape, input_data_type, input_num_channles, input_qinfo);
102  }
103 
104  ReductionOperation first_kernel_op;
105  ReductionOperation intermediate_kernel_op;
106  ReductionOperation last_kernel_op;
107  switch(op)
108  {
111  first_kernel_op = ReductionOperation::SUM;
112  intermediate_kernel_op = ReductionOperation::SUM;
113  last_kernel_op = op;
114  break;
116  first_kernel_op = ReductionOperation::SUM_SQUARE;
117  intermediate_kernel_op = ReductionOperation::SUM;
118  last_kernel_op = ReductionOperation::SUM;
119  break;
121  first_kernel_op = ReductionOperation::PROD;
122  intermediate_kernel_op = ReductionOperation::PROD;
123  last_kernel_op = ReductionOperation::PROD;
124  break;
126  first_kernel_op = ReductionOperation::MIN;
127  intermediate_kernel_op = ReductionOperation::MIN;
128  last_kernel_op = ReductionOperation::MIN;
129  break;
131  first_kernel_op = ReductionOperation::MAX;
132  intermediate_kernel_op = ReductionOperation::MAX;
133  last_kernel_op = ReductionOperation::MAX;
134  break;
135  default:
136  ARM_COMPUTE_ERROR("Not supported");
137  }
138 
139  // Validate ReductionOperation only on first kernel
140  ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, &sums_vector[0], axis, first_kernel_op));
141 
142  // Validate ReductionOperation on intermediate stages
143  for(unsigned int i = 1; i < num_of_stages - 1; ++i)
144  {
145  ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[i - 1], &sums_vector[i], axis, intermediate_kernel_op));
146  }
147 
148  // Validate ReductionOperation on the last stage
149  const unsigned int last_stage = num_of_stages - 1;
150  ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output_internal, axis, last_kernel_op, input->dimension(0)));
151  }
152 
153  if(is_reshape_required)
154  {
156  }
157 
158  return Status{};
159 }
160 
161 ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
162 {
163  if(!_is_reshape_required && _is_serial)
164  {
165  return output;
166  }
167 
168  auto intermediate_result_vector_size = _is_serial ? 1 : _num_of_stages;
169 
170  if(!_is_reshape_required)
171  {
172  --intermediate_result_vector_size;
173  }
174 
175  _results_vector.resize(intermediate_result_vector_size);
176  auto shape = input->info()->tensor_shape();
177 
178  shape.set(_reduction_axis, _is_serial ? 1 : ceil(shape.x() / 128.f));
179 
180  for(auto &v : _results_vector)
181  {
182  if(&v == &_results_vector.back() && _is_reshape_required)
183  {
184  shape.set(_reduction_axis, 1);
185  }
186  v.allocator()->init(input->info()->clone()->set_tensor_shape(shape));
187  }
188 
189  return _is_reshape_required ? &_results_vector.back() : output;
190 }
191 
192 void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
193 {
194  configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
195 }
196 
197 void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
198 {
200  _op = op;
201  _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
202  _reduction_axis = axis;
203  _is_serial = needs_serialized_reduction(op, input->info()->data_type(), axis);
204  _is_reshape_required = !keep_dims;
205 
206  auto *output_internal = configure_intermediate_result_vector(input, output);
207 
208  if(_is_reshape_required)
209  {
211  const auto output_data_type = input->info()->data_type();
212  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
213  }
214 
215  // Configure reduction operation kernels
216  _reduction_kernels_vector.resize(_num_of_stages);
217 
218  // Create temporary tensors
219  if(_is_serial)
220  {
221  if(_is_reshape_required)
222  {
223  _memory_group.manage(&_results_vector.back());
224  }
225 
226  _reduction_kernels_vector[0].configure(compile_context, input, output_internal, axis, op, 0);
227  }
228  else
229  {
230  _border_handlers_vector.resize(_num_of_stages);
231  _memory_group.manage(&_results_vector[0]);
232 
233  ReductionOperation first_kernel_op;
234  ReductionOperation intermediate_kernel_op;
235  ReductionOperation last_kernel_op;
236  PixelValue pixelValue;
237  switch(op)
238  {
241  first_kernel_op = ReductionOperation::SUM;
242  intermediate_kernel_op = ReductionOperation::SUM;
243  last_kernel_op = op;
244  pixelValue = PixelValue();
245  break;
247  first_kernel_op = ReductionOperation::SUM_SQUARE;
248  intermediate_kernel_op = ReductionOperation::SUM;
249  last_kernel_op = ReductionOperation::SUM;
250  pixelValue = PixelValue();
251  break;
253  first_kernel_op = ReductionOperation::PROD;
254  intermediate_kernel_op = ReductionOperation::PROD;
255  last_kernel_op = ReductionOperation::PROD;
256  pixelValue = PixelValue(1, input->info()->data_type());
257  break;
259  first_kernel_op = ReductionOperation::MIN;
260  intermediate_kernel_op = ReductionOperation::MIN;
261  last_kernel_op = ReductionOperation::MIN;
262  switch(input->info()->data_type())
263  {
264  case DataType::F32:
265  {
266  pixelValue = PixelValue(std::numeric_limits<float>::max());
267  break;
268  }
269  case DataType::F16:
270  {
271  pixelValue = PixelValue(static_cast<half>(65504.0f));
272  break;
273  }
274  case DataType::QASYMM8:
275  {
276  pixelValue = std::get<1>(get_min_max(input->info()->data_type()));
277  break;
278  }
280  {
281  pixelValue = PixelValue(127, input->info()->data_type(), input->info()->quantization_info());
282  break;
283  }
284  default:
285  {
286  ARM_COMPUTE_ERROR("Unsupported DataType");
287  }
288  }
289  break;
291  first_kernel_op = ReductionOperation::MAX;
292  intermediate_kernel_op = ReductionOperation::MAX;
293  last_kernel_op = ReductionOperation::MAX;
294  switch(input->info()->data_type())
295  {
296  case DataType::F32:
297  {
298  pixelValue = PixelValue(-std::numeric_limits<float>::max());
299  break;
300  }
301  case DataType::F16:
302  {
303  pixelValue = PixelValue(static_cast<half>(-65504.0f));
304  break;
305  }
306  case DataType::QASYMM8:
307  {
308  pixelValue = std::get<0>(get_min_max(input->info()->data_type()));
309  break;
310  }
312  {
313  pixelValue = PixelValue(-128, input->info()->data_type(), input->info()->quantization_info());
314  break;
315  }
316  default:
317  {
318  ARM_COMPUTE_ERROR("Unsupported DataType");
319  }
320  }
321  break;
322  default:
323  ARM_COMPUTE_ERROR("Not supported");
324  }
325 
326  _reduction_kernels_vector[0].configure(compile_context, input, &_results_vector[0], axis, first_kernel_op);
327  _border_handlers_vector[0].configure(compile_context, input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
328 
329  // Apply ReductionOperation on intermediate stages
330  for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
331  {
332  _memory_group.manage(&_results_vector[i]);
333  _reduction_kernels_vector[i].configure(compile_context, &_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
334  _border_handlers_vector[i].configure(compile_context, &_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
335  _results_vector[i - 1].allocator()->allocate();
336  }
337 
338  // Apply ReductionOperation on the last stage
339  const unsigned int last_stage = _num_of_stages - 1;
340  const unsigned int input_width = input->info()->dimension(0);
341 
342  if(_is_reshape_required)
343  {
344  _memory_group.manage(&_results_vector.back());
345  }
346 
347  _reduction_kernels_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], output_internal, axis, last_kernel_op, input_width);
348  _border_handlers_vector[last_stage].configure(compile_context, &_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
349  _results_vector[last_stage - 1].allocator()->allocate();
350  }
351 
352  if(_is_reshape_required)
353  {
354  _reshape_kernel.configure(compile_context, &_results_vector.back(), output);
355  _results_vector.back().allocator()->allocate();
356  }
357 }
358 
360 {
361  MemoryGroupResourceScope scope_mg(_memory_group);
362 
363  if(_is_serial)
364  {
365  CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
366  }
367  else
368  {
369  for(unsigned int i = 0; i < _num_of_stages; ++i)
370  {
371  CLScheduler::get().enqueue(_border_handlers_vector[i], false);
372  CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
373  }
374  }
375 
376  if(_is_reshape_required)
377  {
378  CLScheduler::get().enqueue(_reshape_kernel, false);
379  }
380 }
381 } // namespace arm_compute
virtual ITensorInfo & set_num_channels(int num_channels)=0
Set the number of channels to the specified value.
bool needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int axis)
Check if the given reduction operation should be handled in a serial way.
Definition: Utils.cpp:429
Class describing the value of a pixel for any image format.
Definition: PixelValue.h:34
Shape of a tensor.
Definition: TensorShape.h:39
ReductionOperation
Available reduction operations.
Definition: Types.h:498
static CLScheduler & get()
Access the scheduler singleton.
Definition: CLScheduler.cpp:99
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
virtual ITensorInfo & set_tensor_shape(const TensorShape &shape)=0
Set the shape of an already initialized tensor.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
ITensorInfo & set_data_type(DataType data_type) override
Set the data type to the specified value.
Definition: TensorInfo.cpp:319
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Store the tensor's metadata.
Definition: ITensorInfo.h:40
Status class.
Definition: Error.h:52
Copyright (c) 2017-2020 ARM Limited.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: Helpers.inl:202
1 channel, 1 F16 per channel
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Definition: MemoryGroup.h:79
Quantization information.
unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis)
Calculate number of stages for parallel implementations.
Definition: Utils.cpp:66
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443
void run() override
Run the kernels contained in the function.
quantized, asymmetric fixed-point 8-bit number unsigned
TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims=true)
Calculate the reduced shape of a tensor given an axis.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width=0)
Static function to check if given info will lead to a valid configuration of CLReductionOperationKern...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
virtual ITensorInfo & set_quantization_info(const QuantizationInfo &quantization_info)=0
Set the quantization settings (scale and offset) of the tensor.
CLReductionOperation(std::shared_ptr< IMemoryManager > memory_manager=nullptr)
Default Constructor.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims=true)
Static function to check if given info will lead to a valid configuration of CLReductionOperation.
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of CLReshapeLayerKernel.
void enqueue(ICLKernel &kernel, bool flush=true)
Schedule the execution of the passed kernel if possible.
CLCompileContext class.
void configure(const ICLTensor *input, ICLTensor *output)
Set the input and output of the kernel.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Memory group resources scope handling class.
Definition: IMemoryGroup.h:82
Interface for OpenCL tensor.
Definition: ICLTensor.h:42
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
const QuantizationInfo qinfo
Definition: Im2Col.cpp:150
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
TensorShape & set(size_t dimension, size_t value, bool apply_dim_correction=true)
Accessor to set the value of one of the dimensions.
Definition: TensorShape.h:78
Store the tensor's metadata.
Definition: TensorInfo.h:45
void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims=true)
Set the input and output tensors.
quantized, asymmetric fixed-point 8-bit number signed
static constexpr size_t num_max_dimensions
Number of dimensions the tensor has.
Definition: Dimensions.h:45
DataType
Available data types.
Definition: Types.h:77
std::tuple< PixelValue, PixelValue > get_min_max(DataType dt)
Compute the mininum and maximum values a data type can take.
Definition: Utils.h:560