Compute Library
 19.08
CLReductionOperation.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2019 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
28 #include "arm_compute/core/Error.h"
35 
36 using namespace arm_compute;
37 
38 namespace
39 {
40 unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
41 {
42  // We need only 1 stage for all axis except x-axis and x-axis for QASYMM8.
43  if(axis != 0 || (axis == 0 && is_data_type_quantized(input->data_type())))
44  {
45  return 1;
46  }
47  // Calculate number of WGs. 16 elements per thread, 8 threads per WG
48  const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
49 
50  // Calculate number of stages. First stage performs op and the rest reduction sum
51  // depending on the size of the input. Last stage should have only 1 WG.
52  const unsigned int num_of_stages = num_of_wg / 128 + 2;
53 
54  return num_of_stages;
55 }
56 } // namespace
57 
58 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
59  : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
60 {
61 }
62 
64 {
65  const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
66  bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0;
67  if(is_serial)
68  {
70  }
71  else
72  {
73  // Create temporary tensor infos
74  std::vector<TensorInfo> sums_vector(num_of_stages - 1);
75 
76  // Create intermediate tensor info
77  TensorShape shape{ input->tensor_shape() };
78 
79  for(unsigned int i = 0; i < num_of_stages - 1; i++)
80  {
81  shape.set(0, ceil(shape.x() / 128.f));
82  sums_vector[i].set_data_type(input->data_type());
83  sums_vector[i].set_tensor_shape(shape);
84  sums_vector[i].set_num_channels(input->num_channels());
85  }
86 
87  ReductionOperation first_kernel_op;
88  ReductionOperation intermediate_kernel_op;
89  ReductionOperation last_kernel_op;
90  switch(op)
91  {
94  first_kernel_op = ReductionOperation::SUM;
95  intermediate_kernel_op = ReductionOperation::SUM;
96  last_kernel_op = op;
97  break;
99  first_kernel_op = ReductionOperation::SUM_SQUARE;
100  intermediate_kernel_op = ReductionOperation::SUM;
101  last_kernel_op = ReductionOperation::SUM;
102  break;
104  first_kernel_op = ReductionOperation::PROD;
105  intermediate_kernel_op = ReductionOperation::PROD;
106  last_kernel_op = ReductionOperation::PROD;
107  break;
109  first_kernel_op = ReductionOperation::MIN;
110  intermediate_kernel_op = ReductionOperation::MIN;
111  last_kernel_op = ReductionOperation::MIN;
112  break;
114  first_kernel_op = ReductionOperation::MAX;
115  intermediate_kernel_op = ReductionOperation::MAX;
116  last_kernel_op = ReductionOperation::MAX;
117  break;
118  default:
119  ARM_COMPUTE_ERROR("Not supported");
120  }
121 
122  // Validate ReductionOperation only on first kernel
123  ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, &sums_vector[0], axis, first_kernel_op));
124 
125  // Validate ReductionOperation on intermediate stages
126  for(unsigned int i = 1; i < num_of_stages - 1; ++i)
127  {
128  ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[i - 1], &sums_vector[i], axis, intermediate_kernel_op));
129  }
130 
131  // Validate ReductionOperation on the last stage
132  const unsigned int last_stage = num_of_stages - 1;
133  ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(&sums_vector[last_stage - 1], output, axis, last_kernel_op, input->dimension(0)));
134  }
135 
136  return Status{};
137 }
138 
140 {
141  _num_of_stages = calculate_number_of_stages(input->info(), axis);
142  _reduction_axis = axis;
143  _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
144 
145  // Configure reduction operation kernels
146  _reduction_kernels_vector.resize(_num_of_stages);
147 
148  // Create temporary tensors
149  if(_is_serial)
150  {
151  _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
152  }
153  else
154  {
155  _border_handlers_vector.resize(_num_of_stages);
156  _results_vector.resize(_num_of_stages - 1);
157  TensorShape shape{ input->info()->tensor_shape() };
158  for(unsigned int i = 0; i < _num_of_stages - 1; i++)
159  {
160  shape.set(0, ceil(shape.x() / 128.f));
161  _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
162  }
163 
164  // Apply ReductionOperation only on first kernel
165  _memory_group.manage(&_results_vector[0]);
166 
167  ReductionOperation first_kernel_op;
168  ReductionOperation intermediate_kernel_op;
169  ReductionOperation last_kernel_op;
170  PixelValue pixelValue;
171  switch(op)
172  {
175  first_kernel_op = ReductionOperation::SUM;
176  intermediate_kernel_op = ReductionOperation::SUM;
177  last_kernel_op = op;
178  pixelValue = PixelValue();
179  break;
181  first_kernel_op = ReductionOperation::SUM_SQUARE;
182  intermediate_kernel_op = ReductionOperation::SUM;
183  last_kernel_op = ReductionOperation::SUM;
184  pixelValue = PixelValue();
185  break;
187  first_kernel_op = ReductionOperation::PROD;
188  intermediate_kernel_op = ReductionOperation::PROD;
189  last_kernel_op = ReductionOperation::PROD;
190  pixelValue = PixelValue(1, input->info()->data_type());
191  break;
193  first_kernel_op = ReductionOperation::MIN;
194  intermediate_kernel_op = ReductionOperation::MIN;
195  last_kernel_op = ReductionOperation::MIN;
196  switch(input->info()->data_type())
197  {
198  case DataType::F32:
199  {
200  pixelValue = PixelValue(std::numeric_limits<float>::max());
201  break;
202  }
203  case DataType::F16:
204  {
205  pixelValue = PixelValue(static_cast<half>(65504.0f));
206  break;
207  }
208  case DataType::QASYMM8:
209  {
210  pixelValue = PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
211  break;
212  }
213  default:
214  {
215  ARM_COMPUTE_ERROR("Unsupported DataType");
216  }
217  }
218  break;
220  first_kernel_op = ReductionOperation::MAX;
221  intermediate_kernel_op = ReductionOperation::MAX;
222  last_kernel_op = ReductionOperation::MAX;
223  switch(input->info()->data_type())
224  {
225  case DataType::F32:
226  {
227  pixelValue = PixelValue(-std::numeric_limits<float>::max());
228  break;
229  }
230  case DataType::F16:
231  {
232  pixelValue = PixelValue(static_cast<half>(-65504.0f));
233  break;
234  }
235  case DataType::QASYMM8:
236  {
237  pixelValue = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
238  break;
239  }
240  default:
241  {
242  ARM_COMPUTE_ERROR("Unsupported DataType");
243  }
244  }
245  break;
246  default:
247  ARM_COMPUTE_ERROR("Not supported");
248  }
249 
250  _reduction_kernels_vector[0].configure(input, &_results_vector[0], axis, first_kernel_op);
251  _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
252 
253  // Apply ReductionOperation on intermediate stages
254  for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
255  {
256  _memory_group.manage(&_results_vector[i]);
257  _reduction_kernels_vector[i].configure(&_results_vector[i - 1], &_results_vector[i], axis, intermediate_kernel_op);
258  _border_handlers_vector[i].configure(&_results_vector[i - 1], _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
259  _results_vector[i - 1].allocator()->allocate();
260  }
261 
262  // Apply ReductionOperation on the last stage
263  const unsigned int last_stage = _num_of_stages - 1;
264  const unsigned int input_width = input->info()->dimension(0);
265  _reduction_kernels_vector[last_stage].configure(&_results_vector[last_stage - 1], output, axis, last_kernel_op, input_width);
266  _border_handlers_vector[last_stage].configure(&_results_vector[last_stage - 1], _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
267  _results_vector[last_stage - 1].allocator()->allocate();
268  }
269 }
270 
272 {
273  MemoryGroupResourceScope scope_mg(_memory_group);
274 
275  if(_is_serial)
276  {
277  CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
278  }
279  else
280  {
281  for(unsigned int i = 0; i < _num_of_stages; ++i)
282  {
283  CLScheduler::get().enqueue(_border_handlers_vector[i], false);
284  CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
285  }
286  }
287 }
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:1010
#define ARM_COMPUTE_ERROR(...)
Print the given message then throw an std::runtime_error.
Definition: Error.h:261
Class describing the value of a pixel for any image format.
Definition: PixelValue.h:34
Shape of a tensor.
Definition: TensorShape.h:39
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
ReductionOperation
Available reduction operations.
Definition: Types.h:485
static CLScheduler & get()
Access the scheduler singleton.
Definition: CLScheduler.cpp:41
void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
Set the input and output tensors.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:193
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
Store the tensor's metadata.
Definition: ITensorInfo.h:40
Status class.
Definition: Error.h:52
static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
Static function to check if given info will lead to a valid configuration of CLReductionOperation.
Copyright (c) 2017-2018 ARM Limited.
1 channel, 1 F16 per channel
void manage(TensorType *obj)
Sets a object to be managed by the given memory group.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
void run() override
Run the kernels contained in the function.
quantized, asymmetric fixed-point 8-bit number
static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width=0)
Static function to check if given info will lead to a valid configuration of CLReductionOperationKern...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
CLReductionOperation(std::shared_ptr< IMemoryManager > memory_manager=nullptr)
Default Constructor.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
void enqueue(ICLKernel &kernel, bool flush=true)
Schedule the execution of the passed kernel if possible.
Definition: CLScheduler.cpp:95
Memory group resources scope handling class.
Definition: IMemoryGroup.h:46
Interface for OpenCL tensor.
Definition: ICLTensor.h:42
virtual size_t num_channels() const =0
The number of channels for each tensor element.