Compute Library
 21.02
CpuAddKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
29 #include "src/core/CPP/Validate.h"
35 
36 #include <array>
37 
38 namespace arm_compute
39 {
40 namespace cpu
41 {
42 namespace kernels
43 {
44 namespace
45 {
46 struct AddSelectorData
47 {
51 };
52 
55 struct AddKernel
56 {
57  const char *name;
58  const AddSelectorPtr is_selected;
59  AddKernelPtr ukernel;
60 };
61 
62 static const AddKernel available_kernels[] =
63 {
64 #if defined(__ARM_FEATURE_SVE)
65  {
66  "add_same_sve",
67  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
68  REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve<float>)
69  },
70  {
71  "add_same_sve",
72  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); },
73  REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve<float16_t>)
74  },
75  {
76  "add_same_sve",
77  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
78  REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<uint8_t>)
79  },
80  {
81  "add_same_sve",
82  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
83  REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int16_t>)
84  },
85  {
86  "add_same_sve",
87  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
88  REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int32_t>)
89  },
90  {
91  "add_u8_s16_s16_sve",
92  [](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
93  REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_s16_s16_sve)
94  },
95  {
96  "add_s16_u8_s16_sve",
97  [](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
98  REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_u8_s16_sve)
99  },
100  {
101  "add_u8_u8_s16_sve",
102  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
103  REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_u8_s16_sve)
104  },
105 #else /* !defined(__ARM_FEATURE_SVE) */
106  {
107  "add_same_neon",
108  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
109  REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon<float>)
110  },
111 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
112  {
113  "add_same_neon",
114  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); },
115  REGISTER_FP16_NEON(arm_compute::cpu::add_same_neon<float16_t>)
116  },
117 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
118  {
119  "add_same_neon",
120  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
121  REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<uint8_t>)
122  },
123  {
124  "add_same_neon",
125  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
126  REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int16_t>)
127  },
128  {
129  "add_same_neon",
130  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
131  REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int32_t>)
132  },
133  {
134  "add_u8_s16_s16_neon",
135  [](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
137  },
138  {
139  "add_s16_u8_s16_neon",
140  [](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
142  },
143  {
144  "add_u8_u8_s16_neon",
145  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
147  },
148 #endif /* defined(__ARM_FEATURE_SVE) */
149 
150 #if defined(__ARM_FEATURE_SVE2)
151  {
152  "add_qasymm8_sve",
153  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
154  REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve)
155  },
156  {
157  "add_qasymm8_signed_sve",
158  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
159  REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve)
160  },
161  {
162  "add_qsymm16_sve",
163  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
164  REGISTER_QSYMM16_SVE(arm_compute::cpu::add_qsymm16_sve)
165  },
166 #else /* !defined(__ARM_FEATURE_SVE2) */
167  {
168  "add_qasymm8_neon",
169  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
171  },
172  {
173  "add_qasymm8_signed_neon",
174  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
176  },
177  {
178  "add_qsymm16_neon",
179  [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
181  },
182 #endif /* defined(__ARM_FEATURE_SVE2) */
183 
184 };
185 
186 /** Micro-kernel selector
187  *
188  * @param[in] data Selection data passed to help pick the appropriate micro-kernel
189  *
190  * @return A matching micro-kernel else nullptr
191  */
192 const AddKernel *get_implementation(DataType dt1, DataType dt2, DataType dt3)
193 {
194  for(const auto &uk : available_kernels)
195  {
196  if(uk.is_selected({ dt1, dt2, dt3 }))
197  {
198  return &uk;
199  }
200  }
201  return nullptr;
202 }
203 
204 Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
205 {
206  ARM_COMPUTE_UNUSED(policy);
207 
215 
216  const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
217 
218  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
219  ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type())
220  || (src1.data_type() != dst.data_type())),
221  "Broadcasting across width is supported on configurations where all tensors have the same data type");
222 
223  // Validate in case of configured dst
224  if(dst.total_size() > 0)
225  {
227  !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::U8)
228  && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
229  && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
230  && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
231  && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
232  && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32 && dst.data_type() == DataType::S32)
233  && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32 && dst.data_type() == DataType::F32)
234  && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16 && dst.data_type() == DataType::F16)
235  && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && dst.data_type() == DataType::QASYMM8)
236  && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && dst.data_type() == DataType::QASYMM8_SIGNED)
237  && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && dst.data_type() == DataType::QSYMM16),
238  "You called addition with the wrong image formats");
239 
240  ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
241  "Wrong shape for dst");
242  }
243 
244  const auto *uk = get_implementation(src0.data_type(), src1.data_type(), dst.data_type());
245  ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
246 
247  return Status{};
248 }
249 
250 std::pair<Status, Window> validate_and_configure_window(const ITensorInfo &src0, const ITensorInfo &src1, ITensorInfo &dst)
251 {
252  const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src0, src1);
253  const TensorShape &out_shape = broadcast_pair.first;
254  const ValidRegion &valid_region = broadcast_pair.second;
255 
256  // Auto initialize dst if not initialized
257  {
258  set_shape_if_empty(dst, out_shape);
259 
260  if(src0.data_type() == DataType::S16 || src1.data_type() == DataType::S16)
261  {
263  }
264  if(src0.data_type() == DataType::S32 || src1.data_type() == DataType::S32)
265  {
267  }
268  else if(src0.data_type() == DataType::F16 || src1.data_type() == DataType::F16)
269  {
271  }
272  else if(src0.data_type() == DataType::F32 || src1.data_type() == DataType::F32)
273  {
275  }
276  else if(src0.data_type() == DataType::QASYMM8 || src1.data_type() == DataType::QASYMM8)
277  {
279  }
280  else if(src0.data_type() == DataType::QASYMM8_SIGNED || src1.data_type() == DataType::QASYMM8_SIGNED)
281  {
283  }
284  else if(src0.data_type() == DataType::QSYMM16 || src1.data_type() == DataType::QSYMM16)
285  {
287  }
288  }
289 
290  Window win = calculate_max_window(valid_region, Steps());
291 
292  // CpuAddKernel doesn't need padding so update_window_and_padding() can be skipped
293  Coordinates coord;
294  coord.set_num_dimensions(dst.num_dimensions());
295  dst.set_valid_region(valid_region);
296  return std::make_pair(Status{}, win);
297 }
298 } // namespace
299 
301 {
302  ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
303  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
304 
305  _policy = policy;
306 
307  // Configure kernel window
308  auto win_config = validate_and_configure_window(*src0, *src1, *dst);
309  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
310  ICpuKernel::configure(win_config.second);
311 }
312 
314 {
315  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
316 
317  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
318  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*src0->clone(), *src1->clone(), *dst->clone()).first);
319 
320  return Status{};
321 }
322 
323 void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
324 {
325  ARM_COMPUTE_UNUSED(info);
328 
329  ARM_COMPUTE_ERROR_ON(tensors.empty());
330 
331  const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
332  const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
334 
335  const auto *uk = get_implementation(src0->info()->data_type(), src1->info()->data_type(), dst->info()->data_type());
336  ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
337 
338  uk->ukernel(src0, src1, dst, _policy, window);
339 }
340 
341 const char *CpuAddKernel::name() const
342 {
343  return "CpuAddKernel";
344 }
345 } // namespace kernels
346 } // namespace cpu
347 } // namespace arm_compute
bool set_format_if_unknown(ITensorInfo &info, Format format)
Set the format, data type and number of channels to the specified value if the current data type is u...
void add_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Definition: integer.cpp:100
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
quantized, symmetric fixed-point 16-bit number
bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
Set the data type and number of channels to the specified value if the current data type is unknown...
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
#define REGISTER_FP16_NEON(func_name)
Definition: Registrars.h:42
bool empty() const
Checks if pack is empty.
Definition: ITensorPack.cpp:61
1 channel, 1 U8 per channel
#define REGISTER_FP32_NEON(func_name)
Definition: Registrars.h:52
AddKernelPtr ukernel
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define REGISTER_FP32_SVE(func_name)
Definition: Registrars.h:53
static TensorShape broadcast_shape(const Shapes &... shapes)
If shapes are broadcast compatible, return the broadcasted shape.
Definition: TensorShape.h:211
#define REGISTER_QASYMM8_SVE(func_name)
Definition: Registrars.h:73
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
#define REGISTER_QASYMM8_SIGNED_NEON(func_name)
Definition: Registrars.h:62
DataType dt1
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Status class.
Definition: Error.h:52
const ValidRegion valid_region
Definition: Scale.cpp:221
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
static std::pair< TensorShape, ValidRegion > broadcast_shape_and_valid_region(const Infos &... infos)
If infos are broadcast compatible tensor info&#39;s, return the broadcasted shape and the intersection of...
Definition: ITensorInfo.h:271
decltype(strategy::transforms) typedef type
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Definition: qsymm16.cpp:35
void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Definition: qasymm8.cpp:35
#define REGISTER_INTEGER_NEON(func_name)
Definition: Registrars.h:92
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
#define REGISTER_QASYMM8_SIGNED_SVE(func_name)
Definition: Registrars.h:63
1 channel, 1 S32 per channel
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:40
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void add_u8_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Definition: integer.cpp:35
#define REGISTER_QASYMM8_NEON(func_name)
Definition: Registrars.h:72
quantized, asymmetric fixed-point 8-bit number unsigned
#define REGISTER_QSYMM16_NEON(func_name)
Definition: Registrars.h:82
#define REGISTER_INTEGER_SVE(func_name)
Definition: Registrars.h:93
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
bool have_different_dimensions(const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)
Definition: Validate.h:51
DataType dt2
const char * name() const override
Name of the kernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
1 channel, 1 S16 per channel
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:50
DataType dt3
Information about executing thread and CPU.
Definition: CPPTypes.h:235
#define REGISTER_QSYMM16_SVE(func_name)
Definition: Registrars.h:83
#define REGISTER_FP16_SVE(func_name)
Definition: Registrars.h:43
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
Static function to check if given info will lead to a valid configuration of CpuAddKernel.
void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy)
Initialise the kernel&#39;s input, dst and border mode.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:37
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
quantized, asymmetric fixed-point 8-bit number signed
const char * name
DataType
Available data types.
Definition: Types.h:77
const AddSelectorPtr is_selected
Describe a multidimensional execution window.
Definition: Window.h:39
ConvertPolicy
Policy to handle overflow.
Definition: Types.h:385
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
void add_u8_s16_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Definition: integer.cpp:164