Compute Library
 20.05
NEPixelWiseMultiplicationKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
31 
32 #include <arm_neon.h>
33 
34 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
35 #include <arm_fp16.h> // needed for float16_t
36 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
37 
38 namespace arm_compute
39 {
40 namespace
41 {
42 const float scale255_constant = 1.f / 255.f;
43 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
44 const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
45 
46 constexpr unsigned int num_elems_processed_per_iteration = 16;
47 
48 inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
49 {
50  ARM_COMPUTE_UNUSED(overflow_policy);
51  ARM_COMPUTE_UNUSED(rounding_policy);
52 
59  if(is_data_type_quantized(input1->data_type()) || is_data_type_quantized(input2->data_type()))
60  {
62  ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
63  }
64 
65  if(output->total_size() > 0)
66  {
67  if(is_data_type_quantized(output->data_type()))
68  {
70  }
71 
72  const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
73  ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
74  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
75 
76  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
77  "Output can only be U8 if both inputs are U8");
78  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
79  "Output can only be S32 if both inputs are QSYMM16");
80  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");
81  }
82 
83  if(std::abs(scale - scale255_constant) < 0.00001f)
84  {
86  }
87  else
88  {
90 
91  int exponent = 0;
92  const float normalized_mantissa = std::frexp(scale, &exponent);
93 
94  // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
95  // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
96  // Moreover, it will be negative as we deal with 1/2^n
97  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
98  }
99 
100  return Status{};
101 }
102 
103 inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
104 {
105  const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
106  const ValidRegion &valid_region = broadcast_pair.second;
107 
108  // Auto initialize output if not initialized
109  {
110  ARM_COMPUTE_UNUSED(set_shape_if_empty(*output, input1->tensor_shape()));
111 
112  if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
113  {
115  }
116  else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
117  {
119  }
120  else if(input1->data_type() == DataType::F16 || input2->data_type() == DataType::F16)
121  {
123  }
124  else if(input1->data_type() == DataType::QASYMM8 || input2->data_type() == DataType::QASYMM8)
125  {
127  }
128  else if(input1->data_type() == DataType::QASYMM8_SIGNED || input2->data_type() == DataType::QASYMM8_SIGNED)
129  {
131  }
132  else if(input1->data_type() == DataType::QSYMM16 || input2->data_type() == DataType::QSYMM16)
133  {
135  }
136  }
137 
138  // Configure kernel window
140  Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
141  Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
142 
143  AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
144  AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
145  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
146 
147  bool window_changed = update_window_and_padding(win_input1, input1_access)
148  || update_window_and_padding(win_input2, input2_access)
149  || update_window_and_padding(win, output_access);
150 
151  output_access.set_valid_region(win, valid_region);
152 
153  Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
154  return std::make_pair(err, win);
155 }
156 
157 /* Scales a given vector by 1/255.
158  *
159  * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
160  *
161  * @param in Input vector to scale.
162  * @return Scaled output rounded to nearest (round half up).
163  */
164 inline int32x4_t scale255_S32_S32(int32x4_t in)
165 {
166  // Scale
167  const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
168  // Round to nearest (round half up)
169  // Add +0.5 for all values
170  // Afterwards vcvt rounds toward zero
171  return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
172 }
173 
174 inline uint16x8_t scale255_U16_U16(uint16x8_t in)
175 {
176  const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
177  const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
178  return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
179 }
180 
181 inline void mul_saturate_QASYMM8_QASYMM8_QASYMM8_n_opt(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr,
182  float32x4_t input1_vscale, int32x4_t input1_voffset, float32x4_t input2_vscale, int32x4_t input2_voffset, float32x4_t output_voffset, float32x4_t vinvscale)
183 {
184  const auto input1 = static_cast<const qasymm8_t *__restrict>(input1_ptr);
185  const auto input2 = static_cast<const qasymm8_t *__restrict>(input2_ptr);
186  const auto output = static_cast<qasymm8_t *__restrict>(output_ptr);
187 
188  const qasymm8x16_t input1_q = vld1q_u8(input1);
189  const qasymm8x16_t input2_q = vld1q_u8(input2);
190 
191  // Dequantitize inputs
192  float32x4x4_t in1_f32x4x4;
193  float32x4x4_t in2_f32x4x4;
194  in1_f32x4x4.val[0] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(input1_q))))), input1_voffset)), input1_vscale);
195  in1_f32x4x4.val[1] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(input1_q))))), input1_voffset)), input1_vscale);
196  in1_f32x4x4.val[2] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(input1_q))))), input1_voffset)), input1_vscale);
197  in1_f32x4x4.val[3] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(input1_q))))), input1_voffset)), input1_vscale);
198 
199  in2_f32x4x4.val[0] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(input2_q))))), input2_voffset)), input2_vscale);
200  in2_f32x4x4.val[1] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(input2_q))))), input2_voffset)), input2_vscale);
201  in2_f32x4x4.val[2] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(input2_q))))), input2_voffset)), input2_vscale);
202  in2_f32x4x4.val[3] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(input2_q))))), input2_voffset)), input2_vscale);
203 
204  float32x4x4_t out_f32x4x4;
205  out_f32x4x4.val[0] = vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]);
206  out_f32x4x4.val[1] = vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]);
207  out_f32x4x4.val[2] = vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]);
208  out_f32x4x4.val[3] = vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]);
209 
210  int32x4x4_t rf;
211 #ifdef __aarch64__
212  rf.val[0] = vcvtnq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[0], vinvscale));
213  rf.val[1] = vcvtnq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[1], vinvscale));
214  rf.val[2] = vcvtnq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[2], vinvscale));
215  rf.val[3] = vcvtnq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[3], vinvscale));
216 #else //__aarch64__
217  rf.val[0] = vcvtq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[0], vinvscale));
218  rf.val[1] = vcvtq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[1], vinvscale));
219  rf.val[2] = vcvtq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[2], vinvscale));
220  rf.val[3] = vcvtq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[3], vinvscale));
221 #endif //__aarch64__
222  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
223  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
224 
225  vst1q_u8(output, vcombine_u8(pa, pb));
226 }
227 
228 inline void mul_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED_n(
229  const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr,
230  float scale, const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info,
231  const UniformQuantizationInfo &output_qua_info)
232 
233 {
234  const auto input1 = static_cast<const qasymm8_signed_t *__restrict>(input1_ptr);
235  const auto input2 = static_cast<const qasymm8_signed_t *__restrict>(input2_ptr);
236  const auto output = static_cast<qasymm8_signed_t *__restrict>(output_ptr);
237  const qasymm8x16_signed_t input1_q = vld1q_s8(input1);
238  const qasymm8x16_signed_t input2_q = vld1q_s8(input2);
239  // Dequantitize inputs
240  const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
241  const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
242  const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
243  const float32x4x4_t out_f32x4x4 =
244  {
245  vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
246  vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
247  vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
248  vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
249  };
250  const int8x16_t result = vquantize_signed(out_f32x4x4, tmp_qua_info);
251  vst1q_s8(output, result);
252 }
253 
254 void mul_saturate_QSYMM16_QSYMM16_QSYMM16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale,
255  const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info, const UniformQuantizationInfo &output_qua_info)
256 {
257  const auto input1 = static_cast<const qsymm16_t *__restrict>(input1_ptr);
258  const auto input2 = static_cast<const qsymm16_t *__restrict>(input2_ptr);
259  const auto output = static_cast<qsymm16_t *__restrict>(output_ptr);
260 
261  const qsymm16x8x2_t input1_q =
262  {
263  {
264  vld1q_s16(input1),
265  vld1q_s16(input1 + 8),
266  }
267  };
268  const qsymm16x8x2_t input2_q =
269  {
270  {
271  vld1q_s16(input2),
272  vld1q_s16(input2 + 8),
273  }
274  };
275 
276  // Dequantitize inputs
277  const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
278  const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
279 
280  const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
281 
282  const float32x4x4_t out_f32x4x4 =
283  {
284  vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
285  vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
286  vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
287  vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
288  };
289 
290  const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
291  vst1q_s16(output, result.val[0]);
292  vst1q_s16(output + 8, result.val[1]);
293 }
294 
295 void mul_QSYMM16_QSYMM16_S32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale)
296 {
298  const auto input1 = static_cast<const qsymm16_t *__restrict>(input1_ptr);
299  const auto input2 = static_cast<const qsymm16_t *__restrict>(input2_ptr);
300  const auto output = static_cast<int32_t *__restrict>(output_ptr);
301 
302  const qsymm16x8x2_t input1_q =
303  {
304  {
305  vld1q_s16(input1),
306  vld1q_s16(input1 + 8),
307  }
308  };
309  const qsymm16x8x2_t input2_q =
310  {
311  {
312  vld1q_s16(input2),
313  vld1q_s16(input2 + 8),
314  }
315  };
316 
317  const int32x4x4_t in1_s32 =
318  {
319  {
320  vmovl_s16(vget_low_s16(input1_q.val[0])),
321  vmovl_s16(vget_high_s16(input1_q.val[0])),
322  vmovl_s16(vget_low_s16(input1_q.val[1])),
323  vmovl_s16(vget_high_s16(input1_q.val[1])),
324  }
325  };
326  const int32x4x4_t in2_s32 =
327  {
328  {
329  vmovl_s16(vget_low_s16(input2_q.val[0])),
330  vmovl_s16(vget_high_s16(input2_q.val[0])),
331  vmovl_s16(vget_low_s16(input2_q.val[1])),
332  vmovl_s16(vget_high_s16(input2_q.val[1])),
333  }
334  };
335 
336  const int32x4x4_t result =
337  {
338  {
339  vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
340  vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
341  vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
342  vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
343  }
344  };
345 
346  vst1q_s32(output, result.val[0]);
347  vst1q_s32(output + 4, result.val[1]);
348  vst1q_s32(output + 8, result.val[2]);
349  vst1q_s32(output + 12, result.val[3]);
350 }
351 
352 template <bool is_scale255, bool is_sat>
353 void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
354 {
355  const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
356  const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
357  const auto output = static_cast<uint8_t *__restrict>(output_ptr);
358 
359  const uint8x16_t ta1 = vld1q_u8(input1);
360  const uint8x16_t ta2 = vld1q_u8(input2);
361 
362  uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
363  const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
364  uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
365  const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
366 
367  tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
368  tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
369 
370  if(is_scale255)
371  {
372  tmp1_high = scale255_U16_U16(tmp1_high);
373  tmp1_low = scale255_U16_U16(tmp1_low);
374  }
375  else
376  {
377  const int16x8_t vn = vdupq_n_s16(-n);
378 
379  if(is_sat)
380  {
381  tmp1_high = vqshlq_u16(tmp1_high, vn);
382  tmp1_low = vqshlq_u16(tmp1_low, vn);
383  }
384  else
385  {
386  tmp1_high = vshlq_u16(tmp1_high, vn);
387  tmp1_low = vshlq_u16(tmp1_low, vn);
388  }
389  }
390 
391  if(is_sat)
392  {
393  vst1q_u8(output, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
394  }
395  else
396  {
397  vst1q_u8(output, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
398  }
399 }
400 
401 template <bool is_scale255, bool is_sat>
402 inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
403 {
404  int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1));
405  const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2));
406  int32x4_t tmp1_low = vmovl_s16(vget_low_s16(input1));
407  const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(input2));
408 
409  tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
410  tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
411 
412  if(is_scale255)
413  {
414  tmp1_high = scale255_S32_S32(tmp1_high);
415  tmp1_low = scale255_S32_S32(tmp1_low);
416  }
417  else
418  {
419  // Right shift amount
420  const int32x4_t vn = vdupq_n_s32(-n);
421  // Left shift amount
422  const int32x4_t vnl = vdupq_n_s32(n);
423  // Calculate conversion bit
424  const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
425  const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
426  const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
427  const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
428  const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
429  const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
430  const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
431  const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
432  if(is_sat)
433  {
434  tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
435  tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
436  }
437  else
438  {
439  tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
440  tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
441  }
442  }
443 
444  if(is_sat)
445  {
446  return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
447  }
448  else
449  {
450  return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
451  }
452 }
453 
454 template <bool is_scale255, bool is_sat>
455 inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x2_t &input2, int n)
456 {
457  const int16x8x2_t result =
458  {
459  {
460  // First 8 elements
461  mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[0], input2.val[0], n),
462  // Second 8 elements
463  mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[1], input2.val[1], n)
464  }
465  };
466 
467  return result;
468 }
469 
470 template <bool is_scale255, bool is_sat>
471 void mul_S16_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
472 {
473  const auto input1 = static_cast<const int16_t *__restrict>(input1_ptr);
474  const auto input2 = static_cast<const int16_t *__restrict>(input2_ptr);
475  const auto output = static_cast<int16_t *__restrict>(output_ptr);
476 
477  const int16x8x2_t ta1 =
478  {
479  {
480  vld1q_s16(input1),
481  vld1q_s16(input1 + 8),
482  }
483  };
484  const int16x8x2_t ta2 =
485  {
486  {
487  vld1q_s16(input2),
488  vld1q_s16(input2 + 8),
489  }
490  };
491  const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
492 
493  vst1q_s16(output, result.val[0]);
494  vst1q_s16(output + 8, result.val[1]);
495 }
496 
497 void mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
498 {
499  const auto input1 = static_cast<const float *__restrict>(input1_ptr);
500  const auto input2 = static_cast<const float *__restrict>(input2_ptr);
501  const auto output = static_cast<float *__restrict>(output_ptr);
502 
503  const float32x4x4_t ta1 = vld4q_f32(input1);
504  const float32x4x4_t ta2 = vld4q_f32(input2);
505  const float32x4_t scale_vec = vdupq_n_f32(scale);
506  const float32x4x4_t result =
507  {
508  {
509  vmulq_f32(vmulq_f32(ta1.val[0], ta2.val[0]), scale_vec),
510  vmulq_f32(vmulq_f32(ta1.val[1], ta2.val[1]), scale_vec),
511  vmulq_f32(vmulq_f32(ta1.val[2], ta2.val[2]), scale_vec),
512  vmulq_f32(vmulq_f32(ta1.val[3], ta2.val[3]), scale_vec)
513  }
514  };
515  vst4q_f32(output, result);
516 }
517 
518 void c_mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr)
519 {
520  const auto input1 = static_cast<const float *__restrict>(input1_ptr);
521  const auto input2 = static_cast<const float *__restrict>(input2_ptr);
522  const auto output = static_cast<float *__restrict>(output_ptr);
523 
524  const float32x4_t a = wrapper::vloadq(input1);
525  float32x4_t b = wrapper::vloadq(input2);
526 
527  using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
528 
529  const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
530  const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
531  const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
532  const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
533  const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
534 
535  const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
536  const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
537 
538  float32x4_t res = wrapper::vmul(tmp0, b);
539 
540  b = wrapper::vrev64(b);
541  b = wrapper::vmul(b, mask);
542 
543  res = wrapper::vmla(res, tmp1, b);
544  wrapper::vstore(output, res);
545 }
546 
547 void mul_F16_F16_F16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
548 {
549 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
550  const auto input1 = static_cast<const float16_t *__restrict>(input1_ptr);
551  const auto input2 = static_cast<const float16_t *__restrict>(input2_ptr);
552  const auto output = static_cast<float16_t *__restrict>(output_ptr);
553  const float16x8x2_t ta1 =
554  {
555  {
556  vld1q_f16(input1),
557  vld1q_f16(input1 + 8),
558  }
559  };
560  const float16x8x2_t ta2 =
561  {
562  {
563  vld1q_f16(input2),
564  vld1q_f16(input2 + 8),
565  }
566  };
567  const float16x8_t scale_vec = vdupq_n_f16(scale);
568  const float16x8x2_t result =
569  {
570  {
571  vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
572  vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
573  }
574  };
575  vst1q_f16(output, result.val[0]);
576  vst1q_f16(output + 8, result.val[1]);
577 #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
578  ARM_COMPUTE_UNUSED(input1_ptr);
579  ARM_COMPUTE_UNUSED(input2_ptr);
580  ARM_COMPUTE_UNUSED(output_ptr);
582  ARM_COMPUTE_ERROR("Not supported. Recompile the library with arch=arm64-v8.2-a.");
583 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
584 }
585 
586 template <bool is_scale255, bool is_sat>
587 void mul_U8_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
588 {
589  const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
590  const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
591  const auto output = static_cast<int16_t *__restrict>(output_ptr);
592 
593  const uint8x16_t bv = vld1q_u8(input2);
594  const uint8x16_t av = vld1q_u8(input1);
595 
596  uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
597  uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
598  tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
599  tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
600 
601  if(is_scale255)
602  {
603  tmp_low = scale255_U16_U16(tmp_low);
604  tmp_high = scale255_U16_U16(tmp_high);
605  }
606  else
607  {
608  const int16x8_t vn = vdupq_n_s16(-n);
609 
610  if(is_sat)
611  {
612  tmp_low = vqshlq_u16(tmp_low, vn);
613  tmp_high = vqshlq_u16(tmp_high, vn);
614  }
615  else
616  {
617  tmp_low = vshlq_u16(tmp_low, vn);
618  tmp_high = vshlq_u16(tmp_high, vn);
619  }
620  }
621 
622  if(is_sat)
623  {
624  static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
625 
626  tmp_low = vminq_u16(tmp_low, max);
627  tmp_high = vminq_u16(tmp_high, max);
628  }
629 
630  vst1q_s16(output, vreinterpretq_s16_u16(tmp_low));
631  vst1q_s16(output + 8, vreinterpretq_s16_u16(tmp_high));
632 }
633 
634 template <bool is_scale255, bool is_sat>
635 void mul_S16_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
636 {
637  const auto input1 = static_cast<const int16_t *__restrict>(input1_ptr);
638  const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
639  const auto output = static_cast<int16_t *__restrict>(output_ptr);
640 
641  const int16x8x2_t ta1 =
642  {
643  {
644  vld1q_s16(input1),
645  vld1q_s16(input1 + 8),
646  }
647  };
648  const uint8x8x2_t ta2u =
649  {
650  {
651  vld1_u8(input2),
652  vld1_u8(input2 + 8),
653  }
654  };
655  const int16x8x2_t ta2 =
656  {
657  {
658  vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
659  vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
660  }
661  };
662 
663  const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
664 
665  vst1q_s16(output, result.val[0]);
666  vst1q_s16(output + 8, result.val[1]);
667 }
668 
669 template <bool is_scale255, bool is_sat>
670 void mul_U8_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
671 {
672  // Simply swap the two input buffers
673  mul_S16_U8_S16_n<is_scale255, is_sat>(input2_ptr, input1_ptr, output_ptr, n);
674 }
675 } // namespace
676 
678  : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }, _run_optimized_qasymm8(false)
679 {
680 }
681 
682 void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
683 {
684  ARM_COMPUTE_UNUSED(rounding_policy);
685  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
686 
687  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy));
688 
689  // Configure kernel window
690  auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
691  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
692 
693  _input1 = input1;
694  _input2 = input2;
695  _output = output;
696  _scale = scale;
697  _scale_exponent = 0;
698  _func_quantized = nullptr;
699  _func_int = nullptr;
700  _func_float = nullptr;
701  _run_optimized_qasymm8 = false;
702 
703  bool is_scale_255 = false;
704  // Check and validate scaling factor
705  if(std::abs(scale - scale255_constant) < 0.00001f)
706  {
707  is_scale_255 = true;
708  }
709  else
710  {
711  int exponent = 0;
712 
713  std::frexp(scale, &exponent);
714 
715  // Store the positive exponent. We know that we compute 1/2^n
716  // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
717  _scale_exponent = std::abs(exponent - 1);
718  }
719 
720  const DataType dt_input1 = input1->info()->data_type();
721  const DataType dt_input2 = input2->info()->data_type();
722  const DataType dt_output = output->info()->data_type();
723  const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
724 
725  if(dt_input1 == DataType::QASYMM8 && dt_input2 == DataType::QASYMM8)
726  {
727  _run_optimized_qasymm8 = true;
728  }
729  else if(dt_input1 == DataType::QASYMM8_SIGNED && dt_input2 == DataType::QASYMM8_SIGNED)
730  {
731  _func_quantized = &mul_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED_n;
732  }
733  else if(dt_input1 == DataType::QSYMM16 && dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
734  {
735  _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16_n;
736  }
737  else if(dt_input1 == DataType::QSYMM16 && dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
738  {
739  _func_int = &mul_QSYMM16_QSYMM16_S32_n;
740  }
741  else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
742  {
743  if(is_scale_255)
744  {
745  _func_int = is_sat ? &mul_U8_U8_U8_n<true, true> : &mul_U8_U8_U8_n<true, false>;
746  }
747  else
748  {
749  _func_int = is_sat ? &mul_U8_U8_U8_n<false, true> : &mul_U8_U8_U8_n<false, false>;
750  }
751  }
752  else if(DataType::S16 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output)
753  {
754  if(is_scale_255)
755  {
756  _func_int = is_sat ? &mul_S16_S16_S16_n<true, true> : &mul_S16_S16_S16_n<true, false>;
757  }
758  else
759  {
760  _func_int = is_sat ? &mul_S16_S16_S16_n<false, true> : &mul_S16_S16_S16_n<false, false>;
761  }
762  }
763  else if(DataType::S16 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output)
764  {
765  if(is_scale_255)
766  {
767  _func_int = is_sat ? &mul_S16_U8_S16_n<true, true> : &mul_S16_U8_S16_n<true, false>;
768  }
769  else
770  {
771  _func_int = is_sat ? &mul_S16_U8_S16_n<false, true> : &mul_S16_U8_S16_n<false, false>;
772  }
773  }
774  else if(DataType::U8 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output)
775  {
776  if(is_scale_255)
777  {
778  _func_int = is_sat ? &mul_U8_S16_S16_n<true, true> : &mul_U8_S16_S16_n<true, false>;
779  }
780  else
781  {
782  _func_int = is_sat ? &mul_U8_S16_S16_n<false, true> : &mul_U8_S16_S16_n<false, false>;
783  }
784  }
785  else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output)
786  {
787  if(is_scale_255)
788  {
789  _func_int = is_sat ? &mul_U8_U8_S16_n<true, true> : &mul_U8_U8_S16_n<true, false>;
790  }
791  else
792  {
793  _func_int = is_sat ? &mul_U8_U8_S16_n<false, true> : &mul_U8_U8_S16_n<false, false>;
794  }
795  }
796  else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output)
797  {
798  _func_float = &mul_F16_F16_F16_n;
799  _func_int = nullptr;
800  }
801  else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
802  {
803  _func_float = &mul_F32_F32_F32_n;
804  _func_int = nullptr;
805  }
806  else
807  {
808  ARM_COMPUTE_ERROR("You called with the wrong img formats");
809  }
810 
811  INEKernel::configure(win_config.second);
812 }
813 
814 Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
815  RoundingPolicy rounding_policy)
816 {
817  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
818  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
819  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
820 
821  return Status{};
822 }
823 
825 {
829 
830  const TensorShape &in_shape1 = _input1->info()->tensor_shape();
831  const TensorShape &in_shape2 = _input2->info()->tensor_shape();
832  const TensorShape &out_shape = _output->info()->tensor_shape();
833 
834  bool can_collapse = true;
835  if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
836  {
837  can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
838  for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
839  {
840  can_collapse = (in_shape1[d] == in_shape2[d]);
841  }
842  }
843 
844  bool has_collapsed = false;
845  Window collapsed = can_collapse ? window.collapse_if_possible(INEKernel::window(), Window::DimZ, &has_collapsed) : window;
846 
847  const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
848  const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
849 
850  Window slice = collapsed.first_slice_window_3D();
851  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
852  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
853 
854  Iterator input1(_input1, slice_input1);
855  Iterator input2(_input2, slice_input2);
856  Iterator output(_output, slice);
857 
858  if((_run_optimized_qasymm8) || (_func_quantized != nullptr))
859  {
860  if(_run_optimized_qasymm8)
861  {
862  const int32x4_t input1_voffset = vdupq_n_s32(_input1->info()->quantization_info().uniform().offset);
863  const float32x4_t input1_vscale = vdupq_n_f32(_input1->info()->quantization_info().uniform().scale);
864  const int32x4_t input2_voffset = vdupq_n_s32(_input2->info()->quantization_info().uniform().offset);
865  const float32x4_t input2_vscale = vdupq_n_f32(_input2->info()->quantization_info().uniform().scale);
866  const float32x4_t output_voffset = vdupq_n_f32(static_cast<float>(_output->info()->quantization_info().uniform().offset));
867  const float output_scale = _output->info()->quantization_info().uniform().scale;
868  const float32x4_t vinvscale = vdupq_n_f32(1.f / (output_scale / _scale));
869 
870  execute_window_loop(collapsed, [&](const Coordinates &)
871  {
872  mul_saturate_QASYMM8_QASYMM8_QASYMM8_n_opt(input1.ptr(), input2.ptr(), output.ptr(),
873  input1_vscale, input1_voffset, input2_vscale, input2_voffset, output_voffset, vinvscale);
874  ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
875  ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
876  },
877  input1, input2, output);
878  }
879  else
880  {
881  execute_window_loop(collapsed, [&](const Coordinates &)
882  {
883  (*_func_quantized)(input1.ptr(), input2.ptr(), output.ptr(), _scale,
884  _input1->info()->quantization_info().uniform(), _input2->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform());
885  ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
886  ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
887  },
888  input1, input2, output);
889  }
890  }
891  else if(_func_int != nullptr)
892  {
893  execute_window_loop(collapsed, [&](const Coordinates &)
894  {
895  (*_func_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent);
896  ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
897  ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
898  },
899  input1, input2, output);
900  }
901  else
902  {
903  ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
904  execute_window_loop(collapsed, [&](const Coordinates &)
905  {
906  (*_func_float)(input1.ptr(), input2.ptr(), output.ptr(), _scale);
907  ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
908  ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
909  },
910  input1, input2, output);
911  }
912 }
913 
915 {
916  const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
917  const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
918  return BorderSize{ 0, border, 0, 0 };
919 }
920 
921 namespace
922 {
923 constexpr unsigned int num_elems_processed_per_iteration_complex = 2;
924 
925 Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
926 {
929 
930  const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
931 
932  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
933 
934  // Validate in case of configured output
935  if(output->total_size() > 0)
936  {
938  ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
939  }
940 
941  return Status{};
942 }
943 
944 std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
945 {
946  const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
947  const TensorShape &out_shape = broadcast_pair.first;
948  const ValidRegion &valid_region = broadcast_pair.second;
949 
950  // Auto initialize output if not initialized
951  const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
952  auto_init_if_empty(*output, out_info);
953 
954  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));
955  Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
956  Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
957 
958  AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);
959  AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);
960  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);
961 
962  bool window_changed = update_window_and_padding(win_input1, input1_access)
963  || update_window_and_padding(win_input2, input2_access)
964  || update_window_and_padding(win, output_access);
965 
966  output_access.set_valid_region(win, valid_region);
967 
968  Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
969  return std::make_pair(err, win);
970 }
971 } // namespace
972 
974  : _input1(nullptr), _input2(nullptr), _output(nullptr)
975 {
976 }
977 
979 {
980  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
981  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info()));
982 
983  // Configure kernel window
984  auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info());
985  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
986 
987  _input1 = input1;
988  _input2 = input2;
989  _output = output;
990 
991  // Create kernel
992  INEKernel::configure(win_config.second);
993 }
994 
996 {
997  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
998  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));
999  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
1000 
1001  return Status{};
1002 }
1003 
1005 {
1009 
1010  Iterator input1(_input1, window.broadcast_if_dimension_le_one(_input1->info()->tensor_shape()));
1011  Iterator input2(_input2, window.broadcast_if_dimension_le_one(_input2->info()->tensor_shape()));
1012  Iterator output(_output, window);
1013 
1014  execute_window_loop(window, [&](const Coordinates &)
1015  {
1016  c_mul_F32_F32_F32_n(input1.ptr(), input2.ptr(), output.ptr());
1017  },
1018  input1, input2, output);
1019 }
1020 
1022 {
1023  const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
1024  const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
1025  return { 0, border, 0, 0 };
1026 }
1027 } // namespace arm_compute
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:1131
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 16 floating point values.
Definition: NESymm.h:204
quantized, symmetric fixed-point 16-bit number
Rounds to nearest value; half rounds away from zero.
void configure(const ITensor *input1, const ITensor *input2, ITensor *output)
Initialise the kernel's input, output and border mode.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
SimpleTensor< float > b
Definition: DFT.cpp:157
Container for 2D border size.
Definition: Types.h:272
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
Definition: NEAsymm.h:421
TensorShape collapsed_from(size_t start) const
Return a copy with collapsed dimensions starting from a given point.
Definition: TensorShape.h:160
1 channel, 1 U8 per channel
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
1 channel, 1 F32 per channel
static TensorShape broadcast_shape(const Shapes &... shapes)
If shapes are broadcast compatible, return the broadcasted shape.
Definition: TensorShape.h:210
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor's metadata.
Definition: ITensorInfo.h:40
void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Initialise the kernel's input, output and border mode.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
static std::pair< TensorShape, ValidRegion > broadcast_shape_and_valid_region(const Infos &... infos)
If infos are broadcast compatible tensor info's, return the broadcasted shape and the intersection of...
Definition: ITensorInfo.h:259
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEComplexPixelWiseMultip...
Interface for NEON tensor.
Definition: ITensor.h:36
bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
Set the data type and number of channels to the specified value if the current data type is unknown.
Definition: Helpers.inl:257
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps=Steps(), bool skip_border=false, BorderSize border_size=BorderSize())
Calculate the maximum window for a given tensor shape and border setting.
Definition: Helpers.cpp:28
Copyright (c) 2017-2020 ARM Limited.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: Helpers.inl:202
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Static function to check if given info will lead to a valid configuration of NEPixelWiseMultiplicatio...
BorderSize border_size() const override
The size of the border for that kernel.
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: Helpers.h:437
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
Window collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed=nullptr) const
Collapse the dimensions between first and last if possible.
Definition: Window.inl:68
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
quantized, asymmetric fixed-point 8-bit number unsigned
Coordinates of an item.
Definition: Coordinates.h:37
size_t total_size() const
Collapses all dimensions to a single linear total size.
Definition: TensorShape.h:171
UniformQuantizationInfo uniform() const
Return per layer quantization info.
RoundingPolicy
Rounding method.
Definition: Rounding.h:30
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:185
bool have_different_dimensions(const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)
Definition: Validate.h:51
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
Window broadcast_if_dimension_le_one(const TensorShape &shape) const
Don't advance in the dimension where shape is less equal to 1.
Definition: Window.inl:120
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
Definition: Window.h:333
BorderSize border_size() const override
The size of the border for that kernel.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Rounds to nearest value; half rounds to nearest even.
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
Information about executing thread and CPU.
Definition: CPPTypes.h:225
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
Definition: Error.h:159
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
int16x8x2_t qsymm16x8x2_t
16 bit quantized symmetric vector with 16 elements
Definition: NESymm.h:37
unsigned int num_dimensions() const
Returns the effective dimensionality of the tensor.
Definition: Dimensions.h:122
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
Definition: Helpers.inl:235
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
uint8x8_t vrev64(const uint8x8_t &a)
Definition: rev64.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
unsigned int num_elems_processed_per_iteration
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:123
bool set_format_if_unknown(ITensorInfo &info, Format format)
Set the format, data type and number of channels to the specified value if the current data type is u...
Definition: Helpers.inl:246
quantized, asymmetric fixed-point 8-bit number signed
int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
Definition: NEAsymm.h:636
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
int8x16_t qasymm8x16_signed_t
8 bit quantized signed asymmetric vector with 16 elements
Definition: NEAsymm.h:42
SimpleTensor< T > scale(const SimpleTensor< T > &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value, SamplingPolicy sampling_policy, bool ceil_policy_scale, bool align_corners)
Definition: Scale.cpp:187
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:353
Window first_slice_window_3D() const
First 3D slice of the window.
Definition: Window.h:289
DataType
Available data types.
Definition: Types.h:77
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
Truncates the least significant values that are lost in operations.
Describe a multidimensional execution window.
Definition: Window.h:39
ConvertPolicy
Policy to handle overflow.
Definition: Types.h:362
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)
uint8x16_t qasymm8x16_t
8 bit quantized asymmetric vector with 16 elements
Definition: NEAsymm.h:36