Compute Library
 22.05
CpuMulKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
28 #include "src/core/CPP/Validate.h"
29 #include "src/core/NEON/NEAsymm.h"
30 #include "src/core/NEON/NESymm.h"
34 
35 #include <arm_neon.h>
36 
37 namespace arm_compute
38 {
39 namespace cpu
40 {
41 namespace kernels
42 {
43 namespace
44 {
45 const float scale255_constant = 1.f / 255.f;
46 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
47 const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
48 
49 inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
50 {
51  ARM_COMPUTE_UNUSED(overflow_policy);
52  ARM_COMPUTE_UNUSED(rounding_policy);
53 
62  if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
63  {
65  ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
66  }
67 
68  if(dst->total_size() > 0)
69  {
70  const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
71  ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
72  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
73  // clang-format off
75  !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
76  !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
77  !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
78  !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
79  !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
80  !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
81  , "Invalid data type combination");
82  // clang-format on
83  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
84  }
85 
86  if(std::abs(scale - scale255_constant) < 0.00001f)
87  {
89  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
90  "Scale == 1/255 is not supported if input and dst are of data type S32");
91  }
92  else
93  {
95 
96  int exponent = 0;
97  const float normalized_mantissa = std::frexp(scale, &exponent);
98 
99  // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
100  // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
101  // Moreover, it will be negative as we deal with 1/2^n
102  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
103  }
104 
105  return Status{};
106 }
107 
108 /* Scales a given vector by 1/255.
109  *
110  * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
111  *
112  * @param in Input vector to scale.
113  * @return Scaled dst rounded to nearest (round half up).
114  */
115 inline int32x4_t scale255_S32_S32(int32x4_t in)
116 {
117  // Scale
118  const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
119  // Round to nearest (round half up)
120  // Add +0.5 for all values
121  // Afterwards vcvt rounds toward zero
122  return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
123 }
124 
125 inline uint16x8_t scale255_U16_U16(uint16x8_t in)
126 {
127  const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
128  const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
129  return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
130 }
131 
132 template <typename T>
133 inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
134 vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
135 {
136  return vquantize_signed(val, info);
137 }
138 
139 template <typename T>
140 inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
141 vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
142 {
143  return vquantize(val, info);
144 }
145 
146 template <typename T>
147 void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
148 {
149  // Create input windows
150  Window win = window;
151  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
152  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
153 
154  // Clear X Dimension on execution window as we handle manually
155  win.set(Window::DimX, Window::Dimension(0, 1, 1));
156 
157  const int window_step_x = 16 / sizeof(T);
158  const auto window_start_x = static_cast<int>(window.x().start());
159  const auto window_end_x = static_cast<int>(window.x().end());
160  const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
161 
162  const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
163  const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
164 
165  if(is_broadcast_across_x)
166  {
167  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
168  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
169  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
170  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
171  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
172  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
173  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
174 
175  // Clear X Dimension on execution window as we handle manually
176  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
177 
178  Iterator broadcast_input(broadcast_tensor, broadcast_win);
179  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
180  Iterator dst(out, win);
181 
182  using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
183 
184  execute_window_loop(win, [&](const Coordinates &)
185  {
186  const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
187  const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
188 
189  const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
190  const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
191 
192  // Compute window_step_x elements per iteration
193  int x = window_start_x;
194  for(; x <= (window_end_x - window_step_x); x += window_step_x)
195  {
196  const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
197 
198  // Dequantize inputs
199  const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
200  const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
201 
202  const float32x4x4_t out_f32x4x4 =
203  {
204  vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
205  vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
206  vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
207  vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
208  };
209 
210  // Quantize dst
211  const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
212  wrapper::vstore(output_ptr + x, result);
213  }
214 
215  // Compute left-over elements
216  for(; x < window_end_x; ++x)
217  {
218  // Dequantize inputs
219  const T src1 = *(non_broadcast_input_ptr + x);
220  const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
221  const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
222  const float tmp_f = tmp_in1 * tmp_in2;
223 
224  // Quantize dst
225  const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
226  *(output_ptr + x) = tmp_qua;
227  }
228  },
229  broadcast_input, non_broadcast_input, dst);
230  }
231  else
232  {
233  const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
234  const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
235 
236  // Clear X Dimension on execution window as we handle manually
237  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
238  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
239 
240  Iterator input1(src1, input1_win);
241  Iterator input2(src2, input2_win);
242  Iterator dst(out, win);
243 
244  execute_window_loop(win, [&](const Coordinates &)
245  {
246  const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
247  const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
248  const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
249 
250  // Compute window_step_x elements per iteration
251  int x = window_start_x;
252  for(; x <= (window_end_x - window_step_x); x += window_step_x)
253  {
254  const auto input1_q = wrapper::vloadq(input1_ptr + x);
255  const auto input2_q = wrapper::vloadq(input2_ptr + x);
256 
257  // Dequantize inputs
258  const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
259  const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
260 
261  const float32x4x4_t out_f32x4x4 =
262  {
263  vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
264  vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
265  vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
266  vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
267  };
268 
269  // Quantize dst
270  const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
271  wrapper::vstore(output_ptr + x, result);
272  }
273 
274  // Compute left-over elements
275  for(; x < window_end_x; ++x)
276  {
277  // Dequantize inputs
278  const T src1 = *(input1_ptr + x);
279  const T src2 = *(input2_ptr + x);
280  const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
281  const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
282  const float tmp_f = tmp_in1 * tmp_in2;
283 
284  // Quantize dst
285  const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
286  *(output_ptr + x) = tmp_qua;
287  }
288  },
289  input1, input2, dst);
290  }
291 }
292 
293 void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
294 {
295  const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
296  const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
297  const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
298 
299  // Create input windows
300  Window win = window;
301  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
302  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
303 
304  // Clear X Dimension on execution window as we handle manually
305  win.set(Window::DimX, Window::Dimension(0, 1, 1));
306  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
307  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
308 
309  Iterator input1(src1, input1_win);
310  Iterator input2(src2, input2_win);
311  Iterator dst(out, win);
312 
313  const int window_step_x = 16;
314  const auto window_start_x = static_cast<int>(window.x().start());
315  const auto window_end_x = static_cast<int>(window.x().end());
316 
317  const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
318 
319  execute_window_loop(win, [&](const Coordinates &)
320  {
321  const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
322  const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
323  const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
324 
325  // Compute window_step_x elements per iteration
326  int x = window_start_x;
327  for(; x <= (window_end_x - window_step_x); x += window_step_x)
328  {
329  const qsymm16x8x2_t input1_q =
330  {
331  {
332  vld1q_s16(input1_ptr + x),
333  vld1q_s16(input1_ptr + x + 8),
334  }
335  };
336  const qsymm16x8x2_t input2_q =
337  {
338  {
339  vld1q_s16(input2_ptr + x),
340  vld1q_s16(input2_ptr + x + 8),
341  }
342  };
343 
344  // Dequantize inputs
345  const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
346  const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
347 
348  const float32x4x4_t out_f32x4x4 =
349  {
350  vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
351  vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
352  vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
353  vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
354  };
355 
356  const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
357  vst1q_s16(output_ptr + x, result.val[0]);
358  vst1q_s16(output_ptr + x + 8, result.val[1]);
359  }
360 
361  // Compute left-over elements
362  for(; x < window_end_x; ++x)
363  {
364  // Dequantize inputs
365  float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
366  float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
367  float tmp_f = tmp_in1 * tmp_in2;
368 
369  // Quantize dst, lrintf() has same rounding mode as vcombine_s16
370  int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
371  qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
372  *(output_ptr + x) = tmp_qua;
373  }
374  },
375  input1, input2, dst);
376 }
377 
378 void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
379 {
380  ARM_COMPUTE_UNUSED(scale);
381 
382  // Create input windows
383  Window win = window;
384  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
385  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
386 
387  // Clear X Dimension on execution window as we handle manually
388  win.set(Window::DimX, Window::Dimension(0, 1, 1));
389  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
390  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
391 
392  Iterator input1(src1, input1_win);
393  Iterator input2(src2, input2_win);
394  Iterator dst(out, win);
395 
396  const int window_step_x = 16;
397  const auto window_start_x = static_cast<int>(window.x().start());
398  const auto window_end_x = static_cast<int>(window.x().end());
399 
400  execute_window_loop(win, [&](const Coordinates &)
401  {
402  const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
403  const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
404  const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
405 
406  // Compute window_step_x elements per iteration
407  int x = window_start_x;
408  for(; x <= (window_end_x - window_step_x); x += window_step_x)
409  {
410  const qsymm16x8x2_t input1_q =
411  {
412  {
413  vld1q_s16(input1_ptr + x),
414  vld1q_s16(input1_ptr + x + 8),
415  }
416  };
417  const qsymm16x8x2_t input2_q =
418  {
419  {
420  vld1q_s16(input2_ptr + x),
421  vld1q_s16(input2_ptr + x + 8),
422  }
423  };
424 
425  const int32x4x4_t in1_s32 =
426  {
427  {
428  vmovl_s16(vget_low_s16(input1_q.val[0])),
429  vmovl_s16(vget_high_s16(input1_q.val[0])),
430  vmovl_s16(vget_low_s16(input1_q.val[1])),
431  vmovl_s16(vget_high_s16(input1_q.val[1])),
432  }
433  };
434  const int32x4x4_t in2_s32 =
435  {
436  {
437  vmovl_s16(vget_low_s16(input2_q.val[0])),
438  vmovl_s16(vget_high_s16(input2_q.val[0])),
439  vmovl_s16(vget_low_s16(input2_q.val[1])),
440  vmovl_s16(vget_high_s16(input2_q.val[1])),
441  }
442  };
443 
444  const int32x4x4_t result =
445  {
446  {
447  vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
448  vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
449  vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
450  vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
451  }
452  };
453 
454  vst1q_s32(output_ptr + x, result.val[0]);
455  vst1q_s32(output_ptr + x + 4, result.val[1]);
456  vst1q_s32(output_ptr + x + 8, result.val[2]);
457  vst1q_s32(output_ptr + x + 12, result.val[3]);
458  }
459 
460  // Compute left-over elements
461  for(; x < window_end_x; ++x)
462  {
463  int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
464  *(output_ptr + x) = tmp;
465  }
466  },
467  input1, input2, dst);
468 }
469 
470 template <bool is_scale255, bool is_sat>
471 void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
472 {
473  // Create input windows
474  Window win = window;
475  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
476  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
477 
478  // Clear X Dimension on execution window as we handle manually
479  win.set(Window::DimX, Window::Dimension(0, 1, 1));
480  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
481  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
482 
483  Iterator input1(src1, input1_win);
484  Iterator input2(src2, input2_win);
485  Iterator dst(out, win);
486 
487  const int window_step_x = 16 / sizeof(uint8_t);
488  const auto window_start_x = static_cast<int>(window.x().start());
489  const auto window_end_x = static_cast<int>(window.x().end());
490 
491  execute_window_loop(win, [&](const Coordinates &)
492  {
493  const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
494  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
495  const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
496 
497  // Compute window_step_x elements per iteration
498  int x = window_start_x;
499  for(; x <= (window_end_x - window_step_x); x += window_step_x)
500  {
501  const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
502  const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
503 
504  uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
505  const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
506  uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
507  const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
508 
509  tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
510  tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
511 
512  if(is_scale255)
513  {
514  tmp1_high = scale255_U16_U16(tmp1_high);
515  tmp1_low = scale255_U16_U16(tmp1_low);
516  }
517  else
518  {
519  const int16x8_t vn = vdupq_n_s16(-n);
520 
521  if(is_sat)
522  {
523  tmp1_high = vqshlq_u16(tmp1_high, vn);
524  tmp1_low = vqshlq_u16(tmp1_low, vn);
525  }
526  else
527  {
528  tmp1_high = vshlq_u16(tmp1_high, vn);
529  tmp1_low = vshlq_u16(tmp1_low, vn);
530  }
531  }
532  if(is_sat)
533  {
534  vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
535  }
536  else
537  {
538  vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
539  }
540  }
541 
542  // Compute left-over elements
543  for(; x < window_end_x; ++x)
544  {
545  uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
546 
547  if(is_scale255)
548  {
549  float tmp_f = static_cast<float>(tmp) * scale255_constant;
550  tmp = static_cast<uint16_t>(tmp_f + 0.5f);
551  }
552  else
553  {
554  tmp >>= n;
555  }
556  if(is_sat && tmp > 255)
557  {
558  tmp = 255;
559  }
560  *(output_ptr + x) = static_cast<uint8_t>(tmp);
561  }
562  },
563  input1, input2, dst);
564 }
565 
566 template <bool is_scale255, bool is_sat>
567 inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
568 {
569  int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
570  const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
571  int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
572  const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
573 
574  tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
575  tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
576 
577  if(is_scale255)
578  {
579  tmp1_high = scale255_S32_S32(tmp1_high);
580  tmp1_low = scale255_S32_S32(tmp1_low);
581  }
582  else
583  {
584  // Right shift amount
585  const int32x4_t vn = vdupq_n_s32(-n);
586  // Left shift amount
587  const int32x4_t vnl = vdupq_n_s32(n);
588  // Calculate conversion bit
589  const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
590  const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
591  const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
592  const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
593  const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
594  const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
595  const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
596  const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
597  if(is_sat)
598  {
599  tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
600  tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
601  }
602  else
603  {
604  tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
605  tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
606  }
607  }
608 
609  if(is_sat)
610  {
611  return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
612  }
613  else
614  {
615  return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
616  }
617 }
618 
619 template <bool is_scale255, bool is_sat>
620 inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
621 {
622  const int16x8x2_t result =
623  {
624  {
625  // First 8 elements
626  mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
627  // Second 8 elements
628  mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
629  }
630  };
631 
632  return result;
633 }
634 
635 template <bool is_scale255, bool is_sat>
636 void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
637 {
638  // Create input windows
639  Window win = window;
640  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
641  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
642 
643  // Clear X Dimension on execution window as we handle manually
644  win.set(Window::DimX, Window::Dimension(0, 1, 1));
645  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
646  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
647 
648  Iterator input1(src1, input1_win);
649  Iterator input2(src2, input2_win);
650  Iterator dst(out, win);
651 
652  const int window_step_x = 16;
653  const auto window_start_x = static_cast<int>(window.x().start());
654  const auto window_end_x = static_cast<int>(window.x().end());
655 
656  execute_window_loop(win, [&](const Coordinates &)
657  {
658  const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
659  const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
660  const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
661 
662  // Compute window_step_x elements per iteration
663  int x = window_start_x;
664  for(; x <= (window_end_x - window_step_x); x += window_step_x)
665  {
666  const int16x8x2_t ta1 =
667  {
668  {
669  vld1q_s16(input1_ptr + x),
670  vld1q_s16(input1_ptr + x + 8),
671  }
672  };
673  const int16x8x2_t ta2 =
674  {
675  {
676  vld1q_s16(input2_ptr + x),
677  vld1q_s16(input2_ptr + x + 8),
678  }
679  };
680  const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
681 
682  vst1q_s16(output_ptr + x, result.val[0]);
683  vst1q_s16(output_ptr + x + 8, result.val[1]);
684  }
685 
686  // Compute left-over elements
687  for(; x < window_end_x; ++x)
688  {
689  int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
690 
691  if(is_scale255)
692  {
693  float tmp_f = static_cast<float>(tmp) * scale255_constant;
694 
695  tmp = static_cast<int32_t>(tmp_f + 0.5f);
696  }
697  else
698  {
699  if(tmp >= 0)
700  {
701  tmp >>= n;
702  }
703  else
704  {
705  uint32_t mask = (1u << n) - 1;
706  tmp = (tmp + static_cast<int32_t>(mask)) >> n;
707  }
708  }
709  if(is_sat)
710  {
711  tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
712  }
713  *(output_ptr + x) = static_cast<int16_t>(tmp);
714  }
715  },
716  input1, input2, dst);
717 }
718 
719 template <bool is_sat>
720 inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
721 {
722  const int32x2_t input1_1 = vget_low_s32(src1);
723  const int32x2_t input2_1 = vget_low_s32(src2);
724  const int32x2_t input1_2 = vget_high_s32(src1);
725  const int32x2_t input2_2 = vget_high_s32(src2);
726 
727  int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
728  int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
729 
730  // Apply scaling, conversion and rounding (round to zero)
731  // Right shift amount
732  const int64x2_t vn = vdupq_n_s64(-n);
733  // Left shift amount
734  const int64x2_t vnl = vdupq_n_s64(n);
735  // Calculate conversion bit
736  const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
737  const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
738  const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
739  const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
740 
741  const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
742  const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
743  const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
744  const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
745  if(is_sat)
746  {
747  tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
748  tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
749  return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
750  }
751  else
752  {
753  tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
754  tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
755  return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
756  }
757 }
758 
759 template <bool is_sat>
760 inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
761 {
762  const int32x4x2_t result =
763  {
764  {
765  // First 4 elements
766  mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
767  // Second 4 elements
768  mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
769  }
770  };
771 
772  return result;
773 }
774 
775 template <bool is_sat>
776 void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
777 {
778  // Create input windows
779  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
780  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
781 
782  // Clear X Dimension on execution window as we handle manually
783  Window win = window;
784  win.set(Window::DimX, Window::Dimension(0, 1, 1));
785 
786  const int window_step_x = 8;
787  const auto window_start_x = static_cast<int>(window.x().start());
788  const auto window_end_x = static_cast<int>(window.x().end());
789  const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
790 
791  if(is_broadcast_across_x)
792  {
793  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
794  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
795  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
796  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
797  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
798 
799  // Clear X Dimension on execution window as we handle manually
800  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
801 
802  Iterator broadcast_input(broadcast_tensor, broadcast_win);
803  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
804  Iterator dst(out, win);
805 
806  execute_window_loop(win, [&](const Coordinates &)
807  {
808  const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
809  const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
810 
811  const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
812  const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
813 
814  // Compute window_step_x elements per iteration
815  int x = window_start_x;
816  for(; x <= (window_end_x - window_step_x); x += window_step_x)
817  {
818  const int32x4x2_t broadcast_v =
819  {
820  {
821  broadcast_value_vec,
822  broadcast_value_vec,
823  }
824  };
825  const int32x4x2_t non_broadcast_v =
826  {
827  {
828  vld1q_s32(non_broadcast_input_ptr + x),
829  vld1q_s32(non_broadcast_input_ptr + x + 4),
830  }
831  };
832  const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
833 
834  vst1q_s32(output_ptr + x, result.val[0]);
835  vst1q_s32(output_ptr + x + 4, result.val[1]);
836  }
837 
838  // Compute left-over elements
839  for(; x < window_end_x; ++x)
840  {
841  int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
842 
843  if(tmp >= 0)
844  {
845  tmp >>= n;
846  }
847  else
848  {
849  uint64_t mask = ((uint64_t)1u << n) - 1;
850  tmp = (tmp + static_cast<int64_t>(mask)) >> n;
851  }
852  if(is_sat)
853  {
854  tmp = utility::clamp<int64_t, int32_t>(tmp);
855  }
856  *(output_ptr + x) = static_cast<int32_t>(tmp);
857  }
858  },
859  broadcast_input, non_broadcast_input, dst);
860  }
861  else
862  {
863  // Clear X Dimension on execution window as we handle manually
864  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
865  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
866 
867  Iterator input1(src1, input1_win);
868  Iterator input2(src2, input2_win);
869  Iterator dst(out, win);
870 
871  execute_window_loop(win, [&](const Coordinates &)
872  {
873  const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
874  const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
875  const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
876 
877  // Compute window_step_x elements per iteration
878  int x = window_start_x;
879  for(; x <= (window_end_x - window_step_x); x += window_step_x)
880  {
881  const int32x4x2_t ta1 =
882  {
883  {
884  vld1q_s32(input1_ptr + x),
885  vld1q_s32(input1_ptr + x + 4),
886  }
887  };
888  const int32x4x2_t ta2 =
889  {
890  {
891  vld1q_s32(input2_ptr + x),
892  vld1q_s32(input2_ptr + x + 4),
893  }
894  };
895  const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
896 
897  vst1q_s32(output_ptr + x, result.val[0]);
898  vst1q_s32(output_ptr + x + 4, result.val[1]);
899  }
900 
901  // Compute left-over elements
902  for(; x < window_end_x; ++x)
903  {
904  int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
905 
906  if(tmp >= 0)
907  {
908  tmp >>= n;
909  }
910  else
911  {
912  uint64_t mask = ((uint64_t)1u << n) - 1;
913  tmp = (tmp + static_cast<int64_t>(mask)) >> n;
914  }
915  if(is_sat)
916  {
917  tmp = utility::clamp<int64_t, int32_t>(tmp);
918  }
919  *(output_ptr + x) = static_cast<int32_t>(tmp);
920  }
921  },
922  input1, input2, dst);
923  }
924 }
925 
926 void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
927 {
928  // Create input windows
929  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
930  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
931 
932  // Clear X Dimension on execution window as we handle manually
933  Window win = window;
934  win.set(Window::DimX, Window::Dimension(0, 1, 1));
935 
936  constexpr int window_step_x = 16 / sizeof(float);
937  const auto window_start_x = static_cast<int>(window.x().start());
938  const auto window_end_x = static_cast<int>(window.x().end());
939  const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
940 
941  using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
942 
943  if(is_broadcast_across_x)
944  {
945  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
946  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
947  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
948  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
949  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
950 
951  // Clear X Dimension on execution window as we handle manually
952  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
953 
954  Iterator broadcast_input(broadcast_tensor, broadcast_win);
955  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
956  Iterator dst(out, win);
957 
958  execute_window_loop(win, [&](const Coordinates &)
959  {
960  const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
961  const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
962 
963  const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
964  const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
965  const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
966 
967  // Compute window_step_x elements per iteration
968  int x = window_start_x;
969  for(; x <= (window_end_x - window_step_x); x += window_step_x)
970  {
971  const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
972  auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
973  wrapper::vstore(output_ptr + x, res);
974  }
975 
976  // Compute left-over elements
977  for(; x < window_end_x; ++x)
978  {
979  const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
980  *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
981  }
982  },
983  broadcast_input, non_broadcast_input, dst);
984  }
985  else
986  {
987  // Clear X Dimension on execution window as we handle manually
988  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
989  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
990 
991  Iterator input1(src1, input1_win);
992  Iterator input2(src2, input2_win);
993  Iterator dst(out, win);
994 
995  execute_window_loop(win, [&](const Coordinates &)
996  {
997  const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
998  const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
999  const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
1000 
1001  // Compute window_step_x elements per iteration
1002  int x = window_start_x;
1003  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1004  {
1005  const auto ta1 = wrapper::vloadq(input1_ptr + x);
1006  const auto ta2 = wrapper::vloadq(input2_ptr + x);
1007  const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1008  const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
1009  wrapper::vstore(output_ptr + x, res);
1010  }
1011 
1012  // Compute left-over elements
1013  for(; x < window_end_x; ++x)
1014  {
1015  const auto ta1 = *(input1_ptr + x);
1016  const auto ta2 = *(input2_ptr + x);
1017  *(output_ptr + x) = ta1 * ta2 * scale;
1018  }
1019  },
1020  input1, input2, dst);
1021  }
1022 }
1023 
1024 void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
1025 {
1026  // Create input windows
1027  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1028  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1029 
1030  // Clear X Dimension on execution window as we handle manually
1031  Window win = window;
1032  win.set(Window::DimX, Window::Dimension(0, 1, 1));
1033 
1034  constexpr int window_step_x = 8 / sizeof(float);
1035  const auto window_start_x = static_cast<int>(window.x().start());
1036  const auto window_end_x = static_cast<int>(window.x().end());
1037  const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
1038 
1039  using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
1040 
1041  if(is_broadcast_across_x)
1042  {
1043  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1044  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1045  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1046  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1047  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
1048 
1049  // Clear X Dimension on execution window as we handle manually
1050  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1051 
1052  Iterator broadcast_input(broadcast_tensor, broadcast_win);
1053  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1054  Iterator dst(out, win);
1055 
1056  execute_window_loop(win, [&](const Coordinates &)
1057  {
1058  const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
1059  const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
1060 
1061  const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1062 
1063  // Compute window_step_x elements per iteration
1064  int x = window_start_x;
1065  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1066  {
1067  const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
1068  float32x4_t b = vdupq_n_f32(broadcast_value);
1069 
1070  const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1071  const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1072  const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1073  const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1074  const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1075 
1076  const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1077  const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1078 
1079  float32x4_t res = wrapper::vmul(tmp0, b);
1080  b = wrapper::vmul(b, mask);
1081 
1082  res = wrapper::vmla(res, tmp1, b);
1083  wrapper::vstore(output_ptr + 2 * x, res);
1084  }
1085 
1086  // Compute left-over elements
1087  for(; x < window_end_x; ++x)
1088  {
1089  const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1090  const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1091  auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
1092  auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
1093  *(output_ptr + 2 * x) = res1;
1094  *(output_ptr + 2 * x + 1) = res2;
1095  }
1096  },
1097  broadcast_input, non_broadcast_input, dst);
1098  }
1099  else
1100  {
1101  // Clear X Dimension on execution window as we handle manually
1102  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1103  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1104 
1105  Iterator input1(src1, input1_win);
1106  Iterator input2(src2, input2_win);
1107  Iterator dst(out, win);
1108 
1109  execute_window_loop(win, [&](const Coordinates &)
1110  {
1111  const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1112  const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
1113  const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
1114 
1115  // Compute window_step_x elements per iteration
1116  int x = window_start_x;
1117  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1118  {
1119  const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
1120  float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
1121 
1122  const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1123  const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1124  const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1125  const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1126  const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1127 
1128  const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1129  const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1130 
1131  float32x4_t res = wrapper::vmul(tmp0, b);
1132 
1133  b = wrapper::vrev64(b);
1134  b = wrapper::vmul(b, mask);
1135 
1136  res = wrapper::vmla(res, tmp1, b);
1137  wrapper::vstore(output_ptr + 2 * x, res);
1138  }
1139 
1140  // Compute left-over elements
1141  for(; x < window_end_x; ++x)
1142  {
1143  const auto a0 = *(input1_ptr + 2 * x);
1144  const auto a1 = *(input1_ptr + 2 * x + 1);
1145  const auto b0 = *(input2_ptr + 2 * x);
1146  const auto b1 = *(input2_ptr + 2 * x + 1);
1147  auto res1 = a0 * b0 - a1 * b1;
1148  auto res2 = a0 * b1 + a1 * b0;
1149  *(output_ptr + 2 * x) = res1;
1150  *(output_ptr + 2 * x + 1) = res2;
1151  }
1152  },
1153  input1, input2, dst);
1154  }
1155 }
1156 
1157 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1158 void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
1159 {
1160  // Create input windows
1161  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1162  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1163 
1164  // Clear X Dimension on execution window as we handle manually
1165  Window win = window;
1166  win.set(Window::DimX, Window::Dimension(0, 1, 1));
1167  constexpr int window_step_x = 16;
1168  const auto window_start_x = static_cast<int>(window.x().start());
1169  const auto window_end_x = static_cast<int>(window.x().end());
1170  const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
1171  if(is_broadcast_across_x)
1172  {
1173  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1174  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1175  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1176  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1177  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
1178  // Clear X Dimension on execution window as we handle manually
1179  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1180  Iterator broadcast_input(broadcast_tensor, broadcast_win);
1181  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1182  Iterator dst(out, win);
1183  execute_window_loop(win, [&](const Coordinates &)
1184  {
1185  const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
1186  const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
1187  const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
1188  const float16x8x2_t broadcast_value_vec =
1189  {
1190  {
1191  vdupq_n_f16(broadcast_value),
1192  vdupq_n_f16(broadcast_value),
1193  }
1194  };
1195  const auto scale_vec = vdupq_n_f16(scale);
1196  // Compute window_step_x elements per iteration
1197  int x = window_start_x;
1198  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1199  {
1200  const float16x8x2_t non_broadcast_v =
1201  {
1202  {
1203  vld1q_f16(non_broadcast_input_ptr + x),
1204  vld1q_f16(non_broadcast_input_ptr + x + 8),
1205  }
1206  };
1207  const float16x8x2_t result =
1208  {
1209  {
1210  vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
1211  vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
1212  }
1213  };
1214  vst1q_f16(output_ptr + x, result.val[0]);
1215  vst1q_f16(output_ptr + x + 8, result.val[1]);
1216  }
1217  // Compute left-over elements
1218  for(; x < window_end_x; ++x)
1219  {
1220  const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1221  *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1222  }
1223  },
1224  broadcast_input, non_broadcast_input, dst);
1225  }
1226  else
1227  {
1228  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1229  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1230  Iterator input1(src1, input1_win);
1231  Iterator input2(src2, input2_win);
1232  Iterator dst(out, win);
1233  execute_window_loop(win, [&](const Coordinates &)
1234  {
1235  const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
1236  const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
1237  const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
1238  // Compute window_step_x elements per iteration
1239  int x = window_start_x;
1240  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1241  {
1242  const float16x8x2_t ta1 =
1243  {
1244  {
1245  vld1q_f16(input1_ptr + x),
1246  vld1q_f16(input1_ptr + x + 8),
1247  }
1248  };
1249  const float16x8x2_t ta2 =
1250  {
1251  {
1252  vld1q_f16(input2_ptr + x),
1253  vld1q_f16(input2_ptr + x + 8),
1254  }
1255  };
1256  const float16x8_t scale_vec = vdupq_n_f16(scale);
1257  const float16x8x2_t result =
1258  {
1259  {
1260  vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
1261  vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
1262  }
1263  };
1264  vst1q_f16(output_ptr + x, result.val[0]);
1265  vst1q_f16(output_ptr + x + 8, result.val[1]);
1266  }
1267  // Compute left-over elements
1268  for(; x < window_end_x; ++x)
1269  {
1270  const auto ta1 = *(input1_ptr + x);
1271  const auto ta2 = *(input2_ptr + x);
1272  *(output_ptr + x) = ta1 * ta2 * scale;
1273  }
1274  },
1275  input1, input2, dst);
1276  }
1277 }
1278 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1279 
1280 template <bool is_scale255, bool is_sat>
1281 void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
1282 {
1283  // Create input windows
1284  Window win = window;
1285  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1286  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1287 
1288  // Clear X Dimension on execution window as we handle manually
1289  win.set(Window::DimX, Window::Dimension(0, 1, 1));
1290  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1291  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1292 
1293  Iterator input1(src1, input1_win);
1294  Iterator input2(src2, input2_win);
1295  Iterator dst(out, win);
1296 
1297  const int window_step_x = 16 / sizeof(uint8_t);
1298  const auto window_start_x = static_cast<int>(window.x().start());
1299  const auto window_end_x = static_cast<int>(window.x().end());
1300 
1301  execute_window_loop(win, [&](const Coordinates &)
1302  {
1303  const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
1304  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
1305  const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
1306 
1307  // Compute window_step_x elements per iteration
1308  int x = window_start_x;
1309  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1310  {
1311  const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
1312  const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
1313 
1314  uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1315  uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1316  tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1317  tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1318 
1319  if(is_scale255)
1320  {
1321  tmp_low = scale255_U16_U16(tmp_low);
1322  tmp_high = scale255_U16_U16(tmp_high);
1323  }
1324  else
1325  {
1326  const int16x8_t vn = vdupq_n_s16(-n);
1327 
1328  if(is_sat)
1329  {
1330  tmp_low = vqshlq_u16(tmp_low, vn);
1331  tmp_high = vqshlq_u16(tmp_high, vn);
1332  }
1333  else
1334  {
1335  tmp_low = vshlq_u16(tmp_low, vn);
1336  tmp_high = vshlq_u16(tmp_high, vn);
1337  }
1338  }
1339 
1340  if(is_sat)
1341  {
1342  static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1343 
1344  tmp_low = vminq_u16(tmp_low, max);
1345  tmp_high = vminq_u16(tmp_high, max);
1346  }
1347 
1348  vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1349  vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
1350  }
1351 
1352  // Compute left-over elements
1353  for(; x < window_end_x; ++x)
1354  {
1355  int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1356 
1357  if(is_scale255)
1358  {
1359  float tmp_f = static_cast<float>(tmp) * scale255_constant;
1360  tmp = static_cast<int32_t>(tmp_f + 0.5f);
1361  }
1362  else
1363  {
1364  tmp >>= n;
1365  }
1366 
1367  if(is_sat)
1368  {
1369  tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1370  }
1371 
1372  *(output_ptr + x) = static_cast<int16_t>(tmp);
1373  }
1374  },
1375  input1, input2, dst);
1376 }
1377 
1378 template <bool is_scale255, bool is_sat>
1379 void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
1380 {
1381  // Create input windows
1382  Window win = window;
1383  Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1384  Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1385 
1386  // Clear X Dimension on execution window as we handle manually
1387  win.set(Window::DimX, Window::Dimension(0, 1, 1));
1388  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1389  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1390 
1391  Iterator input1(src1, input1_win);
1392  Iterator input2(src2, input2_win);
1393  Iterator dst(out, win);
1394 
1395  const int window_step_x = 16;
1396  const auto window_start_x = static_cast<int>(window.x().start());
1397  const auto window_end_x = static_cast<int>(window.x().end());
1398 
1399  execute_window_loop(win, [&](const Coordinates &)
1400  {
1401  const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
1402  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
1403  const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
1404 
1405  // Compute window_step_x elements per iteration
1406  int x = window_start_x;
1407  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1408  {
1409  const int16x8x2_t ta1 =
1410  {
1411  {
1412  vld1q_s16(input1_ptr + x),
1413  vld1q_s16(input1_ptr + x + 8),
1414  }
1415  };
1416  const uint8x8x2_t ta2u =
1417  {
1418  {
1419  vld1_u8(input2_ptr + x),
1420  vld1_u8(input2_ptr + x + 8),
1421  }
1422  };
1423  const int16x8x2_t ta2 =
1424  {
1425  {
1426  vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
1427  vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
1428  }
1429  };
1430 
1431  const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1432 
1433  vst1q_s16(output_ptr + x, result.val[0]);
1434  vst1q_s16(output_ptr + x + 8, result.val[1]);
1435  }
1436 
1437  // Compute left-over elements
1438  for(; x < window_end_x; ++x)
1439  {
1440  int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1441 
1442  if(is_scale255)
1443  {
1444  float tmp_f = static_cast<float>(tmp) * scale255_constant;
1445 
1446  tmp = static_cast<int32_t>(tmp_f + 0.5f);
1447  }
1448  else
1449  {
1450  if(tmp >= 0)
1451  {
1452  tmp >>= n;
1453  }
1454  else
1455  {
1456  uint32_t mask = (1u << n) - 1;
1457  tmp = (tmp + static_cast<int32_t>(mask)) >> n;
1458  }
1459  }
1460  if(is_sat)
1461  {
1462  tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1463  }
1464  *(output_ptr + x) = static_cast<int16_t>(tmp);
1465  }
1466  },
1467  input1, input2, dst);
1468 }
1469 
1470 template <bool is_scale255, bool is_sat>
1471 void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
1472 {
1473  // Simply swap the two input buffers
1474  mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
1475 }
1476 } // namespace
1477 
1478 void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
1479 {
1480  ARM_COMPUTE_UNUSED(rounding_policy);
1481  ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1482 
1483  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
1484 
1485  const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
1486 
1487  // Auto initialize dst if not initialized
1488  set_shape_if_empty(*dst, out_shape);
1489 
1490  _scale = scale;
1491  _scale_exponent = 0;
1492  _func_quantized = nullptr;
1493  _func_int = nullptr;
1494  _func_float = nullptr;
1495 
1496  bool is_scale_255 = false;
1497  // Check and validate scaling factor
1498  if(std::abs(scale - scale255_constant) < 0.00001f)
1499  {
1500  is_scale_255 = true;
1501  }
1502  else
1503  {
1504  int exponent = 0;
1505 
1506  std::frexp(scale, &exponent);
1507 
1508  // Store the positive exponent. We know that we compute 1/2^n
1509  // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
1510  _scale_exponent = std::abs(exponent - 1);
1511  }
1512 
1513  const DataType dt_input1 = src1->data_type();
1514  const DataType dt_input2 = src2->data_type();
1515  const DataType dt_output = dst->data_type();
1516  const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
1517 
1518  switch(dt_input1)
1519  {
1520  case DataType::QASYMM8:
1521  if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
1522  {
1523  _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1524  }
1525  break;
1527  if(dt_input2 == DataType::QASYMM8_SIGNED)
1528  {
1529  _func_quantized = &mul_saturate_quantized_8<int8_t>;
1530  ;
1531  }
1532  break;
1533  case DataType::QSYMM16:
1534  if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
1535  {
1536  _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1537  }
1538  else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
1539  {
1540  _func_int = &mul_QSYMM16_QSYMM16_S32;
1541  }
1542  break;
1543  case DataType::S16:
1544  if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1545  {
1546  if(is_scale_255)
1547  {
1548  _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1549  }
1550  else
1551  {
1552  _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1553  }
1554  }
1555  if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1556  {
1557  if(is_scale_255)
1558  {
1559  _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1560  }
1561  else
1562  {
1563  _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1564  }
1565  }
1566  break;
1567  case DataType::S32:
1568  if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
1569  {
1570  _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1571  }
1572  break;
1573  case DataType::U8:
1574  if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
1575  {
1576  if(is_scale_255)
1577  {
1578  _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1579  }
1580  else
1581  {
1582  _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1583  }
1584  }
1585  else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1586  {
1587  if(is_scale_255)
1588  {
1589  _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1590  }
1591  else
1592  {
1593  _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1594  }
1595  }
1596  else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1597  {
1598  if(is_scale_255)
1599  {
1600  _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1601  }
1602  else
1603  {
1604  _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1605  }
1606  }
1607  break;
1608 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1609  case DataType::F16:
1610  _func_float = &mul_F16_F16_F16;
1611  break;
1612 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1613  case DataType::F32:
1614  _func_float = &mul_F32_F32_F32;
1615  break;
1616  default:
1617  ARM_COMPUTE_ERROR("You called with the wrong img formats");
1618  }
1619 
1620  // Configure kernel window
1621  Window win = calculate_max_window(out_shape);
1622 
1623  ICpuKernel::configure(win);
1624 }
1625 
1626 Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
1627  RoundingPolicy rounding_policy)
1628 {
1629  ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1630  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
1631 
1632  return Status{};
1633 }
1634 
1635 void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
1636 {
1637  ARM_COMPUTE_UNUSED(info);
1640 
1641  auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1642  auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1643  auto dst = tensors.get_tensor(TensorType::ACL_DST);
1644 
1645  if(_func_quantized != nullptr)
1646  {
1647  (*_func_quantized)(src1, src2, dst, window, _scale);
1648  }
1649  else if(_func_int != nullptr)
1650  {
1651  (*_func_int)(src1, src2, dst, window, _scale_exponent);
1652  }
1653  else
1654  {
1655  ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
1656  (*_func_float)(src1, src2, dst, window, _scale);
1657  }
1658 }
1659 const char *CpuMulKernel::name() const
1660 {
1661  return "CpuMulKernel";
1662 }
1663 namespace
1664 {
1665 Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
1666 {
1669 
1670  const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
1671 
1672  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
1673 
1674  // Validate in case of configured dst
1675  if(dst->total_size() > 0)
1676  {
1678  ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
1679  }
1680 
1681  return Status{};
1682 }
1683 } // namespace
1684 
1686 {
1687  ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1688  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
1689 
1690  const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
1691 
1692  // Auto initialize dst if not initialized
1693  const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
1694  auto_init_if_empty(*dst, out_info);
1695 
1696  // Configure kernel window
1697  Window win = calculate_max_window(out_shape);
1698 
1699  ICpuKernel::configure(win);
1700 }
1701 
1703 {
1704  ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1705  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
1706 
1707  return Status{};
1708 }
1709 
1710 void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
1711 {
1712  ARM_COMPUTE_UNUSED(info);
1715 
1716  auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1717  auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1718  auto dst = tensors.get_tensor(TensorType::ACL_DST);
1719 
1720  c_mul_F32_F32_F32_n(src1, src2, dst, window);
1721 }
1722 
1723 const char *CpuComplexMulKernel::name() const
1724 {
1725  return "CpuComplexMulKernel";
1726 }
1727 } // namespace kernels
1728 } // namespace cpu
1729 } // namespace arm_compute
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:1030
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 16 floating point values.
Definition: NESymm.h:204
static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
Static function to check if given info will lead to a valid configuration.
quantized, symmetric fixed-point 16-bit number
Rounds to nearest value; half rounds away from zero.
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
Definition: clang-tidy.h:78
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:115
SimpleTensor< float > b
Definition: DFT.cpp:157
void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
Initialise the kernel&#39;s src, dst and border mode.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
Definition: NEAsymm.h:415
1 channel, 1 U8 per channel
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
static TensorShape broadcast_shape(const Shapes &... shapes)
If shapes are broadcast compatible, return the broadcasted shape.
Definition: TensorShape.h:211
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Status class.
Definition: Error.h:52
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
decltype(strategy::transforms) typedef type
static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Static function to check if given info will lead to a valid configuration.
Copyright (c) 2017-2022 Arm Limited.
static QUANTIZED_TYPE quantize(float value, const UniformQuantizationInfo &qinfo)
Quantize a value given a 8-bit asymmetric quantization scheme.
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
quantized, asymmetric fixed-point 8-bit number unsigned
size_t total_size() const
Collapses all dimensions to a single linear total size.
Definition: TensorShape.h:172
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
RoundingPolicy
Rounding method.
Definition: Rounding.h:30
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
bool have_different_dimensions(const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)
Definition: Validate.h:47
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
1 channel, 1 S16 per channel
const char * name() const override
Name of the kernel.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
Rounds to nearest value; half rounds to nearest even.
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
Information about executing thread and CPU.
Definition: CPPTypes.h:169
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Initialise the kernel&#39;s input, dst and border mode.
int16x8x2_t qsymm16x8x2_t
16 bit quantized symmetric vector with 16 elements
Definition: NESymm.h:37
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:541
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
uint8x8_t vrev64(const uint8x8_t &a)
Definition: rev64.h:39
uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
Definition: NEAsymm.h:602
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:39
static float dequantize(QUANTIZED_TYPE value, const UniformQuantizationInfo &qinfo)
Dequantize a value given a 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:43
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
Definition: NEAsymm.h:630
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
int16_t qsymm16_t
16 bit quantized symmetric scalar value
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
DataType
Available data types.
Definition: Types.h:79
Truncates the least significant values that are lost in operations.
Describe a multidimensional execution window.
Definition: Window.h:39
ConvertPolicy
Policy to handle integer overflow.
Definition: Types.h:404
virtual size_t num_channels() const =0
The number of channels for each tensor element.
const char * name() const override
Name of the kernel.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:201