Compute Library
 22.11
impl.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 #include "src/core/NEON/NEMath.h"
27 #include "support/SaturateCast.h"
28 
29 namespace arm_compute
30 {
31 namespace cpu
32 {
33 template <typename T>
34 void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
35 {
36  /** SIMD vector tag type. */
38 
39  constexpr int window_step_x = 16 / sizeof(T);
40  const auto window_start_x = static_cast<int>(window.x().start());
41  const auto window_end_x = static_cast<int>(window.x().end());
42 
43  Window win{ window };
44  win.set(Window::DimX, Window::Dimension(0, 1, 1));
45  Iterator input(in, win);
46  Iterator output(out, win);
47 
48  const int sum_stages = log2(window_step_x / 2);
49  execute_window_loop(win, [&](const Coordinates &)
50  {
51  // Get pointers
52  const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
53  const auto out_ptr = reinterpret_cast<T *>(output.ptr());
54 
55  // Init max value
56  auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
57  int x = window_start_x;
58 
59  for(; x <= (window_end_x - window_step_x); x += window_step_x)
60  {
61  const auto current_value = wrapper::vloadq(in_ptr + x);
62  vec_max = wrapper::vmax(vec_max, current_value);
63  }
64  auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
65 
66  for(int i = 0; i < sum_stages; ++i)
67  {
68  carry_max = wrapper::vpmax(carry_max, carry_max);
69  }
70  T max_val = wrapper::vgetlane(carry_max, 0);
71 
72  // Compute left-over elements
73  for(; x < window_end_x; ++x)
74  {
75  max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
76  }
77 
78  *out_ptr = max_val;
79  },
80  input, output);
81 }
82 
83 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
84 template void neon_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
85 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
86 template void neon_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
87 template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
88 template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
89 
90 template <typename T>
91 void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
92  ITensor *out, float beta, bool is_log, const Window &window)
93 {
94  static_assert(std::is_same<T, qasymm8_t>::value
95  || std::is_same<T, qasymm8_signed_t>::value,
96  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
97 
98  const int start_x = in->info()->valid_region().anchor.x();
99  const int input_width = in->info()->valid_region().shape.x();
100 
101  const float scale_beta = -beta * in->info()->quantization_info().uniform().scale;
102  const auto scale_beta_vec = vdupq_n_f32(scale_beta);
103 
104  Iterator in_it(in, window);
105  Iterator max_it(max, window);
106  Iterator out_it(out, window);
107  constexpr int vec_size = 16;
108 
109  execute_window_loop(window, [&](const Coordinates &)
110  {
111  /* Get pointers */
112  const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
113  const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
114  const auto tmp_ptr = reinterpret_cast<float *>(tmp);
115 
116  float sum{};
117  float sum_inversed{};
118 
119  /* Compute exponentials and sum */
120  {
121  /* Get max value */
122  const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
123  const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
124 
125  /* Init sum to zero */
126  float32x4x4_t vec_sum =
127  {
128  vdupq_n_f32(0.f),
129  vdupq_n_f32(0.f),
130  vdupq_n_f32(0.f),
131  vdupq_n_f32(0.f),
132  };
133 
134  /* Loop over row and compute exponentials and sum */
135  int x = 0;
136  for(; x <= (input_width - vec_size); x += vec_size)
137  {
138  auto vec_elements = wrapper::vloadq(in_ptr + x);
139  vec_elements = wrapper::vqsub(vec_max, vec_elements);
140  auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
141 
142  if(is_log)
143  {
144  vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
145  vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
146  vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
147  vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
148  vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
149  vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
150  vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
151  vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
152  }
153  else
154  {
155  vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
156  vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
157  vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
158  vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
159  vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
160  vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
161  vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
162  vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
163  }
164 
165  vst4q_f32(tmp_ptr + x, vec_elements_flt);
166  }
167 
168  /* Reduce sum */
169  const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
170  auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
171  sum_res = vpadd_f32(sum_res, sum_res);
172  sum = wrapper::vgetlane(sum_res, 0);
173 
174  /* Run remaining elements */
175  for(; x < input_width; ++x)
176  {
177  float element{};
178  if(is_log)
179  {
180  element = (max_val - in_ptr[x]) * scale_beta;
181  sum += std::exp(element);
182  }
183  else
184  {
185  element = std::exp((max_val - in_ptr[x]) * scale_beta);
186  sum += element;
187  }
188 
189  tmp_ptr[x] = element;
190  }
191 
192  if(!is_log)
193  {
194  sum_inversed = 256.f / sum;
195  }
196  else
197  {
198  sum = std::log(sum);
199  }
200  }
201 
202  /* Normalize exponentials */
203  {
204  constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
205  /* Loop over row and compute softmax */
206  int x = 0;
207  for(; x <= (input_width - vec_size); x += vec_size)
208  {
209  using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
210  float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
211  int_vec_type normalized_value{};
212  if(is_log)
213  {
214  const float32x4x4_t sub =
215  {
216  vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
217  vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
218  vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
219  vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
220  };
221  normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
222  }
223  else
224  {
225  float32x4x4_t mul =
226  {
227  vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
228  vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
229  vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
230  vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
231  };
232 
233  if(is_qasymm8_signed)
234  {
235  const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
236  mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
237  mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
238  mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
239  mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
240  }
241 
242  normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
243  }
244  wrapper::vstore(out_ptr + x, normalized_value);
245  }
246  /* Run remaining elements */
247  for(; x < input_width; ++x)
248  {
249  if(is_log)
250  {
251  out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
252  }
253  else
254  {
255  out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
256  }
257  }
258  }
259  },
260  in_it, max_it, out_it);
261 }
262 
263 template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
264  ITensor *out, float beta, bool is_log, const Window &window);
265 template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
266  ITensor *out, float beta, bool is_log, const Window &window);
267 template <typename T>
268 void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
269  ITensor *out, const float beta, bool is_log, const Window &window)
270 {
271  const int start_x = in->info()->valid_region().anchor.x();
272  const int input_width = in->info()->valid_region().shape.x();
273 
274  Iterator in_it(in, window);
275  Iterator max_it(max, window);
276  Iterator out_it(out, window);
277 
278  /** SIMD vector tag type. */
280 
281  constexpr int vec_size = 16 / sizeof(T);
282  const int sum_stages = log2(vec_size / 2);
283 
284  execute_window_loop(window, [&](const Coordinates &)
285  {
286  /* Get pointers */
287  const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
288  const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
289  const auto tmp_ptr = reinterpret_cast<T *>(tmp);
290 
291  T sum{};
292  T sum_inversed{};
293 
294  /* Compute exponentials and sum */
295  {
296  /* Get max value */
297  const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
298  const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
299 
300  /* Init sum to zero */
301  auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
302 
303  /* Loop over row and compute exponentials and sum */
304  int x = 0;
305  for(; x <= (input_width - vec_size); x += vec_size)
306  {
307  auto vec_elements = wrapper::vloadq(in_ptr + x);
308  vec_elements = wrapper::vsub(vec_elements, vec_max);
309  if(is_log)
310  {
311  vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
312  vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
313  }
314  else
315  {
316  vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
317  vec_sum = wrapper::vadd(vec_sum, vec_elements);
318  }
319  wrapper::vstore(tmp_ptr + x, vec_elements);
320  }
321 
322  /* Reduce sum */
323  auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
324  for(int i = 0; i < sum_stages; ++i)
325  {
326  sum_res = wrapper::vpadd(sum_res, sum_res);
327  }
328  sum = wrapper::vgetlane(sum_res, 0);
329 
330  /* Run remaining elements */
331  for(; x < input_width; ++x)
332  {
333  T element{};
334 
335  if(is_log)
336  {
337  element = (in_ptr[x] - max_val) * beta;
338  sum += std::exp(element);
339  }
340  else
341  {
342  element = std::exp((in_ptr[x] - max_val) * beta);
343  sum += element;
344  }
345  tmp_ptr[x] = element;
346  }
347 
348  if(!is_log)
349  {
350  sum_inversed = T(1) / sum;
351  }
352  else
353  {
354  sum = static_cast<T>(std::log(sum));
355  }
356  }
357 
358  /* Normalize exponentials */
359  {
360  /* Loop over row and compute softmax */
361  int x = 0;
362  for(; x <= (input_width - vec_size); x += vec_size)
363  {
364  auto vec_in = wrapper::vloadq(tmp_ptr + x);
365  auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
366  if(is_log)
367  {
368  normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
369  }
370  else
371  {
372  normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
373  }
374  wrapper::vstore(out_ptr + x, normalized_value);
375  }
376  /* Run remaining elements */
377  for(; x < input_width; ++x)
378  {
379  if(is_log)
380  {
381  out_ptr[x] = tmp_ptr[x] - sum;
382  }
383  else
384  {
385  out_ptr[x] = tmp_ptr[x] * sum_inversed;
386  }
387  }
388  }
389  },
390  in_it, max_it, out_it);
391 }
392 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
393 template void neon_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
394  ITensor *out, const float beta, bool is_log, const Window &window);
395 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
396 template void neon_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
397  ITensor *out, const float beta, bool is_log, const Window &window);
398 } // namespace cpu
399 } // namespace arm_compute
template void neon_logits_1d_max< float >(const ITensor *in, ITensor *out, const Window &window)
template void neon_logits_1d_max< qasymm8_t >(const ITensor *in, ITensor *out, const Window &window)
TensorShape shape
Shape of the valid region.
Definition: Types.h:266
void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
Definition: impl.cpp:34
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
template void neon_softmax_logits_1d_float< float >(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:79
Interface for CPU tensor.
Definition: ITensor.h:36
const size_t input_width
Definition: impl.cpp:62
Copyright (c) 2017-2022 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
typename neon_bitvector< T, BW >::tag_type neon_bitvector_tag_t
Helper type template to get the tag type of a neon vector.
Definition: traits.h:132
T x() const
Alias to access the size of the first dimension.
Definition: Dimensions.h:87
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:187
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
typename neon_vector< T, S >::type neon_vector_t
Helper type template to get the type of a neon vector.
Definition: traits.h:80
template void neon_softmax_logits_1d_quantized< qasymm8_signed_t >(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
Coordinates of an item.
Definition: Coordinates.h:37
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
template void neon_logits_1d_max< qasymm8_signed_t >(const ITensor *in, ITensor *out, const Window &window)
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
template void neon_softmax_logits_1d_quantized< qasymm8_t >(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
uint8x8_t vqsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:74
float32x4_t vexpq_f32(float32x4_t x)
Calculate exponential.
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
Includes all wrapper headers at once.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: pmax.h:39
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:102
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
Definition: impl.cpp:91
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:97
float32x4_t vexpq(const float32x4_t &a)
Definition: exp.h:47
Describe a multidimensional execution window.
Definition: Window.h:39
void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
Definition: impl.cpp:268
Coordinates anchor
Anchor for the start of the valid region.
Definition: Types.h:265
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:159