Compute Library
 21.11
list.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
25 #define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
26 
28 #include "src/core/NEON/NEMath.h"
30 #include "support/SaturateCast.h"
31 
32 namespace arm_compute
33 {
34 namespace cpu
35 {
36 template <typename T>
37 void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
38 {
39  /** SIMD vector tag type. */
41 
42  constexpr int window_step_x = 16 / sizeof(T);
43  const auto window_start_x = static_cast<int>(window.x().start());
44  const auto window_end_x = static_cast<int>(window.x().end());
45 
46  Window win{ window };
47  win.set(Window::DimX, Window::Dimension(0, 1, 1));
48  Iterator input(in, win);
49  Iterator output(out, win);
50 
51  const int sum_stages = log2(window_step_x / 2);
52  execute_window_loop(win, [&](const Coordinates &)
53  {
54  // Get pointers
55  const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
56  const auto out_ptr = reinterpret_cast<T *>(output.ptr());
57 
58  // Init max value
59  auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
60  int x = window_start_x;
61 
62  for(; x <= (window_end_x - window_step_x); x += window_step_x)
63  {
64  const auto current_value = wrapper::vloadq(in_ptr + x);
65  vec_max = wrapper::vmax(vec_max, current_value);
66  }
67  auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
68 
69  for(int i = 0; i < sum_stages; ++i)
70  {
71  carry_max = wrapper::vpmax(carry_max, carry_max);
72  }
73  T max_val = wrapper::vgetlane(carry_max, 0);
74 
75  // Compute left-over elements
76  for(; x < window_end_x; ++x)
77  {
78  max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
79  }
80 
81  *out_ptr = max_val;
82  },
83  input, output);
84 }
85 
86 template <typename T>
87 void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
88  ITensor *out, float beta, bool is_log, const Window &window)
89 {
90  static_assert(std::is_same<T, qasymm8_t>::value
91  || std::is_same<T, qasymm8_signed_t>::value,
92  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
93 
94  const int start_x = in->info()->valid_region().anchor.x();
95  const int input_width = in->info()->valid_region().shape.x();
96 
97  const float scale_beta = -beta * in->info()->quantization_info().uniform().scale;
98  const auto scale_beta_vec = vdupq_n_f32(scale_beta);
99 
100  Iterator in_it(in, window);
101  Iterator max_it(max, window);
102  Iterator out_it(out, window);
103  constexpr int vec_size = 16;
104 
105  execute_window_loop(window, [&](const Coordinates &)
106  {
107  /* Get pointers */
108  const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
109  const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
110  const auto tmp_ptr = reinterpret_cast<float *>(tmp);
111 
112  float sum{};
113  float sum_inversed{};
114 
115  /* Compute exponentials and sum */
116  {
117  /* Get max value */
118  const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
119  const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
120 
121  /* Init sum to zero */
122  float32x4x4_t vec_sum =
123  {
124  vdupq_n_f32(0.f),
125  vdupq_n_f32(0.f),
126  vdupq_n_f32(0.f),
127  vdupq_n_f32(0.f),
128  };
129 
130  /* Loop over row and compute exponentials and sum */
131  int x = 0;
132  for(; x <= (input_width - vec_size); x += vec_size)
133  {
134  auto vec_elements = wrapper::vloadq(in_ptr + x);
135  vec_elements = wrapper::vqsub(vec_max, vec_elements);
136  auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
137 
138  if(is_log)
139  {
140  vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
141  vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
142  vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
143  vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
144  vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
145  vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
146  vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
147  vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
148  }
149  else
150  {
151  vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
152  vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
153  vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
154  vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
155  vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
156  vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
157  vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
158  vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
159  }
160 
161  vst4q_f32(tmp_ptr + x, vec_elements_flt);
162  }
163 
164  /* Reduce sum */
165  const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
166  auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
167  sum_res = vpadd_f32(sum_res, sum_res);
168  sum = wrapper::vgetlane(sum_res, 0);
169 
170  /* Run remaining elements */
171  for(; x < input_width; ++x)
172  {
173  float element{};
174  if(is_log)
175  {
176  element = (max_val - in_ptr[x]) * scale_beta;
177  sum += std::exp(element);
178  }
179  else
180  {
181  element = std::exp((max_val - in_ptr[x]) * scale_beta);
182  sum += element;
183  }
184 
185  tmp_ptr[x] = element;
186  }
187 
188  if(!is_log)
189  {
190  sum_inversed = 256.f / sum;
191  }
192  else
193  {
194  sum = std::log(sum);
195  }
196  }
197 
198  /* Normalize exponentials */
199  {
200  constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
201  /* Loop over row and compute softmax */
202  int x = 0;
203  for(; x <= (input_width - vec_size); x += vec_size)
204  {
205  using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
206  float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
207  int_vec_type normalized_value{};
208  if(is_log)
209  {
210  const float32x4x4_t sub =
211  {
212  vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
213  vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
214  vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
215  vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
216  };
217  normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
218  }
219  else
220  {
221  float32x4x4_t mul =
222  {
223  vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
224  vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
225  vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
226  vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
227  };
228 
229  if(is_qasymm8_signed)
230  {
231  const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
232  mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
233  mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
234  mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
235  mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
236  }
237 
238  normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
239  }
240  wrapper::vstore(out_ptr + x, normalized_value);
241  }
242  /* Run remaining elements */
243  for(; x < input_width; ++x)
244  {
245  if(is_log)
246  {
247  out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
248  }
249  else
250  {
251  out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
252  }
253  }
254  }
255  },
256  in_it, max_it, out_it);
257 }
258 
259 template <typename T>
260 void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
261  ITensor *out, const float beta, bool is_log, const Window &window)
262 {
263  const int start_x = in->info()->valid_region().anchor.x();
264  const int input_width = in->info()->valid_region().shape.x();
265 
266  Iterator in_it(in, window);
267  Iterator max_it(max, window);
268  Iterator out_it(out, window);
269 
270  /** SIMD vector tag type. */
272 
273  constexpr int vec_size = 16 / sizeof(T);
274  const int sum_stages = log2(vec_size / 2);
275 
276  execute_window_loop(window, [&](const Coordinates &)
277  {
278  /* Get pointers */
279  const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
280  const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
281  const auto tmp_ptr = reinterpret_cast<T *>(tmp);
282 
283  T sum{};
284  T sum_inversed{};
285 
286  /* Compute exponentials and sum */
287  {
288  /* Get max value */
289  const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
290  const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
291 
292  /* Init sum to zero */
293  auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
294 
295  /* Loop over row and compute exponentials and sum */
296  int x = 0;
297  for(; x <= (input_width - vec_size); x += vec_size)
298  {
299  auto vec_elements = wrapper::vloadq(in_ptr + x);
300  vec_elements = wrapper::vsub(vec_elements, vec_max);
301  if(is_log)
302  {
303  vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
304  vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
305  }
306  else
307  {
308  vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
309  vec_sum = wrapper::vadd(vec_sum, vec_elements);
310  }
311  wrapper::vstore(tmp_ptr + x, vec_elements);
312  }
313 
314  /* Reduce sum */
315  auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
316  for(int i = 0; i < sum_stages; ++i)
317  {
318  sum_res = wrapper::vpadd(sum_res, sum_res);
319  }
320  sum = wrapper::vgetlane(sum_res, 0);
321 
322  /* Run remaining elements */
323  for(; x < input_width; ++x)
324  {
325  T element{};
326 
327  if(is_log)
328  {
329  element = (in_ptr[x] - max_val) * beta;
330  sum += std::exp(element);
331  }
332  else
333  {
334  element = std::exp((in_ptr[x] - max_val) * beta);
335  sum += element;
336  }
337  tmp_ptr[x] = element;
338  }
339 
340  if(!is_log)
341  {
342  sum_inversed = T(1) / sum;
343  }
344  else
345  {
346  sum = static_cast<T>(std::log(sum));
347  }
348  }
349 
350  /* Normalize exponentials */
351  {
352  /* Loop over row and compute softmax */
353  int x = 0;
354  for(; x <= (input_width - vec_size); x += vec_size)
355  {
356  auto vec_in = wrapper::vloadq(tmp_ptr + x);
357  auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
358  if(is_log)
359  {
360  normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
361  }
362  else
363  {
364  normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
365  }
366  wrapper::vstore(out_ptr + x, normalized_value);
367  }
368  /* Run remaining elements */
369  for(; x < input_width; ++x)
370  {
371  if(is_log)
372  {
373  out_ptr[x] = tmp_ptr[x] - sum;
374  }
375  else
376  {
377  out_ptr[x] = tmp_ptr[x] * sum_inversed;
378  }
379  }
380  }
381  },
382  in_it, max_it, out_it);
383 }
384 
385 } // namespace cpu
386 } // namespace arm_compute
387 
388 #endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
TensorShape shape
Shape of the valid region.
Definition: Types.h:257
void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
Definition: list.h:260
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
Interface for CPU tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
typename neon_bitvector< T, BW >::tag_type neon_bitvector_tag_t
Helper type template to get the tag type of a neon vector.
Definition: traits.h:132
T x() const
Alias to access the size of the first dimension.
Definition: Dimensions.h:87
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:187
const size_t input_width
void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
Definition: list.h:37
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
typename neon_vector< T, S >::type neon_vector_t
Helper type template to get the type of a neon vector.
Definition: traits.h:80
void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
Definition: list.h:87
Coordinates of an item.
Definition: Coordinates.h:37
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
uint8x8_t vqsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:74
float32x4_t vexpq_f32(float32x4_t x)
Calculate exponential.
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
Includes all wrapper headers at once.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: pmax.h:39
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:99
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:94
float32x4_t vexpq(const float32x4_t &a)
Definition: exp.h:47
Describe a multidimensional execution window.
Definition: Window.h:39
Coordinates anchor
Anchor for the start of the valid region.
Definition: Types.h:256
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:145