Compute Library
 21.02
list.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
25 #define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
26 
28 #include "src/core/NEON/NEMath.h"
30 #include "support/SaturateCast.h"
31 
32 namespace arm_compute
33 {
34 namespace cpu
35 {
36 namespace
37 {
38 template <typename float_vec_type, typename int_vec_type>
39 int_vec_type convert_float_to_int(const float_vec_type &in);
40 
41 template <typename float_vec_type, typename int_vec_type>
42 float_vec_type convert_int_to_float(const int_vec_type &in);
43 
44 template <>
45 uint8x16_t convert_float_to_int<float32x4x4_t, uint8x16_t>(const float32x4x4_t &in)
46 {
47  uint8x16_t out;
49  return out;
50 }
51 
52 template <>
53 int8x16_t convert_float_to_int<float32x4x4_t, int8x16_t>(const float32x4x4_t &in)
54 {
55  int8x16_t out;
57  return out;
58 }
59 
60 template <>
61 float32x4x4_t convert_int_to_float<float32x4x4_t, uint8x16_t>(const uint8x16_t &in)
62 {
64 }
65 
66 template <>
67 float32x4x4_t convert_int_to_float<float32x4x4_t, int8x16_t>(const int8x16_t &in)
68 {
70 }
71 } // namespace
72 
73 template <typename T>
74 void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
75 {
76  /** Neon vector tag type. */
78 
79  constexpr int window_step_x = 16 / sizeof(T);
80  const auto window_start_x = static_cast<int>(window.x().start());
81  const auto window_end_x = static_cast<int>(window.x().end());
82 
83  Window win{ window };
84  win.set(Window::DimX, Window::Dimension(0, 1, 1));
85  Iterator input(in, win);
86  Iterator output(out, win);
87 
88  const int sum_stages = log2(window_step_x / 2);
89  execute_window_loop(win, [&](const Coordinates &)
90  {
91  // Get pointers
92  const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
93  const auto out_ptr = reinterpret_cast<T *>(output.ptr());
94 
95  // Init max value
96  auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
97  int x = window_start_x;
98 
99  for(; x <= (window_end_x - window_step_x); x += window_step_x)
100  {
101  const auto current_value = wrapper::vloadq(in_ptr + x);
102  vec_max = wrapper::vmax(vec_max, current_value);
103  }
104  auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
105 
106  for(int i = 0; i < sum_stages; ++i)
107  {
108  carry_max = wrapper::vpmax(carry_max, carry_max);
109  }
110  T max_val = wrapper::vgetlane(carry_max, 0);
111 
112  // Compute left-over elements
113  for(; x < window_end_x; ++x)
114  {
115  max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
116  }
117 
118  *out_ptr = max_val;
119  },
120  input, output);
121 }
122 
123 template <typename T>
124 void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
125  ITensor *out, float beta, bool is_log, const Window &window)
126 {
127  static_assert(std::is_same<T, qasymm8_t>::value
128  || std::is_same<T, qasymm8_signed_t>::value,
129  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
130 
131  const int start_x = in->info()->valid_region().anchor.x();
132  const int input_width = in->info()->valid_region().shape.x();
133 
134  const float scale_beta = -beta * in->info()->quantization_info().uniform().scale;
135  const auto scale_beta_vec = vdupq_n_f32(scale_beta);
136 
137  Iterator in_it(in, window);
138  Iterator max_it(max, window);
139  Iterator out_it(out, window);
140  constexpr int vec_size = 16;
141 
142  execute_window_loop(window, [&](const Coordinates &)
143  {
144  /* Get pointers */
145  const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
146  const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
147  const auto tmp_ptr = reinterpret_cast<float *>(tmp);
148 
149  float sum{};
150  float sum_inversed{};
151 
152  /* Compute exponentials and sum */
153  {
154  /* Get max value */
155  const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
156  const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
157 
158  /* Init sum to zero */
159  float32x4x4_t vec_sum =
160  {
161  vdupq_n_f32(0.f),
162  vdupq_n_f32(0.f),
163  vdupq_n_f32(0.f),
164  vdupq_n_f32(0.f),
165  };
166 
167  /* Loop over row and compute exponentials and sum */
168  int x = 0;
169  for(; x <= (input_width - vec_size); x += vec_size)
170  {
171  auto vec_elements = wrapper::vloadq(in_ptr + x);
172  vec_elements = wrapper::vqsub(vec_max, vec_elements);
173  auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
174 
175  if(is_log)
176  {
177  vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
178  vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
179  vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
180  vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
181  vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
182  vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
183  vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
184  vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
185  }
186  else
187  {
188  vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
189  vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
190  vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
191  vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
192  vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
193  vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
194  vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
195  vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
196  }
197 
198  vst4q_f32(tmp_ptr + x, vec_elements_flt);
199  }
200 
201  /* Reduce sum */
202  const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
203  auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
204  sum_res = vpadd_f32(sum_res, sum_res);
205  sum = wrapper::vgetlane(sum_res, 0);
206 
207  /* Run remaining elements */
208  for(; x < input_width; ++x)
209  {
210  float element{};
211  if(is_log)
212  {
213  element = (max_val - in_ptr[x]) * scale_beta;
214  sum += std::exp(element);
215  }
216  else
217  {
218  element = std::exp((max_val - in_ptr[x]) * scale_beta);
219  sum += element;
220  }
221 
222  tmp_ptr[x] = element;
223  }
224 
225  if(!is_log)
226  {
227  sum_inversed = 256.f / sum;
228  }
229  else
230  {
231  sum = std::log(sum);
232  }
233  }
234 
235  /* Normalize exponentials */
236  {
237  constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
238  /* Loop over row and compute softmax */
239  int x = 0;
240  for(; x <= (input_width - vec_size); x += vec_size)
241  {
242  using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
243  float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
244  int_vec_type normalized_value{};
245  if(is_log)
246  {
247  const float32x4x4_t sub =
248  {
249  vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
250  vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
251  vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
252  vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
253  };
254  normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
255  }
256  else
257  {
258  float32x4x4_t mul =
259  {
260  vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
261  vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
262  vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
263  vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
264  };
265 
266  if(is_qasymm8_signed)
267  {
268  const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
269  mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
270  mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
271  mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
272  mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
273  }
274 
275  normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
276  }
277  wrapper::vstore(out_ptr + x, normalized_value);
278  }
279  /* Run remaining elements */
280  for(; x < input_width; ++x)
281  {
282  if(is_log)
283  {
284  out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
285  }
286  else
287  {
288  out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
289  }
290  }
291  }
292  },
293  in_it, max_it, out_it);
294 }
295 
296 template <typename T>
297 void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
298  ITensor *out, const float beta, bool is_log, const Window &window)
299 {
300  const int start_x = in->info()->valid_region().anchor.x();
301  const int input_width = in->info()->valid_region().shape.x();
302 
303  Iterator in_it(in, window);
304  Iterator max_it(max, window);
305  Iterator out_it(out, window);
306 
307  /** Neon vector tag type. */
309 
310  constexpr int vec_size = 16 / sizeof(T);
311  const int sum_stages = log2(vec_size / 2);
312 
313  execute_window_loop(window, [&](const Coordinates &)
314  {
315  /* Get pointers */
316  const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
317  const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
318  const auto tmp_ptr = reinterpret_cast<T *>(tmp);
319 
320  T sum{};
321  T sum_inversed{};
322 
323  /* Compute exponentials and sum */
324  {
325  /* Get max value */
326  const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
327  const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
328 
329  /* Init sum to zero */
330  auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
331 
332  /* Loop over row and compute exponentials and sum */
333  int x = 0;
334  for(; x <= (input_width - vec_size); x += vec_size)
335  {
336  auto vec_elements = wrapper::vloadq(in_ptr + x);
337  vec_elements = wrapper::vsub(vec_elements, vec_max);
338  if(is_log)
339  {
340  vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
341  vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
342  }
343  else
344  {
345  vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
346  vec_sum = wrapper::vadd(vec_sum, vec_elements);
347  }
348  wrapper::vstore(tmp_ptr + x, vec_elements);
349  }
350 
351  /* Reduce sum */
352  auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
353  for(int i = 0; i < sum_stages; ++i)
354  {
355  sum_res = wrapper::vpadd(sum_res, sum_res);
356  }
357  sum = wrapper::vgetlane(sum_res, 0);
358 
359  /* Run remaining elements */
360  for(; x < input_width; ++x)
361  {
362  T element{};
363 
364  if(is_log)
365  {
366  element = (in_ptr[x] - max_val) * beta;
367  sum += std::exp(element);
368  }
369  else
370  {
371  element = std::exp((in_ptr[x] - max_val) * beta);
372  sum += element;
373  }
374  tmp_ptr[x] = element;
375  }
376 
377  if(!is_log)
378  {
379  sum_inversed = T(1) / sum;
380  }
381  else
382  {
383  sum = static_cast<T>(std::log(sum));
384  }
385  }
386 
387  /* Normalize exponentials */
388  {
389  /* Loop over row and compute softmax */
390  int x = 0;
391  for(; x <= (input_width - vec_size); x += vec_size)
392  {
393  auto vec_in = wrapper::vloadq(tmp_ptr + x);
394  auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
395  if(is_log)
396  {
397  normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
398  }
399  else
400  {
401  normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
402  }
403  wrapper::vstore(out_ptr + x, normalized_value);
404  }
405  /* Run remaining elements */
406  for(; x < input_width; ++x)
407  {
408  if(is_log)
409  {
410  out_ptr[x] = tmp_ptr[x] - sum;
411  }
412  else
413  {
414  out_ptr[x] = tmp_ptr[x] * sum_inversed;
415  }
416  }
417  }
418  },
419  in_it, max_it, out_it);
420 }
421 
422 } // namespace cpu
423 } // namespace arm_compute
424 
425 #endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
TensorShape shape
Shape of the valid region.
Definition: Types.h:261
void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
Definition: list.h:297
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in)
Converts from int8x16 to float32x4x4_t.
Definition: NEMath.inl:336
float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in)
Converts from uint8x16 to float32x4x4_t.
Definition: NEMath.inl:322
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
typename neon_bitvector< T, BW >::tag_type neon_bitvector_tag_t
Helper type template to get the tag type of a neon vector.
Definition: traits.h:132
T x() const
Alias to access the size of the first dimension.
Definition: Dimensions.h:87
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:187
void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
Definition: list.h:74
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
typename neon_vector< T, S >::type neon_vector_t
Helper type template to get the type of a neon vector.
Definition: traits.h:80
void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
Definition: list.h:124
void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
Converts from float32x4x4_t to just one int8x16_t.
Definition: NEMath.inl:381
Coordinates of an item.
Definition: Coordinates.h:37
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
uint8x8_t vqsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:74
float32x4_t vexpq_f32(float32x4_t x)
Calculate exponential.
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
Converts from two float32x4x4_t to just one uint8x16_t.
Definition: NEMath.inl:372
Includes all wrapper headers at once.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: pmax.h:39
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:99
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:94
float32x4_t vexpq(const float32x4_t &a)
Definition: exp.h:47
Describe a multidimensional execution window.
Definition: Window.h:39
Coordinates anchor
Anchor for the start of the valid region.
Definition: Types.h:260
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:145