Compute Library
 21.08
qasymm8.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
26 #include "arm_compute/core/Types.h"
30 
31 namespace arm_compute
32 {
33 namespace cpu
34 {
35 void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
36 {
37  ARM_COMPUTE_UNUSED(policy);
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  const int window_step_x = 16;
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
53  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
54  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
55 
56  const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
57  const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
58 
59  if(is_broadcast_across_x)
60  {
61  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
62  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
63  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
64  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
65  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
66  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
67  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
68 
69  const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
70  const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
71  const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
72  const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
73 
74  // Clear X Dimension on execution window as we handle manually
75  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
76 
77  Iterator broadcast_input(broadcast_tensor, broadcast_win);
78  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79  Iterator output(dst, win);
80 
81  execute_window_loop(win, [&](const Coordinates &)
82  {
83  const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
84  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
85 
86  const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
87  const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
88 
89  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
90  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
91  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
92  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
93 
94  const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
95 
96  // Compute S elements per iteration
97  int x = window_start_x;
98  for(; x <= (window_end_x - window_step_x); x += window_step_x)
99  {
100  const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
101  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
102  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
103  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
104  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
105 
106  int32x4_t rf_0{};
107  int32x4_t rf_1{};
108  int32x4_t rf_2{};
109  int32x4_t rf_3{};
110 
111 #ifdef __aarch64__
112  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
113  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
114  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
115  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
116 #else //__aarch64__
117  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
118  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
119  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
120  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
121 #endif //__aarch64__
122 
123  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
124  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
125  vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
126  }
127 
128  // Compute left-over elements
129  for(; x < window_end_x; ++x)
130  {
131  const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
132  *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
133  }
134  },
135  broadcast_input, non_broadcast_input, output);
136  }
137  else
138  {
139  // Clear X Dimension on execution window as we handle manually
140  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
141  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
142 
143  Iterator input1(src0, input1_win);
144  Iterator input2(src1, input2_win);
145  Iterator output(dst, win);
146 
147  const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
148  const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
149  const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
150  const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
151 
152  execute_window_loop(win, [&](const Coordinates &)
153  {
154  const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
155  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
156  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
157 
158  // Compute S elements per iteration
159  int x = window_start_x;
160  for(; x <= (window_end_x - window_step_x); x += window_step_x)
161  {
162  const uint8x16_t a = vld1q_u8(input1_ptr + x);
163  const uint8x16_t b = vld1q_u8(input2_ptr + x);
164 
165  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
166  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
167  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
168  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
169 
170  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
171  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
172  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
173  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
174 
175  int32x4_t rf_0{};
176  int32x4_t rf_1{};
177  int32x4_t rf_2{};
178  int32x4_t rf_3{};
179 
180 #ifdef __aarch64__
181  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
182  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
183  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
184  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
185 #else //__aarch64__
186  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
187  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
188  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
189  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
190 #endif //__aarch64__
191 
192  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
193  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
194  vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
195  }
196 
197  // Compute left-over elements
198  for(; x < window_end_x; ++x)
199  {
200  const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
201  const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
202  *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
203  }
204  },
205  input1, input2, output);
206  }
207 }
208 } // namespace cpu
209 } // namespace arm_compute
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given an unsigned 8-bit asymmetric quantization scheme.
constexpr int step() const
Return the step of the dimension.
Definition: Window.h:104
Quantization info when assuming per layer quantization.
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
Interface for CPU tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Definition: qasymm8.cpp:35
T x() const
Alias to access the size of the first dimension.
Definition: Dimensions.h:87
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
Coordinates of an item.
Definition: Coordinates.h:37
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
Window broadcast_if_dimension_le_one(const TensorShape &shape) const
Don&#39;t advance in the dimension where shape is less equal to 1.
Definition: Window.inl:120
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:99
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:94
Describe a multidimensional execution window.
Definition: Window.h:39
ConvertPolicy
Policy to handle overflow.
Definition: Types.h:382
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:145