Compute Library
 22.11
impl.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
29 namespace arm_compute
30 {
31 namespace cpu
32 {
33 template <typename ScalarType>
34 void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
35 {
36  /** SIMD vector tag type. */
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  constexpr int window_step_x = 16 / sizeof(ScalarType);
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  if(is_broadcast_across_x)
53  {
54  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
55  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
56  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
57  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
58  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
59 
60  // Clear X Dimension on execution window as we handle manually
61  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
62 
63  Iterator broadcast_input(broadcast_tensor, broadcast_win);
64  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
65  Iterator output(dst, win);
66 
67  execute_window_loop(win, [&](const Coordinates &)
68  {
69  const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
70  const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
71 
72  const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
73  const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
74 
75  // Compute S elements per iteration
76  int x = window_start_x;
77  for(; x <= (window_end_x - window_step_x); x += window_step_x)
78  {
79  const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
80  const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
81  wrapper::vstore(output_ptr + x, res);
82  }
83 
84  // Compute left-over elements
85  for(; x < window_end_x; ++x)
86  {
87  const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
88  *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
89  }
90  },
91  broadcast_input, non_broadcast_input, output);
92  }
93  else
94  {
95  // Clear X Dimension on execution window as we handle manually
96  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
97  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
98 
99  Iterator input1(src0, input1_win);
100  Iterator input2(src1, input2_win);
101  Iterator output(dst, win);
102 
103  execute_window_loop(win, [&](const Coordinates &)
104  {
105  const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
106  const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
107  const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
108 
109  // Compute S elements per iteration
110  int x = window_start_x;
111  for(; x <= (window_end_x - window_step_x); x += window_step_x)
112  {
113  const auto val1 = wrapper::vloadq(input1_ptr + x);
114  const auto val2 = wrapper::vloadq(input2_ptr + x);
115  const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
116  wrapper::vstore(output_ptr + x, res);
117  }
118 
119  // Compute left-over elements
120  for(; x < window_end_x; ++x)
121  {
122  const auto val1 = *(input1_ptr + x);
123  const auto val2 = *(input2_ptr + x);
124  *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
125  }
126  },
127  input1, input2, output);
128  }
129 }
130 
132 {
133  const auto iq0 = src0->quantization_info().uniform();
134  const auto iq1 = src1->quantization_info().uniform();
135  const auto oq = dst->quantization_info().uniform();
136 
137  const auto scale0 = iq0.scale / oq.scale;
138  const auto scale1 = iq1.scale / oq.scale;
139 
140  if(scale0 < -31.f || scale0 > 31.f || scale1 < -31.f || scale1 > 31.f)
141  {
142  // The scale factor cannot be stored as 6.10 signed fixed-point number.
143  return false;
144  }
145 
146  const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset);
147  const auto max_acc = (std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset);
148 
149  if(max_acc > 2097151.f) // 2^21 - 1
150  {
151  // It might not be possible to store the result as 22.10 signed fixed-point number.
152  return false;
153  }
154 
155  return true;
156 }
157 
158 template <typename ScalarType>
159 void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
160 {
161  ARM_COMPUTE_UNUSED(policy);
162 
163  const auto in0_info = src0->info();
164  const auto in1_info = src1->info();
165 
166  const auto &in0_shape = in0_info->tensor_shape();
167  const auto &in1_shape = in1_info->tensor_shape();
168 
169  // Create input windows.
170  Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
171  Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
172 
173  // Clear the x dimension on the execution window as we process the whole row each iteration.
174  Window win = window;
175  win.set(Window::DimX, Window::Dimension(0, 1, 1));
176 
177  constexpr int window_step_x = 16;
178  const auto window_start_x = window.x().start();
179  const auto window_end_x = window.x().end();
180  const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();
181 
182  const auto iq0_info = in0_info->quantization_info().uniform();
183  const auto iq1_info = in1_info->quantization_info().uniform();
184  const auto oq_info = dst->info()->quantization_info().uniform();
185 
186  const auto in0_scale = iq0_info.scale / oq_info.scale;
187  const auto in1_scale = iq1_info.scale / oq_info.scale;
188  const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
189 
190  const auto in0_scale_6p10 = static_cast<int16_t>(support::cpp11::lround(in0_scale * 1024.f));
191  const auto in1_scale_6p10 = static_cast<int16_t>(support::cpp11::lround(in1_scale * 1024.f));
192  const auto offset_22p10 = static_cast<int32_t>(support::cpp11::lround(offset * 1024.f));
193 
194  if(is_broadcast_across_x)
195  {
196  // Prefix: a = non-broadcast, b = broadcast.
197 
198  const auto is_broadcast_input_1 = in1_win.x().step() == 0;
199  auto a_win = is_broadcast_input_1 ? in0_win : in1_win;
200  auto b_win = is_broadcast_input_1 ? in1_win : in0_win;
201  const auto a_tensor = is_broadcast_input_1 ? src0 : src1;
202  const auto b_tensor = is_broadcast_input_1 ? src1 : src0;
203 
204  const auto a_scale_6p10 = is_broadcast_input_1 ? in0_scale_6p10 : in1_scale_6p10;
205  const auto b_scale = is_broadcast_input_1 ? in1_scale : in0_scale;
206  const auto a_vscale_6p10 = wrapper::vdup_n(a_scale_6p10, wrapper::traits::vector_64_tag());
207 
208 #ifndef __aarch64__
209  const auto a_scale = is_broadcast_input_1 ? in0_scale : in1_scale;
210 #endif // __aarch64__
211 
212  // Clear the x dimension on the execution window as we process the whole row each iteration.
213  a_win.set(Window::DimX, Window::Dimension(0, 1, 1));
214 
215  Iterator a_input_it(a_tensor, a_win);
216  Iterator b_input_it(b_tensor, b_win);
217  Iterator out_it(dst, win);
218 
219  execute_window_loop(win, [&](const Coordinates &)
220  {
221  const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
222  const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
223  const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
224 
225  const auto b_val = *b_ptr;
226  const auto b_scaled = b_scale * b_val;
227  const auto b_scaled_22p10 = static_cast<int32_t>(support::cpp11::lround(b_scaled * 1024.f));
228  const auto b_scaled_offseted_22p10 = b_scaled_22p10 + offset_22p10;
229  const auto b_vscaled_offseted_22p10 = wrapper::vdup_n(b_scaled_offseted_22p10, wrapper::traits::vector_128_tag());
230 
231 #ifndef __aarch64__
232  const auto b_scaled_offseted = b_scaled + offset;
233 #endif // __aarch64__
234 
235  int x = window_start_x;
236 
237  for(; x <= (window_end_x - window_step_x); x += window_step_x)
238  {
239  // Load the input.
240  const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
241 
242  // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
243  const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
244  const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
245 
246  // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
247  // Widen and store the result in 32-bit integer.
248  const auto vout_22p10_00 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgetlow(a_vin_16p0_0), a_vscale_6p10);
249  const auto vout_22p10_01 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgethigh(a_vin_16p0_0), a_vscale_6p10);
250  const auto vout_22p10_10 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgetlow(a_vin_16p0_1), a_vscale_6p10);
251  const auto vout_22p10_11 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgethigh(a_vin_16p0_1), a_vscale_6p10);
252 
253  // Remove 2 bits of the fractional part, round, narrow to 16-bit and saturate the result.
254  const auto vout_8p8_0 = wrapper::vcombine(
255  wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_00),
256  wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_01)
257  );
258  const auto vout_8p8_1 = wrapper::vcombine(
259  wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_10),
260  wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_11)
261  );
262 
263  // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
264  const auto vout_8p0 = wrapper::vcombine(
265  wrapper::vqrshrn<8>(vout_8p8_0),
266  wrapper::vqrshrn<8>(vout_8p8_1)
267  );
268 
269  // Store the result.
270  wrapper::vstore(out_ptr + x, vout_8p0);
271  }
272 
273  // Process the left-over elements.
274  for(; x < window_end_x; ++x)
275  {
276 #ifdef __aarch64__
277  out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<2, ScalarType>(int32_t(a_ptr[x]) * a_scale_6p10 + b_scaled_offseted_22p10));
278 #else // __aarch64__
279  out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
280 #endif // __aarch64__
281  }
282  },
283  b_input_it, a_input_it, out_it);
284  }
285  else
286  {
287  const auto vscale0_6p10 = wrapper::vdup_n(in0_scale_6p10, wrapper::traits::vector_64_tag());
288  const auto vscale1_6p10 = wrapper::vdup_n(in1_scale_6p10, wrapper::traits::vector_64_tag());
289  const auto voffset_22p10 = wrapper::vdup_n(offset_22p10, wrapper::traits::vector_128_tag());
290 
291  // Clear the x dimension on the execution window as we process the whole row each iteration.
292  in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
293  in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
294 
295  Iterator in0_it(src0, in0_win);
296  Iterator in1_it(src1, in1_win);
297  Iterator out_it(dst, win);
298 
299  execute_window_loop(win, [&](const Coordinates &)
300  {
301  const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
302  const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
303  const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
304 
305  int x = window_start_x;
306 
307  for(; x <= (window_end_x - window_step_x); x += window_step_x)
308  {
309  // Load the inputs.
310  const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
311  const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
312 
313  // Widen the input elements to signed 16-bit regardless of the input signedness.
314  const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
315  const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
316  const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
317  const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
318 
319  // Multiply the input elements by the scale factor and add the offset.
320  // Widen and store the result in 32-bit integer.
321  const auto vscaled0_offseted_22p10_00 = wrapper::vmlal(voffset_22p10, wrapper::vgetlow(vin0_16p0_0), vscale0_6p10);
322  const auto vscaled0_offseted_22p10_01 = wrapper::vmlal(voffset_22p10, wrapper::vgethigh(vin0_16p0_0), vscale0_6p10);
323  const auto vscaled0_offseted_22p10_10 = wrapper::vmlal(voffset_22p10, wrapper::vgetlow(vin0_16p0_1), vscale0_6p10);
324  const auto vscaled0_offseted_22p10_11 = wrapper::vmlal(voffset_22p10, wrapper::vgethigh(vin0_16p0_1), vscale0_6p10);
325 
326  const auto vout_22p10_00 = wrapper::vmlal(vscaled0_offseted_22p10_00, wrapper::vgetlow(vin1_16p0_0), vscale1_6p10);
327  const auto vout_22p10_01 = wrapper::vmlal(vscaled0_offseted_22p10_01, wrapper::vgethigh(vin1_16p0_0), vscale1_6p10);
328  const auto vout_22p10_10 = wrapper::vmlal(vscaled0_offseted_22p10_10, wrapper::vgetlow(vin1_16p0_1), vscale1_6p10);
329  const auto vout_22p10_11 = wrapper::vmlal(vscaled0_offseted_22p10_11, wrapper::vgethigh(vin1_16p0_1), vscale1_6p10);
330 
331  // Remove 2 bits of the fractional part, round, narrow to 16-bit and saturate the result.
332  const auto vout_8p8_0 = wrapper::vcombine(
333  wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_00),
334  wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_01)
335  );
336  const auto vout_8p8_1 = wrapper::vcombine(
337  wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_10),
338  wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_11)
339  );
340 
341  // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
342  const auto vout_8p0 = wrapper::vcombine(
343  wrapper::vqrshrn<8>(vout_8p8_0),
344  wrapper::vqrshrn<8>(vout_8p8_1)
345  );
346 
347  // Store the result.
348  wrapper::vstore(out_ptr + x, vout_8p0);
349  }
350 
351  // Process the left-over elements.
352  for(; x < window_end_x; ++x)
353  {
354 #ifdef __aarch64__
355  out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<2, ScalarType>(int32_t(in0_ptr[x]) * in0_scale_6p10 + int32_t(in1_ptr[x]) * in1_scale_6p10 + offset_22p10));
356 #else // __aarch64__
357  out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
358 #endif // __aarch64__
359  }
360  },
361  in0_it, in1_it, out_it);
362  }
363 }
364 
365 template void add_same_neon<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
366 template void add_same_neon<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
367 template void add_same_neon<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
368 template void add_same_neon<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
369 
370 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
371 template void add_same_neon<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
372 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
373 
374 template void add_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
375 template void add_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
376 
377 } // namespace cpu
378 } // namespace arm_compute
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1084
constexpr int step() const
Return the step of the dimension.
Definition: Window.h:107
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:79
Interface for CPU tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2022 Arm Limited.
void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Definition: impl.cpp:159
typename neon_bitvector< T, BW >::tag_type neon_bitvector_tag_t
Helper type template to get the tag type of a neon vector.
Definition: traits.h:132
T x() const
Alias to access the size of the first dimension.
Definition: Dimensions.h:87
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
template void add_q8_neon_fixedpoint< int8_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
Coordinates of an item.
Definition: Coordinates.h:37
uint8_t add_sat(const uint8_t &a, const uint8_t &b)
Definition: add.h:33
UniformQuantizationInfo uniform() const
Return per layer quantization info.
template void add_same_neon< float >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
uint8x8_t vqadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:73
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
Window broadcast_if_dimension_le_one(const TensorShape &shape) const
Don&#39;t advance in the dimension where shape is less equal to 1.
Definition: Window.inl:120
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint16x8_t vmlal(const uint16x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:76
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
template void add_same_neon< int16_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
template void add_same_neon< uint8_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Definition: impl.cpp:34
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
long lround(T value)
Round floating-point value with half value rounding away from zero and cast to long.
Includes all wrapper headers at once.
bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
Definition: impl.cpp:131
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:102
template void add_same_neon< int32_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:97
Describe a multidimensional execution window.
Definition: Window.h:39
ConvertPolicy
Policy to handle integer overflow.
Definition: Types.h:404
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:159
template void add_q8_neon_fixedpoint< uint8_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)