47 const int window_step_x = 16;
48 const auto window_start_x = static_cast<int>(window.
x().
start());
49 const auto window_end_x = static_cast<int>(window.
x().
end());
56 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.
scale);
57 const float32x4_t voffseto = vdupq_n_f32(oq_info.
offset);
59 if(is_broadcast_across_x)
61 const bool is_broadcast_input_2 = input2_win.
x().
step() == 0;
62 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
63 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
64 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
65 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
68 const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.
scale) : vdupq_n_f32(iq2_info.
scale);
69 const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.
scale) : vdupq_n_f32(iq1_info.
scale);
70 const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.
offset) : vdupq_n_s32(iq2_info.
offset);
71 const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.
offset) : vdupq_n_s32(iq1_info.
offset);
76 Iterator broadcast_input(broadcast_tensor, broadcast_win);
77 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
82 const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.
ptr());
83 const auto output_ptr = reinterpret_cast<int8_t *>(output.
ptr());
85 const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.
ptr());
88 const float32x4x4_t bf =
99 int x = window_start_x;
100 for(; x <= (window_end_x - window_step_x); x += window_step_x)
104 const float32x4x4_t af =
114 const int32x4x4_t rf =
118 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
119 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
120 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
121 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
123 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
124 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
125 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
126 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
131 const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
132 const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
137 for(; x < window_end_x; ++x)
139 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.
offset) * non_broadcast_qinfo.
scale;
140 const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.
offset) * broadcast_qinfo.
scale;
144 broadcast_input, non_broadcast_input, output);
148 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.
scale);
149 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.
scale);
150 const int32x4_t voffset1 = vdupq_n_s32(iq1_info.
offset);
151 const int32x4_t voffset2 = vdupq_n_s32(iq2_info.
offset);
163 const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.
ptr());
164 const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.
ptr());
165 const auto output_ptr = reinterpret_cast<int8_t *>(output.
ptr());
168 int x = window_start_x;
169 for(; x <= (window_end_x - window_step_x); x += window_step_x)
174 const float32x4x4_t af =
184 const float32x4x4_t bf =
194 const int32x4x4_t rf =
198 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
199 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
200 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
201 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
203 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
204 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
205 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
206 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
211 const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
212 const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
217 for(; x < window_end_x; ++x)
219 const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.
offset) * iq1_info.
scale;
220 const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.
offset) * iq2_info.
scale;
225 input1, input2, output);
constexpr int step() const
Return the step of the dimension.
uint8x16_t vloadq(const uint8_t *ptr)
Describe one of the image's dimensions with a start, end and step.
Interface for CPU tensor.
Copyright (c) 2017-2021 Arm Limited.
T x() const
Alias to access the size of the first dimension.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
int16x4_t vreinterpret(const uint16x4_t &a)
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
uint8x8_t vgetlow(const uint8x16_t val)
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Window broadcast_if_dimension_le_one(const TensorShape &shape) const
Don't advance in the dimension where shape is less equal to 1.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint8x8_t vgethigh(const uint8x16_t val)
void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
void vstore(uint8_t *ptr, uint8x8_t val)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
uint16x8_t vmovl(const uint8x8_t &a)
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
ConvertPolicy
Policy to handle overflow.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.