47 const int window_step_x = 8;
48 const auto window_start_x =
static_cast<int>(window.
x().
start());
49 const auto window_end_x =
static_cast<int>(window.
x().
end());
56 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.
scale);
57 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.
scale);
58 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.
scale);
60 if(is_broadcast_across_x)
62 const bool is_broadcast_input_2 = input2_win.
x().
step() == 0;
63 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
64 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
65 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
66 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
73 Iterator broadcast_input(broadcast_tensor, broadcast_win);
74 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79 const auto non_broadcast_input_ptr =
reinterpret_cast<const int16_t *
>(non_broadcast_input.
ptr());
80 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.
ptr());
82 const int16_t broadcast_value = *
reinterpret_cast<const int16_t *
>(broadcast_input.
ptr());
83 const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
85 const float32x4x2_t bf =
88 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
89 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
92 const float bfs =
static_cast<int32_t
>(broadcast_value) * broadcast_qinfo.
scale;
95 int x = window_start_x;
96 for(; x <= (window_end_x - window_step_x); x += window_step_x)
98 const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
99 const float32x4x2_t af =
102 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
103 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
107 const int32x4x4_t rf =
111 vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
112 vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
114 vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
115 vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
120 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
121 vst1q_s16(output_ptr + x, pa);
125 for(; x < window_end_x; ++x)
127 const float afs =
static_cast<int32_t
>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.
scale;
128 *(output_ptr + x) =
quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs -
bfs), oq_info);
131 broadcast_input, non_broadcast_input, output);
145 const auto input1_ptr =
reinterpret_cast<const int16_t *
>(input1.
ptr());
146 const auto input2_ptr =
reinterpret_cast<const int16_t *
>(input2.
ptr());
147 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.
ptr());
150 int x = window_start_x;
151 for(; x <= (window_end_x - window_step_x); x += window_step_x)
153 const int16x8_t a = vld1q_s16(input1_ptr + x);
154 const int16x8_t
b = vld1q_s16(input2_ptr + x);
156 const float32x4x2_t af =
159 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
160 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
164 const float32x4x2_t bf =
167 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
168 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
172 const int32x4x2_t rf =
176 vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
177 vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
179 vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
180 vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
185 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
186 vst1q_s16(output_ptr + x, pa);
190 for(; x < window_end_x; ++x)
192 const float afs =
static_cast<int32_t
>((*(input1_ptr + x))) * iq1_info.
scale;
193 const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.
scale;
197 input1, input2, output);
int16_t quantize_qsymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a 16-bit symmetric quantization scheme.
constexpr int step() const
Return the step of the dimension.
Describe one of the image's dimensions with a start, end and step.
Interface for CPU tensor.
Copyright (c) 2017-2021 Arm Limited.
T x() const
Alias to access the size of the first dimension.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Window broadcast_if_dimension_le_one(const TensorShape &shape) const
Don't advance in the dimension where shape is less equal to 1.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
ConvertPolicy
Policy to handle overflow.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.