74 inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
77 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
78 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
79 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
80 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
83 in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
84 in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
85 in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
86 in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
90 inline typename std::enable_if<std::is_same<T, uint8_t>::value,
98 inline typename std::enable_if<std::is_same<T, int8_t>::value,
105 template <
typename T>
110 in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
111 in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
112 in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
113 in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
116 const int16x8x2_t in_s16 =
119 vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
120 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
135 template <
typename T>
140 const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->
gemmlowp_offset);
141 const int32x4_t result_shift_s32 = vdupq_n_s32(-_output_stage->
gemmlowp_shift);
142 const int window_step_x = 16;
143 const auto window_start_x =
static_cast<int>(window.
x().
start());
144 const auto window_end_x =
static_cast<int>(window.
x().
end());
147 const int clamp_max = (_is_bounded_relu) ? _output_stage->
gemmlowp_max_bound : std::numeric_limits<T>::max();
168 int x = window_start_x;
169 for(; x <= (window_end_x - window_step_x); x += window_step_x)
174 vld1q_s32(reinterpret_cast<const int32_t *>(in.
ptr()) + x + 0),
175 vld1q_s32(reinterpret_cast<const int32_t *>(in.
ptr()) + x + 4),
176 vld1q_s32(reinterpret_cast<const int32_t *>(in.
ptr()) + x + 8),
177 vld1q_s32(reinterpret_cast<const int32_t *>(in.
ptr()) + x + 12)
181 const int32x4x4_t bias_s32 =
184 vld1q_s32(reinterpret_cast<const int32_t *>(bias.
ptr()) + x + 0),
185 vld1q_s32(reinterpret_cast<const int32_t *>(bias.
ptr()) + x + 4),
186 vld1q_s32(reinterpret_cast<const int32_t *>(bias.
ptr()) + x + 8),
187 vld1q_s32(reinterpret_cast<const int32_t *>(bias.
ptr()) + x + 12)
192 in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
193 in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
194 in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
195 in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
200 wrapper::vstore(reinterpret_cast<T *>(out.
ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
204 for(; x < window_end_x; ++x)
206 const int bias_value = *(
reinterpret_cast<const int *
>(bias.
ptr()) + x);
207 int in_value = *(
reinterpret_cast<const int *
>(in.
ptr()) + x);
213 *(out.
ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
223 int x = window_start_x;
224 for(; x <= (window_end_x - window_step_x); x += window_step_x)
229 vld1q_s32(reinterpret_cast<const int32_t *>(in.
ptr()) + x + 0),
230 vld1q_s32(reinterpret_cast<const int32_t *>(in.
ptr()) + x + 4),
231 vld1q_s32(reinterpret_cast<const int32_t *>(in.
ptr()) + x + 8),
232 vld1q_s32(reinterpret_cast<const int32_t *>(in.
ptr()) + x + 12)
239 wrapper::vstore(reinterpret_cast<T *>(out.
ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
243 for(; x < window_end_x; ++x)
245 int in_value = *(
reinterpret_cast<const int *
>(in.
ptr()) + x);
251 *(out.
ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
259 : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _output_stage(nullptr), _is_bounded_relu(false)
272 (bias !=
nullptr) ? bias->
info() :
nullptr,
279 _output_stage = output_stage;
287 INEKernel::configure(win);
295 _func = &NEGEMMLowpQuantizeDownInt32ScaleKernel::run<uint8_t>;
299 _func = &NEGEMMLowpQuantizeDownInt32ScaleKernel::run<int8_t>;
321 (this->*_func)(window);
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
int32_t gemmlowp_multiplier
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
const Window & window() const
The maximum window the kernel can be executed on.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
std::enable_if< std::is_same< T, uint8_t >::value, typename wrapper::traits::neon_vector< T, 16 >::type >::type convert_to_8bit(const int16x8x2_t in_s16)
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpQuantizeDownIn...
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
int32_t gemmlowp_offset
GEMMLowp output stage offset used for quantizing to QASYMM8.
int32_t gemmlowp_max_bound
GEMMLowp max value used to saturate down the output result before converting back to QASYMM8...
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
1 channel, 1 S32 per channel
NEGEMMLowpQuantizeDownInt32ScaleKernel()
Constructor.
uint32x2_t vqmovn(const uint64x2_t &a)
std::pair< int, int > get_min_max_values_from_quantized_data_type(DataType data_type)
Get minimum and maximum values for the input quantized data type.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Create the appropriate Neon vector given its type and size in terms of elements.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
GEMMLowp output stage info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
void configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo *output_stage)
Initialise the kernel's input and output.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
int32_t gemmlowp_shift
GEMMLowp output stage shift used for quantizing to uint8.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
#define ARM_COMPUTE_RETURN_ERROR_MSG(...)
An error is returned with the given description.
Information about executing thread and CPU.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
Container for valid region of a window.
constexpr int end() const
Return the end of the dimension.
int32_t gemmlowp_min_bound
GEMMLowp min value used to saturate down the output result before converting back to QASYMM8...
Iterator updated by execute_window_loop for each window element.
DataType output_data_type
Output tensor data type to use if the output is not initialized.
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
wrapper::traits::neon_vector< T, 16 >::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector< T, 16 >::type min, typename wrapper::traits::neon_vector< T, 16 >::type max)
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
uint32x2_t vqmovun(const int64x2_t &a)
constexpr const Dimension & x() const
Alias to access the first dimension of the window.