74 inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
77 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
78 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
79 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
80 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
83 in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
84 in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
85 in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
86 in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
90 inline typename std::enable_if<std::is_same<T, uint8_t>::value,
98 inline typename std::enable_if<std::is_same<T, int8_t>::value,
105 template <
typename T>
110 in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
111 in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
112 in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
113 in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
116 const int16x8x2_t in_s16 =
119 vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
120 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
135 template <
typename T>
140 const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->
gemmlowp_offset);
141 const int32x4_t result_shift_s32 = vdupq_n_s32(-_output_stage->
gemmlowp_shift);
142 const int window_step_x = 16;
143 const auto window_start_x = static_cast<int>(
window.
x().
start());
144 const auto window_end_x = static_cast<int>(
window.
x().
end());
147 const int clamp_max = (_is_bounded_relu) ? _output_stage->
gemmlowp_max_bound : std::numeric_limits<T>::max();
149 VectorType min =
wrapper::vdup_n(static_cast<T>(clamp_min), wrapper::traits::vector_128_tag{});
150 VectorType max =
wrapper::vdup_n(static_cast<T>(clamp_max), wrapper::traits::vector_128_tag{});
155 Iterator in(_input, win);
156 Iterator out(_output, win);
161 win_biases.set(
Window::DimX, Window::Dimension(0, 1, 1));
162 win_biases.set(
Window::DimY, Window::Dimension(0, 1, 1));
164 Iterator bias(_bias, win_biases);
168 int x = window_start_x;
169 for(; x <= (window_end_x - window_step_x); x += window_step_x)
174 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
175 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
176 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
177 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
181 const int32x4x4_t bias_s32 =
184 vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),
185 vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4),
186 vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 8),
187 vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 12)
192 in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
193 in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
194 in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
195 in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
200 wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
204 for(; x < window_end_x; ++x)
206 const int bias_value = *(reinterpret_cast<const int *>(bias.ptr()) + x);
207 int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
213 *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
223 int x = window_start_x;
224 for(; x <= (window_end_x - window_step_x); x += window_step_x)
229 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
230 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
231 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
232 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
239 wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
243 for(; x < window_end_x; ++x)
245 int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
251 *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
259 : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _output_stage(nullptr), _is_bounded_relu(false)
272 (bias !=
nullptr) ? bias->
info() :
nullptr,
279 _output_stage = output_stage;
284 INEKernel::configure(win);
292 _func = &NEGEMMLowpQuantizeDownInt32ScaleKernel::run<uint8_t>;
296 _func = &NEGEMMLowpQuantizeDownInt32ScaleKernel::run<int8_t>;
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
int32_t gemmlowp_multiplier
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
const Window & window() const
The maximum window the kernel can be executed on.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
std::enable_if< std::is_same< T, uint8_t >::value, typename wrapper::traits::neon_vector< T, 16 >::type >::type convert_to_8bit(const int16x8x2_t in_s16)
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpQuantizeDownIn...
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
int32_t gemmlowp_offset
GEMMLowp output stage offset used for quantizing to QASYMM8.
int32_t gemmlowp_max_bound
GEMMLowp max value used to saturate down the output result before converting back to QASYMM8.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 S32 per channel
NEGEMMLowpQuantizeDownInt32ScaleKernel()
Constructor.
uint32x2_t vqmovn(const uint64x2_t &a)
std::pair< int, int > get_min_max_values_from_quantized_data_type(DataType data_type)
Get minimum and maximum values for the input quantized data type.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Create the appropriate SIMD vector given its type and size in terms of elements.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
GEMMLowp output stage info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
void configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo *output_stage)
Initialise the kernel's input and output.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
int32_t gemmlowp_shift
GEMMLowp output stage shift used for quantizing to uint8.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
#define ARM_COMPUTE_RETURN_ERROR_MSG(...)
An error is returned with the given description.
Information about executing thread and CPU.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
constexpr int end() const
Return the end of the dimension.
int32_t gemmlowp_min_bound
GEMMLowp min value used to saturate down the output result before converting back to QASYMM8.
DataType output_data_type
Output tensor data type to use if the output is not initialized.
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
wrapper::traits::neon_vector< T, 16 >::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector< T, 16 >::type min, typename wrapper::traits::neon_vector< T, 16 >::type max)
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
uint32x2_t vqmovun(const int64x2_t &a)
constexpr const Dimension & x() const
Alias to access the first dimension of the window.