49 Status validate_arguments(
const ITensorInfo *
src,
const ITensorInfo *bias,
const ITensorInfo *
dst,
const GEMMLowpOutputStageInfo *output_stage)
56 || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
66 if(dst->total_size() != 0)
79 inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
82 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
83 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
84 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
85 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
88 in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
89 in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
90 in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
91 in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
95 inline typename std::enable_if<std::is_same<T, uint8_t>::value,
97 convert_to_8bit(
const int16x8x2_t in_s16)
102 template <
typename T>
103 inline typename std::enable_if<std::is_same<T, int8_t>::value,
105 convert_to_8bit(
const int16x8x2_t in_s16)
110 template <
typename T>
115 in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
116 in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
117 in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
118 in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
121 const int16x8x2_t in_s16 =
124 vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
125 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
139 template <
typename T>
140 void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(
const ITensor *src,
const ITensor *bias, ITensor *dst,
const Window &window)
144 const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->
gemmlowp_offset);
145 const int32x4_t result_shift_s32 = vdupq_n_s32(-_output_stage->
gemmlowp_shift);
146 const int window_step_x = 16;
147 const auto window_start_x =
static_cast<int>(window.x().start());
148 const auto window_end_x =
static_cast<int>(window.x().end());
151 const int clamp_max = (_is_bounded_relu) ? _output_stage->
gemmlowp_max_bound : std::numeric_limits<T>::max();
153 VectorType min =
wrapper::vdup_n(static_cast<T>(clamp_min), wrapper::traits::vector_128_tag{});
154 VectorType max =
wrapper::vdup_n(static_cast<T>(clamp_max), wrapper::traits::vector_128_tag{});
159 Iterator in(src, win);
160 Iterator out(dst, win);
165 win_biases.set(
Window::DimX, Window::Dimension(0, 1, 1));
166 win_biases.set(
Window::DimY, Window::Dimension(0, 1, 1));
168 Iterator bias_i(bias, win_biases);
172 int x = window_start_x;
173 for(; x <= (window_end_x - window_step_x); x += window_step_x)
178 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
179 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
180 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
181 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
185 const int32x4x4_t bias_s32 =
188 vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
189 vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
190 vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
191 vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
196 in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
197 in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
198 in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
199 in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
204 wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
208 for(; x < window_end_x; ++x)
210 const int bias_value = *(
reinterpret_cast<const int *
>(bias_i.ptr()) + x);
211 int in_value = *(
reinterpret_cast<const int *
>(in.ptr()) + x);
217 *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
227 int x = window_start_x;
228 for(; x <= (window_end_x - window_step_x); x += window_step_x)
233 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
234 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
235 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
236 vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
243 wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
247 for(; x < window_end_x; ++x)
249 int in_value = *(
reinterpret_cast<const int *
>(in.ptr()) + x);
255 *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
276 _output_stage = output_stage;
281 ICpuKernel::configure(win);
289 _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>;
293 _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>;
317 (this->*_func)(src, bias, dst, window);
322 return "CpuGemmLowpQuantizeDownInt32ScaleKernel";
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
int32_t gemmlowp_multiplier
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
const Window & window() const
The maximum window the kernel can be executed on.
void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
Initialise the kernel's input and output.
bool empty() const
Checks if pack is empty.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
const char * name() const override
Name of the kernel.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
int32_t gemmlowp_offset
GEMMLowp output stage offset used for quantizing to QASYMM8.
int32_t gemmlowp_max_bound
GEMMLowp max value used to saturate down the output result before converting back to QASYMM8...
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 S32 per channel
uint32x2_t vqmovn(const uint64x2_t &a)
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
Static function to check if given info will lead to a valid configuration.
std::pair< int, int > get_min_max_values_from_quantized_data_type(DataType data_type)
Get minimum and maximum values for the input quantized data type.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
GEMMLowp output stage info.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
int32_t gemmlowp_shift
GEMMLowp output stage shift used for quantizing to uint8.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
#define ARM_COMPUTE_RETURN_ERROR_MSG(...)
An error is returned with the given description.
uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
Performs final quantization step on 16 elements.
Information about executing thread and CPU.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
int32_t gemmlowp_min_bound
GEMMLowp min value used to saturate down the output result before converting back to QASYMM8...
DataType output_data_type
Output tensor data type to use if the output is not initialized.
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
uint32x2_t vqmovun(const int64x2_t &a)