24 #ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H 25 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H 27 #if defined(ARM_COMPUTE_ENABLE_SVE2) 38 template <
typename InputScalarType,
typename OutputScalarType,
typename OperatorType>
39 struct QuantizedLoopArguments
42 const InputScalarType *input1_ptr;
43 const InputScalarType *input2_ptr;
44 OutputScalarType *output_ptr;
46 const svint32_t &in1_offset;
47 const svint32_t &in2_offset;
48 const svint32_t &out_offset;
49 const svfloat32_t &in1_scale;
50 const svfloat32_t &in2_scale;
51 const svfloat32_t &out_scale;
54 template <
typename InputScalarType,
typename OutputScalarType,
typename OperatorType>
55 struct BroadcastQuantizedLoopArguments
58 const InputScalarType *input1_ptr;
59 float broadcast_value;
60 OutputScalarType *output_ptr;
63 const svint32_t &in1_offset;
64 const svint32_t &out_offset;
65 const svfloat32_t &in1_scale;
66 const svfloat32_t &out_scale;
71 auto x = svld1(pg, ptr);
73 const auto widened = svcreate4(
82 svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
83 svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
84 svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
85 svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
88 svfloat32x4_t
load_quantized(
const uint8_t *ptr, svbool_t pg,
const svint32_t &offset,
const svfloat32_t &scale)
90 auto x = svld1(pg, ptr);
94 const auto widened = svcreate4(
103 svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
104 svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
105 svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
106 svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
109 void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data,
const svint32_t &offset,
const svfloat32_t &inv_scale)
111 const auto quantized = svcreate4(
112 svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
113 svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
114 svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
115 svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
117 const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1));
118 const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3));
119 const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
120 svst1(pg, ptr, narrowed);
123 void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data,
const svint32_t &offset,
const svfloat32_t &inv_scale)
125 const auto quantized = svcreate4(
126 svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
127 svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
128 svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
129 svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
131 const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1));
132 const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3));
133 const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
135 svst1(pg, ptr, narrowed);
138 template <
typename InputScalarType,
typename OutputScalarType>
139 inline void arithmetic_op_quantized_loop(svbool_t pg,
const QuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &
args)
141 const auto in1 =
load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
142 const auto in2 =
load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
144 const auto result = svcreate4(
145 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
146 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), args.op),
147 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), args.op),
148 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), args.op));
150 store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale);
153 template <
typename InputScalarType,
typename OutputScalarType>
154 inline void arithmetic_op_broadcast_quantized_loop(svbool_t pg,
const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
156 const auto in1 =
load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
157 const auto in2 = svcreate4(
158 svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value));
160 const auto &af = args.reorder ? in2 : in1;
161 const auto &bf = args.reorder ? in1 : in2;
163 const auto result = svcreate4(
164 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 0), svget4(bf, 0), args.op),
165 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 1), svget4(bf, 1), args.op),
166 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 2), svget4(bf, 2), args.op),
167 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 3), svget4(bf, 3), args.op));
169 store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale);
172 template <
typename InputScalarType,
typename OutputScalarType>
173 inline void comparison_op_quantized_loop(svbool_t pg,
const QuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
175 const auto in1 =
load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
176 const auto in2 =
load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
180 const auto result = svcreate4(
181 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
182 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), args.op),
183 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), args.op),
184 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), args.op));
186 const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
187 const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3));
188 const auto zipped = svzip1(zipped_bottom, zipped_top);
189 svst1(pg, args.output_ptr, zipped);
192 template <
typename InputScalarType,
typename OutputScalarType>
193 inline void comparison_op_broadcast_quantized_loop(svbool_t pg,
const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
195 const auto in1 =
load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
196 const auto in2 = svcreate4(
197 svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value));
199 const auto &af = args.reorder ? in2 : in1;
200 const auto &bf = args.reorder ? in1 : in2;
204 const auto result = svcreate4(
205 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 0), svget4(bf, 0), args.op),
206 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 1), svget4(bf, 1), args.op),
207 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 2), svget4(bf, 2), args.op),
208 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 3), svget4(bf, 3), args.op));
210 const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
211 const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3));
212 const auto zipped = svzip1(zipped_bottom, zipped_top);
213 svst1(pg, args.output_ptr, zipped);
216 template <
typename InputScalarType,
typename OutputScalarType,
typename OperatorType>
217 using LoopQuantizedFuncType = void (*)(svbool_t,
const QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
219 template <
typename InputScalarType,
typename OutputScalarType,
typename OperatorType>
220 using BroadcastQuantizedLoopFuncType = void (*)(svbool_t,
const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
222 template <
typename InputVectorType,
typename OutputVectorType,
typename OperatorType,
225 void elementwise_quantized_op(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
227 LoopQuantizedFuncType<InputScalarType, OutputScalarType, OperatorType> func,
228 BroadcastQuantizedLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
230 const auto all_true_pg = wrapper::svptrue<InputScalarType>();
233 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
234 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
240 const auto window_start_x =
static_cast<int>(window.x().start());
241 const auto window_end_x =
static_cast<int>(window.x().end());
242 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
244 const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset);
245 const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale);
247 if(is_broadcast_across_x)
249 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
250 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
251 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
252 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
253 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
255 const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
256 const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
258 const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
259 const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale);
262 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
264 Iterator broadcast_input(broadcast_tensor, broadcast_win);
265 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
266 Iterator output(out, win);
270 auto output_ptr =
reinterpret_cast<OutputScalarType *
>(output.ptr());
271 const auto non_broadcast_input_ptr =
reinterpret_cast<const InputScalarType *
>(non_broadcast_input.ptr());
272 const InputScalarType broadcast_value = *
reinterpret_cast<const InputScalarType *
>(broadcast_input.ptr());
274 int x = window_start_x;
276 svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
279 const auto args = BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType>
282 non_broadcast_input_ptr + x,
285 !is_broadcast_input_2,
286 non_broadcast_voffset, output_voffset,
287 non_broadcast_vscale, output_vscale
289 broadcast_func(pg, args);
290 x += wrapper::svcnt<InputScalarType>();
291 pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
293 while(svptest_any(all_true_pg, pg));
295 broadcast_input, non_broadcast_input, output);
300 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
301 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
303 Iterator input1(in1, input1_win);
304 Iterator input2(in2, input2_win);
305 Iterator output(out, win);
307 const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset);
308 const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale);
310 const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
311 const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale);
315 auto output_ptr =
reinterpret_cast<OutputScalarType *
>(output.ptr());
316 const auto input1_ptr =
reinterpret_cast<const InputScalarType *
>(input1.ptr());
317 const auto input2_ptr =
reinterpret_cast<const InputScalarType *
>(input2.ptr());
319 int x = window_start_x;
321 svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
324 const auto args = QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType>
330 in1_voffset, in2_voffset, output_voffset,
331 in1_vscale, in2_vscale, output_vscale
334 x += wrapper::svcnt<InputScalarType>();
335 pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
337 while(svptest_any(all_true_pg, pg));
339 input1, input2, output);
343 template <ArithmeticOperation op,
typename ScalarType>
344 void elementwise_arithmetic_quantized_op(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window)
347 elementwise_quantized_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
348 &arithmetic_op_quantized_loop<ScalarType, ScalarType>,
349 &arithmetic_op_broadcast_quantized_loop<ScalarType, ScalarType>);
352 template <ComparisonOperation op,
typename InputScalarType,
typename OutputScalarType = u
int8_t>
353 void elementwise_comparison_quantized_op(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window)
355 static_assert(
sizeof(InputScalarType) >=
sizeof(OutputScalarType),
"input data type's width should be equal to or greater than output data type's width");
358 elementwise_quantized_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
359 &comparison_op_quantized_loop<InputScalarType, OutputScalarType>,
360 &comparison_op_broadcast_quantized_loop<InputScalarType, OutputScalarType>);
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
decltype(strategy::transforms) typedef type
Copyright (c) 2017-2021 Arm Limited.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
static float dequantize(QUANTIZED_TYPE value, const UniformQuantizationInfo &qinfo)
Dequantize a value given a 8-bit asymmetric quantization scheme.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
opencl::ClGemm OperatorType
void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
Sets the macro arm_any if compiling for Aarch32 or Aarch64.