Compute Library
 22.05
impl.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
25 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
26 
28 namespace arm_compute
29 {
30 namespace cpu
31 {
32 using namespace arm_compute::wrapper;
33 
34 template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
36 {
38  const InputScalarType *input1_ptr;
39  const InputScalarType *input2_ptr;
40  OutputScalarType *output_ptr;
41 
42  const svint32_t &in1_offset;
43  const svint32_t &in2_offset;
44  const svint32_t &out_offset;
45  const svfloat32_t &in1_scale;
46  const svfloat32_t &in2_scale;
47  const svfloat32_t &out_scale;
48 };
49 
50 template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
52 {
54  const InputScalarType *input1_ptr;
56  OutputScalarType *output_ptr;
57  bool reorder;
58 
59  const svint32_t &in1_offset;
60  const svint32_t &out_offset;
61  const svfloat32_t &in1_scale;
62  const svfloat32_t &out_scale;
63 };
64 
65 inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
66 {
67  auto x = svld1(pg, ptr);
68 
69  const auto widened = svcreate4(
70  svmovlb(svmovlb(x)),
71  svmovlt(svmovlb(x)),
72  svmovlb(svmovlt(x)),
73  svmovlt(svmovlt(x)));
74 
75  pg = svptrue_b8();
76 
77  return svcreate4(
78  svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
79  svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
80  svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
81  svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
82 }
83 
84 inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
85 {
86  auto x = svld1(pg, ptr);
87 
88  //vprint(x);
89 
90  const auto widened = svcreate4(
91  svmovlb(svmovlb(x)),
92  svmovlt(svmovlb(x)),
93  svmovlb(svmovlt(x)),
94  svmovlt(svmovlt(x)));
95 
96  pg = svptrue_b8();
97 
98  return svcreate4(
99  svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
100  svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
101  svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
102  svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
103 }
104 
105 inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
106 {
107  const auto quantized = svcreate4(
108  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
109  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
110  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
111  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
112 
113  const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1));
114  const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3));
115  const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
116  svst1(pg, ptr, narrowed);
117 }
118 
119 inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
120 {
121  const auto quantized = svcreate4(
122  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
123  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
124  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
125  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
126 
127  const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1));
128  const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3));
129  const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
130 
131  svst1(pg, ptr, narrowed);
132 }
133 
134 template <typename InputScalarType, typename OutputScalarType>
136 {
137  const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
138  const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
139 
140  const auto result = svcreate4(
141  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
142  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), args.op),
143  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), args.op),
144  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), args.op));
145 
146  store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale);
147 }
148 
149 template <typename InputScalarType, typename OutputScalarType>
151 {
152  const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
153  const auto in2 = svcreate4(
154  svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value));
155 
156  const auto &af = args.reorder ? in2 : in1;
157  const auto &bf = args.reorder ? in1 : in2;
158 
159  const auto result = svcreate4(
160  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 0), svget4(bf, 0), args.op),
161  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 1), svget4(bf, 1), args.op),
162  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 2), svget4(bf, 2), args.op),
163  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 3), svget4(bf, 3), args.op));
164 
165  store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale);
166 }
167 
168 template <typename InputScalarType, typename OutputScalarType>
170 {
171  const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
172  const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
173 
174  using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
175 
176  const auto result = svcreate4(
177  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
178  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), args.op),
179  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), args.op),
180  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), args.op));
181 
182  const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
183  const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3));
184  const auto zipped = svzip1(zipped_bottom, zipped_top);
185  svst1(pg, args.output_ptr, zipped);
186 }
187 
188 template <typename InputScalarType, typename OutputScalarType>
190 {
191  const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
192  const auto in2 = svcreate4(
193  svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value));
194 
195  const auto &af = args.reorder ? in2 : in1;
196  const auto &bf = args.reorder ? in1 : in2;
197 
198  using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
199 
200  const auto result = svcreate4(
201  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 0), svget4(bf, 0), args.op),
202  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 1), svget4(bf, 1), args.op),
203  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 2), svget4(bf, 2), args.op),
204  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 3), svget4(bf, 3), args.op));
205 
206  const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
207  const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3));
208  const auto zipped = svzip1(zipped_bottom, zipped_top);
209  svst1(pg, args.output_ptr, zipped);
210 }
211 
212 template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
214 
215 template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
217 
218 template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
219  typename InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type,
220  typename OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type>
221 void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
222  OperatorType op,
225 {
226  const auto all_true_pg = wrapper::svptrue<InputScalarType>();
227 
228  // Create input windows
229  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
230  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
231 
232  // Clear X Dimension on execution window as we handle manually
233  Window win = window;
234  win.set(Window::DimX, Window::Dimension(0, 1, 1));
235 
236  const auto window_start_x = static_cast<int>(window.x().start());
237  const auto window_end_x = static_cast<int>(window.x().end());
238  const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
239 
240  const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset);
241  const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale);
242 
243  if(is_broadcast_across_x)
244  {
245  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
246  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
247  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
248  const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
249  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
250 
251  const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
252  const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
253 
254  const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
255  const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale);
256 
257  // Clear X Dimension on execution window as we handle manually
258  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
259 
260  Iterator broadcast_input(broadcast_tensor, broadcast_win);
261  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
262  Iterator output(out, win);
263 
264  execute_window_loop(win, [&](const Coordinates &)
265  {
266  auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
267  const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
268  const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
269 
270  int x = window_start_x;
271 
272  svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
273  do
274  {
276  {
277  op,
278  non_broadcast_input_ptr + x,
279  Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo),
280  output_ptr + x,
281  !is_broadcast_input_2,
282  non_broadcast_voffset, output_voffset,
283  non_broadcast_vscale, output_vscale
284  };
285  broadcast_func(pg, args);
286  x += wrapper::svcnt<InputScalarType>();
287  pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
288  }
289  while(svptest_any(all_true_pg, pg));
290  },
291  broadcast_input, non_broadcast_input, output);
292  }
293  else
294  {
295  // Clear X Dimension on execution window as we handle manually
296  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
297  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
298 
299  Iterator input1(in1, input1_win);
300  Iterator input2(in2, input2_win);
301  Iterator output(out, win);
302 
303  const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset);
304  const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale);
305 
306  const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
307  const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale);
308 
309  execute_window_loop(win, [&](const Coordinates &)
310  {
311  auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
312  const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
313  const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
314 
315  int x = window_start_x;
316 
317  svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
318  do
319  {
321  {
322  op,
323  input1_ptr + x,
324  input2_ptr + x,
325  output_ptr + x,
326  in1_voffset, in2_voffset, output_voffset,
327  in1_vscale, in2_vscale, output_vscale
328  };
329  func(pg, args);
330  x += wrapper::svcnt<InputScalarType>();
331  pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
332  }
333  while(svptest_any(all_true_pg, pg));
334  },
335  input1, input2, output);
336  }
337 }
338 
339 template <ArithmeticOperation op, typename ScalarType>
340 void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
341 {
342  using VectorType = typename wrapper::traits::sve_vector<ScalarType>::type;
343  elementwise_quantized_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
344  &arithmetic_op_quantized_loop<ScalarType, ScalarType>,
345  &arithmetic_op_broadcast_quantized_loop<ScalarType, ScalarType>);
346 }
347 
348 template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
349 void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
350 {
351  static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
352  using InputVectorType = typename wrapper::traits::sve_vector<InputScalarType>::type;
353  using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
354  elementwise_quantized_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
355  &comparison_op_quantized_loop<InputScalarType, OutputScalarType>,
356  &comparison_op_broadcast_quantized_loop<InputScalarType, OutputScalarType>);
357 }
358 } // namespace cpu
359 } // namespace arm_compute
360 
361 #endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1083
float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
Definition: impl.h:482
const InputScalarType * input2_ptr
Definition: impl.h:39
constexpr int step() const
Return the step of the dimension.
Definition: Window.h:106
void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
Definition: impl.h:340
void(*)(svbool_t, const BroadcastQuantizedLoopArguments< InputScalarType, OutputScalarType, OperatorType > &) BroadcastQuantizedLoopFuncType
Definition: impl.h:216
void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
Definition: impl.h:349
const InputScalarType * input1_ptr
Definition: impl.h:38
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:79
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2022 Arm Limited.
const svfloat32_t & in2_scale
Definition: impl.h:46
T x() const
Alias to access the size of the first dimension.
Definition: Dimensions.h:87
void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments< InputScalarType, OutputScalarType, ComparisonOperation > &args)
Definition: impl.h:189
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, OperatorType op, LoopQuantizedFuncType< InputScalarType, OutputScalarType, OperatorType > func, BroadcastQuantizedLoopFuncType< InputScalarType, OutputScalarType, OperatorType > broadcast_func)
Definition: impl.h:221
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments< InputScalarType, OutputScalarType, ComparisonOperation > &args)
Definition: impl.h:169
void(*)(svbool_t, const QuantizedLoopArguments< InputScalarType, OutputScalarType, OperatorType > &) LoopQuantizedFuncType
Definition: impl.h:213
Coordinates of an item.
Definition: Coordinates.h:37
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
const svfloat32_t & in1_scale
Definition: impl.h:45
Window broadcast_if_dimension_le_one(const TensorShape &shape) const
Don&#39;t advance in the dimension where shape is less equal to 1.
Definition: Window.inl:120
const svfloat32_t & out_scale
Definition: impl.h:47
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
void arithmetic_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments< InputScalarType, OutputScalarType, ArithmeticOperation > &args)
Definition: impl.h:150
void arithmetic_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments< InputScalarType, OutputScalarType, ArithmeticOperation > &args)
Definition: impl.h:135
static float dequantize(QUANTIZED_TYPE value, const UniformQuantizationInfo &qinfo)
Dequantize a value given a 8-bit asymmetric quantization scheme.
OutputScalarType * output_ptr
Definition: impl.h:40
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
opencl::ClGemm OperatorType
Definition: CLGEMM.cpp:40
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:101
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:96
Describe a multidimensional execution window.
Definition: Window.h:39
void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
Definition: impl.h:512
Basic function to execute GEMM on OpenCL.
Definition: ClGemm.h:54
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:158