Compute Library
 21.02
NEDequantizationLayerKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
28 #include "arm_compute/core/Utils.h"
32 #include "src/core/CPP/Validate.h"
33 #include "src/core/NEON/NEAsymm.h"
34 #include "src/core/NEON/NESymm.h"
38 
39 #include <arm_neon.h>
40 
41 namespace arm_compute
42 {
43 namespace
44 {
45 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
46 {
49 
50  if(output->tensor_shape().total_size() > 0)
51  {
55  }
56 
57  return Status{};
58 }
59 
60 std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
61 {
62  // Configure kernel window
63  Window win = calculate_max_window(*input, Steps());
64 
65  // Output tensor auto initialization if not yet initialized
66  auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
67 
68  // NEDequantizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
69  Coordinates coord;
70  coord.set_num_dimensions(output->num_dimensions());
71  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
72 
73  return std::make_tuple(Status{}, win);
74 }
75 
76 template <typename T>
77 inline void store_result(T *ptr, const float32x4x4_t &v)
78 {
79  ARM_COMPUTE_UNUSED(ptr, v);
80 }
81 
82 template <>
83 inline void store_result<float>(float *ptr, const float32x4x4_t &v)
84 {
85  wrapper::vstore(ptr, v.val[0]);
86  wrapper::vstore(ptr + 4, v.val[1]);
87  wrapper::vstore(ptr + 8, v.val[2]);
88  wrapper::vstore(ptr + 12, v.val[3]);
89 }
90 
91 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
92 template <>
93 inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
94 {
95  wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
96  wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
97 }
98 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
99 
100 template <typename T>
101 inline void store_result(T *ptr, const float32x4x2_t &v)
102 {
103  ARM_COMPUTE_UNUSED(ptr, v);
104 }
105 
106 template <>
107 inline void store_result<float>(float *ptr, const float32x4x2_t &v)
108 {
109  wrapper::vstore(ptr, v.val[0]);
110  wrapper::vstore(ptr + 4, v.val[1]);
111 }
112 
113 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
114 template <>
115 inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v)
116 {
117  wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
118 }
119 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
120 
121 template <typename TOut, typename TIn>
122 void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
123 {
124  const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
125  const float scale = qinfo.scale;
126  const int32_t offset = qinfo.offset;
127 
128  const int window_step_x = 16;
129  const auto window_start_x = static_cast<int>(window.x().start());
130  const auto window_end_x = static_cast<int>(window.x().end());
131 
132  // Collapse window and reset first dimension to handle tail calculations manually
133  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
134  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
135 
136  // Create iterators
137  Iterator in(input, win_collapsed);
138  Iterator out(output, win_collapsed);
139 
140  execute_window_loop(win_collapsed, [&](const Coordinates &)
141  {
142  const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr());
143  const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
144 
145  int x = window_start_x;
146  for(; x <= (window_end_x - window_step_x); x += window_step_x)
147  {
148  const auto vin = wrapper::vloadq(in_ptr + x);
149  const auto vdeq = vdequantize(vin, scale, offset);
150 
151  store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
152  }
153 
154  // Compute left-over elements
155  for(; x < window_end_x; ++x)
156  {
157  auto val = *(in_ptr + x);
158  *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
159  }
160  },
161  in, out);
162 }
163 
164 template <typename T>
165 void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window)
166 {
167  const auto scale = input->info()->quantization_info().scale();
168 
169  const int window_step_x = 16;
170  const auto window_start_x = static_cast<int>(window.x().start());
171  const auto window_end_x = static_cast<int>(window.x().end());
172 
173  // Reset first dimension to handle tail calculations manually
174  Window win(window);
175  win.set(Window::DimX, Window::Dimension(0, 1, 1));
176 
177  // Create iterators
178  Iterator in(input, win);
179  Iterator out(output, win);
180 
181  execute_window_loop(win, [&](const Coordinates & id)
182  {
183  const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
184  const auto out_ptr = reinterpret_cast<T *>(out.ptr());
185 
186  int x = window_start_x;
187  for(; x <= (window_end_x - window_step_x); x += window_step_x)
188  {
189  const auto vin = wrapper::vloadq(in_ptr + x);
190  const auto vdeq = vdequantize(vin, scale[id.z()]);
191 
192  store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
193  }
194 
195  // Compute left-over elements
196  for(; x < window_end_x; ++x)
197  {
198  int8_t val = *(in_ptr + x);
199  *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
200  }
201  },
202  in, out);
203 }
204 
205 template <typename T>
206 void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window)
207 {
208  const auto scale = input->info()->quantization_info().scale();
209 
210  const int window_step_x = 16;
211  const auto window_start_x = static_cast<int>(window.x().start());
212  const auto window_end_x = static_cast<int>(window.x().end());
213 
214  // Reset first dimension to handle tail calculations manually
215  Window win(window);
216  win.set(Window::DimX, Window::Dimension(0, 1, 1));
217 
218  // Create iterators
219  Iterator in(input, win);
220  Iterator out(output, win);
221 
222  execute_window_loop(win, [&](const Coordinates &)
223  {
224  const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
225  const auto out_ptr = reinterpret_cast<T *>(out.ptr());
226 
227  int x = window_start_x;
228  for(; x <= (window_end_x - window_step_x); x += window_step_x)
229  {
230  const float32x4x4_t vscale =
231  {
232  {
233  scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
234  scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
235  scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
236  scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
237  }
238  };
239  const auto vin = wrapper::vloadq(in_ptr + x);
240  const auto vdeq = vdequantize(vin, vscale);
241 
242  store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
243  }
244 
245  // Compute left-over elements
246  for(; x < window_end_x; ++x)
247  {
248  int8_t val = *(in_ptr + x);
249  *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
250  }
251  },
252  in, out);
253 }
254 
255 template <typename T>
256 void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window)
257 {
258  const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
259  const float scale = qinfo.scale;
260 
261  const int window_step_x = 16;
262  const auto window_start_x = static_cast<int>(window.x().start());
263  const auto window_end_x = static_cast<int>(window.x().end());
264 
265  // Collapse window and reset first dimension to handle tail calculations manually
266  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
267  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
268 
269  // Create iterators
270  Iterator in(input, win_collapsed);
271  Iterator out(output, win_collapsed);
272 
273  execute_window_loop(win_collapsed, [&](const Coordinates &)
274  {
275  const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
276  const auto out_ptr = reinterpret_cast<T *>(out.ptr());
277 
278  int x = window_start_x;
279  for(; x <= (window_end_x - window_step_x); x += window_step_x)
280  {
281  const auto vin = wrapper::vloadq(in_ptr + x);
282  const auto vdeq = vdequantize(vin, scale);
283 
284  store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
285  }
286 
287  // Compute left-over elements
288  for(; x < window_end_x; ++x)
289  {
290  int8_t val = *(in_ptr + x);
291  *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
292  }
293  },
294  in, out);
295 }
296 
297 template <typename T>
298 void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window)
299 {
300  const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
301  const float scale = qinfo.scale;
302 
303  const int window_step_x = 8;
304  const auto window_start_x = static_cast<int>(window.x().start());
305  const auto window_end_x = static_cast<int>(window.x().end());
306 
307  // Collapse window and reset first dimension to handle tail calculations manually
308  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
309  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
310 
311  // Create iterators
312  Iterator in(input, win_collapsed);
313  Iterator out(output, win_collapsed);
314 
315  execute_window_loop(win_collapsed, [&](const Coordinates &)
316  {
317  const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr());
318  const auto out_ptr = reinterpret_cast<T *>(out.ptr());
319 
320  int x = window_start_x;
321  for(; x <= (window_end_x - window_step_x); x += window_step_x)
322  {
323  const auto vin = wrapper::vloadq(in_ptr + x);
324  const auto vdeq = vdequantize_int16(vin, scale);
325 
326  store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
327  }
328 
329  // Compute left-over elements
330  for(; x < window_end_x; ++x)
331  {
332  int16_t val = *(in_ptr + x);
333  *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
334  }
335  },
336  in, out);
337 }
338 
339 template <typename T>
340 void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
341 {
342  switch(input->info()->data_type())
343  {
344  case DataType::QASYMM8:
345  run_dequantization_qasymm8<T, uint8_t>(input, output, window);
346  break;
348  run_dequantization_qasymm8<T, int8_t>(input, output, window);
349  break;
351  input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
352  break;
353  case DataType::QSYMM8:
354  run_dequantization_qsymm8<T>(input, output, window);
355  break;
356  case DataType::QSYMM16:
357  run_dequantization_qsymm16<T>(input, output, window);
358  break;
359  default:
360  ARM_COMPUTE_ERROR("Unsupported data type.");
361  }
362 }
363 } // namespace
364 
366  : _input(nullptr), _output(nullptr)
367 {
368 }
369 
371 {
372  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
374 
375  _input = input;
376  _output = output;
377 
378  // Configure kernel window
379  auto win_config = validate_and_configure_window(input->info(), output->info());
380 
381  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
382 
383  INEKernel::configure(std::get<1>(win_config));
384 }
385 
387 {
389  ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
390  return Status{};
391 }
392 
394 {
395  ARM_COMPUTE_UNUSED(info);
398 
399  switch(_output->info()->data_type())
400  {
401  case DataType::F32:
402  run_dequantization_core<float>(_input, _output, window);
403  break;
404 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
405  case DataType::F16:
406  run_dequantization_core<float16_t>(_input, _output, window);
407  break;
408 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
409  default:
410  ARM_COMPUTE_ERROR("Unsupported data type.");
411  }
412 }
413 } // namespace arm_compute
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:846
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
void configure(const ITensor *input, ITensor *output)
Set input, output tensors.
quantized, symmetric fixed-point 16-bit number
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
Definition: NEAsymm.h:415
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale)
Dequantize a neon vector holding 8 16-bit quantized values.
Definition: NESymm.h:135
1 channel, 1 F32 per channel
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Status class.
Definition: Error.h:52
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEDequantizationLayerKer...
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
float dequantize_qsymm16(int16_t value, const UniformQuantizationInfo &qinfo)
Dequantize a value given a 16-bit symmetric quantization scheme.
quantized, asymmetric fixed-point 8-bit number unsigned
UniformQuantizationInfo uniform() const
Return per layer quantization info.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
quantized, symmetric fixed-point 8-bit number
quantized, symmetric per channel fixed-point 8-bit number
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
const QuantizationInfo qinfo
Definition: Im2Col.cpp:155
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
static float dequantize(QUANTIZED_TYPE value, const UniformQuantizationInfo &qinfo)
Dequantize a value given a 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
float dequantize(uint8_t value, float scale, int32_t offset)
Dequantize a value given an 8-bit asymmetric quantization scheme.
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205