Compute Library
 21.02
NEGEMMLowpReductionKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
33 
34 namespace arm_compute
35 {
36 namespace
37 {
38 Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
39 {
42 
43  if(output->total_size() > 0)
44  {
46  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
47  }
48  return Status{};
49 }
50 Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
51 {
54 
55  if(output->total_size() > 0)
56  {
58  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
59  }
60  return Status{};
61 }
62 } // namespace
63 
65  : _input(), _output(), _k(0), _scalar(0), _mul_by_scalar(false)
66 {
67 }
68 
70 {
71  // Perform validate step
72  ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
73  ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
74  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
75  _input = mtx_a;
76  _output = vector_sum_row;
77  _k = info.k;
78  _scalar = info.scalar;
79  _mul_by_scalar = info.mul_by_scalar;
80 
81  // Output auto initialization if not yet initialized
82  auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(1)), 1, DataType::S32);
83 
84  Window win = calculate_max_window(*_output->info(), Steps(1));
85  _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
86 
87  INEKernel::configure(win);
88 }
89 
91 {
92  ARM_COMPUTE_UNUSED(info);
93  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
94  return Status{};
95 }
96 
97 template <typename T>
98 void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)
99 {
100  // Intermediate and final accumulator types
101  using TIAcc = wrapper::traits::promote_t<T>;
103 
104  Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
105 
106  Window win_input(collapsed_window);
107  win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
108  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
109  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
110 
111  Iterator in(_input, win_input);
112  Iterator out(_output, collapsed_window);
113 
114  execute_window_loop(collapsed_window, [&](const Coordinates & id)
115  {
116  auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
117  TAcc sum_row = 0;
118 
119  const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
120 
121 #if __arm__
122  asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
123 #endif /* __arm__ */
124 
125  int i = 0;
126  // This for loop performs 16 accumulations
127  for(; i <= (_k - 16); i += 16)
128  {
129  const auto a0_d8 = wrapper::vloadq(matrix_a + i);
130 
131  // Partial accumulations in U16
132  const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
133 
134  // Accumulate to U32
135  vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
136  }
137 
138  // This for loop performs the leftover accumulations
139  for(; i < _k; ++i)
140  {
141  sum_row += static_cast<TAcc>(matrix_a[i]);
142  }
143 
144 #if defined(__aarch64__)
145  // Reduction operation available on 64 bit architectures only
146  sum_row += wrapper::vaddv(vsum_row);
147 #else // __aarch64__
148  auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
149  tmp = wrapper::vpadd(tmp, tmp);
150 
151  sum_row += wrapper::vgetlane(tmp, 0);
152 #endif // __aarch64__
153 
154  // Multiply by scalar if necessary
155  if(_mul_by_scalar)
156  {
157  sum_row *= _scalar;
158  }
159 
160  *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
161  },
162  in, out);
163 }
164 
166 {
167  ARM_COMPUTE_UNUSED(info);
170 
171  switch(_input->info()->data_type())
172  {
173  case DataType::QASYMM8:
174  run_internal<uint8_t>(window);
175  break;
177  case DataType::QSYMM8:
179  run_internal<int8_t>(window);
180  break;
181  default:
182  ARM_COMPUTE_ERROR("Unsupported data type");
183  }
184 }
185 
187 {
188  ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
189  ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
190 
191  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
192 
193  _input = mtx_b;
194  _output = vector_sum_col;
195  _k = info.k;
196  _scalar = info.scalar;
197  _mul_by_scalar = info.mul_by_scalar;
198 
199  // Configure kernel window
200  constexpr unsigned int num_elems_processed_per_iteration = 16;
201 
202  // Output auto initialization if not yet initialized
203  auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(0)), 1, DataType::S32);
204 
205  // Configure kernel window
206  Window win = calculate_max_window_horizontal(*_output->info(), Steps(num_elems_processed_per_iteration));
207  _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
208  INEKernel::configure(win);
209 }
210 
212 {
213  ARM_COMPUTE_UNUSED(info);
214  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
215 
216  return Status{};
217 }
218 
219 template <typename T>
220 void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)
221 {
222  // Intermediate and final accumulator types
223  using TIAcc = wrapper::traits::promote_t<T>;
225 
226  Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
227  const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
228 
229  const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
230  const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);
231 
232  // The implementation computes 16 elements per iteration
233  const int window_start_x = 16 * info.thread_id;
234  const int window_step_x = 16 * info.num_threads;
235  // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
236  const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
237 
238  Window win_out(collapsed_window);
239  win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
240 
241  Window win_in(win_out);
242  win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
243  win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
244 
245  Iterator inb(_input, win_in);
246  Iterator out(_output, win_out);
247 
248  execute_window_loop(win_out, [&](const Coordinates & id)
249  {
250  if(id.x() > width_matrix_b)
251  {
252  return;
253  }
254 
255  // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
257  {
258  wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
259  wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
260  wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
261  wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
262  };
263 
264  const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
265 
266 #if __arm__
267  asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
268  asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
269 #endif /* __arm__ */
270 
271  int i = 0;
272  // This for loop performs 4 accumulations
273  for(; i <= (_k - 4); i += 4)
274  {
275  const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
276  const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
277  const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
278  const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
279 
280 #if __arm__
281  asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
282  asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
283  asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
284  asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
285 #endif /* __arm__ */
286 
287  // Partial accumulation in 16bit
289  {
290  wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
291  wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
292  };
293 
294  tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
295  tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
296  tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
297  tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
298  tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
299  tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
300  tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
301  tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
302 
303  // Accumulate to 32bit
304  sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
305  sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
306  sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
307  sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
308 
309  matrix_b += 4 * in_b_stride;
310  }
311 
312  // This for loop perfoms the leftover accumulations
313  for(; i < _k; ++i)
314  {
315  const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
316 
317  // Convert S8 to S16
319  {
322  };
323 
324  // Accumulate to 32bit
325  sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
326  sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
327  sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
328  sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
329 
330  matrix_b += in_b_stride;
331  }
332 
333  // Multiply by scalar if necessary
334  if(_mul_by_scalar)
335  {
336  sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
337  sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
338  sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
339  sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
340  }
341 
342  auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
343  if(id.x() + 16 < width_matrix_b)
344  {
345  wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
346  wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
347  wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
348  wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
349  }
350  else
351  {
352  auto left_over = width_matrix_b - id.x();
353  for(auto k = 0; k < 4 && left_over; ++k)
354  {
355  for(auto j = 0; j < 4 && left_over; ++j, --left_over)
356  {
357  *(vector_sum_col + k * 4 + j) = sum_col[k][j];
358  }
359  }
360  }
361  },
362  inb, out);
363 }
364 
366 {
367  ARM_COMPUTE_UNUSED(info);
370 
371  switch(_input->info()->data_type())
372  {
373  case DataType::QASYMM8:
374  run_internal<uint8_t>(window, info);
375  break;
377  case DataType::QSYMM8:
379  run_internal<int8_t>(window, info);
380  break;
381  default:
382  ARM_COMPUTE_ERROR("Unsupported data type");
383  }
384 }
385 } // namespace arm_compute
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override
Initialise the kernel&#39;s input and output.
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
Status class.
Definition: Error.h:52
int32_t scalar
Scalar value to multiply each reduced column/row by.
static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixBReducti...
Interface for Neon tensor.
Definition: ITensor.h:36
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
1 channel, 1 S32 per channel
typename promote< T >::type promote_t
Get promoted type.
Definition: traits.h:147
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:187
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override
Initialise the kernel&#39;s input and output.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
int32_t k
Number of matrix columns/rows.
Window collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed=nullptr) const
Collapse the dimensions between first and last if possible.
Definition: Window.inl:68
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Definition: Utils.h:71
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Definition: Error.h:456
Coordinates of an item.
Definition: Coordinates.h:37
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Create the appropriate Neon vector given its type and size in terms of bits.
Definition: traits.h:92
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
uint16x8_t vaddl(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:122
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
quantized, symmetric fixed-point 8-bit number
quantized, symmetric per channel fixed-point 8-bit number
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint16x8_t vaddw(const uint16x8_t &a, const uint8x8_t &b)
Definition: add.h:107
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
Information about executing thread and CPU.
Definition: CPPTypes.h:235
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
unsigned int num_elems_processed_per_iteration
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
bool is_reshaped
True if the input tensor has been reshaped.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint16x4_t vpaddl(const uint8x8_t &a)
Definition: add.h:165
T y() const
Alias to access the size of the second dimension.
Definition: Dimensions.h:92
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
Container for valid region of a window.
Definition: Types.h:188
static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixAReducti...
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
bool mul_by_scalar
True if each column/row reduction has to be multiplied by a scalar value.
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205