Compute Library
 21.11
CpuGemmLowpMatrixReductionKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
32 
33 namespace arm_compute
34 {
35 namespace cpu
36 {
37 namespace kernels
38 {
39 namespace
40 {
41 Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
42 {
43  ARM_COMPUTE_UNUSED(info);
45  ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
47 
48  if(dst->total_size() > 0)
49  {
51  ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
52  }
53  return Status{};
54 }
55 Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
56 {
57  ARM_COMPUTE_UNUSED(info);
59  ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
61 
62  if(dst->total_size() > 0)
63  {
65  ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
66  }
67  return Status{};
68 }
69 } // namespace
70 
72 {
73  // Perform validate step
75  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(src, dst, info));
76  _k = info.k;
77  _scalar = info.scalar;
78  _mul_by_scalar = info.mul_by_scalar;
79 
80  switch(src->data_type())
81  {
82  case DataType::QASYMM8:
83  _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>;
84  break;
86  case DataType::QSYMM8:
88  _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<int8_t>;
89  break;
90  default:
91  ARM_COMPUTE_ERROR("Unsupported data type");
92  }
93 
94  // Output auto initialization if not yet initialized
96 
97  Window win = calculate_max_window(*dst, Steps(1));
98  ICpuKernel::configure(win);
99 }
100 
102 {
103  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info));
104  return Status{};
105 }
106 
107 template <typename T>
108 void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window)
109 {
110  // Intermediate and final accumulator types
111  using TIAcc = wrapper::traits::promote_t<T>;
113 
114  Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
115 
116  Window win_input(collapsed_window);
117  win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
118  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
119  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
120 
121  Iterator in(src, win_input);
122  Iterator out(dst, collapsed_window);
123 
124  execute_window_loop(collapsed_window, [&](const Coordinates & id)
125  {
126  auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
127  TAcc sum_row = 0;
128 
129  const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
130 
131 #if __arm__
132  asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
133 #endif /* __arm__ */
134 
135  int i = 0;
136  // This for loop performs 16 accumulations
137  for(; i <= (_k - 16); i += 16)
138  {
139  const auto a0_d8 = wrapper::vloadq(matrix_a + i);
140 
141  // Partial accumulations in U16
142  const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
143 
144  // Accumulate to U32
145  vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
146  }
147 
148  // This for loop performs the leftover accumulations
149  for(; i < _k; ++i)
150  {
151  sum_row += static_cast<TAcc>(matrix_a[i]);
152  }
153 
154 #if defined(__aarch64__)
155  // Reduction operation available on 64 bit architectures only
156  sum_row += wrapper::vaddv(vsum_row);
157 #else // __aarch64__
158  auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
159  tmp = wrapper::vpadd(tmp, tmp);
160 
161  sum_row += wrapper::vgetlane(tmp, 0);
162 #endif // __aarch64__
163 
164  // Multiply by scalar if necessary
165  if(_mul_by_scalar)
166  {
167  sum_row *= _scalar;
168  }
169 
170  *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
171  },
172  in, out);
173 }
174 
175 void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
176 {
177  ARM_COMPUTE_UNUSED(info);
180 
181  auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
182  auto dst = tensors.get_tensor(TensorType::ACL_DST);
183 
184  (this->*_func)(src, dst, window);
185 }
186 
188 {
189  return "CpuGemmLowpMatrixAReductionKernel";
190 }
191 
193 {
195  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info));
196 
197  _k = info.k;
198  _scalar = info.scalar;
199  _mul_by_scalar = info.mul_by_scalar;
200 
201  // Configure kernel window
202  constexpr unsigned int num_elems_processed_per_iteration = 16;
203 
204  switch(src->data_type())
205  {
206  case DataType::QASYMM8:
207  _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>;
208  break;
210  case DataType::QSYMM8:
212  _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<int8_t>;
213  break;
214  default:
215  ARM_COMPUTE_ERROR("Unsupported data type");
216  }
217 
218  // Output auto initialization if not yet initialized
220 
221  // Configure kernel window
222  Window win = calculate_max_window_horizontal(*dst, Steps(num_elems_processed_per_iteration));
223  ICpuKernel::configure(win);
224 }
225 
227 {
228  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info));
229  return Status{};
230 }
231 
232 template <typename T>
233 void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info)
234 {
235  // Intermediate and final accumulator types
236  using TIAcc = wrapper::traits::promote_t<T>;
238 
239  Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
240  const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
241 
242  const auto width_matrix_b = static_cast<int>(src->info()->dimension(0));
243  const auto in_b_stride = static_cast<int>(src->info()->strides_in_bytes()[1]);
244 
245  // The implementation computes 16 elements per iteration
246  const int window_start_x = 16 * info.thread_id;
247  const int window_step_x = 16 * info.num_threads;
248  // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
249  const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
250 
251  Window win_out(collapsed_window);
252  win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
253 
254  Window win_in(win_out);
255  win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
256  win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
257 
258  Iterator inb(src, win_in);
259  Iterator out(dst, win_out);
260 
261  execute_window_loop(win_out, [&](const Coordinates & id)
262  {
263  if(id.x() > width_matrix_b)
264  {
265  return;
266  }
267 
268  // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
270  {
271  wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
272  wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
273  wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
274  wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
275  };
276 
277  const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
278 
279 #if __arm__
280  asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
281  asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
282 #endif /* __arm__ */
283 
284  int i = 0;
285  // This for loop performs 4 accumulations
286  for(; i <= (_k - 4); i += 4)
287  {
288  const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
289  const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
290  const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
291  const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
292 
293 #if __arm__
294  asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
295  asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
296  asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
297  asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
298 #endif /* __arm__ */
299 
300  // Partial accumulation in 16bit
302  {
303  wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
304  wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
305  };
306 
307  tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
308  tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
309  tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
310  tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
311  tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
312  tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
313  tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
314  tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
315 
316  // Accumulate to 32bit
317  sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
318  sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
319  sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
320  sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
321 
322  matrix_b += 4 * in_b_stride;
323  }
324 
325  // This for loop perfoms the leftover accumulations
326  for(; i < _k; ++i)
327  {
328  const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
329 
330  // Convert S8 to S16
332  {
335  };
336 
337  // Accumulate to 32bit
338  sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
339  sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
340  sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
341  sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
342 
343  matrix_b += in_b_stride;
344  }
345 
346  // Multiply by scalar if necessary
347  if(_mul_by_scalar)
348  {
349  sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
350  sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
351  sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
352  sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
353  }
354 
355  auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
356  if(id.x() + 16 < width_matrix_b)
357  {
358  wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
359  wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
360  wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
361  wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
362  }
363  else
364  {
365  auto left_over = width_matrix_b - id.x();
366  for(auto k = 0; k < 4 && left_over; ++k)
367  {
368  for(auto j = 0; j < 4 && left_over; ++j, --left_over)
369  {
370  *(vector_sum_col + k * 4 + j) = sum_col[k][j];
371  }
372  }
373  }
374  },
375  inb, out);
376 }
377 
378 void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
379 {
380  ARM_COMPUTE_UNUSED(info);
383 
384  auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
385  auto dst = tensors.get_tensor(TensorType::ACL_DST);
386 
387  (this->*_func)(src, dst, window, info);
388 }
389 
391 {
392  return "CpuGemmLowpMatrixBReductionKernel";
393 }
394 } // namespace kernels
395 } // namespace cpu
396 } // namespace arm_compute
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration.
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
Initialise the kernel&#39;s input and output.
Status class.
Definition: Error.h:52
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
int32_t scalar
Scalar value to multiply each reduced column/row by.
Interface for CPU tensor.
Definition: ITensor.h:36
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2021 Arm Limited.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:159
1 channel, 1 S32 per channel
typename promote< T >::type promote_t
Get promoted type.
Definition: traits.h:147
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:187
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
int32_t k
Number of matrix columns/rows.
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration.
Window collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed=nullptr) const
Collapse the dimensions between first and last if possible.
Definition: Window.inl:68
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Definition: Utils.h:71
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Definition: Error.h:456
Coordinates of an item.
Definition: Coordinates.h:37
unsigned int num_elems_processed_per_iteration
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Create the appropriate SIMD vector given its type and size in terms of bits.
Definition: traits.h:92
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
uint16x8_t vaddl(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:122
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
quantized, symmetric fixed-point 8-bit number
quantized, symmetric per channel fixed-point 8-bit number
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint16x8_t vaddw(const uint16x8_t &a, const uint8x8_t &b)
Definition: add.h:107
void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
Initialise the kernel&#39;s input and output.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
Information about executing thread and CPU.
Definition: CPPTypes.h:158
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:39
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint16x4_t vpaddl(const uint8x8_t &a)
Definition: add.h:165
T y() const
Alias to access the size of the second dimension.
Definition: Dimensions.h:92
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
bool mul_by_scalar
True if each column/row reduction has to be multiplied by a scalar value.
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:201