Compute Library
 23.05
GEMMLowp.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "GEMMLowp.h"
25 
26 #include "arm_compute/core/Types.h"
28 
30 
31 #include <limits>
32 
33 namespace arm_compute
34 {
35 namespace test
36 {
37 namespace validation
38 {
39 namespace reference
40 {
41 namespace
42 {
43 template <typename T>
44 struct DataTypeExtractor
45 {
46  static DataType data_type()
47  {
49  if(std::is_same<T, int8_t>::value)
50  {
51  data_type = DataType::QASYMM8_SIGNED;
52  }
53  else if(std::is_same<T, uint8_t>::value)
54  {
55  data_type = DataType::QASYMM8;
56  }
57  else if(std::is_same<T, int16_t>::value)
58  {
59  data_type = DataType::QSYMM16;
60  }
61  return data_type;
62  }
63 };
64 
65 template <typename TIn, typename TOut>
66 void quantize_down_scale(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, int32_t result_offset, std::vector<int32_t> result_mult_int,
67  std::vector<int32_t> result_shift, int32_t min, int32_t max)
68 {
69  const int cols_in = in->shape().x();
70  const bool is_per_channel = result_mult_int.size() > 1;
71 
72 #if defined(_OPENMP)
73  #pragma omp parallel for
74 #endif /* _OPENMP */
75  for(int i = 0; i < in->num_elements(); ++i)
76  {
77  int32_t result = ((*in)[i] + result_offset);
78 
79  if(bias != nullptr)
80  {
81  result += (*bias)[i % cols_in];
82  }
83 
84  result *= (is_per_channel) ? result_mult_int[i % cols_in] : result_mult_int[0];
85 
86  result >>= (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
87 
88  // Bounded ReLu
89  if(min != max)
90  {
91  result = std::max(min, std::min(max, result));
92  }
93 
94  (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
95  std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
96  }
97 }
98 
99 template <typename TIn, typename TOut>
100 void quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, std::vector<int32_t> result_fixedpoint_multiplier,
101  std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
102 {
103  const int cols_in = in->shape().x();
104  const bool is_per_channel = result_fixedpoint_multiplier.size() > 1;
105 
106 #if defined(_OPENMP)
107  #pragma omp parallel for
108 #endif /* _OPENMP */
109  for(int i = 0; i < in->num_elements(); ++i)
110  {
111  TIn result = (*in)[i];
112 
113  if(bias != nullptr)
114  {
115  result += (*bias)[i % cols_in];
116  }
117 
118  // Fixed point multiplication
119  const int32_t multiplier = (is_per_channel) ? result_fixedpoint_multiplier[i % cols_in] : result_fixedpoint_multiplier[0];
120  const int32_t shift = (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
121 
122  if(shift < 0)
123  {
124  result = asymm_int_mult(result * (1 << (-shift)), multiplier);
125  }
126  else
127  {
128  result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, multiplier), shift);
129  }
130  result += result_offset_after_shift;
131 
132  // Bounded ReLu
133  if(min != max)
134  {
135  result = std::max(min, std::min(max, result));
136  }
137 
138  (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
139  std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
140  }
141 }
142 
143 template <typename TIn, typename TOut>
144 void quantize_down_scale_by_float(const SimpleTensor<TIn> *in, const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, std::vector<float_t> result_real_multiplier,
145  int32_t result_offset, int32_t min, int32_t max)
146 {
147  const int cols_in = in->shape().x();
148  const bool is_per_channel = result_real_multiplier.size() > 1;
149 
150 #if defined(_OPENMP)
151  #pragma omp parallel for
152 #endif /* _OPENMP */
153  for(int i = 0; i < in->num_elements(); ++i)
154  {
155  TIn result = (*in)[i];
156 
157  if(bias != nullptr)
158  {
159  result += (*bias)[i % cols_in];
160  }
161 
162  // Float multiplication
163  const float_t multiplier = (is_per_channel) ? result_real_multiplier[i % cols_in] : result_real_multiplier[0];
164 
165  float_t result_f = static_cast<float_t>(result) * multiplier + static_cast<float_t>(result_offset);
166  result = static_cast<TIn>(support::cpp11::round(result_f));
167 
168  // Bounded ReLu
169  if(min != max)
170  {
171  result = std::max(min, std::min(max, result));
172  }
173 
174  (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
175  std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
176  }
177 }
178 } // namespace
179 
180 template <typename T_out, typename T_in, typename T_in_1>
181 SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, const SimpleTensor<T_in_1> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
182 {
183  static_assert(std::is_same<typename std::decay<T_out>::type, int32_t>::value, "Only int32_t is allowed for the output");
184 
185  DataType dt = std::is_same<T_out, int32_t>::value ? DataType::S32 : DataType::U32;
186  SimpleTensor<T_out> c(shape_c, dt);
187 
188  const int K = a.shape().x();
189  const int M = a.shape().y();
190  const int N = b.shape().x();
191  const int D = a.shape().z(); // Number of matrices in a batch
192 
193  const int a_stride_z = K * M;
194  // Do not slide the matrix B along the 3rd dimension in case matrix B has less than 3 dimensions
195  const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;
196  const int c_stride_z = N * M;
197 
198  std::vector<T_out> acc;
199  acc.resize(N);
200 
201  for(int depth = 0; depth < D; ++depth)
202  {
203  const int base_addr_a = depth * a_stride_z;
204  const int base_addr_b = depth * b_stride_z;
205  const int base_addr_c = depth * c_stride_z;
206 
207  for(int i = 0; i < M; ++i)
208  {
209  for(int j = 0; j < N; ++j)
210  {
211  acc[j] = 0;
212  }
213  for(int k = 0; k < K; ++k)
214  {
215  const T_out tmp_a = a_offset + static_cast<T_out>(a[base_addr_a + k + i * K]);
216  for(int j = 0; j < N; ++j)
217  {
218  const T_out tmp_b = b_offset + static_cast<T_out>(b[base_addr_b + j + k * N]);
219  const T_out mult_as_int = tmp_a * tmp_b;
220  acc[j] += mult_as_int;
221  }
222  }
223  for(int j = 0; j < N; ++j)
224  {
225  c[base_addr_c + j + i * N] = acc[j];
226  }
227  }
228  }
229 
230  return c;
231 }
232 
233 // used to validate assembly kernels which don't know anything about offsets
234 template <typename T1, typename T2, typename T3>
236 {
237  return gemmlowp_matrix_multiply_core<T1, T2, T3>(a, b, shape_c, 0, 0);
238 }
239 
240 template <typename TIn, typename TOut>
241 SimpleTensor<TOut> gemmlowp_quantize_down_scale(const SimpleTensor<TIn> &in, int32_t result_offset, std::vector<int32_t> result_mult_int, std::vector<int32_t> result_shift,
242  int32_t min, int32_t max)
243 {
245 
246  quantize_down_scale<TIn, TOut>(&in, nullptr, &dst, result_offset, result_mult_int, result_shift, min, max);
247 
248  return dst;
249 }
250 
251 template <typename TIn, typename TOut>
252 SimpleTensor<TOut> gemmlowp_quantize_down_scale(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias, int32_t result_offset, std::vector<int32_t> result_mult_int,
253  std::vector<int32_t> result_shift, int32_t min, int32_t max)
254 {
256 
257  quantize_down_scale<TIn, TOut>(&in, &bias, &dst, result_offset, result_mult_int, result_shift, min, max);
258 
259  return dst;
260 }
261 
262 template <typename TIn, typename TOut>
263 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
264  int32_t result_offset_after_shift, int32_t min, int32_t max)
265 {
267 
268  quantize_down_scale_by_fixedpoint<TIn, TOut>(&in, nullptr, &dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
269 
270  return dst;
271 }
272 
273 template <typename TIn, typename TOut>
274 SimpleTensor<TOut> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> &in, const SimpleTensor<TIn> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
275  std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
276 {
278 
279  quantize_down_scale_by_fixedpoint<TIn, TOut>(&in, &bias, &dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
280 
281  return dst;
282 }
283 
284 template <typename TIn, typename TOut>
286  std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
287 {
289 
290  quantize_down_scale_by_float<TIn, TOut>(&in, &bias, &dst, result_real_multiplier, result_offset, min, max);
291 
292  return dst;
293 }
294 
295 template <typename TIn, typename TOut>
297  std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
298 {
300 
301  quantize_down_scale_by_float<TIn, TOut>(&in, nullptr, &dst, result_real_multiplier, result_offset, min, max);
302 
303  return dst;
304 }
305 
307  std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
309  std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
311  std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
313  std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
314 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
315  std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
317  std::vector<int32_t> result_fixedpoint_multiplier,
318  std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
319 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
320  std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
322  std::vector<int32_t> result_fixedpoint_multiplier,
323  std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
324 template SimpleTensor<int16_t> gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
325  std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
327  std::vector<int32_t> result_fixedpoint_multiplier,
328  std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
329 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, std::vector<int32_t> result_mult_int,
330  std::vector<int32_t> result_shift, int32_t min, int32_t max);
331 template SimpleTensor<uint8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, std::vector<int32_t> result_mult_int,
332  std::vector<int32_t> result_shift, int32_t min, int32_t max);
333 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, std::vector<int32_t> result_mult_int,
334  std::vector<int32_t> result_shift, int32_t min, int32_t max);
335 template SimpleTensor<int8_t> gemmlowp_quantize_down_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, std::vector<int32_t> result_mult_int,
336  std::vector<int32_t> result_shift, int32_t min, int32_t max);
337 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
338 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
342 } // namespace reference
343 } // namespace validation
344 } // namespace test
345 } // namespace arm_compute
int32_t asymm_rounding_divide_by_pow2(int32_t x, int exponent)
Rounded to nearest division by a power-of-two.
SimpleTensor< T1 > gemmlowp(const SimpleTensor< T2 > &a, const SimpleTensor< T3 > &b, TensorShape shape_c)
Definition: GEMMLowp.cpp:235
Shape of a tensor.
Definition: TensorShape.h:39
quantized, symmetric fixed-point 16-bit number
template SimpleTensor< int32_t > gemmlowp< int32_t, int8_t, int8_t >(const SimpleTensor< int8_t > &a, const SimpleTensor< int8_t > &b, TensorShape shape_c)
template SimpleTensor< int32_t > gemmlowp< int32_t, uint8_t, int8_t >(const SimpleTensor< uint8_t > &a, const SimpleTensor< int8_t > &b, TensorShape shape_c)
SimpleTensor< float > b
Definition: DFT.cpp:157
unsigned int M
TensorShape shape() const override
Shape of the tensor.
Definition: SimpleTensor.h:329
decltype(strategy::transforms) typedef type
Copyright (c) 2017-2023 Arm Limited.
1 channel, 1 S32 per channel
int32_t asymm_int_mult(int32_t a, int32_t b)
Multiplication of two integers.
1 channel, 1 U32 per channel
quantized, asymmetric fixed-point 8-bit number unsigned
template SimpleTensor< int32_t > gemmlowp< int32_t, uint8_t, uint8_t >(const SimpleTensor< uint8_t > &a, const SimpleTensor< uint8_t > &b, TensorShape shape_c)
unsigned int N
int32_t quantize_down_scale_by_fixedpoint(int32_t val, int32_t result_mult_int, int32_t result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
Quantize down the input value in range [min, max].
Simple tensor object that stores elements in a consecutive chunk of memory.
Definition: SimpleTensor.h:58
T round(T value)
Round floating-point value with half value rounding away from zero.
SimpleTensor< TOut > gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor< TIn > &in, std::vector< int32_t > result_fixedpoint_multiplier, std::vector< int32_t > result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
Definition: GEMMLowp.cpp:263
quantized, asymmetric fixed-point 8-bit number signed
DataType
Available data types.
Definition: Types.h:79
SimpleTensor< TOut > gemmlowp_quantize_down_scale_by_float(const SimpleTensor< TIn > &in, const SimpleTensor< TIn > &bias, std::vector< float_t > result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
Definition: GEMMLowp.cpp:285
SimpleTensor< T_out > gemmlowp_matrix_multiply_core(const SimpleTensor< T_in > &a, const SimpleTensor< T_in_1 > &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
Definition: GEMMLowp.cpp:181
SimpleTensor< TOut > gemmlowp_quantize_down_scale(const SimpleTensor< TIn > &in, int32_t result_offset, std::vector< int32_t > result_mult_int, std::vector< int32_t > result_shift, int32_t min, int32_t max)
Definition: GEMMLowp.cpp:241
const int32_t * bias
unsigned int K