Compute Library
 22.11
ClGemmLowpMatrixMultiplyCore.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Log.h"
28 
41 
42 namespace arm_compute
43 {
44 namespace opencl
45 {
47 using namespace arm_compute::cl_gemm;
48 using namespace arm_compute::opencl::kernels;
49 using namespace arm_compute::experimental;
50 
51 namespace
52 {
53 inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
54 {
55  switch(kernel_type)
56  {
60  {
61  return true;
62  }
63  default:
64  {
65  return false;
66  }
67  }
68 }
69 
70 //Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
71 inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
72 {
73  auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
74  if(bool(gemm_kernel))
75  {
76  if(validate_gemm_kernel(gemm_kernel.gemm_type))
77  {
78  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
79  return gemm_kernel.gemm_type;
80  }
81  }
82  gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
83  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
84  return gemm_kernel.gemm_type;
85 }
86 
87 // Validate lhs_info and rhs_info for native kernel
88 inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
89 {
90  // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
91  TensorInfo mm_result_s32_info{};
92  // Output tensor auto initialization if not yet initialized
93  auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
94  // Validate mm kernel
95  // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
96  // NOTE: This assumes:
97  // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
98  // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
99  if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
100  {
101  return false;
102  }
103  return true;
104 }
105 
106 // Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
107 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
108 {
110  if(config)
111  {
112  if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
113  {
114  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
115  return { config.lhs_info, config.rhs_info };
116  }
117  }
119  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
120  return { config.lhs_info, config.rhs_info };
121 }
122 
123 // Validate lhs_info and rhs_info for reshaped only rhs kernel
124 inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
125  unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
126 {
127  // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
128  TensorInfo tmp_b_info{};
129  // Validate reshape RHS kernel
130  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
131  if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
132  {
133  return false;
134  }
135  // Validate mm kernel
136  // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
137  // NOTE: This assumes:
138  // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
139  // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
140  GEMMKernelInfo gemm_kernel_info;
141  gemm_kernel_info.m = m;
142  gemm_kernel_info.n = n;
143  gemm_kernel_info.k = k;
144  gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
145  gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
146  gemm_kernel_info.lhs_info = lhs_info;
147  gemm_kernel_info.rhs_info = rhs_info;
148  // Since we ignore the output stage, output data type has to be S32 to pass the validation
149  TensorInfo output_info_copy(*output);
150  output_info_copy.set_data_type(DataType::S32);
151  if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
152  {
153  return false;
154  }
155  return true;
156 }
157 
158 // Validate lhs_info and rhs_info for reshaped only rhs kernel
159 inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
160  unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
161 {
162  // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
163  TensorInfo tmp_b_info{};
164  // Validate reshape RHS kernel
165  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
166  if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
167  {
168  return false;
169  }
170  // Validate mm kernel
171  // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
172  // NOTE: This assumes:
173  // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
174  // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
175  GEMMKernelInfo gemm_kernel_info;
176  gemm_kernel_info.m = m;
177  gemm_kernel_info.n = n;
178  gemm_kernel_info.k = k;
179  gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
180  gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
181  gemm_kernel_info.lhs_info = lhs_info;
182  gemm_kernel_info.rhs_info = rhs_info;
183  // Since we ignore the output stage, output data type has to be S32 to pass the validation
184  TensorInfo output_info_copy(*output);
185  output_info_copy.set_data_type(DataType::S32);
186  if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
187  {
188  return false;
189  }
190  return true;
191 }
192 
193 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
194 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
195  const ITensorInfo *a,
196  const ITensorInfo *b, const ITensorInfo *output)
197 {
199  if(config)
200  {
201  if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d))
202  {
203  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
204  return { config.lhs_info, config.rhs_info };
205  }
206  }
208  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
209  return { config.lhs_info, config.rhs_info };
210 }
211 
212 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
213 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
214  const ITensorInfo *a,
215  const ITensorInfo *b, const ITensorInfo *output)
216 {
217  ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);
219  validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d);
220  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
221  to_string(config.rhs_info).c_str());
222  return { config.lhs_info, config.rhs_info };
223 }
224 
225 inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
226 {
227  switch(kernel_type)
228  {
230  return false;
233  return true;
234  default:
235  ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
236  }
237 }
238 } // namespace
239 
241  : _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
242  _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
243  _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
244  _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>()),
245  _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
246  _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
247  _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
248  _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()),
249  _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()),
250  _aux_mem(AuxTensorIdx::Count)
251 {
252 }
253 
255 
257  ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output,
258  const GEMMInfo &gemm_info)
259 {
260  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
262  ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info);
263 
264  _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
265  _a_offset = a->quantization_info().uniform().offset;
267  && a->data_type() == DataType::QASYMM8;
268  _b_offset = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
269  _gemm_info = gemm_info;
270 
271  // Get the GPU target
272  const GPUTarget gpu_target = CLScheduler::get().target();
273 
274  // Set the target for the kernels
275  _mm_native_kernel->set_target(gpu_target);
276  _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
277  _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
278 
281 
282  // Arguments used by GEMMReshapeInfo
283  // in order to know how the matrices have been reshaped
284  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
285  const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
286  const unsigned int n = b->dimension(0);
287  const unsigned int k = a->dimension(0);
288  const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
289  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
290 
291  const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
292 
293  _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run);
294 
295  if(_convert_to_qasymm8)
296  {
297  // Set data type for converted weights
298  _qasymm8_weights = *b;
299  _qasymm8_weights.set_data_type(DataType::QASYMM8);
300  _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP);
301  }
302 
303  ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
304  if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
305  {
306  matrix_b = &_tmp_b;
307 
308  // Pick up the GEMM configuration
309  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
310  std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
311  depth_output_gemm3d,
312  a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
313 
314  // Configure reshape RHS kernel
315  _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
316  }
317  if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
318  {
319  matrix_b = &_tmp_b;
320 
321  // Pick up the GEMM configuration
322  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
323  std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
324  depth_output_gemm3d,
325  a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
326 
327  // Configure reshape RHS kernel
328  _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
329  }
330 
331  // Using default reduction info
332  const GEMMLowpReductionKernelInfo reduction_info {};
333 
334  // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
335  if(_a_offset != 0)
336  {
337  _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
338 
339  // Configure Matrix B reduction kernel
340  _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
341  }
342 
343  // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
344  if(_b_offset != 0)
345  {
346  _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
347 
348  // Configure matrix A reduction kernel
349  _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
350  }
351 
352  GEMMKernelInfo gemm_kernel_info;
353  gemm_kernel_info.m = m;
354  gemm_kernel_info.n = n;
355  gemm_kernel_info.k = k;
356  gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
357  gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
358  gemm_kernel_info.lhs_info = lhs_info;
359  gemm_kernel_info.rhs_info = rhs_info;
360  gemm_kernel_info.a_offset = _a_offset;
361  gemm_kernel_info.b_offset = _b_offset;
362  // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
364  {
365  // Configure offset contribution kernel
366  const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
367 
368  _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
369  _gemm_output_stage_shifts = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
370 
371  GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
372  gemmlowp_output_stage.output_data_type = a->data_type();
373  if(num_filters == 1)
374  {
375  // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
376  // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
377  gemmlowp_output_stage.is_quantized_per_channel = false;
378  }
379 
380  gemm_kernel_info.output_stage = gemmlowp_output_stage;
381 
382  if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
383  {
384  // Configure and tune matrix multiply kernel with fused output stage
385  _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
386  _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
387  }
388  else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
389  {
390  // Configure and tune matrix multiply kernel with fused output stage
391  _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
392  _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
393  }
394  else
395  {
396  _run_output_stage = true;
397 
398  if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
399  {
400  _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
401  }
402  if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
403  {
404  _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
405  }
406  else
407  {
408  // Pick up the GEMM configuration
409  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
410  std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
411  a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
412 
413  // Configure matrix multiply kernel
414  _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
415 
416  _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
417  c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,
418  &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
419  }
420  }
421  }
422  else
423  {
424  _run_offset_contribution = true;
425  if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
426  {
427  // Configure and tune matrix multiply kernel
428  _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
429  }
430  else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
431  {
432  // Configure and tune matrix multiply kernel
433  _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
434  }
435  else
436  {
437  // Pick up the GEMM configuration
438  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
439  std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
440  a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
441 
442  // Configure matrix multiply kernel
443  _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
444  }
445 
446  // Configure offset contribution kernel
447  _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
448  c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset);
449  }
450 
451  // Request memory
452  _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
453  if(is_gemm_reshaped(_gemm_kernel_type))
454  {
455  // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
456  _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
457  _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
458  }
459  if(_a_offset != 0)
460  {
461  _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size());
462  }
463  if(_b_offset != 0)
464  {
465  _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
466  }
467  _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
468  _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size());
469  _aux_mem[Shifts] = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
470 }
471 
473 {
474  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
479  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
480  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
481 
482  int32_t a_offset = a->quantization_info().uniform().offset;
483  int32_t b_offset = b->quantization_info().uniform().offset;
484 
485  const ITensorInfo *matrix_a_info = a;
486 
487  TensorInfo tmp_b_info{};
490 
491  // Get the GPU target
492  const GPUTarget gpu_target = CLScheduler::get().target();
493 
494  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
495  const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
496  const unsigned int n = b->dimension(0);
497  const unsigned int k = a->dimension(0);
498  const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
499  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
500 
501  bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run()));
502 
503  const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
504 
508  if(convert_to_qasymm8)
509  {
510  b_offset = -128;
511  weights_info.set_data_type(DataType::QASYMM8);
513  }
514  const ITensorInfo *matrix_b_info = &weights_info;
515  if(reshape_matrix_b)
516  {
517  matrix_b_info = &tmp_b_info;
518 
519  // Pick up the GEMM configuration
520  // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
521  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
523  lhs_info = res.lhs_info;
524  rhs_info = res.rhs_info;
525 
526  // Validate reshape RHS kernel
527  auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
528  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
529  }
530 
531  TensorInfo info_vector_sum_col{};
532  TensorInfo info_vector_sum_row{};
533 
534  const GEMMLowpReductionKernelInfo reduction_info;
535  // Validate matrix B reduction kernel only if _a_offset is not equal to 0
536  if(a_offset != 0)
537  {
538  info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
539 
540  // Configure Matrix B reduction kernel
541  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
542  }
543 
544  // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
545  if(b_offset != 0)
546  {
547  info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
548 
549  // Configure matrix A reduction kernel
550  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
551  }
552 
553  GEMMKernelInfo gemm_kernel_info;
554  gemm_kernel_info.m = m;
555  gemm_kernel_info.n = n;
556  gemm_kernel_info.k = k;
557  gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
558  gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
559  gemm_kernel_info.lhs_info = lhs_info;
560  gemm_kernel_info.rhs_info = rhs_info;
561  gemm_kernel_info.a_offset = a_offset;
562  gemm_kernel_info.b_offset = b_offset;
564  {
565  const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
566 
567  const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
568 
569  GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
570  gemmlowp_output_stage.output_data_type = a->data_type();
571 
572  gemm_kernel_info.output_stage = gemmlowp_output_stage;
574  {
575  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
576  a_offset == 0 ? nullptr : &info_vector_sum_col,
577  b_offset == 0 ? nullptr : &info_vector_sum_row,
578  c,
579  &gemm_output_stage_multipliers_shifts_info,
580  &gemm_output_stage_multipliers_shifts_info));
581  }
582  else
583  {
584  TensorInfo mm_result_s32_info{};
585 
586  if(reshape_matrix_b)
587  {
588  // Output tensor auto inizialitation if not yet initialized
589  auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
590 
591  // Validate matrix multiply
592  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
593  }
594  else
595  {
596  // Output tensor auto inizialitation if not yet initialized
597  auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
598 
599  // Pick up the GEMM configuration
600  // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
601  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
602  const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
603  lhs_info = res.lhs_info;
604  rhs_info = res.rhs_info;
605 
606  // Validate matrix multiply
607  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
608  }
609 
610  // Validate offset contribution kernel
612  a_offset == 0 ? nullptr : &info_vector_sum_col,
613  b_offset == 0 ? nullptr : &info_vector_sum_row,
614  c,
615  output,
616  a_offset, b_offset,
617  gemmlowp_output_stage,
618  &gemm_output_stage_multipliers_shifts_info,
619  &gemm_output_stage_multipliers_shifts_info));
620  }
621  }
622  else
623  {
624  if(reshape_matrix_b)
625  {
626  // Validate matrix multiply
627  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
628  }
629  else
630  {
631  // Pick up the GEMM configuration
632  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
633  const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
634  lhs_info = res.lhs_info;
635  rhs_info = res.rhs_info;
636 
637  // Validate matrix multiply
638  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
639  }
640 
641  if(output->total_size() != 0)
642  {
643  // Validate offset contribution kernel
645  a_offset == 0 ? nullptr : &info_vector_sum_col,
646  b_offset == 0 ? nullptr : &info_vector_sum_row,
647  c,
648  a_offset, b_offset));
649  }
650  }
651 
652  return Status{};
653 }
654 
656 {
657  const ITensor *a = tensors.get_const_tensor(ACL_SRC_0);
658  const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
659  const ITensor *c = tensors.get_const_tensor(ACL_SRC_2);
660  ITensor *dst = tensors.get_tensor(ACL_DST);
661 
663 
664  CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
665  CLAuxTensorHandler vec_sum_row(offset_int_vec(VecSumRow), _vector_sum_row, tensors, true);
666  CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, true);
667  CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
668  CLAuxTensorHandler res32(offset_int_vec(ResultS32), _mm_result_s32, tensors, true);
669  CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, true);
670  CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, true);
671 
672  // Prepare the consts if needed
673  prepare(tensors);
674 
675  const ITensor *matrix_a = a;
676  const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
677 
678  if(is_gemm_reshaped(_gemm_kernel_type))
679  {
680  matrix_b = tmp_b.get();
681  if(!_reshape_b_only_on_first_run)
682  {
683  // Run reshape matrix B
684  ITensorPack mtx_b_reshape_pack =
685  {
686  { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
687  { TensorType::ACL_DST, tmp_b.get() }
688  };
689  CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
690  }
691  }
692 
693  // Run matrix B reduction kernel only if _a_offset is not equal to 0
694  if(_a_offset != 0 && !_reshape_b_only_on_first_run)
695  {
696  ITensorPack mtx_b_red_pack =
697  {
698  { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
699  { TensorType::ACL_DST, vec_sum_col.get() }
700  };
701  CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
702  }
703 
704  // Run matrix A reduction kernel only if _b_offset is not equal to 0
705  if(_b_offset != 0)
706  {
707  ITensorPack mtx_a_red_pack =
708  {
709  { TensorType::ACL_SRC, matrix_a },
710  { TensorType::ACL_DST, vec_sum_row.get() }
711  };
712  CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
713  }
714 
715  // Run matrix multiply
716  if(is_gemm_reshaped(_gemm_kernel_type))
717  {
718  ITensorPack gemm_reshaped_pack;
719  if(_run_offset_contribution)
720  {
721  gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a },
722  { TensorType::ACL_SRC_1, matrix_b },
723  { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst }
724  });
725  }
726  else
727  {
728  gemm_reshaped_pack = ITensorPack(
729  {
730  { TensorType::ACL_SRC, matrix_a },
731  { TensorType::ACL_SRC_1, matrix_b },
732  { TensorType::ACL_BIAS, c },
733  { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
734  { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
735  { TensorType::ACL_SHIFTS, shifts.get() },
736  { TensorType::ACL_MULTIPLIERS, multipliers.get() },
737  { TensorType::ACL_DST, dst },
738  });
739  }
740  if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
741  {
742  CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
743  }
744  else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
745  {
746  CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
747  }
748  else
749  {
750  ARM_COMPUTE_ERROR("Invalid reshaped kernel");
751  }
752  }
753  else
754  {
755  ITensorPack gemm_native_pack =
756  {
757  { TensorType::ACL_SRC_0, matrix_a },
758  { TensorType::ACL_SRC_1, matrix_b },
759  { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() }
760  };
761  CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
762  }
763  if(_run_output_stage)
764  {
765  // Run offset contribution/output stage kernel
766  ITensorPack output_stage_pack =
767  {
768  { TensorType::ACL_SRC, res32.get() },
769  { TensorType::ACL_BIAS, c },
770  { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
771  { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
772  { TensorType::ACL_SHIFTS, shifts.get() },
773  { TensorType::ACL_MULTIPLIERS, multipliers.get() },
774  { TensorType::ACL_DST, dst },
775  };
776  CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
777  }
778  if(_run_offset_contribution)
779  {
780  // Run offset contribution kernel
781  ITensorPack offset_contrib_pack =
782  {
783  { TensorType::ACL_SRC_DST, dst },
784  { TensorType::ACL_BIAS, c },
785  { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
786  { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }
787  };
788  CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
789  }
790 }
791 
793 {
794  if(!_is_prepared)
795  {
796  auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
797  CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
798  CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
799  CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, false);
800 
802 
803  if(_convert_to_qasymm8)
804  {
805  ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } };
806  CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
807  b->mark_as_unused();
808  }
809 
810  if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
811  {
812  // Run reshape kernel and mark original weights tensor as unused
813  ITensorPack mtx_b_pack =
814  {
815  { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
816  { TensorType::ACL_DST, tmp_b.get() }
817  };
818  CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
819  b->mark_as_unused();
820  }
821 
822  // Run matrix B reduction kernel only if _a_offset is not equal to 0
823  if(_a_offset != 0 && _reshape_b_only_on_first_run)
824  {
825  ITensorPack mtx_b_red_pack =
826  {
827  { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
828  { TensorType::ACL_DST, vec_sum_col.get() }
829  };
830  CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
831  }
832 
833  // Compute GEMM output multipliers and shifts for output stage
834  {
835  const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
836 
837  CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
838  CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
839 
840  ICLTensor *multiplier_tensor = multipliers.get();
841  if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
842  {
843  multiplier_tensor->map(CLScheduler::get().queue(), true);
844  std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
845  multiplier_tensor->unmap(CLScheduler::get().queue());
846  }
847 
848  ICLTensor *shifts_tensor = shifts.get();
849  if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
850  {
851  shifts_tensor->map(CLScheduler::get().queue(), true);
852  std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
853  shifts_tensor->unmap(CLScheduler::get().queue());
854  }
855  }
856  CLScheduler::get().queue().finish();
857  _is_prepared = true;
858  }
859 }
860 
862 {
863  return _aux_mem;
864 }
865 } // namespace opencl
866 } // namespace arm_compute
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
Definition: ITensor.h:63
void map(cl::CommandQueue &q, bool blocking=true)
Enqueue a map operation of the allocated buffer on the given queue.
Definition: ICLTensor.cpp:35
experimental::MemoryRequirements workspace() const override
Return the memory requirements required by the workspace.
Shape of a tensor.
Definition: TensorShape.h:39
Quantize using a fixed point multiplication.
Descriptor used by the GEMM kernels.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
SimpleTensor< float > b
Definition: DFT.cpp:157
static CLScheduler & get()
Access the scheduler singleton.
OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the out...
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
GPUTarget target() const
Get the target GPU.
Definition: CLScheduler.cpp:49
void prepare(ITensorPack &constants) override
Prepare the function for executing.
unsigned int depth_output_gemm3d
Depth of the output tensor in case is reinterpreted as 3D.
GEMM reshape information class.
Definition: Types.h:2159
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Calculate the matrix multiplication output shape of two tensors.
GEMMLowpOutputStageInfo gemmlowp_output_stage() const
GEMMLowp output stage.
Definition: Types.h:2457
TensorShape compute_reductionA_shape(const ITensorInfo &b)
Calculate the reductionA shape used in GEMMLowp.
A collection of adaptor functions that enable the auto selection between mlgo-based heuristics and de...
ITensorInfo & set_data_type(DataType data_type) override
Set the data type to the specified value.
Definition: TensorInfo.cpp:307
GEMM LHS (Left Hand Side) matrix information.
Definition: Types.h:2303
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
Reshaped GEMM kernel where only the rhs matrix is reshaped.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Reshaped GEMM kernel where only the rhs matrix is reshaped.
int depth_output_gemm3d() const
Depth of the output when GEMM output is reinterpreted as 3D tensor.
Definition: Types.h:2433
Status class.
Definition: Error.h:52
GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
Select gemm config based on mlgo heuristics.
CLGEMMKernelType
OpenCL GEMM kernel types.
Definition: CLTypes.h:31
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col=nullptr, const ITensorInfo *vector_sum_row=nullptr, const ITensorInfo *bias=nullptr, const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration.
bool is_data_type_quantized_symmetric(DataType dt)
Check if a given data type is of symmetric quantized type.
Definition: Utils.h:1088
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
GEMMLowpOutputStageType type
GEMMLowp output stage type.
Definition: Types.h:2289
Interface for CPU tensor.
Definition: ITensor.h:36
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col=nullptr, const ITensorInfo *vector_sum_row=nullptr, const ITensorInfo *bias=nullptr, const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration.
GEMMLHSMatrixInfo lhs_info
LHS matrix information used to retrieve the number of rows processed by each thread.
Copyright (c) 2017-2022 Arm Limited.
bool is_b_reshaped() const
Flag which specifies if the matrix B has been reshaped.
Definition: Types.h:2415
std::vector< MemoryInfo > MemoryRequirements
Definition: Types.h:134
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset)
Static function to check if given info will lead to a valid configuration.
static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration.
bool is_quantized_per_channel
GEMMLowp quantized per-channel flag.
Definition: Types.h:2298
std::vector< int32_t > gemmlowp_shifts
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
Definition: Types.h:2296
1 channel, 1 S32 per channel
static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration.
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info=GEMMInfo())
Static function to check if given info will lead to a valid configuration.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
unsigned int m
Number of LHS rows.
GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
Select gemm config based on default heuristics.
#define ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(fmt,...)
Log information level formatted message to the core system logger.
Definition: Log.h:99
unsigned int n
Number of RHS columns.
void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel&#39;s inputs, output.
void run(ITensorPack &tensors) override
Run the kernels contained in the function.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
Casts a given tensor to a new type.
Definition: ClCastKernel.h:41
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1107
GEMM RHS (Right Hand Side) matrix information.
Definition: Types.h:2318
int32_t b_offset
Offset to be added to each element of the matrix B.
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
Static function to check if given info will lead to a valid configuration.
OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B...
quantized, asymmetric fixed-point 8-bit number unsigned
void unmap(cl::CommandQueue &q)
Enqueue an unmap operation of the allocated and mapped buffer on the given queue. ...
Definition: ICLTensor.cpp:40
OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data types when only the input matrix ...
size_t total_size() const override
Returns the total size of the tensor in bytes.
Definition: TensorInfo.h:250
Coordinates of an item.
Definition: Coordinates.h:37
void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush=true)
Schedule the execution of the passed kernel if possible.
std::vector< int32_t > gemmlowp_multipliers
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
Definition: Types.h:2295
UniformQuantizationInfo uniform() const
Return per layer quantization info.
GEMMLowpOutputStageInfo output_stage
GEMMLowp output stage information.
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type.
bool reinterpret_input_as_3d
Flag used to reinterpret the input as 3D.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
GEMMLowp output stage info.
Definition: Types.h:2287
OpenCL kernel used to add the offset contribution after the matrix multiplication.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
GEMMTypeResult select_default_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
Select gemm type based on default heuristics.
std::string to_string(const T &val)
Fallback method: try to use std::to_string:
Definition: TypePrinter.h:80
cl::CommandQueue & queue()
Accessor for the associated CL command queue.
Definition: CLScheduler.cpp:43
bool reinterpret_input_as_3d() const
Flag which specifies if the input tensor has to be reinterpreted as 3D.
Definition: Types.h:2441
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has b...
quantized, symmetric fixed-point 8-bit number
CLCompileContext class.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1052
bool is_a_reshaped() const
Flag which specifies if the matrix A has been reshaped.
Definition: Types.h:2407
quantized, symmetric per channel fixed-point 8-bit number
TensorShape compute_reductionB_shape(const ITensorInfo &a)
Calculate the reductionB shape used in GEMMLowp.
int32_t a_offset
Offset to be added to each element of the matrix A.
unsigned int k
Number of rows for the rhs matrix.
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
Interface for OpenCL tensor.
Definition: ICLTensor.h:42
GEMMRHSMatrixInfo rhs_info
RHS matrix information used for reshaping the RHS matrix.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication In particular...
GPUTarget
Available GPU Targets.
Definition: GPUTarget.h:34
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
Static function to check if given info will lead to a valid configuration.
Native GEMM kernel with configurable block size.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
Select gemm type based on mlgo heuristics.
unsigned int k
Number of LHS columns or RHS rows.
unsigned int m
Number of rows for the lhs matrix.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:39
#define ARM_COMPUTE_LOG_PARAMS(...)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:43
bool reshape_b_only_on_first_run() const
Flag which specifies if the reshape of matrix B should executed only for the first.
Definition: Types.h:2425
OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A...
int offset_int_vec(int offset)
Definition: MemoryHelpers.h:38
GEMM information class.
Definition: Types.h:2339
GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
Select gemm config based on mlgo heuristics.
quantized, asymmetric fixed-point 8-bit number signed
unsigned int n
Number of columns for the rhs matrix.
DataType output_data_type
Output tensor data type to use if the output is not initialized.
Definition: Types.h:2299
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
Static function to check if given info will lead to a valid configuration.
GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery &query)
Select gemm config based on default heuristics.