Compute Library
 22.05
ClGemmLowpMatrixMultiplyCore.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
27 #include "arm_compute/core/Error.h"
30 #include "arm_compute/core/Log.h"
32 #include "arm_compute/core/Types.h"
37 
49 
50 #include "src/common/utils/Log.h"
51 #include "utils/TypePrinter.h"
52 
53 namespace arm_compute
54 {
55 namespace opencl
56 {
58 using namespace arm_compute::cl_gemm;
59 using namespace arm_compute::opencl::kernels;
60 using namespace arm_compute::experimental;
61 
62 namespace
63 {
64 inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
65 {
66  switch(kernel_type)
67  {
70  {
71  return true;
72  }
73  default:
74  {
75  return false;
76  }
77  }
78 }
79 
80 //Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
81 inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
82 {
83  auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
84  if(bool(gemm_kernel))
85  {
86  if(validate_gemm_kernel(gemm_kernel.gemm_type))
87  {
88  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
89  return gemm_kernel.gemm_type;
90  }
91  }
92  gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
93  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
94  return gemm_kernel.gemm_type;
95 }
96 
97 // Validate lhs_info and rhs_info for native kernel
98 inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
99 {
100  // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
101  TensorInfo mm_result_s32_info{};
102  // Output tensor auto initialization if not yet initialized
103  auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
104  // Validate mm kernel
105  // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
106  // NOTE: This assumes:
107  // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
108  // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
109  if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
110  {
111  return false;
112  }
113  return true;
114 }
115 
116 // Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
117 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
118 {
120  if(config)
121  {
122  if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
123  {
124  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
125  return { config.lhs_info, config.rhs_info };
126  }
127  }
129  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
130  return { config.lhs_info, config.rhs_info };
131 }
132 
133 // Validate lhs_info and rhs_info for reshaped only rhs kernel
134 inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
135  unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
136 {
137  // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
138  TensorInfo tmp_b_info{};
139  // Validate reshape RHS kernel
140  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
141  if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
142  {
143  return false;
144  }
145  // Validate mm kernel
146  // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
147  // NOTE: This assumes:
148  // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
149  // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
150  GEMMKernelInfo gemm_kernel_info;
151  gemm_kernel_info.m = m;
152  gemm_kernel_info.n = n;
153  gemm_kernel_info.k = k;
154  gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
155  gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
156  gemm_kernel_info.lhs_info = lhs_info;
157  gemm_kernel_info.rhs_info = rhs_info;
158  // Since we ignore the output stage, output data type has to be S32 to pass the validation
159  TensorInfo output_info_copy(*output);
160  output_info_copy.set_data_type(DataType::S32);
161  if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
162  {
163  return false;
164  }
165  return true;
166 }
167 
168 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
169 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
170  const ITensorInfo *a,
171  const ITensorInfo *b, const ITensorInfo *output)
172 {
174  if(config)
175  {
176  if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d))
177  {
178  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
179  return { config.lhs_info, config.rhs_info };
180  }
181  }
183  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
184  return { config.lhs_info, config.rhs_info };
185 }
186 
187 inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
188 {
189  switch(kernel_type)
190  {
192  return false;
194  return true;
195  default:
196  ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
197  }
198 }
199 } // namespace
200 
202  : _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
203  _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
204  _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
205  _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
206  _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
207  _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
208  _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()),
209  _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()),
210  _aux_mem(AuxTensorIdx::Count)
211 {
212 }
213 
215 
217  ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output,
218  const GEMMInfo &gemm_info)
219 {
220  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
221  ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c != nullptr ? c : nullptr, output, gemm_info));
222  ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info);
223 
224  _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
225  _a_offset = a->quantization_info().uniform().offset;
227  && a->data_type() == DataType::QASYMM8;
228  _b_offset = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
229  _gemm_info = gemm_info;
230 
231  // Get the GPU target
232  const GPUTarget gpu_target = CLScheduler::get().target();
233 
234  // Set the target for the kernels
235  _mm_native_kernel->set_target(gpu_target);
236  _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
237 
240 
241  // Arguments used by GEMMReshapeInfo
242  // in order to know how the matrices have been reshaped
243  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
244  const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
245  const unsigned int n = b->dimension(0);
246  const unsigned int k = a->dimension(0);
247  const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
248  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
249 
250  const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
251 
252  // Check if we need to reshape the matrix A and matrix B
253  _is_gemm_reshaped = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run));
254 
255  if(_convert_to_qasymm8)
256  {
257  // Set data type for converted weights
258  _qasymm8_weights = *b;
259  _qasymm8_weights.set_data_type(DataType::QASYMM8);
260  _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP);
261  }
262 
263  ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
264  if(_is_gemm_reshaped)
265  {
266  matrix_b = &_tmp_b;
267 
268  // Pick up the GEMM configuration
269  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
270  std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
271  depth_output_gemm3d,
272  a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
273 
274  // Configure reshape RHS kernel
275  _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
276  }
277 
278  // Using default reduction info
279  const GEMMLowpReductionKernelInfo reduction_info {};
280 
281  // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
282  if(_a_offset != 0)
283  {
284  _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
285 
286  // Configure Matrix B reduction kernel
287  _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
288  }
289 
290  // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
291  if(_b_offset != 0)
292  {
293  _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
294 
295  // Configure matrix A reduction kernel
296  _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
297  }
298 
299  GEMMKernelInfo gemm_kernel_info;
300  gemm_kernel_info.m = m;
301  gemm_kernel_info.n = n;
302  gemm_kernel_info.k = k;
303  gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
304  gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
305  gemm_kernel_info.lhs_info = lhs_info;
306  gemm_kernel_info.rhs_info = rhs_info;
307  gemm_kernel_info.a_offset = _a_offset;
308  gemm_kernel_info.b_offset = _b_offset;
309  // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
311  {
312  // Configure offset contribution kernel
313  const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
314 
315  _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
316  _gemm_output_stage_shifts = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
317 
318  GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
319  gemmlowp_output_stage.output_data_type = a->data_type();
320  if(num_filters == 1)
321  {
322  // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
323  // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
324  gemmlowp_output_stage.is_quantized_per_channel = false;
325  }
326 
327  gemm_kernel_info.output_stage = gemmlowp_output_stage;
328 
329  if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
330  {
331  // Configure and tune matrix multiply kernel with fused output stage
332  _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
333  _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
334  }
335  else
336  {
337  _run_output_stage = true;
338 
339  if(_is_gemm_reshaped)
340  {
341  _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
342  }
343  else
344  {
345  // Pick up the GEMM configuration
346  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
347  std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
348  a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
349 
350  // Configure matrix multiply kernel
351  _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
352 
353  _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
354  c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,
355  &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
356  }
357  }
358  }
359  else
360  {
361  _run_offset_contribution = true;
362  if(_is_gemm_reshaped)
363  {
364  // Configure and tune matrix multiply kernel
365  _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
366  }
367  else
368  {
369  // Pick up the GEMM configuration
370  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
371  std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
372  a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
373 
374  // Configure matrix multiply kernel
375  _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
376  }
377 
378  // Configure offset contribution kernel
379  _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
380  c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset);
381  }
382 
383  // Request memory
384  _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
385  if(_is_gemm_reshaped)
386  {
387  // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
388  _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
389  _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
390  }
391  if(_a_offset != 0)
392  {
393  _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size());
394  }
395  if(_b_offset != 0)
396  {
397  _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
398  }
399  _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
400  _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size());
401  _aux_mem[Shifts] = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
402 }
403 
405 {
406  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
411  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
412  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
413 
414  int32_t a_offset = a->quantization_info().uniform().offset;
415  int32_t b_offset = b->quantization_info().uniform().offset;
416 
417  const ITensorInfo *matrix_a_info = a;
418 
419  TensorInfo tmp_b_info{};
422 
423  // Get the GPU target
424  const GPUTarget gpu_target = CLScheduler::get().target();
425 
426  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
427  const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
428  const unsigned int n = b->dimension(0);
429  const unsigned int k = a->dimension(0);
430  const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
431  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
432 
433  bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run()));
434 
435  const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
436 
440  if(convert_to_qasymm8)
441  {
442  b_offset = -128;
443  weights_info.set_data_type(DataType::QASYMM8);
445  }
446  const ITensorInfo *matrix_b_info = &weights_info;
447  if(reshape_matrix_b)
448  {
449  matrix_b_info = &tmp_b_info;
450 
451  // Pick up the GEMM configuration
452  // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
453  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
455  lhs_info = res.lhs_info;
456  rhs_info = res.rhs_info;
457 
458  // Validate reshape RHS kernel
459  auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
460  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
461  }
462 
463  TensorInfo info_vector_sum_col{};
464  TensorInfo info_vector_sum_row{};
465 
466  const GEMMLowpReductionKernelInfo reduction_info;
467  // Validate matrix B reduction kernel only if _a_offset is not equal to 0
468  if(a_offset != 0)
469  {
470  info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
471 
472  // Configure Matrix B reduction kernel
473  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
474  }
475 
476  // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
477  if(b_offset != 0)
478  {
479  info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
480 
481  // Configure matrix A reduction kernel
482  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
483  }
484 
485  GEMMKernelInfo gemm_kernel_info;
486  gemm_kernel_info.m = m;
487  gemm_kernel_info.n = n;
488  gemm_kernel_info.k = k;
489  gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
490  gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
491  gemm_kernel_info.lhs_info = lhs_info;
492  gemm_kernel_info.rhs_info = rhs_info;
493  gemm_kernel_info.a_offset = a_offset;
494  gemm_kernel_info.b_offset = b_offset;
496  {
497  const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
498 
499  const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
500 
501  GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
502  gemmlowp_output_stage.output_data_type = a->data_type();
503 
504  gemm_kernel_info.output_stage = gemmlowp_output_stage;
506  {
507  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
508  a_offset == 0 ? nullptr : &info_vector_sum_col,
509  b_offset == 0 ? nullptr : &info_vector_sum_row,
510  c,
511  &gemm_output_stage_multipliers_shifts_info,
512  &gemm_output_stage_multipliers_shifts_info));
513  }
514  else
515  {
516  TensorInfo mm_result_s32_info{};
517 
518  if(reshape_matrix_b)
519  {
520  // Output tensor auto inizialitation if not yet initialized
521  auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
522 
523  // Validate matrix multiply
524  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
525  }
526  else
527  {
528  // Output tensor auto inizialitation if not yet initialized
529  auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
530 
531  // Pick up the GEMM configuration
532  // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
533  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
534  const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
535  lhs_info = res.lhs_info;
536  rhs_info = res.rhs_info;
537 
538  // Validate matrix multiply
539  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
540  }
541 
542  // Validate offset contribution kernel
544  a_offset == 0 ? nullptr : &info_vector_sum_col,
545  b_offset == 0 ? nullptr : &info_vector_sum_row,
546  c,
547  output,
548  a_offset, b_offset,
549  gemmlowp_output_stage,
550  &gemm_output_stage_multipliers_shifts_info,
551  &gemm_output_stage_multipliers_shifts_info));
552  }
553  }
554  else
555  {
556  if(reshape_matrix_b)
557  {
558  // Validate matrix multiply
559  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
560  }
561  else
562  {
563  // Pick up the GEMM configuration
564  // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
565  const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
566  lhs_info = res.lhs_info;
567  rhs_info = res.rhs_info;
568 
569  // Validate matrix multiply
570  ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
571  }
572 
573  if(output->total_size() != 0)
574  {
575  // Validate offset contribution kernel
577  a_offset == 0 ? nullptr : &info_vector_sum_col,
578  b_offset == 0 ? nullptr : &info_vector_sum_row,
579  c,
580  a_offset, b_offset));
581  }
582  }
583 
584  return Status{};
585 }
586 
588 {
589  const ITensor *a = tensors.get_const_tensor(ACL_SRC_0);
590  const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
591  const ITensor *c = tensors.get_const_tensor(ACL_SRC_2);
592  ITensor *dst = tensors.get_tensor(ACL_DST);
593 
595 
596  CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
597  CLAuxTensorHandler vec_sum_row(offset_int_vec(VecSumRow), _vector_sum_row, tensors, true);
598  CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, true);
599  CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
600  CLAuxTensorHandler res32(offset_int_vec(ResultS32), _mm_result_s32, tensors, true);
601  CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, true);
602  CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, true);
603 
604  // Prepare the consts if needed
605  prepare(tensors);
606 
607  const ITensor *matrix_a = a;
608  const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
609 
610  if(_is_gemm_reshaped)
611  {
612  matrix_b = tmp_b.get();
613  if(!_reshape_b_only_on_first_run)
614  {
615  // Run reshape matrix B
616  ITensorPack mtx_b_reshape_pack =
617  {
618  { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
619  { TensorType::ACL_DST, tmp_b.get() }
620  };
621  CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
622  }
623  }
624 
625  // Run matrix B reduction kernel only if _a_offset is not equal to 0
626  if(_a_offset != 0 && !_reshape_b_only_on_first_run)
627  {
628  ITensorPack mtx_b_red_pack =
629  {
630  { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
631  { TensorType::ACL_DST, vec_sum_col.get() }
632  };
633  CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
634  }
635 
636  // Run matrix A reduction kernel only if _b_offset is not equal to 0
637  if(_b_offset != 0)
638  {
639  ITensorPack mtx_a_red_pack =
640  {
641  { TensorType::ACL_SRC, matrix_a },
642  { TensorType::ACL_DST, vec_sum_row.get() }
643  };
644  CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
645  }
646 
647  // Run matrix multiply
648  if(_is_gemm_reshaped)
649  {
650  ITensorPack gemm_reshaped_pack;
651  if(_run_offset_contribution)
652  {
653  gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a },
654  { TensorType::ACL_SRC_1, matrix_b },
655  { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst }
656  });
657  }
658  else
659  {
660  gemm_reshaped_pack = ITensorPack(
661  {
662  { TensorType::ACL_SRC, matrix_a },
663  { TensorType::ACL_SRC_1, matrix_b },
664  { TensorType::ACL_BIAS, c },
665  { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
666  { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
667  { TensorType::ACL_SHIFTS, shifts.get() },
668  { TensorType::ACL_MULTIPLIERS, multipliers.get() },
669  { TensorType::ACL_DST, dst },
670  });
671  }
672  CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
673  }
674  else
675  {
676  ITensorPack gemm_native_pack =
677  {
678  { TensorType::ACL_SRC_0, matrix_a },
679  { TensorType::ACL_SRC_1, matrix_b },
680  { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() }
681  };
682  CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
683  }
684  if(_run_output_stage)
685  {
686  // Run offset contribution/output stage kernel
687  ITensorPack output_stage_pack =
688  {
689  { TensorType::ACL_SRC, res32.get() },
690  { TensorType::ACL_BIAS, c },
691  { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
692  { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
693  { TensorType::ACL_SHIFTS, shifts.get() },
694  { TensorType::ACL_MULTIPLIERS, multipliers.get() },
695  { TensorType::ACL_DST, dst },
696  };
697  CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
698  }
699  if(_run_offset_contribution)
700  {
701  // Run offset contribution kernel
702  ITensorPack offset_contrib_pack =
703  {
704  { TensorType::ACL_SRC_DST, dst },
705  { TensorType::ACL_BIAS, c },
706  { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
707  { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }
708  };
709  CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
710  }
711 }
712 
714 {
715  if(!_is_prepared)
716  {
717  auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
718  CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
719  CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
720  CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, false);
721 
723 
724  if(_convert_to_qasymm8)
725  {
726  ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } };
727  CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
728  b->mark_as_unused();
729  }
730 
731  if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
732  {
733  // Run reshape kernel and mark original weights tensor as unused
734  ITensorPack mtx_b_pack =
735  {
736  { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
737  { TensorType::ACL_DST, tmp_b.get() }
738  };
739  CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
740  b->mark_as_unused();
741  }
742 
743  // Run matrix B reduction kernel only if _a_offset is not equal to 0
744  if(_a_offset != 0 && _reshape_b_only_on_first_run)
745  {
746  ITensorPack mtx_b_red_pack =
747  {
748  { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
749  { TensorType::ACL_DST, vec_sum_col.get() }
750  };
751  CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
752  }
753 
754  // Compute GEMM output multipliers and shifts for output stage
755  {
756  const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
757 
758  CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
759  CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
760 
761  ICLTensor *multiplier_tensor = multipliers.get();
762  if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
763  {
764  multiplier_tensor->map(CLScheduler::get().queue(), true);
765  std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
766  multiplier_tensor->unmap(CLScheduler::get().queue());
767  }
768 
769  ICLTensor *shifts_tensor = shifts.get();
770  if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
771  {
772  shifts_tensor->map(CLScheduler::get().queue(), true);
773  std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
774  shifts_tensor->unmap(CLScheduler::get().queue());
775  }
776  }
777  CLScheduler::get().queue().finish();
778  _is_prepared = true;
779  }
780 }
781 
783 {
784  return _aux_mem;
785 }
786 } // namespace opencl
787 } // namespace arm_compute
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
Definition: ITensor.h:63
void map(cl::CommandQueue &q, bool blocking=true)
Enqueue a map operation of the allocated buffer on the given queue.
Definition: ICLTensor.cpp:35
experimental::MemoryRequirements workspace() const override
Return the memory requirements required by the workspace.
Shape of a tensor.
Definition: TensorShape.h:39
Quantize using a fixed point multiplication.
Descriptor used by the GEMM kernels.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
SimpleTensor< float > b
Definition: DFT.cpp:157
static CLScheduler & get()
Access the scheduler singleton.
OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the out...
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
GPUTarget target() const
Get the target GPU.
Definition: CLScheduler.cpp:49
void prepare(ITensorPack &constants) override
Prepare the function for executing.
unsigned int depth_output_gemm3d
Depth of the output tensor in case is reinterpreted as 3D.
GEMM reshape information class.
Definition: Types.h:1910
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Calculate the matrix multiplication output shape of two tensors.
GEMMLowpOutputStageInfo gemmlowp_output_stage() const
GEMMLowp output stage.
Definition: Types.h:2201
TensorShape compute_reductionA_shape(const ITensorInfo &b)
Calculate the reductionA shape used in GEMMLowp.
A collection of adaptor functions that enable the auto selection between mlgo-based heuristics and de...
ITensorInfo & set_data_type(DataType data_type) override
Set the data type to the specified value.
Definition: TensorInfo.cpp:287
GEMM LHS (Left Hand Side) matrix information.
Definition: Types.h:2054
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Reshaped GEMM kernel where only the rhs matrix is reshaped.
int depth_output_gemm3d() const
Depth of the output when GEMM output is reinterpreted as 3D tensor.
Definition: Types.h:2177
Status class.
Definition: Error.h:52
GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
Select gemm config based on mlgo heuristics.
CLGEMMKernelType
OpenCL GEMM kernel types.
Definition: CLTypes.h:31
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col=nullptr, const ITensorInfo *vector_sum_row=nullptr, const ITensorInfo *bias=nullptr, const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration.
bool is_data_type_quantized_symmetric(DataType dt)
Check if a given data type is of symmetric quantized type.
Definition: Utils.h:1088
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
GEMMLowpOutputStageType type
GEMMLowp output stage type.
Definition: Types.h:2040
Interface for CPU tensor.
Definition: ITensor.h:36
GEMMLHSMatrixInfo lhs_info
LHS matrix information used to retrieve the number of rows processed by each thread.
Copyright (c) 2017-2022 Arm Limited.
bool is_b_reshaped() const
Flag which specifies if the matrix B has been reshaped.
Definition: Types.h:2159
std::vector< MemoryInfo > MemoryRequirements
Definition: Types.h:134
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset)
Static function to check if given info will lead to a valid configuration.
static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration.
bool is_quantized_per_channel
GEMMLowp quantized per-channel flag.
Definition: Types.h:2049
std::vector< int32_t > gemmlowp_shifts
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
Definition: Types.h:2047
1 channel, 1 S32 per channel
static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration.
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info=GEMMInfo())
Static function to check if given info will lead to a valid configuration.
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
unsigned int m
Number of LHS rows.
GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
Select gemm config based on default heuristics.
#define ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(fmt,...)
Log information level formatted message to the core system logger.
Definition: Log.h:99
unsigned int n
Number of RHS columns.
void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel&#39;s inputs, output.
void run(ITensorPack &tensors) override
Run the kernels contained in the function.
Casts a given tensor to a new type.
Definition: ClCastKernel.h:41
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1107
GEMM RHS (Right Hand Side) matrix information.
Definition: Types.h:2069
int32_t b_offset
Offset to be added to each element of the matrix B.
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
Static function to check if given info will lead to a valid configuration.
OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B...
quantized, asymmetric fixed-point 8-bit number unsigned
void unmap(cl::CommandQueue &q)
Enqueue an unmap operation of the allocated and mapped buffer on the given queue. ...
Definition: ICLTensor.cpp:40
size_t total_size() const override
Returns the total size of the tensor in bytes.
Definition: TensorInfo.h:250
Coordinates of an item.
Definition: Coordinates.h:37
void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush=true)
Schedule the execution of the passed kernel if possible.
std::vector< int32_t > gemmlowp_multipliers
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
Definition: Types.h:2046
UniformQuantizationInfo uniform() const
Return per layer quantization info.
GEMMLowpOutputStageInfo output_stage
GEMMLowp output stage information.
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type.
bool reinterpret_input_as_3d
Flag used to reinterpret the input as 3D.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
GEMMLowp output stage info.
Definition: Types.h:2038
OpenCL kernel used to add the offset contribution after the matrix multiplication.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
GEMMTypeResult select_default_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
Select gemm type based on default heuristics.
std::string to_string(const T &val)
Fallback method: try to use std::to_string:
Definition: TypePrinter.h:79
cl::CommandQueue & queue()
Accessor for the associated CL command queue.
Definition: CLScheduler.cpp:43
bool reinterpret_input_as_3d() const
Flag which specifies if the input tensor has to be reinterpreted as 3D.
Definition: Types.h:2185
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has b...
quantized, symmetric fixed-point 8-bit number
CLCompileContext class.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1052
bool is_a_reshaped() const
Flag which specifies if the matrix A has been reshaped.
Definition: Types.h:2151
quantized, symmetric per channel fixed-point 8-bit number
TensorShape compute_reductionB_shape(const ITensorInfo &a)
Calculate the reductionB shape used in GEMMLowp.
int32_t a_offset
Offset to be added to each element of the matrix A.
unsigned int k
Number of rows for the rhs matrix.
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
Interface for OpenCL tensor.
Definition: ICLTensor.h:42
GEMMRHSMatrixInfo rhs_info
RHS matrix information used for reshaping the RHS matrix.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication In particular...
GPUTarget
Available GPU Targets.
Definition: GPUTarget.h:34
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
Static function to check if given info will lead to a valid configuration.
Native GEMM kernel with configurable block size.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
Select gemm type based on mlgo heuristics.
unsigned int k
Number of LHS columns or RHS rows.
unsigned int m
Number of rows for the lhs matrix.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:39
#define ARM_COMPUTE_LOG_PARAMS(...)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:43
bool reshape_b_only_on_first_run() const
Flag which specifies if the reshape of matrix B should executed only for the first.
Definition: Types.h:2169
OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A...
int offset_int_vec(int offset)
Definition: MemoryHelpers.h:38
GEMM information class.
Definition: Types.h:2090
GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
Select gemm config based on mlgo heuristics.
quantized, asymmetric fixed-point 8-bit number signed
unsigned int n
Number of columns for the rhs matrix.
DataType output_data_type
Output tensor data type to use if the output is not initialized.
Definition: Types.h:2050
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
Static function to check if given info will lead to a valid configuration.
GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery &query)
Select gemm config based on default heuristics.