Compute Library
 21.02
CLGEMM.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
28 #include "arm_compute/core/Error.h"
32 #include "arm_compute/core/Log.h"
34 #include "arm_compute/core/Types.h"
35 #include "arm_compute/core/Utils.h"
49 #include "support/Cast.h"
50 #include "utils/TypePrinter.h"
51 
52 namespace arm_compute
53 {
55 using namespace arm_compute::cl_gemm;
56 using namespace arm_compute::utils::cast;
57 
58 namespace weights_transformations
59 {
60 CLGEMMReshapeRHSMatrixKernelManaged::CLGEMMReshapeRHSMatrixKernelManaged()
61  : _kernel(std::make_unique<CLGEMMReshapeRHSMatrixKernel>())
62 {
63 }
64 
66 
68 {
69  _output.allocator()->allocate();
70  CLScheduler::get().enqueue(*_kernel, false);
71  _reshape_run = true;
72 }
73 
75 {
76  _output.allocator()->free();
77 }
78 
80 {
81  return &_output;
82 }
83 
85 {
86  return _uid;
87 }
88 
90 {
91  configure(CLKernelLibrary::get().get_compile_context(), input, info);
92 }
93 
95 {
96  _kernel->configure(compile_context, input, &_output, info);
97 }
98 } // namespace weights_transformations
99 
100 namespace
101 {
102 inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
103 {
104  switch(kernel_type)
105  {
110  {
111  return true;
112  }
113  default:
114  {
115  return false;
116  }
117  }
118 }
119 //Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
120 inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
121 {
122  auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
123  if(bool(gemm_kernel))
124  {
125  if(validate_gemm_kernel(gemm_kernel.gemm_type))
126  {
127  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
128  return gemm_kernel.gemm_type;
129  }
130  }
131  gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
132  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
133  return gemm_kernel.gemm_type;
134 }
135 // Validate lhs_info and rhs_info for reshaped only rhs kernel
136 inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
137  const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)
138 {
139  // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
140  TensorInfo tmp_b_info{};
141  // Validate reshape RHS kernel
142  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
143  if(!bool(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
144  {
145  return false;
146  }
147  // Validate mm kernel
148  gemm_kernel_info.lhs_info = lhs_info;
149  gemm_kernel_info.rhs_info = rhs_info;
150  gemm_kernel_info.has_pad_y = false;
151  if(!bool(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
152  {
153  return false;
154  }
155  gemm_kernel_info.has_pad_y = true;
156  if(!bool(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
157  {
158  return false;
159  }
160  return true;
161 }
162 
163 //Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
164 inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,
165  const ITensorInfo *b,
166  const ITensorInfo *c, const ITensorInfo *output)
167 {
169  if(config)
170  {
171  if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
172  {
173  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
174  return { config.lhs_info, config.rhs_info };
175  }
176  }
178  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
179  return { config.lhs_info, config.rhs_info };
180 }
181 
182 // Validate lhs_info and rhs_info for reshaped kernel
183 inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
184  const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)
185 {
186  // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel
187  TensorInfo tmp_a_info{};
188  TensorInfo tmp_b_info{};
189 
190  // Validate reshape LHS kernel
191  auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
192  if(!bool(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
193  {
194  return false;
195  }
196 
197  // Validate reshape RHS kernel
198  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
199  if(!bool(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
200  {
201  return false;
202  }
203  // Validate mm kernel
204  gemm_kernel_info.lhs_info = lhs_info;
205  gemm_kernel_info.rhs_info = rhs_info;
206  if(!bool(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
207  {
208  return false;
209  }
210  return true;
211 }
212 
213 //Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs
214 inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,
215  const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)
216 {
218  if(config)
219  {
220  if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))
221  {
222  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
223  return { config.lhs_info, config.rhs_info };
224  }
225  }
227  ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
228  return { config.lhs_info, config.rhs_info };
229 }
230 
231 } // namespace
232 
233 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
234  : _memory_group(std::move(memory_manager)),
235  _weights_manager(weights_manager),
236  _mm_kernel(std::make_unique<CLGEMMMatrixMultiplyKernel>()),
237  _reshape_lhs_kernel(std::make_unique<CLGEMMReshapeLHSMatrixKernel>()),
238  _reshape_rhs_kernel(std::make_unique<CLGEMMReshapeRHSMatrixKernel>()),
239  _reshape_rhs_kernel_managed(std::make_unique<weights_transformations::CLGEMMReshapeRHSMatrixKernelManaged>()),
240  _mm_reshaped_kernel(std::make_unique<CLGEMMMatrixMultiplyReshapedKernel>()),
241  _mm_reshaped_only_rhs_kernel(std::make_unique<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>()),
242  _mm_reshaped_only_rhs_fallback_kernel(std::make_unique<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>()),
243  _tmp_a(),
244  _tmp_b(),
245  _original_b(nullptr),
246  _lhs(nullptr),
247  _dst(nullptr),
248  _reshape_b_only_on_first_run(false),
249  _is_prepared(false),
250  _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1)
251 {
252 }
253 
254 CLGEMM::~CLGEMM() = default;
255 
256 void CLGEMM::configure_native_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
257  const GEMMInfo &gemm_info)
258 {
259  const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
260  const unsigned int n = b->info()->dimension(0);
261  const unsigned int k = a->info()->dimension(0);
262  const GPUTarget gpu_target = CLScheduler::get().target();
263 
264  // Set the target for the kernels
265  _mm_kernel->set_target(gpu_target);
266 
267  GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias());
268 
269  // Configure and tune matrix multiply kernel
270  _mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
271 
272  // Tune kernel statically
273  CLScheduler::get().tune_kernel_static(*_mm_kernel);
274 }
275 
276 void CLGEMM::configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
277  const GEMMInfo &gemm_info)
278 {
279  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
280  const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
281  const unsigned int n = b->info()->dimension(0);
282  const unsigned int k = a->info()->dimension(0);
283  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
284  const GPUTarget gpu_target = CLScheduler::get().target();
285  int mult_transpose1xW_width = 1;
286  int mult_interleave4x4_height = 1;
287 
288  // Set the target for the kernels
289  _reshape_lhs_kernel->set_target(gpu_target);
290  _mm_kernel->set_target(gpu_target);
291 
292  if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
293  {
294  mult_transpose1xW_width = 4;
295  mult_interleave4x4_height = 2;
296  }
297 
298  GEMMRHSMatrixInfo rhs_info;
299  rhs_info.n0 = 16 / b->info()->element_size();
300  rhs_info.k0 = 1;
301  rhs_info.h0 = mult_transpose1xW_width;
302  rhs_info.interleave = false;
303  rhs_info.transpose = false;
304 
305  GEMMLHSMatrixInfo lhs_info;
306  lhs_info.m0 = 4;
307  lhs_info.k0 = 4;
308  lhs_info.v0 = mult_interleave4x4_height;
309  lhs_info.interleave = true;
310  lhs_info.transpose = true;
311 
312  GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
313 
314  const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
315 
316  // Manage intermediate buffers
317  _memory_group.manage(&_tmp_a);
318 
319  if(!_reshape_b_only_on_first_run && use_mm_b)
320  {
321  _memory_group.manage(&_tmp_b);
322  }
323 
324  // Configure interleave kernel
325  _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
326 
327  // Configure transpose kernel
328  ICLTensor *reshaped_rhs = &_tmp_b;
329  if(_weights_manager && _weights_manager->are_weights_managed(b))
330  {
331  _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info);
332  reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get()));
333  }
334  else
335  {
336  _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
337  }
338 
339  // Configure and tune matrix multiply kernel
340  _mm_kernel->configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info());
341 
342  CLScheduler::get().tune_kernel_static(*_mm_kernel);
343 
344  // Allocate intermediate tensors
345  _tmp_a.allocator()->allocate();
346 
347  if(!_reshape_b_only_on_first_run && use_mm_b)
348  {
349  _tmp_b.allocator()->allocate();
350  }
351 }
352 
353 void CLGEMM::configure_reshaped_v2(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
354  const GEMMInfo &gemm_info)
355 {
356  DataType data_type = a->info()->data_type();
357  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
358  const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
359  const unsigned int n = b->info()->dimension(0);
360  const unsigned int k = a->info()->dimension(0);
361  const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
362  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
363  const GPUTarget gpu_target = CLScheduler::get().target();
364  bool broadcast_bias = gemm_info.broadcast_bias();
365 
366  GEMMKernelInfo kernel_info;
367  kernel_info.m = m;
368  kernel_info.n = n;
369  kernel_info.k = k;
370  kernel_info.depth_output_gemm3d = depth_output_gemm3d;
371  kernel_info.reinterpret_input_as_3d = false;
372  kernel_info.broadcast_bias = broadcast_bias;
373  kernel_info.activation_info = gemm_info.activation_info();
374 
375  // Set the target for the kernels
376  _reshape_lhs_kernel->set_target(gpu_target);
377  _mm_kernel->set_target(gpu_target);
378 
379  const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
380 
381  // Manage intermediate buffers
382  _memory_group.manage(&_tmp_a);
383 
384  if(!_reshape_b_only_on_first_run && use_mm_b)
385  {
386  _memory_group.manage(&_tmp_b);
387  }
388 
389  // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
390 
391  GEMMLHSMatrixInfo lhs_info{};
392  GEMMRHSMatrixInfo rhs_info{};
393 
394  // Pick up the GEMM configuration
395  std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a->info(), b->info(),
396  c == nullptr ? nullptr : c->info(), output->info(), gemm_info.reinterpret_input_as_3d());
397 
398  _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
399 
400  ICLTensor *reshaped_rhs = &_tmp_b;
401  if(_weights_manager && _weights_manager->are_weights_managed(b))
402  {
403  _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info);
404  reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get()));
405  }
406  else
407  {
408  _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
409  }
410 
411  // Configure and tune matrix multiply kernel
412  _mm_reshaped_kernel->configure(compile_context, &_tmp_a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
413 
414  // Allocate intermediate tensors
415  _tmp_a.allocator()->allocate();
416 
417  if(!_reshape_b_only_on_first_run && use_mm_b)
418  {
419  _tmp_b.allocator()->allocate();
420  }
421 }
422 
423 void CLGEMM::configure_reshaped_only_rhs(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
424  const GEMMInfo &gemm_info)
425 {
426  DataType data_type = a->info()->data_type();
427  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
428  const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
429  const unsigned int n = b->info()->dimension(0);
430  const unsigned int k = a->info()->dimension(0);
431  const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
432  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
433  const GPUTarget gpu_target = CLScheduler::get().target();
434  bool broadcast_bias = gemm_info.broadcast_bias();
435 
436  GEMMKernelInfo kernel_info;
437  kernel_info.m = m;
438  kernel_info.n = n;
439  kernel_info.k = k;
440  kernel_info.depth_output_gemm3d = depth_output_gemm3d;
441  kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
442  kernel_info.broadcast_bias = broadcast_bias;
443  kernel_info.activation_info = gemm_info.activation_info();
444 
445  // Set the target for the kernels
446  _mm_kernel->set_target(gpu_target);
447 
448  const bool use_mm_b = (!_weights_manager || !_weights_manager->are_weights_managed(b));
449 
450  // Manage intermediate buffers
451  if(!_reshape_b_only_on_first_run && use_mm_b)
452  {
453  _memory_group.manage(&_tmp_b);
454  }
455 
456  GEMMLHSMatrixInfo lhs_info{};
457  GEMMRHSMatrixInfo rhs_info{};
458 
459  // Pick up the GEMM configuration
460  std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a->info(), b->info(),
461  c == nullptr ? nullptr : c->info(), output->info());
462 
463  ICLTensor *reshaped_rhs = &_tmp_b;
464  if(_weights_manager && _weights_manager->are_weights_managed(b))
465  {
466  _reshape_rhs_kernel_managed->configure(compile_context, b, rhs_info);
467  reshaped_rhs = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->acquire(b, _reshape_rhs_kernel_managed.get()));
468  }
469  else
470  {
471  _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
472  }
473 
474  // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)
475  // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have
476  // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false
477 
478  // Configure matrix multiply kernel with no y padding support
479  kernel_info.has_pad_y = false;
480  _mm_reshaped_only_rhs_kernel->configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
481 
482  // Configure matrix multiply kernel with y padding support
483  kernel_info.has_pad_y = true;
484  _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, reshaped_rhs, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
485 
486  if(!_reshape_b_only_on_first_run && use_mm_b)
487  {
488  _tmp_b.allocator()->allocate();
489  }
490 }
491 
492 Status CLGEMM::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
493 {
494  ARM_COMPUTE_UNUSED(alpha);
495  ARM_COMPUTE_UNUSED(output);
496 
497  // Get the GPU target
498  const GPUTarget gpu_target = CLScheduler::get().target();
499  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
500  const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
501  const unsigned int n = b->dimension(0);
502  const unsigned int k = a->dimension(0);
503  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
504 
505  const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias());
506 
507  // Validate matrix multiply
509  false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
510 
511  return Status{};
512 }
513 
514 Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
515 {
516  ARM_COMPUTE_UNUSED(alpha);
517  ARM_COMPUTE_UNUSED(output);
518 
519  TensorInfo tmp_a_info{};
520  TensorInfo tmp_b_info{};
521 
522  // Get the GPU target
523  const GPUTarget gpu_target = CLScheduler::get().target();
524  const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
525  const unsigned int n = b->dimension(0);
526  const unsigned int k = a->dimension(0);
527  int mult_transpose1xW_width = 1;
528  int mult_interleave4x4_height = 1;
529  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
530 
531  if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
532  {
533  mult_transpose1xW_width = 4;
534  mult_interleave4x4_height = 2;
535  }
536 
537  GEMMRHSMatrixInfo rhs_info;
538  rhs_info.n0 = 16 / b->element_size();
539  rhs_info.k0 = 1;
540  rhs_info.h0 = mult_transpose1xW_width;
541  rhs_info.interleave = false;
542  rhs_info.transpose = false;
543 
544  GEMMLHSMatrixInfo lhs_info;
545  lhs_info.m0 = 4;
546  lhs_info.k0 = 4;
547  lhs_info.v0 = mult_interleave4x4_height;
548  lhs_info.interleave = true;
549  lhs_info.transpose = true;
550 
551  const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias());
552 
553  // Validate interleave kernel
554  auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
556 
557  // Validate transpose kernel
558  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
560 
561  // Validate matrix multiply
562  ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta,
563  true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info()));
564 
565  return Status{};
566 }
567 
568 Status CLGEMM::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
569 {
570  ARM_COMPUTE_UNUSED(alpha);
571  ARM_COMPUTE_UNUSED(output);
572 
573  TensorInfo tmp_a_info{};
574  TensorInfo tmp_b_info{};
575 
576  // Get the GPU target
577  const GPUTarget gpu_target = CLScheduler::get().target();
579  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
580  const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
581  const unsigned int n = b->dimension(0);
582  const unsigned int k = a->dimension(0);
583  const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
584  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
585  const bool broadcast_bias = gemm_info.broadcast_bias();
586 
587  GEMMKernelInfo kernel_info;
588  kernel_info.m = m;
589  kernel_info.n = n;
590  kernel_info.k = k;
591  kernel_info.depth_output_gemm3d = depth_output_gemm3d;
592  kernel_info.reinterpret_input_as_3d = false;
593  kernel_info.broadcast_bias = broadcast_bias;
594  kernel_info.activation_info = gemm_info.activation_info();
595 
596  GEMMLHSMatrixInfo lhs_info;
597  GEMMRHSMatrixInfo rhs_info;
598 
599  // Pick up the GEMM configuration
600  // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
601  const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
602  lhs_info = gemm_config.lhs_info;
603  rhs_info = gemm_config.rhs_info;
604 
605  auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
607 
608  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
610 
611  // Validate matrix multiply
612  ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
613 
614  return Status{};
615 }
616 
617 Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
618 {
619  ARM_COMPUTE_UNUSED(alpha);
620  ARM_COMPUTE_UNUSED(output);
621 
622  TensorInfo tmp_b_info{};
623 
624  // Get the GPU target
625  const GPUTarget gpu_target = CLScheduler::get().target();
626  const DataType data_type = a->data_type();
627  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
628  const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
629  const unsigned int n = b->dimension(0);
630  const unsigned int k = a->dimension(0);
631  const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
632  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
633  const bool broadcast_bias = gemm_info.broadcast_bias();
634 
635  GEMMKernelInfo kernel_info;
636  kernel_info.m = m;
637  kernel_info.n = n;
638  kernel_info.k = k;
639  kernel_info.depth_output_gemm3d = depth_output_gemm3d;
640  kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
641  kernel_info.broadcast_bias = broadcast_bias;
642  kernel_info.activation_info = gemm_info.activation_info();
643 
644  GEMMLHSMatrixInfo lhs_info;
645  GEMMRHSMatrixInfo rhs_info;
646 
647  // Pick up the GEMM configuration
648  // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
649  const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
650  lhs_info = gemm_config.lhs_info;
651  rhs_info = gemm_config.rhs_info;
652 
653  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
655 
656  // Validate matrix multiply
657  kernel_info.has_pad_y = false;
658  ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
659 
660  kernel_info.has_pad_y = true;
661  ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
662 
663  return Status{};
664 }
665 
666 void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
667 {
668  configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
669 }
670 
671 void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
672 {
673  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
674 
675  // Perform validation step
676  ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
677 
678  // Check if we need to reshape the matrix B only on the first run
679  _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
680  _is_prepared = gemm_info.retain_internal_weights();
681  _original_b = b;
682  _lhs = a;
683  _dst = output;
684 
685  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
686  const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
687  const unsigned int n = b->info()->dimension(0);
688  const unsigned int k = a->info()->dimension(0);
689  const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
690 
691  // Select GEMMType
692  _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->info()->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run);
693 
694  const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
695 
696  const ICLTensor *c_to_use = fuse_add_c ? c : nullptr;
697 
698  switch(_gemm_kernel_type)
699  {
701  {
702  configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
703  break;
704  }
706  {
707  configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
708  break;
709  }
711  {
712  configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
713  break;
714  }
716  {
717  configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
718  break;
719  }
720  default:
721  {
722  ARM_COMPUTE_ERROR("GEMMType not supported");
723  }
724  }
725 }
726 
727 Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
728 {
729  // Get the GPU target
730  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
731  const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
732  const unsigned int n = b->dimension(0);
733  const unsigned int k = a->dimension(0);
734  const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
735 
736  // Select GEMMType
737  CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
738  {
739  CLScheduler::get().target(), a->data_type(), m, n, k, batch_size,
740  },
741  gemm_info.reshape_b_only_on_first_run());
742 
743  const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
744 
745  const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
746 
747  switch(gemm_kernel_type)
748  {
750  {
751  ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
752  break;
753  }
755  {
756  ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info));
757  break;
758  }
760  {
761  ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));
762  break;
763  }
765  {
766  ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
767  break;
768  }
769  default:
770  {
771  ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
772  }
773  }
774 
775  return Status{};
776 }
777 
779 {
780  prepare();
781  MemoryGroupResourceScope scope_mg(_memory_group);
782 
783  // Run matrix multiply kernel
784  switch(_gemm_kernel_type)
785  {
787  {
788  CLScheduler::get().enqueue(*_mm_kernel, true);
789  break;
790  }
792  {
793  // Run interleave kernel
794  CLScheduler::get().enqueue(*_reshape_lhs_kernel, false);
795 
796  if(!_reshape_b_only_on_first_run)
797  {
798  // Run transpose kernel
799  if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
800  {
801  _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
802  }
803  else
804  {
805  CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
806  }
807  }
808 
809  CLScheduler::get().enqueue(*_mm_kernel, true);
810  break;
811  }
813  {
814  // Run interleave kernel
815  CLScheduler::get().enqueue(*_reshape_lhs_kernel, false);
816 
817  if(!_reshape_b_only_on_first_run)
818  {
819  // Run transpose kernel
820  if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
821  {
822  _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
823  }
824  else
825  {
826  CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
827  }
828  }
829 
830  CLScheduler::get().enqueue(*_mm_reshaped_kernel, true);
831  break;
832  }
834  {
835  if(!_reshape_b_only_on_first_run)
836  {
837  // Run transpose kernel
838  if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
839  {
840  _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
841  }
842  else
843  {
844  CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
845  }
846  }
847  // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
848  // Check if the lhs or dst tensors have padding
849  const unsigned int cross_plane_pad_lhs = _lhs->info()->padding().top + _lhs->info()->padding().bottom;
850  const unsigned int cross_plane_pad_dst = _dst->info()->padding().top + _dst->info()->padding().bottom;
851 
852  bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
853  if(has_pad_y)
854  {
855  CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_fallback_kernel, true);
856  }
857  else
858  {
859  CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, true);
860  }
861  break;
862  }
863  default:
864  {
865  ARM_COMPUTE_ERROR("GEMMType not supported");
866  }
867  }
868 }
869 
871 {
872  if(!_is_prepared)
873  {
874  if(_gemm_kernel_type != CLGEMMKernelType::NATIVE_V1 && _reshape_b_only_on_first_run)
875  {
876  if(_weights_manager && _weights_manager->are_weights_managed(_original_b))
877  {
878  _weights_manager->run(_original_b, _reshape_rhs_kernel_managed.get());
879  }
880  else
881  {
882  // Run transpose kernel and mark original weights tensor as unused
883  _tmp_b.allocator()->allocate();
884  CLScheduler::get().enqueue(*_reshape_rhs_kernel, false);
885  _original_b->mark_as_unused();
886  }
887  }
888  CLScheduler::get().queue().finish();
889  _is_prepared = true;
890  }
891 }
892 } // namespace arm_compute
unsigned int top
top of the border
Definition: Types.h:375
bool broadcast_bias
Flag used to broadcast the bias addition.
static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Static function to check if given info will lead to a valid configuration of CLGEMMMatrixMultiplyResh...
void prepare() override
Prepare the function for executing.
Definition: CLGEMM.cpp:870
~CLGEMM()
Default destructor.
GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query)
Select gemm config based on default heuristics.
Descriptor used by the GEMM kernels.
void run() override
Run the kernels contained in the function.
Definition: CLGEMM.cpp:778
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
SimpleTensor< float > b
Definition: DFT.cpp:157
CLGEMM(std::shared_ptr< IMemoryManager > memory_manager=nullptr, IWeightsManager *weights_manager=nullptr)
Default constructor.
Definition: CLGEMM.cpp:233
static CLScheduler & get()
Access the scheduler singleton.
static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Static function to check if given info will lead to a valid configuration of CLGEMMMatrixMultiplyResh...
unsigned int v0
Number of vertical blocks of size (m0xk0) stored on the same output row.
Definition: Types.h:1977
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
GPUTarget target() const
Get the target GPU.
Definition: CLScheduler.cpp:47
unsigned int depth_output_gemm3d
Depth of the output tensor in case is reinterpreted as 3D.
OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication In particular...
GEMM reshape information class.
Definition: Types.h:1831
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
Static function to check if given info will lead to a valid configuration of CLGEMMReshapeLHSMatrixKe...
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
A collection of adaptor functions that enable the auto selection between mlgo-based heuristics and de...
unsigned int h0
Number of horizontal blocks of size (k0xn0) stored on the same output row.
Definition: Types.h:1992
bool fp_mixed_precision() const
Flag which specifies if a wider accumulator should be used.
Definition: Types.h:2122
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
GEMM LHS (Left Hand Side) matrix information.
Definition: Types.h:1968
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
CLTensorAllocator * allocator()
Return a pointer to the tensor&#39;s allocator.
Definition: CLTensor.cpp:61
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
unsigned int bottom
bottom of the border
Definition: Types.h:377
Reshaped GEMM kernel where only the rhs matrix is reshaped.
int depth_output_gemm3d() const
Depth of the output when GEMM output is reinterpreted as 3D tensor.
Definition: Types.h:2082
Status class.
Definition: Error.h:52
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
Definition: GPUTarget.cpp:189
ActivationLayerInfo activation_info
Activation function to perform after the matrix multiplication.
bool retain_internal_weights() const
Flag which specifies if the weights tensor has to be retained from previous run.
Definition: Types.h:2098
CLGEMMKernelType
OpenCL GEMM kernel types.
Definition: CLTypes.h:31
Reshaped GEMM kernel where both lhs and rhs matrices are reshaped.
GEMMLHSMatrixInfo lhs_info
LHS matrix information used to retrieve the number of rows processed by each thread.
bool transpose
True if the (k0xn0) block has to be transposed before been stored.
Definition: Types.h:1993
bool interleave
True if the v0 (m0xk0) blocks have to be interleaved in the output row.
Definition: Types.h:1979
static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision=false, const ActivationLayerInfo &activation_info=ActivationLayerInfo())
Static function to check if given info will lead to a valid configuration of CLGEMMMatrixMultiplyKern...
Copyright (c) 2017-2021 Arm Limited.
void run() override
Run the transformation function.
Definition: CLGEMM.cpp:67
void mark_as_unused() const
Marks a tensor as unused.
Definition: ITensor.cpp:168
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Definition: MemoryGroup.h:79
bool transpose
True if the (m0xk0) block has to be transposed before been stored.
Definition: Types.h:1978
GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
Select gemm config based on mlgo heuristics.
Native GEMM kernel with fixed block size.
bool are_weights_managed(const ITensor *weights)
Check if the weights are managed.
const DataType data_type
Definition: Im2Col.cpp:150
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
Definition: Types.h:1991
unsigned int m
Number of LHS rows.
#define ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(fmt,...)
Log information level formatted message to the core system logger.
Definition: Log.h:99
unsigned int n
Number of RHS columns.
OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided.
void release() override
Release transformed weights memory.
Definition: CLGEMM.cpp:74
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d=false)
Calculate the Left Hand Side matrix reshaped shape.
GEMM RHS (Right Hand Side) matrix information.
Definition: Types.h:1983
unsigned int n0
Number of columns processed by the matrix multiplication.
Definition: Types.h:1990
OpenCL kernel to multiply matrices when only the input matrix RHS (input1) has been reshaped...
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
bool reinterpret_input_as_3d
Flag used to reinterpret the input as 3D.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
GEMMTypeResult select_default_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
Select gemm type based on default heuristics.
virtual PaddingSize padding() const =0
Padding of tensor.
cl::CommandQueue & queue()
Accessor for the associated CL command queue.
Definition: CLScheduler.cpp:41
bool reinterpret_input_as_3d() const
Flag which specifies if the input tensor has to be reinterpreted as 3D.
Definition: Types.h:2090
Weights manager interface to handle weights transformations.
ICLTensor * get_weights() override
Get a pointer to the transformed weights.
Definition: CLGEMM.cpp:79
bool broadcast_bias() const
Flag which specifies whether to broadcast the shape of the bias tensor.
Definition: Types.h:2130
void enqueue(ICLKernel &kernel, bool flush=true)
Schedule the execution of the passed kernel if possible.
uint32_t uid() override
Function that returns a unique id of the reshape function.
Definition: CLGEMM.cpp:84
CLCompileContext class.
void configure(const ICLTensor *input, GEMMRHSMatrixInfo info)
Configures the CLGEMMReshapeRHSMatrixKernel kernel.
Definition: CLGEMM.cpp:89
bool has_pad_y
Flag used to indicate if the input/output tensors have internal pad on the y direction.
void allocate() override
Allocate size specified by TensorInfo of OpenCL memory.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
#define ARM_COMPUTE_RETURN_ERROR_MSG(...)
An error is returned with the given description.
Definition: Error.h:194
Memory group resources scope handling class.
Definition: IMemoryGroup.h:82
Interface for OpenCL tensor.
Definition: ICLTensor.h:42
GEMMRHSMatrixInfo rhs_info
RHS matrix information used for reshaping the RHS matrix.
GPUTarget
Available GPU Targets.
Definition: GPUTarget.h:34
void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel&#39;s inputs and output.
Definition: CLGEMM.cpp:666
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context...
std::string to_string(const ICLTensor &arg)
void free() override
Free allocated OpenCL memory.
GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
Select gemm type based on mlgo heuristics.
unsigned int k
Number of LHS columns or RHS rows.
bool interleave
True if the h0 (k0xn0) blocks have to be interleaved in the output row.
Definition: Types.h:1994
bool is_zero(float a, float epsilon=0.00001f)
Checks if the input floating point number is 0.0f checking if the difference is within a range define...
Definition: float_ops.h:109
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info=GEMMInfo())
Static function to check if given info will lead to a valid configuration of CLGEMM.
Definition: CLGEMM.cpp:727
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:45
bool reshape_b_only_on_first_run() const
Flag which specifies if the reshape of matrix B should executed only for the first.
Definition: Types.h:2074
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
Definition: Types.h:1976
GEMM information class.
Definition: Types.h:2003
unsigned int m0
Number of rows processed by the matrix multiplication.
Definition: Types.h:1975
ITensor * run(const ITensor *weights, ITransformWeights *weights_transform)
Run the reshape function.
GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
Select gemm config based on mlgo heuristics.
OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have be...
void tune_kernel_static(ICLKernel &kernel)
Tunes OpenCL kernel.
Definition: CLScheduler.cpp:84
DataType
Available data types.
Definition: Types.h:77
ActivationLayerInfo activation_info() const
Activation layer to apply after the matrix multiplication.
Definition: Types.h:2154
Reshaped GEMM kernel where both lhs and rhs matrices are reshaped.
GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery &query)
Select gemm config based on default heuristics.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info)
Static function to check if given info will lead to a valid configuration of CLGEMMReshapeRHSMatrixKe...
ITensor * acquire(const ITensor *weights, ITransformWeights *weights_transform)
Acquire the requested reshape tensor of the selected weights.