Compute Library
 23.08
CpuGemmAssemblyDispatch.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2018-2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
27 #include "src/core/CPP/Validate.h"
34 
35 #include <arm_neon.h>
36 
37 namespace arm_compute
38 {
39 namespace cpu
40 {
41 namespace
42 {
43 /** Run pretranspose_B_array in parallel (1D static scheduling)
44  *
45  * @tparam TypeInput
46  * @tparam TypeOutput
47  *
48  * @param[in] gemm_asm GemmCommon kernel to run
49  * @param[in] dst Pretransposed B array
50  * @param[in] src B array to be pretransposed
51  * @param[in] src_ld Stride in y
52  * @param[in] src_multi_stride Stride in z ("multi")
53  * @param[in] num_threads Number of threads to run this method. Must be >= 1
54  */
55 template <typename TypeInput, typename TypeOutput>
56 void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads)
57 {
58  ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr);
59  ARM_COMPUTE_ERROR_ON(num_threads == 0);
60  // The window size is also the total workload size
61  const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size();
62 
63  std::vector<IScheduler::Workload> workloads(num_threads);
64  for(unsigned int t = 0; t < num_threads; ++t)
65  {
66  workloads[t] = [ = ](const ThreadInfo & info)
67  {
68  const unsigned int start = (info.thread_id * wsize) / num_threads;
69  const unsigned int end = ((info.thread_id + 1) * wsize) / num_threads;
70 
71  if(start < end)
72  {
73  gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end);
74  }
75  };
76  }
77  NEScheduler::get().run_tagged_workloads(workloads, "CpuGemmAssemblyDispatch/pretranspose_B_array");
78 }
79 } // namespace
80 
81 using namespace arm_compute::experimental;
82 
83 namespace
84 {
85 struct free_delete
86 {
87  void operator()(void *x)
88  {
89  free(x);
90  }
91 };
92 
93 struct Params
94 {
95  unsigned int M;
96  unsigned int N;
97  unsigned int K;
98  unsigned int batches;
99  unsigned int multis;
100  unsigned int sections;
101  bool indirect;
102 };
103 
104 Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
105 {
107  Params p;
108  p.M = d->tensor_shape().y();
109  p.K = a->tensor_shape().x();
110  p.N = d->tensor_shape().x();
111  p.batches = 1;
112  p.multis = 1;
113  p.sections = 1;
114  p.indirect = false;
115 
116  if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
117  {
118  p.indirect = true;
119  p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
120  }
121  else
122  {
123  p.multis = b->tensor_shape().z();
124  p.batches = d->tensor_shape().total_size_upper(2) / p.multis;
125  }
126 
127  // Update M in case of GEMM3D for output
128  if(info.depth_output_gemm3d != 0)
129  {
130  p.M = d->tensor_shape().y() * d->tensor_shape().z();
131  p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
132  }
133 
134  return p;
135 }
136 
137 IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type)
138 {
139  // Schedule assembly kernel
140  const int granule_threshold = 200;
141  IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
143  {
144  scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
145  }
147  {
148  //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
149  scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
150  }
152  {
153  //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
154  scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
155  }
156 
157  return scheduling_hint;
158 }
159 
160 /** Fallback in case ACL doesn't have a function */
161 template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing>
162 class Fallback : public CpuGemmAssemblyDispatch::IFallback
163 {
164 public:
165  /** Destructor */
166  ~Fallback() = default;
167 
168  /** Initialise the functions's input and output.
169  *
170  * @param[in] a Input tensor containing the Matrix A.
171  * @param[in] b Input tensor containing the Matrix B.
172  * @param[in] c Input tensor containing the Matrix C.
173  * @param[out] d Output tensor to store the result of matrix multiplication.
174  * @param[in] args Matrix multiplication information.
175  * @param[in] gemm_info GEMM meta-data
176  * @param[in] os Output stage meta-data.
177  */
178  void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
179  arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
180  const OutputStage &os = {});
181 
182  /** Set requantization shifts to be used
183  *
184  * @param[in] shifts Requantization shifts
185  *
186  * @return Pointer to the shift data
187  */
188  /** Set requantization data to be used
189  *
190  *
191  * @param shifts Requantization shifts
192  * @param multipliers Requantization multipliers
193  *
194  * @return A tuple with the pointers to the shift and multiplier data respectively
195  */
196  std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
197  const std::vector<int32_t> &multipliers);
198 
199  // Inherited methods overridden:
200  void run(ITensorPack &tensors) override;
201  void prepare(ITensorPack &tensors) override;
202  bool is_configured() const override;
203  experimental::MemoryRequirements workspace() const override;
204  bool isVarWeightsKernel() const override
205  {
206  if(!_gemm_kernel_asm)
207  return false;
208  const arm_compute::WeightFormat wf = assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
210  }
211 
212 private:
213  enum AuxTensorIdx
214  {
215  AsmGemmWorkspace = 0,
216  Pretranspose,
217  Count
218  };
219 
220  /** Configure the indirect buffer
221  *
222  * @param[in] a Input tensor containing the Matrix A.
223  * @param[in] b Input tensor containing the Matrix B.
224  * @param[out] d Output tensor to store the result of matrix multiplication.
225  * @param[in] info GEMM meta-data
226  */
227  void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info);
228  /** Prepare the indirect buffer */
229  void prepare_indirect_buffer(ITensorPack &tensors);
230 
231  /** Assembly Gemm kernel */
232  std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
233  /** Optimised Arm® Neon™ kernel */
234  std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
235  /** Assembly GEMM workspace tensor info */
236  TensorInfo _workspace_info{};
237  /** Pre-transpose tensor info */
238  TensorInfo _pretranspose_info{};
239  /** Prepared flag */
240  bool _is_prepared{ false };
241  /** GEMM meta-data */
242  AsmGemmInfo _gemm_info{};
243  /** GEMM kernel description */
244  arm_gemm::KernelDescription _kernel_info{};
245  /** Per channel quantization shifts */
246  std::vector<int32_t> _shifts{};
247  std::vector<int32_t> right_shifts{};
248  std::vector<int32_t> left_shifts{};
249  /** Per channel quantization multipliers */
250  std::vector<int32_t> _multipliers{};
251  /** Indirect buffer */
252  std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
253  std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{};
254  std::vector<TypeInput> _indirect_pad{};
256  experimental::MemoryRequirements _aux_mem{ Count };
257  bool _B_pretranspose_required{ false };
258  bool _is_b_constant{ true };
259  bool _is_c_constant{ true };
260 };
261 
262 template <typename TypeInput, typename TypeOutput, class OutputStage>
263 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
264 Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
265 {
266  _multipliers = multipliers;
267  _shifts = shifts;
268  bool need_left = false;
269  for(const auto s : _shifts)
270  {
271  left_shifts.push_back(std::max(-s, int32_t(0)));
272  right_shifts.push_back(std::min(-s, int32_t(0)));
273  if(s < 0 && !need_left)
274  {
275  need_left = true;
276  }
277  }
278  return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
279 }
280 
281 template <typename TypeInput, typename TypeOutput, class OutputStage>
282 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors)
283 {
284  auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
285  const TypeInput *A_ptr = reinterpret_cast<TypeInput *>(a->buffer());
286  const int multis = 1;
287  const int batches = a->info()->tensor_shape().total_size_upper(3);
288  const size_t stride_A = a->info()->strides_in_bytes().y() / sizeof(TypeInput);
289  const size_t batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
290  const size_t multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput);
291 
292  const size_t output_hw = _cp.output_height * _cp.output_width;
293  const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput);
294  const size_t batch_stride = batch_size / sizeof(TypeInput);
295  const int multi_size = batch_size * batches;
296  const size_t multi_stride = multi_size / sizeof(TypeInput);
297 
298  for(int64_t m = 0; m < multis; m++)
299  {
300  for(int64_t b = 0; b < batches; b++)
301  {
302  for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
303  {
304  for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
305  {
306  int64_t output_xy = (output_y * _cp.output_width) + output_x;
307 
308  for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
309  {
310  for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
311  {
312  int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
313  int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
314  int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
315  int64_t input_xy = (input_y * _cp.input_width) + input_x;
316 
317  if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
318  {
319  _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
320  }
321  else
322  {
323  _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
324  A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
325  }
326  }
327  }
328  }
329  }
330  }
331  }
332 }
333 
334 template <typename TypeInput, typename TypeOutput, class OutputStage>
335 void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
336 {
338 
339  float zeropad = 0.f;
340  if(is_data_type_quantized(a->data_type()))
341  {
342  zeropad = a->quantization_info().uniform().offset;
343  }
344 
345  const int64_t input_width = static_cast<int64_t>(a->tensor_shape()[1]);
346  const int64_t input_height = static_cast<int64_t>(a->tensor_shape()[2]);
347  const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]);
348  const int64_t kernel_width = static_cast<int64_t>(b->tensor_shape()[2]);
349  const int64_t kernel_height = static_cast<int64_t>(b->tensor_shape()[3]);
350  const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]);
351  const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]);
352 
353  _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
354  info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
355  };
356 
357  if(info.method == AsmConvMethod::Conv)
358  {
359  _gemm_kernel_asm->set_convolution_parameters(_cp);
360  }
361 
362  if(info.method == AsmConvMethod::Indirect)
363  {
364  const unsigned int multis = 1;
365  const unsigned int batches = a->tensor_shape().total_size_upper(3);
366  const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height;
367  const unsigned int output_hw = _cp.output_width * _cp.output_height;
368 
369  using TypeInputPtr = TypeInput *;
370  const int batch_size = kernel_hw * output_hw * sizeof(TypeInputPtr);
371  const size_t batch_stride = batch_size / sizeof(TypeInputPtr);
372  const int multi_size = batch_size * batches;
373  const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
374 
375  _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
376  _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
377  _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
378 
379  // Set indirect argument
380  int64_t pos = 0;
381  for(int64_t m = 0; m < multis; m++)
382  {
383  for(int64_t b = 0; b < batches; b++)
384  {
385  for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
386  {
387  (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
388  }
389  }
390  }
391 
392  _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
393  }
394 }
395 
396 template <typename TypeInput, typename TypeOutput, class OutputStage>
397 void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
398  arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
399  const OutputStage &os)
400 {
402 
403  _is_b_constant = b->are_values_constant();
404  _is_c_constant = c ? c->are_values_constant() : true;
405 
406  _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
407  if(_gemm_kernel_asm == nullptr)
408  {
409  //configuration not supported: Leave function unconfigured:
410  return;
411  }
412 
413  arm_gemm::GemmConfig gemm_cfg = _gemm_kernel_asm->get_config();
414 
415  // arm_compute wrapper for the Gemm object (see above)
416  auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>();
417  ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
418  acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
419  const size_t workspace_size = _gemm_kernel_asm->get_working_size();
420  const unsigned int alignment = 4096;
421  _workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8);
422  _aux_mem[AsmGemmWorkspace] = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
423 
424  //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
425  //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
426  {
427  const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
428  if(window_size < static_cast<unsigned int>(args._maxthreads))
429  {
430  _gemm_kernel_asm->set_nthreads(window_size);
431  }
432  }
433 
434  _optimised_kernel = std::move(acl_gemm_wrapper);
435  _gemm_info = gemm_info;
436  // Check for pre-transposed support
437  if(_gemm_kernel_asm->B_pretranspose_required())
438  {
439  // Forcing 128-byte alignment (required by 32-bit kernels)
440  const unsigned int alignment = 128;
441  const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
442  _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
443  _aux_mem[Pretranspose] = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
444  _B_pretranspose_required = true;
445  }
446 
447  // Handle indirect GEMM convolution
449  {
450  configure_indirect(a, b, d, gemm_info);
451  }
452 }
453 
454 template <typename TypeInput, typename TypeOutput, class OutputStage>
455 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
456 {
457  if(!_is_prepared)
458  {
459  auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
460  auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
461 
462  // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
463  if(c && c->info()->data_type() == DataType::S32)
464  {
465  _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
466  }
467 
468  // Pretranspose B if required
469  if(_gemm_kernel_asm->B_pretranspose_required())
470  {
471  // Fixed format kernels need no pretranspose.
473  const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
474  const auto in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
475  const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
476 
477  CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
478  ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
479  run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
480 
481  b->mark_as_unused();
482  }
483 
484  if(_gemm_info.method == AsmConvMethod::Indirect)
485  {
486  prepare_indirect_buffer(tensors);
487  }
488 
489  _is_prepared = true;
490  }
491 }
492 
493 template <typename TypeInput, typename TypeOutput, class OutputStage>
494 bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
495 {
496  return _optimised_kernel != nullptr;
497 }
498 
499 template <typename TypeInput, typename TypeOutput, class OutputStage>
500 experimental::MemoryRequirements Fallback<TypeInput, TypeOutput, OutputStage>::workspace() const
501 {
502  return _aux_mem;
503 }
504 
505 template <typename TypeInput, typename TypeOutput, class OutputStage>
506 void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
507 {
508  auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
509  auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
510  auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
511  auto d = tensors.get_tensor(TensorType::ACL_DST);
512 
513  int lda = a->info()->strides_in_bytes().y() / a->info()->element_size();
514  int ldb = 0;
515  const int ldd = d->info()->strides_in_bytes().y() / d->info()->element_size();
516 
517  const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
518  const size_t a_multi_idx = a_batch_idx + 1;
519  const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
520  const size_t d_multi_idx = d_batch_idx + 1;
521 
522  int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / a->info()->element_size();
523  const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / d->info()->element_size();
524 
525  int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / a->info()->element_size();
526  int multi_stride_b = 0;
527  const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size();
528 
529  auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
530  const TypeInput *in1_ptr = nullptr;
531  auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
532 
533  // Check if B is pre-tranposed and de-reference if not
534  if(!_gemm_kernel_asm->B_is_pretransposed())
535  {
536  ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
537  multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
538  in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
539  }
540 
541  // If necessary, run pretranspose every time if either weights or biases are non-constant
542  if((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
543  {
544  if(c && c->info()->data_type() == DataType::S32)
545  {
546  _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
547  }
548 
549  // Pretranspose B if required
550  if(_B_pretranspose_required)
551  {
552  const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
553  const auto b_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
554  const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
555 
556  CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true);
557  ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
558 
559  if(_is_b_constant)
560  {
561  _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
562  }
563  else
564  {
565  run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
566  }
567  }
568  }
569 
570  const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type());
571 
572  // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
573  CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false);
574  if(workspace.get()->buffer() != nullptr)
575  {
576  _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
577  const unsigned int split_dim = scheduling_hint.split_dimension();
578  const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
579  unsigned int num_threads = NEScheduler::get().num_threads();
580  if(window_size < num_threads)
581  {
582  num_threads = window_size;
583  }
584  if(split_dim != IScheduler::split_dimensions_all)
585  {
586  // Make sure the kernel does not expect more threads than we can actually spawn
587  const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
588  num_threads = std::min(num_iterations, num_threads);
589  }
590  _gemm_kernel_asm->set_nthreads(num_threads);
591  }
592 
593  // Prepare assembly kernel
594  prepare(tensors);
595 
596  // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
597  TypeOutput *bias = nullptr;
598  if(c && c->info()->data_type() != DataType::S32)
599  {
600  bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
601  }
602 
603  if(_gemm_info.method == AsmConvMethod::Indirect)
604  {
605  in0_ptr = nullptr;
606  lda = 0;
607  batch_stride_a = 0;
608  multi_stride_a = 0;
609  }
610 
611  // Set gemm parameters
612  _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
613  in1_ptr, ldb, multi_stride_b,
614  out_ptr, ldd, batch_stride_d, multi_stride_d,
615  bias, 0);
616  // Schedule
617  NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
618 }
619 
620 template <typename TypeInput, typename TypeOutput>
621 void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
622  const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
623  arm_gemm::Activation activation, const AsmGemmInfo &info)
624 {
625  Params p = extract_parameters(a, b, d, info);
626  const CPUInfo &ci = NEScheduler::get().cpu_info();
627  unsigned int num_threads = NEScheduler::get().num_threads();
628 
631  arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
632 
633  // Create arm_gemm fallback
634  auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
635  fallback->configure(a, b, c, d, args, info);
636  arm_gemm = std::move(fallback);
637 }
638 
639 template <typename TypeInput, typename TypeOutput>
640 void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
641  const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
642  arm_gemm::Activation activation, const AsmGemmInfo &info)
643 {
644  ARM_COMPUTE_UNUSED(activation);
645  Params p = extract_parameters(a, b, d, info);
646  const CPUInfo &ci = NEScheduler::get().cpu_info();
647  const unsigned int num_threads = NEScheduler::get().num_threads();
648 
651  arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
652 
653  // Create arm_gemm fallback
654  auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
655 
656  // Configure requantization info
657  const int32_t negation = info.negated_offsets ? 1 : -1;
658  const int32_t a_offset = -a->quantization_info().uniform().offset * negation;
659  const int32_t b_offset = -b->quantization_info().uniform().offset * negation;
660  const GEMMLowpOutputStageInfo os_info = info.output_stage;
661 
662  arm_gemm::Requantize32 gemm_requant_info{};
663  if(os_info.gemmlowp_shifts.size() > 1)
664  {
665  const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
666  gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
667  a_offset, b_offset, os_info.gemmlowp_offset,
668  (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
669  std::get<2>(requantize_data),
670  std::get<3>(requantize_data),
671  os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
672  }
673  else
674  {
675  gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
676  a_offset, b_offset, os_info.gemmlowp_offset,
677  -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
678  os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
679  }
680 
681  // Configure fallback
682  fallback->configure(a, b, c, d, args, info, gemm_requant_info);
683  arm_gemm = std::move(fallback);
684 }
685 } //namespace
686 
688  : _arm_gemm(nullptr)
689 {
690 }
691 
693  const AsmGemmInfo &info)
694 {
698  Params p = extract_parameters(a, b, d, info);
699  const CPUInfo &ci = NEScheduler::get().cpu_info();
700  unsigned int num_threads = NEScheduler::get().num_threads();
703  arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
704  arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.fixed_format, info.fast_mode, &cfg);
705  switch(a->data_type())
706  {
707  case DataType::F32:
708  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
709  "We could not find an optimized kernel for F32 input");
710  break;
711 #ifdef __aarch64__
712  case DataType::U8:
713  case DataType::QASYMM8:
714  if(d->data_type() == DataType::S32)
715  {
716  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
717  "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
718  }
719  else
720  {
721  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
722  "We could not find an optimized kernel for U8 input and U8 output");
723  }
724  break;
725  case DataType::S8:
727  if(d->data_type() == DataType::S32)
728  {
729  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
730  "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
731  }
732  else
733  {
734  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
735  "We could not find an optimized kernel for S8 input and S8 output");
736  }
737  break;
738 #endif /* __aarch64__ */
739 #if defined(ARM_COMPUTE_ENABLE_BF16)
740  case DataType::BFLOAT16:
741  {
742  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
743  "We could not find an optimized kernel for BFLOAT16 input and F32 output");
744  break;
745  }
746 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
747 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
748  case DataType::F16:
749  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
750  "We could not find an optimized kernel for F16 input and F16 output");
751  break;
752 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
753  default:
754  ARM_COMPUTE_RETURN_ERROR_ON_MSG(true, "Usupported type. Could not find a kernel");
755  break;
756  }
757  expected_weight_format = assembly_utils::map_to_arm_compute_weight_format(arm_gemm_expected_wf);
758 
759  return Status{};
760 }
761 
763 {
768  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
769 
770 #ifndef __aarch64__
771  ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
772 #endif /* __aarch64__ */
777  if(is_data_type_quantized_per_channel(b->data_type()))
778  {
780  }
781  else if(is_fixed_format_fast_math(info.weight_format))
782  {
785  }
786  else
787  {
789  }
790  ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
791  ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
792  ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input");
793  ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
794  ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
796  "Only QASYMM8/S32 output supported for QASYMM8 input");
798  const Status ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info);
799  if((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
800  {
801  // Correctness check: if the format expected by the kernel is
802  // not "any", make sure that the one found matches the format
803  // intended by the caller.
804  ARM_COMPUTE_RETURN_ERROR_ON_MSG((expected_weight_format != info.weight_format),
805  "The format expected by the kernel does not correspond with the one requested by the user.");
806  }
807  return ret;
808 }
809 
811 {
814 }
815 
817 {
820 
821  //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
823  {
824  return;
825  }
826 
827  switch(a->data_type())
828  {
829  case DataType::F32:
830  create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info);
831  break;
832 #ifdef __aarch64__
833  case DataType::U8:
834  case DataType::QASYMM8:
835  if(d->data_type() == DataType::S32)
836  {
837  create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info);
838  }
839  else
840  {
841  create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info);
842  }
843  break;
844  case DataType::S8:
846  if(d->data_type() == DataType::S32)
847  {
848  create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
849  }
850  else
851  {
852  create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info);
853  }
854  break;
855 #endif /* __aarch64__ */
856 #if defined(ARM_COMPUTE_ENABLE_BF16)
857  case DataType::BFLOAT16:
858  create_arm_gemm<bfloat16, float>(_arm_gemm, a, b, c, d, act, info);
859  break;
860 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
861 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
862  case DataType::F16:
863  create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info);
864  break;
865 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
866  default:
867  break;
868  }
869 }
870 
872 {
873  ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
874  _arm_gemm->prepare(tensors);
875 }
876 
878 {
879  return _arm_gemm && _arm_gemm->is_configured();
880 }
881 
883 {
884  ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
885  _arm_gemm->run(tensors);
886 }
887 
889 {
890  ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
891  return _arm_gemm->workspace();
892 }
893 } // namespace cpu
894 } // namespace arm_compute
ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor)
Definition: Validate.h:121
arm_compute::DataType::QSYMM8_PER_CHANNEL
@ QSYMM8_PER_CHANNEL
quantized, symmetric per channel fixed-point 8-bit number
GemmTuner.args
args
Definition: GemmTuner.py:679
arm_compute::test::validation::configure
im2col_func configure(src_target.info(), dst_target.info(), spatial_kernel, conv_info, has_bias)
arm_compute::experimental::MemoryRequirements
std::vector< MemoryInfo > MemoryRequirements
Definition: Types.h:134
arm_compute::test::validation::run
lstmq run()
arm_compute::test::validation::src
SimpleTensor< float > src
Definition: DFT.cpp:155
N
unsigned int N
Definition: CpuGemmAssemblyDispatch.cpp:96
arm_compute::DataType::BFLOAT16
@ BFLOAT16
16-bit brain floating-point number
arm_compute::cpu::CpuGemmAssemblyDispatch::is_configured
bool is_configured() const
Was the function successfully configured ?
Definition: CpuGemmAssemblyDispatch.cpp:877
arm_compute::assembly_utils::map_to_arm_gemm_weight_format
arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFormat &weight_format)
Performs a mapping from Compute Library WeightFormat to the assembly WeightFormat enum.
Definition: AssemblyUtils.cpp:72
input_width
const size_t input_width
Definition: impl.cpp:63
CpuGemmAssemblyDispatch.h
arm_compute::WeightFormat::ANY
@ ANY
arm_compute::cpu::CpuGemmAssemblyDispatch::has_opt_impl
static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
Indicates whether or not there is an optimal assembly implementation that can be used to process the ...
Definition: CpuGemmAssemblyDispatch.cpp:692
arm_gemm::ConvolutionParameters
Definition: convolution_parameters.hpp:48
arm_compute::cpu::CpuGemmAssemblyDispatch::prepare
void prepare(ITensorPack &tensors) override
Prepare the function for executing.
Definition: CpuGemmAssemblyDispatch.cpp:871
arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D
@ GEMM_INTERLEAVED_2D
arm_compute::DataType::QASYMM8
@ QASYMM8
quantized, asymmetric fixed-point 8-bit number unsigned
indirect
bool indirect
Definition: CpuGemmAssemblyDispatch.cpp:101
arm_compute::test::validation::dst
auto dst
Definition: DFT.cpp:170
arm_compute::ITensorInfo::element_size
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
arm_gemm::GemmMethod
GemmMethod
Definition: arm_gemm.hpp:35
arm_compute::Window::DimX
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
arm_gemm::ConvolutionParameters::output_width
int64_t output_width
Definition: convolution_parameters.hpp:55
arm_compute::assembly_utils::map_to_arm_gemm_activation
arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
Performs a mapping between Compute Library ActivationLayerInfo and the assembly Activation structure.
Definition: AssemblyUtils.cpp:32
arm_gemm::ConvolutionParameters::padding_left
int64_t padding_left
Definition: convolution_parameters.hpp:61
arm_compute::IScheduler::StrategyHint::STATIC
@ STATIC
Split the workload evenly among the threads.
arm_compute::IScheduler::cpu_info
CPUInfo & cpu_info()
Get CPU info.
Definition: IScheduler.cpp:41
arm_compute::cpu::AsmGemmInfo
Definition: CpuGemmAssemblyDispatch.h:43
sections
unsigned int sections
Definition: CpuGemmAssemblyDispatch.cpp:100
arm_gemm::ConvolutionParameters::output_stride_w
int64_t output_stride_w
Definition: convolution_parameters.hpp:57
AssemblyUtils.h
arm_compute::ACL_SRC_0
@ ACL_SRC_0
Definition: Types.h:45
arm_compute::IScheduler::split_dimensions_all
static constexpr unsigned int split_dimensions_all
When arm_compute::ISchedular::Hints::_split_dimension is initialized with this value then the schedul...
Definition: IScheduler.h:62
arm_compute::ACL_SRC_1
@ ACL_SRC_1
Definition: Types.h:46
arm_compute::DataType::S8
@ S8
signed 8-bit number
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:630
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:877
arm_compute::CPUInfo
Definition: CPPTypes.h:66
batches
unsigned int batches
Definition: CpuGemmAssemblyDispatch.cpp:98
arm_compute::ACL_SRC_2
@ ACL_SRC_2
Definition: Types.h:47
arm_compute::IScheduler::run_tagged_workloads
virtual void run_tagged_workloads(std::vector< Workload > &workloads, const char *tag)
Execute all the passed workloads.
Definition: IScheduler.cpp:172
arm_gemm::ConvolutionParameters::padding_top
int64_t padding_top
Definition: convolution_parameters.hpp:60
arm_compute::cpu::CpuGemmAssemblyDispatch::configure
void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
If supported create a Compute Library function else fallback to the arm_gemm function.
Definition: CpuGemmAssemblyDispatch.cpp:816
arm_gemm::GemmMethod::GEMM_INTERLEAVED
@ GEMM_INTERLEAVED
arm_compute::ActivationLayerInfo
Activation Layer Information class.
Definition: ActivationLayerInfo.h:55
arm_compute::test::validation::m
const unsigned int m
Definition: GEMMMatrixMultiplyNative.cpp:359
arm_gemm::IGemmCommon::get_B_pretranspose_window_size
virtual size_t get_B_pretranspose_window_size() const
Definition: gemm_common.hpp:117
ARM_COMPUTE_ERROR_ON_NULLPTR
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
arm_compute::experimental::MemoryInfo
Definition: Types.h:96
arm_gemm::GemmConfig::weight_format
WeightFormat weight_format
Definition: arm_gemm.hpp:112
ARM_COMPUTE_ERROR_ON
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:467
arm_gemm::Activation
Definition: arm_gemm.hpp:123
arm_compute::WeightFormat
WeightFormat
Memory layouts for the weights tensor.
Definition: CoreTypes.h:305
arm_compute::DataType::U32
@ U32
unsigned 32-bit number
arm_compute::ITensorPack
Tensor packing service.
Definition: ITensorPack.h:39
arm_gemm::GemmConfig::filter
std::string filter
Definition: arm_gemm.hpp:109
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(t,...)
Definition: Validate.h:779
arm_compute::cpu::CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch
CpuGemmAssemblyDispatch()
Constructor.
Definition: CpuGemmAssemblyDispatch.cpp:687
arm_gemm::KernelDescription
Definition: arm_gemm.hpp:90
arm_compute::test::validation::gemm_info
gemm_info
Definition: GEMMMatrixMultiplyReshaped.cpp:862
arm_gemm::ConvolutionParameters::output_stride_h
int64_t output_stride_h
Definition: convolution_parameters.hpp:58
arm_gemm
Definition: barrier.hpp:30
arm_compute::ACL_DST
@ ACL_DST
Definition: Types.h:55
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:115
arm_compute::DataType::U8
@ U8
unsigned 8-bit number
arm_compute::Scheduler::get
static IScheduler & get()
Access the scheduler singleton.
Definition: Scheduler.cpp:94
arm_compute::IScheduler::StrategyHint::DYNAMIC
@ DYNAMIC
Split the workload dynamically using a bucket system.
arm_compute::Status
Status class.
Definition: Error.h:52
arm_compute::DataType::QASYMM8_SIGNED
@ QASYMM8_SIGNED
quantized, asymmetric fixed-point 8-bit number signed
CpuAuxTensorHandler.h
arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D
@ QUANTIZE_WRAPPER_2D
arm_compute::ITensorInfo::data_type
virtual DataType data_type() const =0
Data type used for each element of the tensor.
arm_gemm::GemmConfig
Definition: arm_gemm.hpp:106
arm_compute::is_data_type_quantized_per_channel
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: DataTypeUtils.h:401
arm_gemm.hpp
bias
const int32_t * bias
Definition: working_space.hpp:322
K
unsigned int K
Definition: CpuGemmAssemblyDispatch.cpp:97
arm_compute::is_fixed_format
bool is_fixed_format(const WeightFormat &wf)
Definition: Types.h:1601
ci
const CPUInfo & ci
Definition: NEBatchNormalizationLayerKernel.cpp:52
arm_gemm::GemmArgs
Definition: arm_gemm.hpp:142
ARM_COMPUTE_UNUSED
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
arm_compute::cpu::CpuGemmAssemblyDispatch::run
void run(ITensorPack &tensors) override
Run the kernels contained in the function.
Definition: CpuGemmAssemblyDispatch.cpp:882
arm_compute::WeightFormat::UNSPECIFIED
@ UNSPECIFIED
arm_compute::test::validation::data_type
data_type
Definition: Cast.cpp:223
arm_compute::cpu::AsmConvMethod::Indirect
@ Indirect
arm_compute::IScheduler::schedule
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.
MemoryHelpers.h
arm_compute::experimental
Definition: IPostOp.h:33
arm_gemm::Activation::Type::None
@ None
arm_compute::is_fixed_format_fast_math
bool is_fixed_format_fast_math(const WeightFormat &wf)
Definition: Types.h:1605
NEScheduler.h
arm_compute::offset_int_vec
int offset_int_vec(int offset)
Definition: MemoryHelpers.h:38
arm_gemm::GemmCommon::pretranspose_B_array_part
virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
Definition: gemm_common.hpp:240
M
unsigned int M
Definition: CpuGemmAssemblyDispatch.cpp:95
arm_gemm::ConvolutionParameters::kernel_width
int64_t kernel_width
Definition: convolution_parameters.hpp:53
Validate.h
arm_compute::test::validation::b
SimpleTensor< float > b
Definition: DFT.cpp:157
ARM_COMPUTE_RETURN_ERROR_ON_MSG
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:245
arm_compute
Copyright (c) 2017-2023 Arm Limited.
Definition: introduction.dox:24
arm_gemm::Activation::type
Type type
Definition: arm_gemm.hpp:132
arm_compute::cpu::CpuGemmAssemblyDispatch::workspace
experimental::MemoryRequirements workspace() const override
Return the memory requirements required by the workspace.
Definition: CpuGemmAssemblyDispatch.cpp:888
arm_gemm::ConvolutionParameters::input_height
int64_t input_height
Definition: convolution_parameters.hpp:51
arm_compute::DataType::F16
@ F16
16-bit floating-point number
arm_compute::cpu::CpuGemmAssemblyDispatch::is_activation_supported
static bool is_activation_supported(const ActivationLayerInfo &activation)
Checks if activation is supported by the gemm assembly dispatcher.
Definition: CpuGemmAssemblyDispatch.cpp:810
arm_gemm::ConvolutionParameters::input_channels
int64_t input_channels
Definition: convolution_parameters.hpp:52
arm_compute::DataType::S32
@ S32
signed 32-bit number
utils.hpp
multis
unsigned int multis
Definition: CpuGemmAssemblyDispatch.cpp:99
arm_compute::mlgo::parser::end
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
arm_compute::IScheduler::num_threads
virtual unsigned int num_threads() const =0
Returns the number of threads that the SingleThreadScheduler has in its pool.
arm_gemm::WeightFormat
WeightFormat
Definition: arm_gemm.hpp:50
arm_compute::is_data_type_quantized
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: DataTypeUtils.h:324
arm_compute::cpu::AsmConvMethod::Conv
@ Conv
arm_compute::ITensorInfo
Store the tensor's metadata.
Definition: ITensorInfo.h:43
arm_compute::DataType::F32
@ F32
32-bit floating-point number
arm_gemm::GemmCommon< TypeInput, TypeOutput >
arm_compute::test::validation::info
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
tf_frozen_model_extractor.t
t
Definition: tf_frozen_model_extractor.py:49
arm_compute::cpu::CpuGemmAssemblyDispatch::validate
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
Indicates whether or not this function can be used to process the given parameters.
Definition: CpuGemmAssemblyDispatch.cpp:762
CpuGemmAssemblyWrapperKernel.h
arm_gemm::ConvolutionParameters::output_height
int64_t output_height
Definition: convolution_parameters.hpp:56
arm_gemm::ConvolutionParameters::kernel_height
int64_t kernel_height
Definition: convolution_parameters.hpp:54
arm_compute::DataType
DataType
Available data types.
Definition: CoreTypes.h:82
arm_gemm::Requantize32
Definition: arm_gemm.hpp:169
input_height
const size_t input_height
Definition: impl.cpp:62
arm_gemm::KernelDescription::method
GemmMethod method
Definition: arm_gemm.hpp:92
arm_compute::assembly_utils::map_to_arm_compute_weight_format
arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::WeightFormat &weight_format)
Performs a mapping from Assembly WeightFormat to the Compute Library WeightFormat enum.
Definition: AssemblyUtils.cpp:192
arm_gemm::ConvolutionParameters::input_width
int64_t input_width
Definition: convolution_parameters.hpp:50