Compute Library
 20.08
NEGEMMLowpMatrixMultiplyCore.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
31 #include "arm_compute/core/Types.h"
36 #include "support/MemorySupport.h"
37 
38 namespace arm_compute
39 {
41 
42 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
43  : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(memory_manager, weights_manager), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(),
44  _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(),
45  _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0),
46  _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false),
47  _run_activation(false), _flip_signedness(false)
48 {
49 }
50 
51 void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
52 {
53  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
55  ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
56 
57  const ITensor *matrix_a = a;
58  const ITensor *matrix_b = b;
59  GEMMInfo info = gemm_info;
60 
61  // Set internal variables
62  _a_offset = a->info()->quantization_info().uniform().offset;
63  _b_offset = b->info()->quantization_info().uniform().offset;
64  _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
65  _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
66  _is_prepared = false;
67  _fused_assembly_path = false;
68  _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
69  _original_b = b;
70 
71  const ITensor *a_to_use = a;
72 
73  // Convert to QASYMM8 -> QASYMM8_SIGNED and back
74  if(_flip_signedness)
75  {
76  const int32_t offset_correction = 128;
78  const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform();
79 
80  _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
81  _memory_group.manage(&_signed_a);
82  _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
83  a_to_use = &_signed_a;
84  _a_offset = _signed_a.info()->quantization_info().uniform().offset;
85 
86  const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
87  _memory_group.manage(&_signed_output);
88  _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
89 
90  // Output stage correction
91  GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
92  output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset;
93  output_stage_corr.gemmlowp_min_bound -= offset_correction;
94  output_stage_corr.gemmlowp_max_bound -= offset_correction;
95  info.set_gemmlowp_output_stage(output_stage_corr);
96 
97  // Update matrix a
98  matrix_a = &_signed_a;
99  }
100 
101  // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
102  if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
103  {
104  _fuse_output_stage = true;
105  _memory_group.manage(&_mm_result_s32);
106  TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
107  _mm_result_s32.allocator()->init(info_mm_result_s32);
108  }
109 
110 #ifdef __aarch64__
111  switch(a->info()->data_type())
112  {
113  case DataType::QASYMM8:
115  case DataType::U8:
116  case DataType::S8:
117  {
118  if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
119  {
120  // Result shifts < 0 are not supported by asm kernels
121  const std::vector<int32_t> &shifts = info.gemmlowp_output_stage().gemmlowp_shifts;
122  const bool is_asm_supported = info.gemmlowp_output_stage().gemmlowp_shift >= 0
123  && std::all_of(shifts.cbegin(), shifts.cend(), [](int32_t val)
124  {
125  return val >= 0;
126  });
127  if(is_asm_supported)
128  {
129  _asm_glue.configure(a_to_use, b, c, output, gemm_info);
130  _fused_assembly_path = _asm_glue.is_configured();
131  }
132  }
133  else
134  {
135  _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
136  }
137  _assembly_path = _asm_glue.is_configured();
138  break;
139  }
140  default:
141  {
142  ARM_COMPUTE_ERROR("Datatype not supported");
143  break;
144  }
145  }
146 #endif /* __aarch64__ */
147  if(!(_assembly_path || _run_vector_matrix_multiplication))
148  {
149  matrix_a = &_tmp_a;
150  matrix_b = &_tmp_b;
151 
152  // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
153  TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
154  // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
155  TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
156  _tmp_a.allocator()->init(a_info);
157  _tmp_b.allocator()->init(b_info);
158  _memory_group.manage(&_tmp_a);
159  if(!_reshape_b_only_on_first_run)
160  {
161  _memory_group.manage(&_tmp_b);
162  }
163 
164  // Configure interleave kernel
165  _mtx_a_reshape_kernel.configure(a_to_use, &_tmp_a);
166 
167  // Configure transpose kernel
168  _mtx_b_reshape_kernel.configure(b, &_tmp_b);
169  }
170 
171  if(!_fused_assembly_path)
172  {
173  // Build reduction info
174  const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false);
175 
176  // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
177  if(_a_offset != 0)
178  {
179  TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
180 
181  _vector_sum_col.allocator()->init(info_vector_sum_col);
182  if(!_reshape_b_only_on_first_run)
183  {
184  _memory_group.manage(&_vector_sum_col);
185  }
186 
187  // Configure Matrix B reduction kernel
188  _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info);
189  }
190 
191  // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
192  if(_b_offset != 0)
193  {
194  TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
195 
196  _vector_sum_row.allocator()->init(info_vector_sum_row);
197  _memory_group.manage(&_vector_sum_row);
198 
199  // Configure matrix A reduction kernel
200  _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info);
201  }
202 
203  if(_fuse_output_stage)
204  {
205  // Configure matrix multiply kernel
206  if(!_assembly_path)
207  {
208  _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32);
209  }
210 
211  _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
212  _a_offset == 0 ? nullptr : &_vector_sum_col,
213  _b_offset == 0 ? nullptr : &_vector_sum_row, c,
214  _flip_signedness ? &_signed_output : output,
215  a->info()->dimension(0),
216  _a_offset, _b_offset, info.gemmlowp_output_stage());
217 
218  if(_flip_signedness)
219  {
220  _convert_from_signed_asymm.configure(&_signed_output, output);
221  }
222  }
223  else
224  {
225  // Configure matrix multiply kernel
226  if(!_assembly_path)
227  {
228  _mm_kernel.configure(matrix_a, matrix_b, output);
229  }
230  // Configure offset contribution kernel
231  _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
232  }
233 
234  // Configure activation
235  const ActivationLayerInfo &activation = gemm_info.activation_info();
236  _run_activation = activation.enabled() && (!_assembly_path || (_assembly_path && !NEGEMMAssemblyDispatch::is_activation_supported(activation)));
237  if(_run_activation)
238  {
239  _activation_func.configure(output, nullptr, activation);
240  }
241  }
242 
243  // Allocate tensors
244  if(!_assembly_path && !_run_vector_matrix_multiplication)
245  {
246  _tmp_a.allocator()->allocate();
247  if(!_reshape_b_only_on_first_run)
248  {
249  _tmp_b.allocator()->allocate();
250  }
251  }
252 
253  if(!_fused_assembly_path)
254  {
255  if(_a_offset != 0 && !_reshape_b_only_on_first_run)
256  {
257  _vector_sum_col.allocator()->allocate();
258  }
259 
260  if(_b_offset != 0)
261  {
262  _vector_sum_row.allocator()->allocate();
263  }
264  }
265 
266  if(_fuse_output_stage)
267  {
268  _mm_result_s32.allocator()->allocate();
269  }
270 
271  if(_flip_signedness)
272  {
273  _signed_a.allocator()->allocate();
274  _signed_output.allocator()->allocate();
275  }
276 }
277 
278 Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
279 {
283  ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
284  ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
285  "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
286  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
287  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
288 
289  GEMMInfo info = gemm_info;
290  const ITensorInfo *matrix_a_info = a;
291  const ITensorInfo *matrix_b_info = b;
292 
293  const ITensorInfo *a_to_use = a;
294 
295  TensorInfo tmp_a_info{};
296  TensorInfo tmp_b_info{};
297  TensorInfo mm_result_s32_info{};
298 
299  int32_t a_offset = a->quantization_info().uniform().offset;
300  int32_t b_offset = b->quantization_info().uniform().offset;
301 
302  bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
303  if(fuse_output_stage)
304  {
305  auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
306  }
307 
308  // Convert QASYMM8->QASYMM8_SIGNED
309  TensorInfo signed_a{};
310  TensorInfo signed_output{};
311  bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
312  if(flip_signedness)
313  {
314  const int32_t offset_correction = 128;
316  const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
317 
318  signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
320  a_to_use = &signed_a;
321  a_offset = signed_a.quantization_info().uniform().offset;
322 
323  const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
324  signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
325 
326  // Output stage correction
327  GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
328  output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
329  output_stage_corr.gemmlowp_min_bound -= offset_correction;
330  output_stage_corr.gemmlowp_max_bound -= offset_correction;
331  info.set_gemmlowp_output_stage(output_stage_corr);
332 
333  // Update matrix a
334  matrix_a_info = &signed_a;
335  }
336 
337  // Check if we need to run the optimized assembly kernel
338  bool run_optimised = false;
339  bool run_optimised_requantized = false;
341  {
342  // Result shifts < 0 are not supported by asm kernels
343  const std::vector<int32_t> &shifts = info.gemmlowp_output_stage().gemmlowp_shifts;
344  const bool is_asm_supported = info.gemmlowp_output_stage().gemmlowp_shift >= 0
345  && std::all_of(shifts.cbegin(), shifts.cend(), [](int32_t val)
346  {
347  return val >= 0;
348  });
349 
350  if(is_asm_supported)
351  {
352  run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
353  run_optimised_requantized = run_optimised;
354  }
355  }
356  else
357  {
358  run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
359  }
360 
361  if(run_optimised)
362  {
363  ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
364  if(info.depth_output_gemm3d() != 0)
365  {
366  if(info.reinterpret_input_as_3d())
367  {
368  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
369  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
370  }
371  else
372  {
373  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
374  }
375  }
376  else
377  {
378  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
379  }
380  }
381  else
382  {
383  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
384  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
385 
386  const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
387  if(!run_vector_matrix_multiplication)
388  {
389  matrix_a_info = &tmp_a_info;
390  matrix_b_info = &tmp_b_info;
391 
392  // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
393  TensorShape shape_tmp_a = a->tensor_shape();
394  shape_tmp_a.set(0, a->dimension(0) * 4);
395  shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
396 
397  // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
398  TensorShape shape_tmp_b = b->tensor_shape();
399  shape_tmp_b.set(0, b->dimension(1) * 16);
400  shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
401 
402  // Validate interleave kernel
403  auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
404  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
405 
408  }
409  }
410 
411  if(!run_optimised_requantized)
412  {
413  TensorInfo info_vector_sum_col{};
414  TensorInfo info_vector_sum_row{};
415 
416  const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
417 
418  // Validate matrix B reduction kernel only if _a_offset is not equal to 0
419  if(a_offset != 0)
420  {
421  info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
422 
423  // Configure Matrix B reduction kernel
424  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
425  }
426 
427  // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
428  if(b_offset != 0)
429  {
430  info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
431 
432  // Configure matrix A reduction kernel
433  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
434  }
435 
436  if(fuse_output_stage)
437  {
438  if(!run_optimised)
439  {
440  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
441  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
442 
443  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
444  }
445 
446  // Validate offset contribution kernel
448  a_offset == 0 ? nullptr : &info_vector_sum_col,
449  b_offset == 0 ? nullptr : &info_vector_sum_row,
450  c,
451  flip_signedness ? &signed_output : output,
452  a_offset, b_offset,
453  info.gemmlowp_output_stage()));
454  }
455  else
456  {
457  if(!run_optimised)
458  {
459  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
460  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
461 
462  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
463  }
464  // Validate offset contribution kernel
466  a_offset == 0 ? nullptr : &info_vector_sum_col,
467  b_offset == 0 ? nullptr : &info_vector_sum_row,
468  a_offset, b_offset));
469  }
470  }
471 
472  // Validate activation
473  const ActivationLayerInfo &activation = gemm_info.activation_info();
474  if(activation.enabled())
475  {
476  ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
477  }
478 
479  return Status{};
480 }
481 
483 {
484  prepare();
485 
486  MemoryGroupResourceScope scope_mg(_memory_group);
487 
488  // Convert QASYMM8->QASYMM8_SIGNED
489  if(_flip_signedness)
490  {
491  NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
492  }
493 
494  // Run GEMM
495  if(_asm_glue.is_configured())
496  {
497  _asm_glue.run();
498  }
499  else
500  {
501  if(!_run_vector_matrix_multiplication)
502  {
503  // Run interleave kernel
504  NEScheduler::get().schedule(&_mtx_a_reshape_kernel, Window::DimY);
505 
506  if(!_reshape_b_only_on_first_run)
507  {
508  // Run transpose kernel
509  NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
510  }
511  }
512  NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
513  }
514 
515  if(!_fused_assembly_path)
516  {
517  // Run matrix A reduction kernel only if _b_offset is not equal to 0
518  if(_b_offset != 0)
519  {
520  NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
521  }
522 
523  // Run matrix B reduction kernel only if _a_offset is not equal to 0
524  if(_a_offset != 0 && !_reshape_b_only_on_first_run)
525  {
526  NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
527  }
528 
529  if(_fuse_output_stage)
530  {
531  // Run offset contribution kernel
532  NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
533  }
534  else
535  {
536  // Run offset contribution kernel
537  NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
538  }
539  }
540 
541  // Convert QASYMM8_SIGNED->QASYMM8
542  if(_flip_signedness)
543  {
544  NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
545  }
546 
547  // Run fused activation unless already run in the fused assembly
548  if(_run_activation && !_fused_assembly_path)
549  {
550  _activation_func.run();
551  }
552 }
553 
555 {
556  if(!_is_prepared)
557  {
558  const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
559  // Run assembly reshape
560  if(_asm_glue.is_configured())
561  {
562  if(!original_b_managed_by_weights_manager)
563  {
564  ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
565  }
566 
567  _asm_glue.prepare();
568  if(!original_b_managed_by_weights_manager)
569  {
570  _original_b->mark_as_unused();
571  }
572  }
573  // Run non-assembly reshape
574  else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured())
575  {
576  if(!original_b_managed_by_weights_manager)
577  {
578  ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
579  }
580 
581  // Run reshape kernel and mark original weights tensor as unused
582  _tmp_b.allocator()->allocate();
583  NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
584  if(!original_b_managed_by_weights_manager)
585  {
586  _original_b->mark_as_unused();
587  }
588  }
589 
590  // Run matrix B reduction kernel only if _a_offset is not equal to 0
591  if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
592  {
593  _vector_sum_col.allocator()->allocate();
594  NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
595  }
596 
597  _is_prepared = true;
598  }
599 }
600 } // namespace arm_compute
void prepare() override
Prepare the function for executing.
Shape of a tensor.
Definition: TensorShape.h:39
Quantize using a fixed point multiplication.
void init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo &sub_info)
Shares the same backing memory with another tensor allocator, while the tensor info might be differen...
bool enabled() const
Check if initialised.
Definition: Types.h:1567
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
1 channel, 1 U8 per channel
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
bool is_used() const
Flags if the tensor is used or not.
Definition: ITensor.cpp:163
NEGEMMLowpMatrixMultiplyCore(std::shared_ptr< IMemoryManager > memory_manager=nullptr, IWeightsManager *weights_manager=nullptr)
Constructor.
GEMMLowpOutputStageInfo gemmlowp_output_stage() const
GEMMLowp output stage.
Definition: Types.h:2035
TensorShape compute_reductionA_shape(const ITensorInfo &b)
Calculate the reductionA shape used in GEMMLowp.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
[NEActivationLayer snippet]
void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override
Initialise the kernel's input and output.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor's metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Quantization info when assuming per layer quantization.
int32_t gemmlowp_offset
GEMMLowp output stage offset used for quantizing to QASYMM8.
Definition: Types.h:1884
void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
Initialise the kernel's input and output.
Status class.
Definition: Error.h:52
int32_t gemmlowp_max_bound
GEMMLowp max value used to saturate down the output result before converting back to QASYMM8.
Definition: Types.h:1888
void run() override
Run the kernels contained in the function.
void configure(const ITensor *input0, const ITensor *input1, ITensor *output)
Initialise the kernel's input and output.
static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixBReducti...
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Activation Layer Information class.
Definition: Types.h:1517
GEMMLowpOutputStageType type
GEMMLowp output stage type.
Definition: Types.h:1883
Interface for NEON tensor.
Definition: ITensor.h:36
TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height=1, bool reinterpret_input_as_3d=false)
Calculate the interleaved shape of an input tensor.
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpOffsetContribu...
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEGEMMTranspose1xWKernel...
Copyright (c) 2017-2020 Arm Limited.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: Helpers.inl:207
bool is_b_reshaped() const
Flag which specifies if the matrix B has been reshaped.
Definition: Types.h:1993
TensorAllocator * allocator()
Return a pointer to the tensor's allocator.
Definition: Tensor.cpp:48
ITensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
Definition: Tensor.cpp:33
void mark_as_unused() const
Marks a tensor as unused.
Definition: ITensor.cpp:168
1 channel, 1 S32 per channel
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Definition: MemoryGroup.h:79
bool are_weights_managed(const ITensor *weights)
Check if the weights are managed.
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NECopyKernel.
Quantization information.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override
Initialise the kernel's input and output.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
static bool is_activation_supported(const ActivationLayerInfo &activation)
Checks if activation is supported by the gemm assembly dispatcher.
void prepare() override
Runs a preparation step, usually for pre-transposing matrix b.
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel's inputs, output.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1198
void run() override
Run the kernels contained in the function.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
void run() override
Run the kernels contained in the function.
quantized, asymmetric fixed-point 8-bit number unsigned
bool is_configured() const
Was the function successfully configured ?
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEGEMMInterleave4x4Kerne...
void allocate() override
Allocate size specified by TensorInfo of CPU memory.
void configure(const ITensor *input, ITensor *output)
Initialise the kernel's input and output.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
GEMMLowp output stage info.
Definition: Types.h:1881
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
void configure(const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
Initialise the kernel's input and output.
Weights manager interface to handle weights transformations.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
quantized, symmetric fixed-point 8-bit number
void configure(const ITensor *input, ITensor *output)
Initialise the kernel's input and output.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1143
bool is_a_reshaped() const
Flag which specifies if the matrix A has been reshaped.
Definition: Types.h:1985
quantized, symmetric per channel fixed-point 8-bit number
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
Indicates whether or not this function can be used to process the given parameters.
TensorShape compute_reductionB_shape(const ITensorInfo &a)
Calculate the reductionB shape used in GEMMLowp.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Memory group resources scope handling class.
Definition: IMemoryGroup.h:82
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.
void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
[NEActivationLayer snippet]
TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
Calculate the transposed 1xW shape.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
TensorShape & set(size_t dimension, size_t value, bool apply_dim_correction=true)
Accessor to set the value of one of the dimensions.
Definition: TensorShape.h:78
Store the tensor's metadata.
Definition: TensorInfo.h:45
void configure(const ITensor *input, ITensor *output)
Initialize the kernel's input, output.
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info)
If supported create an ACL function else fallback to the arm_gemm function.
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info=GEMMInfo())
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixMultiply...
GEMM information class.
Definition: Types.h:1932
static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixMultiply...
quantized, asymmetric fixed-point 8-bit number signed
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpOffsetContribu...
static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixAReducti...
int32_t gemmlowp_min_bound
GEMMLowp min value used to saturate down the output result before converting back to QASYMM8.
Definition: Types.h:1887
DataType
Available data types.
Definition: Types.h:77
ActivationLayerInfo activation_info() const
Activation layer to apply after the matrix multiplication.
Definition: Types.h:2083
signed 8-bit number
static IScheduler & get()
Access the scheduler singleton.
Definition: Scheduler.cpp:95