Compute Library
 20.05
NEGEMMLowpMatrixMultiplyCore.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
31 #include "arm_compute/core/Types.h"
36 #include "support/MemorySupport.h"
37 
38 namespace arm_compute
39 {
41 
42 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
43  : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(memory_manager, weights_manager), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(),
44  _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(),
45  _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0),
46  _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false),
47  _run_activation(false), _flip_signedness(false)
48 {
49 }
50 
51 void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
52 {
53  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
55  ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
56 
57  const ITensor *matrix_a = a;
58  const ITensor *matrix_b = b;
59  GEMMInfo info = gemm_info;
60 
61  // Set internal variables
62  _a_offset = a->info()->quantization_info().uniform().offset;
63  _b_offset = b->info()->quantization_info().uniform().offset;
64  _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
65  _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
66  _is_prepared = false;
67  _fused_assembly_path = false;
68  _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
69  _original_b = b;
70 
71  const ITensor *a_to_use = a;
72 
73  // Convert to QASYMM8 -> QASYMM8_SIGNED and back
74  if(_flip_signedness)
75  {
76  const int32_t offset_correction = 128;
78  const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform();
79 
80  _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
81  _memory_group.manage(&_signed_a);
82  _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
83  a_to_use = &_signed_a;
84  _a_offset = _signed_a.info()->quantization_info().uniform().offset;
85 
86  const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
87  _memory_group.manage(&_signed_output);
88  _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
89 
90  // Output stage correction
91  GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
92  output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset;
93  output_stage_corr.gemmlowp_min_bound -= offset_correction;
94  output_stage_corr.gemmlowp_max_bound -= offset_correction;
95  info.set_gemmlowp_output_stage(output_stage_corr);
96 
97  // Update matrix a
98  matrix_a = &_signed_a;
99  }
100 
101  // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
102  if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
103  {
104  _fuse_output_stage = true;
105  _memory_group.manage(&_mm_result_s32);
106  TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
107  _mm_result_s32.allocator()->init(info_mm_result_s32);
108  }
109 
110 #ifdef __aarch64__
111  switch(a->info()->data_type())
112  {
113  case DataType::QASYMM8:
115  case DataType::U8:
116  case DataType::S8:
117  {
118  if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
119  {
120  _asm_glue.configure(a_to_use, b, c, output, gemm_info);
121  _fused_assembly_path = _asm_glue.is_configured();
122  }
123  else
124  {
125  _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
126  }
127  _assembly_path = _asm_glue.is_configured();
128  break;
129  }
130  default:
131  {
132  ARM_COMPUTE_ERROR("Datatype not supported");
133  break;
134  }
135  }
136 #endif /* __aarch64__ */
137  if(!(_assembly_path || _run_vector_matrix_multiplication))
138  {
139  matrix_a = &_tmp_a;
140  matrix_b = &_tmp_b;
141 
142  // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
143  TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
144  // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
145  TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
146  _tmp_a.allocator()->init(a_info);
147  _tmp_b.allocator()->init(b_info);
148  _memory_group.manage(&_tmp_a);
149  if(!_reshape_b_only_on_first_run)
150  {
151  _memory_group.manage(&_tmp_b);
152  }
153 
154  // Configure interleave kernel
155  _mtx_a_reshape_kernel.configure(a_to_use, &_tmp_a);
156 
157  // Configure transpose kernel
158  _mtx_b_reshape_kernel.configure(b, &_tmp_b);
159  }
160 
161  if(!_fused_assembly_path)
162  {
163  // Build reduction info
164  const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false);
165 
166  // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
167  if(_a_offset != 0)
168  {
169  TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
170 
171  _vector_sum_col.allocator()->init(info_vector_sum_col);
172  if(!_reshape_b_only_on_first_run)
173  {
174  _memory_group.manage(&_vector_sum_col);
175  }
176 
177  // Configure Matrix B reduction kernel
178  _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, reduction_info);
179  }
180 
181  // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
182  if(_b_offset != 0)
183  {
184  TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
185 
186  _vector_sum_row.allocator()->init(info_vector_sum_row);
187  _memory_group.manage(&_vector_sum_row);
188 
189  // Configure matrix A reduction kernel
190  _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, reduction_info);
191  }
192 
193  if(_fuse_output_stage)
194  {
195  // Configure matrix multiply kernel
196  if(!_assembly_path)
197  {
198  _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32);
199  }
200 
201  _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
202  _a_offset == 0 ? nullptr : &_vector_sum_col,
203  _b_offset == 0 ? nullptr : &_vector_sum_row, c,
204  _flip_signedness ? &_signed_output : output,
205  a->info()->dimension(0),
206  _a_offset, _b_offset, info.gemmlowp_output_stage());
207 
208  if(_flip_signedness)
209  {
210  _convert_from_signed_asymm.configure(&_signed_output, output);
211  }
212  }
213  else
214  {
215  // Configure matrix multiply kernel
216  if(!_assembly_path)
217  {
218  _mm_kernel.configure(matrix_a, matrix_b, output);
219  }
220  // Configure offset contribution kernel
221  _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
222  }
223 
224  // Configure activation
225  const ActivationLayerInfo &activation = gemm_info.activation_info();
226  _run_activation = activation.enabled() && (!_assembly_path || (_assembly_path && !NEGEMMAssemblyDispatch::is_activation_supported(activation)));
227  if(_run_activation)
228  {
229  _activation_func.configure(output, nullptr, activation);
230  }
231  }
232 
233  // Allocate tensors
234  if(!_assembly_path && !_run_vector_matrix_multiplication)
235  {
236  _tmp_a.allocator()->allocate();
237  if(!_reshape_b_only_on_first_run)
238  {
239  _tmp_b.allocator()->allocate();
240  }
241  }
242 
243  if(!_fused_assembly_path)
244  {
245  if(_a_offset != 0 && !_reshape_b_only_on_first_run)
246  {
247  _vector_sum_col.allocator()->allocate();
248  }
249 
250  if(_b_offset != 0)
251  {
252  _vector_sum_row.allocator()->allocate();
253  }
254  }
255 
256  if(_fuse_output_stage)
257  {
258  _mm_result_s32.allocator()->allocate();
259  }
260 
261  if(_flip_signedness)
262  {
263  _signed_a.allocator()->allocate();
264  _signed_output.allocator()->allocate();
265  }
266 }
267 
268 Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
269 {
273  ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
274  ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
275  "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
276  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
277  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
278 
279  GEMMInfo info = gemm_info;
280  const ITensorInfo *matrix_a_info = a;
281  const ITensorInfo *matrix_b_info = b;
282 
283  const ITensorInfo *a_to_use = a;
284 
285  TensorInfo tmp_a_info{};
286  TensorInfo tmp_b_info{};
287  TensorInfo mm_result_s32_info{};
288 
289  int32_t a_offset = a->quantization_info().uniform().offset;
290  int32_t b_offset = b->quantization_info().uniform().offset;
291 
292  bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
293  if(fuse_output_stage)
294  {
295  auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
296  }
297 
298  // Convert QASYMM8->QASYMM8_SIGNED
299  TensorInfo signed_a{};
300  TensorInfo signed_output{};
301  bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
302  if(flip_signedness)
303  {
304  const int32_t offset_correction = 128;
306  const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
307 
308  signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
310  a_to_use = &signed_a;
311  a_offset = signed_a.quantization_info().uniform().offset;
312 
313  const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
314  signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
315 
316  // Output stage correction
317  GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
318  output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
319  output_stage_corr.gemmlowp_min_bound -= offset_correction;
320  output_stage_corr.gemmlowp_max_bound -= offset_correction;
321  info.set_gemmlowp_output_stage(output_stage_corr);
322 
323  // Update matrix a
324  matrix_a_info = &signed_a;
325  }
326 
327  // Check if we need to run the optimized assembly kernel
328  bool run_optimised = false;
329  bool run_optimised_requantized = false;
330  if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
331  {
332  run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
333  run_optimised_requantized = run_optimised;
334 
335  const UniformQuantizationInfo a_qinfo = a_to_use->quantization_info().uniform();
336  const QuantizationInfo b_qinfo = b->quantization_info();
337  const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform();
338  for(auto const s : b_qinfo.scale())
339  {
340  const float fmultipler = a_qinfo.scale * s / output_qinfo.scale;
341  if(fmultipler > 1.f)
342  {
343  run_optimised_requantized = false;
344  break;
345  }
346  }
347  }
348  else
349  {
350  run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
351  }
352 
353  if(run_optimised)
354  {
355  ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
356  if(info.depth_output_gemm3d() != 0)
357  {
358  if(info.reinterpret_input_as_3d())
359  {
360  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
361  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
362  }
363  else
364  {
365  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
366  }
367  }
368  else
369  {
370  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
371  }
372  }
373  else
374  {
375  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
376  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
377 
378  const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
379  if(!run_vector_matrix_multiplication)
380  {
381  matrix_a_info = &tmp_a_info;
382  matrix_b_info = &tmp_b_info;
383 
384  // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
385  TensorShape shape_tmp_a = a->tensor_shape();
386  shape_tmp_a.set(0, a->dimension(0) * 4);
387  shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
388 
389  // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
390  TensorShape shape_tmp_b = b->tensor_shape();
391  shape_tmp_b.set(0, b->dimension(1) * 16);
392  shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
393 
394  // Validate interleave kernel
395  auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
396  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
397 
400  }
401  }
402 
403  if(!run_optimised_requantized)
404  {
405  TensorInfo info_vector_sum_col{};
406  TensorInfo info_vector_sum_row{};
407 
408  const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
409 
410  // Validate matrix B reduction kernel only if _a_offset is not equal to 0
411  if(a_offset != 0)
412  {
413  info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
414 
415  // Configure Matrix B reduction kernel
416  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
417  }
418 
419  // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
420  if(b_offset != 0)
421  {
422  info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
423 
424  // Configure matrix A reduction kernel
425  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
426  }
427 
428  if(fuse_output_stage)
429  {
430  if(!run_optimised)
431  {
432  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
433  }
434 
435  // Validate offset contribution kernel
437  a_offset == 0 ? nullptr : &info_vector_sum_col,
438  b_offset == 0 ? nullptr : &info_vector_sum_row,
439  c,
440  flip_signedness ? &signed_output : output,
441  a_offset, b_offset,
442  info.gemmlowp_output_stage()));
443  }
444  else
445  {
446  if(!run_optimised)
447  {
448  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
449  }
450  // Validate offset contribution kernel
452  a_offset == 0 ? nullptr : &info_vector_sum_col,
453  b_offset == 0 ? nullptr : &info_vector_sum_row,
454  a_offset, b_offset));
455  }
456  }
457 
458  // Validate activation
459  const ActivationLayerInfo &activation = gemm_info.activation_info();
460  if(activation.enabled())
461  {
462  ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
463  }
464 
465  return Status{};
466 }
467 
469 {
470  prepare();
471 
472  MemoryGroupResourceScope scope_mg(_memory_group);
473 
474  // Convert QASYMM8->QASYMM8_SIGNED
475  if(_flip_signedness)
476  {
477  NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
478  }
479 
480  // Run GEMM
481  if(_asm_glue.is_configured())
482  {
483  _asm_glue.run();
484  }
485  else
486  {
487  if(!_run_vector_matrix_multiplication)
488  {
489  // Run interleave kernel
490  NEScheduler::get().schedule(&_mtx_a_reshape_kernel, Window::DimY);
491 
492  if(!_reshape_b_only_on_first_run)
493  {
494  // Run transpose kernel
495  NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
496  }
497  }
498  NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
499  }
500 
501  if(!_fused_assembly_path)
502  {
503  // Run matrix A reduction kernel only if _b_offset is not equal to 0
504  if(_b_offset != 0)
505  {
506  NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
507  }
508 
509  // Run matrix B reduction kernel only if _a_offset is not equal to 0
510  if(_a_offset != 0 && !_reshape_b_only_on_first_run)
511  {
512  NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
513  }
514 
515  if(_fuse_output_stage)
516  {
517  // Run offset contribution kernel
518  NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
519  }
520  else
521  {
522  // Run offset contribution kernel
523  NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
524  }
525  }
526 
527  // Convert QASYMM8_SIGNED->QASYMM8
528  if(_flip_signedness)
529  {
530  NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
531  }
532 
533  // Run fused activation unless already run in the fused assembly
534  if(_run_activation && !_fused_assembly_path)
535  {
536  _activation_func.run();
537  }
538 }
539 
541 {
542  if(!_is_prepared)
543  {
544  const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
545  // Run assembly reshape
546  if(_asm_glue.is_configured())
547  {
548  if(!original_b_managed_by_weights_manager)
549  {
550  ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
551  }
552 
553  _asm_glue.prepare();
554  if(!original_b_managed_by_weights_manager)
555  {
556  _original_b->mark_as_unused();
557  }
558  }
559  // Run non-assembly reshape
560  else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured())
561  {
562  if(!original_b_managed_by_weights_manager)
563  {
564  ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
565  }
566 
567  // Run reshape kernel and mark original weights tensor as unused
568  _tmp_b.allocator()->allocate();
569  NEScheduler::get().schedule(&_mtx_b_reshape_kernel, Window::DimY);
570  if(!original_b_managed_by_weights_manager)
571  {
572  _original_b->mark_as_unused();
573  }
574  }
575 
576  // Run matrix B reduction kernel only if _a_offset is not equal to 0
577  if(_a_offset != 0 && _reshape_b_only_on_first_run)
578  {
579  _vector_sum_col.allocator()->allocate();
580  NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
581  }
582 
583  _is_prepared = true;
584  }
585 }
586 } // namespace arm_compute
void prepare() override
Prepare the function for executing.
Shape of a tensor.
Definition: TensorShape.h:39
Quantize using a fixed point multiplication.
void run() override final
Run the kernels contained in the function.
void init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo &sub_info)
Shares the same backing memory with another tensor allocator, while the tensor info might be differen...
bool enabled() const
Check if initialised.
Definition: Types.h:1567
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
1 channel, 1 U8 per channel
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
bool is_used() const
Flags if the tensor is used or not.
Definition: ITensor.cpp:162
NEGEMMLowpMatrixMultiplyCore(std::shared_ptr< IMemoryManager > memory_manager=nullptr, IWeightsManager *weights_manager=nullptr)
Constructor.
GEMMLowpOutputStageInfo gemmlowp_output_stage() const
GEMMLowp output stage.
Definition: Types.h:2034
TensorShape compute_reductionA_shape(const ITensorInfo &b)
Calculate the reductionA shape used in GEMMLowp.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
[NEActivationLayer snippet]
void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override
Initialise the kernel's input and output.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor's metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Quantization info when assuming per layer quantization.
int32_t gemmlowp_offset
GEMMLowp output stage offset used for quantizing to QASYMM8.
Definition: Types.h:1884
void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
Initialise the kernel's input and output.
Status class.
Definition: Error.h:52
int32_t gemmlowp_max_bound
GEMMLowp max value used to saturate down the output result before converting back to QASYMM8.
Definition: Types.h:1888
void run() override
Run the kernels contained in the function.
void configure(const ITensor *input0, const ITensor *input1, ITensor *output)
Initialise the kernel's input and output.
static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixBReducti...
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Activation Layer Information class.
Definition: Types.h:1517
GEMMLowpOutputStageType type
GEMMLowp output stage type.
Definition: Types.h:1883
Interface for NEON tensor.
Definition: ITensor.h:36
TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height=1, bool reinterpret_input_as_3d=false)
Calculate the interleaved shape of an input tensor.
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpOffsetContribu...
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEGEMMTranspose1xWKernel...
Copyright (c) 2017-2020 ARM Limited.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: Helpers.inl:202
bool is_b_reshaped() const
Flag which specifies if the matrix B has been reshaped.
Definition: Types.h:1992
TensorAllocator * allocator()
Return a pointer to the tensor's allocator.
Definition: Tensor.cpp:48
ITensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
Definition: Tensor.cpp:33
void mark_as_unused() const
Marks a tensor as unused.
Definition: ITensor.cpp:167
1 channel, 1 S32 per channel
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Definition: MemoryGroup.h:79
bool are_weights_managed(const ITensor *weights)
Check if the weights are managed.
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NECopyKernel.
Quantization information.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override
Initialise the kernel's input and output.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
static bool is_activation_supported(const ActivationLayerInfo &activation)
Checks if activation is supported by the gemm assembly dispatcher.
void prepare() override
Runs a preparation step, usually for pre-transposing matrix b.
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel's inputs, output.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1208
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
void run() override
Run the kernels contained in the function.
quantized, asymmetric fixed-point 8-bit number unsigned
bool is_configured() const
Was the function successfully configured ?
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEGEMMInterleave4x4Kerne...
void allocate() override
Allocate size specified by TensorInfo of CPU memory.
void configure(const ITensor *input, ITensor *output)
Initialise the kernel's input and output.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
GEMMLowp output stage info.
Definition: Types.h:1881
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
const std::vector< float > & scale() const
Scale vector accessor.
void configure(const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
Initialise the kernel's input and output.
Weights manager interface to handle weights transformations.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
quantized, symmetric fixed-point 8-bit number
void configure(const ITensor *input, ITensor *output)
Initialise the kernel's input and output.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1153
bool is_a_reshaped() const
Flag which specifies if the matrix A has been reshaped.
Definition: Types.h:1984
quantized, symmetric per channel fixed-point 8-bit number
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
Indicates whether or not this function can be used to process the given parameters.
TensorShape compute_reductionB_shape(const ITensorInfo &a)
Calculate the reductionB shape used in GEMMLowp.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Memory group resources scope handling class.
Definition: IMemoryGroup.h:82
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.
void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
[NEActivationLayer snippet]
TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
Calculate the transposed 1xW shape.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
TensorShape & set(size_t dimension, size_t value, bool apply_dim_correction=true)
Accessor to set the value of one of the dimensions.
Definition: TensorShape.h:78
Store the tensor's metadata.
Definition: TensorInfo.h:45
void configure(const ITensor *input, ITensor *output)
Initialize the kernel's input, output.
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info)
If supported create an ACL function else fallback to the arm_gemm function.
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info=GEMMInfo())
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixMultiply...
GEMM information class.
Definition: Types.h:1931
static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixMultiply...
quantized, asymmetric fixed-point 8-bit number signed
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpOffsetContribu...
static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixAReducti...
int32_t gemmlowp_min_bound
GEMMLowp min value used to saturate down the output result before converting back to QASYMM8.
Definition: Types.h:1887
DataType
Available data types.
Definition: Types.h:77
ActivationLayerInfo activation_info() const
Activation layer to apply after the matrix multiplication.
Definition: Types.h:2082
signed 8-bit number
static IScheduler & get()
Access the scheduler singleton.
Definition: Scheduler.cpp:95