Compute Library
 20.02.1
NEGEMMLowpMatrixMultiplyCore.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
33 #include "arm_compute/core/Types.h"
39 
40 using namespace arm_compute;
42 
43 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
44  : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
45  _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
46  _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
47  _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
48 {
49 }
50 
51 void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
52 {
53  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
55  ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
56 
57  const ITensor *matrix_a = a;
58  const ITensor *matrix_b = b;
59  GEMMInfo info = gemm_info;
60 
61  // Clear state
62  _mtx_a_reshape_kernel = nullptr;
63  _mtx_b_reshape_kernel = nullptr;
64 
65  // Set internal variables
66  _a_offset = a->info()->quantization_info().uniform().offset;
67  _b_offset = b->info()->quantization_info().uniform().offset;
68  _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
69  _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
70  _is_prepared = false;
71  _fused_assembly_path = false;
72  _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
73  _original_b = b;
74 
75  const ITensor *a_to_use = a;
76 
77  // Convert to QASYMM8 -> QASYMM8_SIGNED and back
78  if(_flip_signedness)
79  {
80  const int32_t offset_correction = 128;
82  const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform();
83 
84  _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
85  _memory_group.manage(&_signed_a);
86  _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
87  a_to_use = &_signed_a;
88  _a_offset = _signed_a.info()->quantization_info().uniform().offset;
89 
90  const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
91  _memory_group.manage(&_signed_output);
92  _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
93 
94  // Output stage correction
95  GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
96  output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset;
97  output_stage_corr.gemmlowp_min_bound -= offset_correction;
98  output_stage_corr.gemmlowp_max_bound -= offset_correction;
99  info.set_gemmlowp_output_stage(output_stage_corr);
100 
101  // Update matrix a
102  matrix_a = &_signed_a;
103  }
104 
105  // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
106  if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
107  {
108  _fuse_output_stage = true;
109  _memory_group.manage(&_mm_result_s32);
110  TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
111  _mm_result_s32.allocator()->init(info_mm_result_s32);
112  }
113 
114 #ifdef __aarch64__
115  switch(a->info()->data_type())
116  {
117  case DataType::QASYMM8:
119  case DataType::U8:
120  case DataType::S8:
121  {
122  if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
123  {
124  _asm_glue.configure(a_to_use, b, c, output, gemm_info);
125  _fused_assembly_path = _asm_glue.is_configured();
126  }
127  else
128  {
129  _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
130  }
131  _assembly_path = _asm_glue.is_configured();
132  break;
133  }
134  default:
135  {
136  ARM_COMPUTE_ERROR("Datatype not supported");
137  break;
138  }
139  }
140 #endif /* __aarch64__ */
141  if(!(_assembly_path || _run_vector_matrix_multiplication))
142  {
143  matrix_a = &_tmp_a;
144  matrix_b = &_tmp_b;
145 
146  // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
147  TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
148  // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
149  TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
150  _tmp_a.allocator()->init(a_info);
151  _tmp_b.allocator()->init(b_info);
152  _memory_group.manage(&_tmp_a);
153  if(!_reshape_b_only_on_first_run)
154  {
155  _memory_group.manage(&_tmp_b);
156  }
157 
158  // Configure interleave kernel
159  {
160  auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
161  k->configure(a_to_use, &_tmp_a);
162  _mtx_a_reshape_kernel = std::move(k);
163  }
164 
165  // Configure transpose kernel
166  {
167  auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
168  k->configure(b, &_tmp_b);
169  _mtx_b_reshape_kernel = std::move(k);
170  }
171  }
172 
173  if(!_fused_assembly_path)
174  {
175  // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
176  if(_a_offset != 0)
177  {
178  TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
179 
180  _vector_sum_col.allocator()->init(info_vector_sum_col);
181  if(!_reshape_b_only_on_first_run)
182  {
183  _memory_group.manage(&_vector_sum_col);
184  }
185 
186  // Configure Matrix B reduction kernel
187  _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
188  }
189 
190  // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
191  if(_b_offset != 0)
192  {
193  TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
194 
195  _vector_sum_row.allocator()->init(info_vector_sum_row);
196  _memory_group.manage(&_vector_sum_row);
197 
198  // Configure matrix A reduction kernel
199  _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false);
200  }
201 
202  if(_fuse_output_stage)
203  {
204  // Configure matrix multiply kernel
205  if(!_assembly_path)
206  {
207  auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
208  k->configure(matrix_a, matrix_b, &_mm_result_s32);
209  _mm_kernel = std::move(k);
210  }
211 
212  _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
213  _a_offset == 0 ? nullptr : &_vector_sum_col,
214  _b_offset == 0 ? nullptr : &_vector_sum_row, c,
215  _flip_signedness ? &_signed_output : output,
216  a->info()->dimension(0),
217  _a_offset, _b_offset, info.gemmlowp_output_stage());
218 
219  if(_flip_signedness)
220  {
221  _convert_from_signed_asymm.configure(&_signed_output, output);
222  }
223  }
224  else
225  {
226  // Configure matrix multiply kernel
227  if(!_assembly_path)
228  {
229  auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
230  k->configure(matrix_a, matrix_b, output);
231  _mm_kernel = std::move(k);
232  }
233  // Configure offset contribution kernel
234  _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
235  }
236  }
237 
238  // Configure activation
239  const ActivationLayerInfo &activation = gemm_info.activation_info();
240  _run_activation = activation.enabled() && (!_assembly_path || (_assembly_path && !NEGEMMAssemblyDispatch::is_activation_supported(activation)));
241  if(_run_activation)
242  {
243  _activation_func.configure(output, nullptr, activation);
244  }
245 
246  // Allocate tensors
247  if(!_assembly_path && !_run_vector_matrix_multiplication)
248  {
249  _tmp_a.allocator()->allocate();
250  if(!_reshape_b_only_on_first_run)
251  {
252  _tmp_b.allocator()->allocate();
253  }
254  }
255 
256  if(!_fused_assembly_path)
257  {
258  if(_a_offset != 0 && !_reshape_b_only_on_first_run)
259  {
260  _vector_sum_col.allocator()->allocate();
261  }
262 
263  if(_b_offset != 0)
264  {
265  _vector_sum_row.allocator()->allocate();
266  }
267  }
268 
269  if(_fuse_output_stage)
270  {
271  _mm_result_s32.allocator()->allocate();
272  }
273 
274  if(_flip_signedness)
275  {
276  _signed_a.allocator()->allocate();
277  _signed_output.allocator()->allocate();
278  }
279 }
280 
281 Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
282 {
286  ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
287  ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
288  "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
289  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
290  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
291 
292  GEMMInfo info = gemm_info;
293  const ITensorInfo *matrix_a_info = a;
294  const ITensorInfo *matrix_b_info = b;
295 
296  const ITensorInfo *a_to_use = a;
297 
298  TensorInfo tmp_a_info{};
299  TensorInfo tmp_b_info{};
300  TensorInfo mm_result_s32_info{};
301 
302  int32_t a_offset = a->quantization_info().uniform().offset;
303  int32_t b_offset = b->quantization_info().uniform().offset;
304 
305  bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
306  if(fuse_output_stage)
307  {
308  auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
309  }
310 
311  // Convert QASYMM8->QASYMM8_SIGNED
312  TensorInfo signed_a{};
313  TensorInfo signed_output{};
314  bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
315  if(flip_signedness)
316  {
317  const int32_t offset_correction = 128;
319  const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
320 
321  signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
323  a_to_use = &signed_a;
324  a_offset = signed_a.quantization_info().uniform().offset;
325 
326  const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
327  signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
328 
329  // Output stage correction
330  GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
331  output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
332  output_stage_corr.gemmlowp_min_bound -= offset_correction;
333  output_stage_corr.gemmlowp_max_bound -= offset_correction;
334  info.set_gemmlowp_output_stage(output_stage_corr);
335 
336  // Update matrix a
337  matrix_a_info = &signed_a;
338  }
339 
340  // Check if we need to run the optimized assembly kernel
341  bool run_optimised = false;
342  bool run_optimised_requantized = false;
343  if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
344  {
345  run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
346  run_optimised_requantized = run_optimised;
347 
348  const UniformQuantizationInfo a_qinfo = a_to_use->quantization_info().uniform();
349  const QuantizationInfo b_qinfo = b->quantization_info();
350  const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform();
351  for(auto const s : b_qinfo.scale())
352  {
353  const float fmultipler = a_qinfo.scale * s / output_qinfo.scale;
354  if(fmultipler > 1.f)
355  {
356  run_optimised_requantized = false;
357  break;
358  }
359  }
360  }
361  else
362  {
363  run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
364  }
365 
366  if(run_optimised)
367  {
368  ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
369  if(info.depth_output_gemm3d() != 0)
370  {
371  if(info.reinterpret_input_as_3d())
372  {
373  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
374  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
375  }
376  else
377  {
378  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
379  }
380  }
381  else
382  {
383  ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
384  }
385  }
386  else
387  {
388  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
389  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
390 
391  const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
392  if(!run_vector_matrix_multiplication)
393  {
394  matrix_a_info = &tmp_a_info;
395  matrix_b_info = &tmp_b_info;
396 
397  // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
398  TensorShape shape_tmp_a = a->tensor_shape();
399  shape_tmp_a.set(0, a->dimension(0) * 4);
400  shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
401 
402  // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
403  TensorShape shape_tmp_b = b->tensor_shape();
404  shape_tmp_b.set(0, b->dimension(1) * 16);
405  shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
406 
407  // Validate interleave kernel
408  auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
409  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
410 
413  }
414  }
415 
416  if(!run_optimised_requantized)
417  {
418  TensorInfo info_vector_sum_col{};
419  TensorInfo info_vector_sum_row{};
420 
421  // Validate matrix B reduction kernel only if _a_offset is not equal to 0
422  if(a_offset != 0)
423  {
424  info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
425 
426  // Configure Matrix B reduction kernel
428  }
429 
430  // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
431  if(b_offset != 0)
432  {
433  info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
434 
435  // Configure matrix A reduction kernel
436  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false));
437  }
438 
439  if(fuse_output_stage)
440  {
441  if(!run_optimised)
442  {
443  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
444  }
445 
446  // Validate offset contribution kernel
448  a_offset == 0 ? nullptr : &info_vector_sum_col,
449  b_offset == 0 ? nullptr : &info_vector_sum_row,
450  c,
451  flip_signedness ? &signed_output : output,
452  a_offset, b_offset,
453  info.gemmlowp_output_stage()));
454  }
455  else
456  {
457  if(!run_optimised)
458  {
459  ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
460  }
461  // Validate offset contribution kernel
463  a_offset == 0 ? nullptr : &info_vector_sum_col,
464  b_offset == 0 ? nullptr : &info_vector_sum_row,
465  a_offset, b_offset));
466  }
467  }
468 
469  // Validate activation
470  const ActivationLayerInfo &activation = gemm_info.activation_info();
471  if(activation.enabled())
472  {
473  ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
474  }
475 
476  return Status{};
477 }
478 
480 {
481  prepare();
482 
483  MemoryGroupResourceScope scope_mg(_memory_group);
484 
485  // Convert QASYMM8->QASYMM8_SIGNED
486  if(_flip_signedness)
487  {
488  NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
489  }
490 
491  // Reshape inputs
492  if(_mtx_a_reshape_kernel)
493  {
494  NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
495  }
496  if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
497  {
498  NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
499  }
500 
501  // Run GEMM
502  if(_asm_glue.is_configured())
503  {
504  _asm_glue.run();
505  }
506  else
507  {
508  NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
509  }
510 
511  if(!_fused_assembly_path)
512  {
513  // Run matrix A reduction kernel only if _b_offset is not equal to 0
514  if(_b_offset != 0)
515  {
516  NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
517  }
518 
519  // Run matrix B reduction kernel only if _a_offset is not equal to 0
520  if(_a_offset != 0 && !_reshape_b_only_on_first_run)
521  {
522  NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
523  }
524 
525  if(_fuse_output_stage)
526  {
527  // Run offset contribution kernel
528  NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
529  }
530  else
531  {
532  // Run offset contribution kernel
533  NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
534  }
535  }
536 
537  // Convert QASYMM8_SIGNED->QASYMM8
538  if(_flip_signedness)
539  {
540  NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
541  }
542 
543  // Run fused activation
544  if(_run_activation)
545  {
546  _activation_func.run();
547  }
548 }
549 
551 {
552  if(!_is_prepared)
553  {
554  // Run assembly reshape
555  if(_asm_glue.is_configured() && _reshape_b_only_on_first_run)
556  {
557  ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
558 
559  _asm_glue.prepare();
560  _original_b->mark_as_unused();
561  }
562  // Run non-assembly reshape
563  else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
564  {
565  ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
566 
567  // Run reshape kernel and mark original weights tensor as unused
568  _tmp_b.allocator()->allocate();
569  NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
570  _original_b->mark_as_unused();
571  }
572 
573  // Run matrix B reduction kernel only if _a_offset is not equal to 0
574  if(_a_offset != 0 && _reshape_b_only_on_first_run)
575  {
576  _vector_sum_col.allocator()->allocate();
577  NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
578  }
579 
580  _is_prepared = true;
581  }
582 }
static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixAReducti...
void prepare() override
Prepare the function for executing.
Shape of a tensor.
Definition: TensorShape.h:39
Quantize using a fixed point multiplication.
void run() override final
Run the kernels contained in the function.
void init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo &sub_info)
Shares the same backing memory with another tensor allocator, while the tensor info might be differen...
bool enabled() const
Check if initialised.
Definition: Types.h:1664
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
1 channel, 1 U8 per channel
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
bool is_used() const
Flags if the tensor is used or not.
Definition: ITensor.cpp:162
GEMMLowpOutputStageInfo gemmlowp_output_stage() const
GEMMLowp output stage.
Definition: Types.h:2086
TensorShape compute_reductionA_shape(const ITensorInfo &b)
Calculate the reductionA shape used in GEMMLowp.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
[NEActivationLayer snippet]
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor's metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Quantization info when assuming per layer quantization.
int32_t gemmlowp_offset
GEMMLowp output stage offset used for quantizing to QASYMM8.
Definition: Types.h:1947
NEGEMMLowpMatrixMultiplyCore(std::shared_ptr< IMemoryManager > memory_manager=nullptr)
Constructor.
void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
Initialise the kernel's input and output.
Status class.
Definition: Error.h:52
int32_t gemmlowp_max_bound
GEMMLowp max value used to saturate down the output result before converting back to QASYMM8.
Definition: Types.h:1951
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override
Initialise the kernel's input and output.
Activation Layer Information class.
Definition: Types.h:1615
GEMMLowpOutputStageType type
GEMMLowp output stage type.
Definition: Types.h:1946
Interface for NEON tensor.
Definition: ITensor.h:36
TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height=1, bool reinterpret_input_as_3d=false)
Calculate the interleaved shape of an input tensor.
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpOffsetContribu...
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEGEMMTranspose1xWKernel...
Copyright (c) 2017-2020 ARM Limited.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: Helpers.inl:202
bool is_b_reshaped() const
Flag which specifies if the matrix B has been reshaped.
Definition: Types.h:2044
TensorAllocator * allocator()
Return a pointer to the tensor's allocator.
Definition: Tensor.cpp:48
ITensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
Definition: Tensor.cpp:33
void mark_as_unused() const
Marks a tensor as unused.
Definition: ITensor.cpp:167
1 channel, 1 S32 per channel
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Definition: MemoryGroup.h:79
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NECopyKernel.
Quantization information.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
static bool is_activation_supported(const ActivationLayerInfo &activation)
Checks if activation is supported by the gemm assembly dispatcher.
void prepare() override
Runs a preparation step, usually for pre-transposing matrix b.
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel's inputs, output.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1194
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
void run() override
Run the kernels contained in the function.
quantized, asymmetric fixed-point 8-bit number unsigned
bool is_configured() const
Was the function successfully configured ?
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEGEMMInterleave4x4Kerne...
void allocate() override
Allocate size specified by TensorInfo of CPU memory.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
GEMMLowp output stage info.
Definition: Types.h:1944
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
const std::vector< float > & scale() const
Scale vector accessor.
void configure(const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
Initialise the kernel's input and output.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1139
bool is_a_reshaped() const
Flag which specifies if the matrix A has been reshaped.
Definition: Types.h:2036
quantized, symmetric per channel fixed-point 8-bit number
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
Indicates whether or not this function can be used to process the given parameters.
TensorShape compute_reductionB_shape(const ITensorInfo &a)
Calculate the reductionB shape used in GEMMLowp.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Memory group resources scope handling class.
Definition: IMemoryGroup.h:82
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.
void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
[NEActivationLayer snippet]
TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
Calculate the transposed 1xW shape.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
TensorShape & set(size_t dimension, size_t value, bool apply_dim_correction=true)
Accessor to set the value of one of the dimensions.
Definition: TensorShape.h:78
static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixBReducti...
Store the tensor's metadata.
Definition: TensorInfo.h:45
void configure(const ITensor *input, ITensor *output)
Initialize the kernel's input, output.
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info)
If supported create an ACL function else fallback to the arm_gemm function.
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info=GEMMInfo())
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixMultiply...
GEMM information class.
Definition: Types.h:1983
static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixMultiply...
quantized, asymmetric fixed-point 8-bit number signed
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpOffsetContribu...
int32_t gemmlowp_min_bound
GEMMLowp min value used to saturate down the output result before converting back to QASYMM8.
Definition: Types.h:1950
DataType
Available data types.
Definition: Types.h:75
ActivationLayerInfo activation_info() const
Activation layer to apply after the matrix multiplication.
Definition: Types.h:2134
signed 8-bit number
void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override
Initialise the kernel's input and output.
static IScheduler & get()
Access the scheduler singleton.
Definition: Scheduler.cpp:95