Compute Library
 22.05
CpuGemm.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
30 #include "src/common/utils/Log.h"
31 #include "src/core/CPP/Validate.h"
35 
36 using namespace arm_compute::experimental;
38 
39 namespace arm_compute
40 {
41 namespace cpu
42 {
43 namespace
44 {
45 cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
46 {
47  cpu::AsmGemmInfo asm_info;
48  asm_info.method = cpu::AsmConvMethod::Im2Col;
49  asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
50  asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
51  asm_info.activation_info = info.activation_info();
52  asm_info.fast_mode = info.fast_math();
53 
54  return asm_info;
55 }
56 } // namespace
57 
58 void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
59 {
61  ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info));
62  ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info);
63 
64  const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
65  const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
66  bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info));
67 
68  // Check if we need to reshape the matrix B only on the first run
69  _is_prepared = false;
70  _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
71  _run_vector_matrix_multiplication = a->dimension(1) < 2;
72  _run_alpha_scale = alpha != 1.f;
73  _run_bias_addition = c != nullptr && gemm_info.reshape_b_only_on_first_run();
74  _run_addition = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run();
75  _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised
77 
78  if(run_optimised)
79  {
80  const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
81  _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
82  _asm_glue->configure(a, b, c_to_use, d, asm_info);
83  ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
84 
85  auto asm_mem_req = _asm_glue->workspace();
86  _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
87  _aux_mem[Pretraspose] = asm_mem_req[Pretraspose];
88 
89  // Scale product by alpha
90  if(_run_alpha_scale)
91  {
92  _alpha_scale_func = std::make_unique<cpu::CpuActivation>();
93  _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
94  }
95  }
96  else
97  {
98  // Pick output tensor in case bias addition should be performed
99  ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d;
100 
101  _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
102 
103  // Select between GEMV and GEMM
104  if(_run_vector_matrix_multiplication)
105  {
106  // Configure the matrix multiply kernel
107  _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
108  }
109  else
110  {
111  const int m = a->dimension(1);
112  const int n = b->dimension(0);
113  const int k = a->dimension(0);
114 
115  // Configure interleave kernel
116  _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
117  _interleave_kernel->configure(a, &_tmp_a);
118  _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
119 
120  // Configure transpose kernel
121  _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
122  _transpose_kernel->configure(b, &_tmp_b);
123  _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
124 
125  // Configure matrix multiplication kernel
126  _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
127  }
128 
129  if(_run_bias_addition)
130  {
131  _add_bias = std::make_unique<cpu::CpuAdd>();
132  _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
133  _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
134  }
135  }
136 
137  // Configure matrix addition kernel
138  if(_run_addition)
139  {
140  _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>();
141  _ma_kernel->configure(c, d, beta);
142  }
143 
144  // Configure activation
145  if(_run_activation)
146  {
147  _activation_func = std::make_unique<cpu::CpuActivation>();
148  _activation_func->configure(d, nullptr, gemm_info.activation_info());
149  }
150 }
151 
152 Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
153 {
154  ARM_COMPUTE_UNUSED(alpha);
155  const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
156 
161  ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
162  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
163  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
164  if(a->data_type() != DataType::BFLOAT16)
165  {
167  }
168 
169  if(c != nullptr && !is_c_bias)
170  {
174  ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
175  ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
176  }
177 
178  if(d->total_size() != 0)
179  {
181  if(gemm_info.depth_output_gemm3d() != 0)
182  {
183  if(gemm_info.reinterpret_input_as_3d())
184  {
187  }
188  else
189  {
191  }
192  }
193  else
194  {
196  }
197  }
198 
199  // Check if we need to run the optimized assembly kernel
200  cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
201  const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info));
202 
203  if(!run_optimised)
204  {
205  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D");
206  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D");
207 
208  // Check if the first input tensor is a vector.
209  const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
210  // Check if we need to reshape the matrix A and matrix B
211  const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run());
212 
213  // Arguments used by GEMMReshapeInfo
214  // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo
215  // in order to know how the matrices have been reshaped
216  const int m = a->dimension(1);
217  const int n = b->dimension(0);
218  const int k = a->dimension(0);
219  int mult_transpose1xW_width = 1;
220  int mult_interleave4x4_height = 1;
221 
222  const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
223 
224  const ITensorInfo *matrix_a_info = a;
225  const ITensorInfo *matrix_b_info = b;
226 
227  TensorInfo tmp_a_info{};
228  TensorInfo tmp_b_info{};
229  TensorInfo tmp_output_info = *d->clone();
230 
231  if(run_interleave_transpose)
232  {
233  matrix_a_info = &tmp_a_info;
234  matrix_b_info = &tmp_b_info;
235 
236  // Validate interleave kernel
237  auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
239 
240  // Validate transpose kernel
241  auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
243  }
244 
245  // Validate matrix multiply
246  auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
247  ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
248 
249  if(c != nullptr && gemm_info.reshape_b_only_on_first_run())
250  {
252  }
253  }
254 
255  // Validate matrix addition kernel
256  if(beta != 0 && c != nullptr && !is_c_bias)
257  {
259  }
260 
261  // Validate activation
262  const ActivationLayerInfo &activation = gemm_info.activation_info();
263  if(activation.enabled())
264  {
266  }
267 
268  return Status{};
269 }
270 
271 void CpuGemm::run(ITensorPack &tensors)
272 {
273  prepare(tensors);
274 
275  auto a = tensors.get_const_tensor(ACL_SRC_0);
276  auto b = tensors.get_const_tensor(ACL_SRC_1);
277  auto c = tensors.get_const_tensor(ACL_SRC_2);
278  auto d = tensors.get_tensor(ACL_DST);
279 
280  if(_asm_glue->is_configured())
281  {
282  // Pass c to asm dispatch only if it's the bias tensor
283  ITensorPack asm_pack = tensors;
284  asm_pack.add_const_tensor(ACL_SRC_2, (_reshape_b_only_on_first_run) ? c : nullptr);
285  _asm_glue->run(asm_pack);
286  if(_run_alpha_scale)
287  {
288  ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
289  _alpha_scale_func->run(pack);
290  }
291  }
292  else
293  {
294  CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true);
295  CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
296  CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
297 
298  ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } };
299  if(!_run_vector_matrix_multiplication)
300  {
301  // Run interleave kernel
302  ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } };
303  NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack);
304 
305  if(!_reshape_b_only_on_first_run)
306  {
307  // Run transpose kernel
308  ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
309  NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
310  }
311 
312  // Use reshaped matrices
313  mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
314  mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
315  }
316 
317  NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack);
318 
319  // Run bias addition kernel
320  if(_run_bias_addition)
321  {
322  ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } };
323  _add_bias->run(pack);
324  }
325  }
326 
327  // Run matrix addition kernel
328  if(_run_addition)
329  {
330  ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } };
331  NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack);
332  }
333 
334  // Run activation function
335  if(_run_activation)
336  {
337  ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
338  _activation_func->run(pack);
339  }
340 }
341 
342 void CpuGemm::prepare(ITensorPack &tensors)
343 {
344  if(!_is_prepared)
345  {
346  if(_asm_glue->is_configured())
347  {
348  _asm_glue->prepare(tensors);
349  }
350  else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
351  {
352  const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
353  ITensor *b_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
355 
356  CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
357  ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
358  NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
359  }
360  _is_prepared = true;
361  }
362 }
363 
364 experimental::MemoryRequirements CpuGemm::workspace() const
365 {
366  return _aux_mem;
367 }
368 } // namespace cpu
369 } // namespace arm_compute
Status validate(const OperatorGraph &op_graph)
Return the validity of op_graph, usually after performing an operation (e.g.
TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b, int mult_transpose1xW_width=1)
Calculate the transposed 1xW width element shape.
std::unique_ptr< ITensorInfo > clone() const override
Provide a clone of the current object of class T.
Definition: TensorInfo.cpp:282
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:115
bool enabled() const
Check if initialised.
Definition: Types.h:1675
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor)
Definition: Validate.h:121
void add_const_tensor(int id, const ITensor *tensor)
Add const tensor to the pack.
Definition: ITensorPack.cpp:49
SimpleTensor< float > b
Definition: DFT.cpp:157
static bool is_activation_supported(const ActivationLayerInfo &activation)
Checks if activation is supported by the gemm assembly dispatcher.
GEMM reshape information class.
Definition: Types.h:1910
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Calculate the matrix multiplication output shape of two tensors.
virtual void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors)=0
Runs the kernel in the same thread as the caller synchronously.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
static Status validate(const ITensorInfo *src, const ITensorInfo *dst)
Static function to check if given info will lead to a valid configuration of CpuGemmTranspose1xWKerne...
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
int depth_output_gemm3d() const
Depth of the output when GEMM output is reinterpreted as 3D tensor.
Definition: Types.h:2177
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
Static function to check if given info will lead to a valid configuration of CpuGemmMatrixMultiplyKer...
Activation Layer Information class.
Definition: Types.h:1625
Interface for CPU tensor.
Definition: ITensor.h:36
TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height=1, bool reinterpret_input_as_3d=false)
Calculate the interleaved shape of an input tensor.
Copyright (c) 2017-2022 Arm Limited.
bool is_b_reshaped() const
Flag which specifies if the matrix B has been reshaped.
Definition: Types.h:2159
std::vector< MemoryInfo > MemoryRequirements
Definition: Types.h:134
1 channel, 1 F16 per channel
16-bit brain floating-point number
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
Indicates whether or not this function can be used to process the given parameters.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
bool reinterpret_input_as_3d() const
Flag which specifies if the input tensor has to be reinterpreted as 3D.
Definition: Types.h:2185
bool is_a_reshaped() const
Flag which specifies if the matrix A has been reshaped.
Definition: Types.h:2151
static Status validate(const ITensorInfo *src, const ITensorInfo *dst)
Static function to check if given info will lead to a valid configuration of CpuGemmInterleave4x4Kern...
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info=ActivationLayerInfo())
Static function to check if given info will lead to a valid configuration.
Definition: CpuAdd.cpp:43
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
Static function to check if given info will lead to a valid configuration.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
Target polymorphic_cast(Source *v)
Polymorphic cast between two types.
Definition: Cast.h:47
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:541
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:39
#define ARM_COMPUTE_LOG_PARAMS(...)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:43
bool reshape_b_only_on_first_run() const
Flag which specifies if the reshape of matrix B should executed only for the first.
Definition: Types.h:2169
int offset_int_vec(int offset)
Definition: MemoryHelpers.h:38
GEMM information class.
Definition: Types.h:2090
im2col_func configure(src_target.info(), dst_target.info(), spatial_kernel, conv_info, has_bias)
ActivationLayerInfo activation_info() const
Activation layer to apply after the matrix multiplication.
Definition: Types.h:2281
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta)
Static function to check if given info will lead to a valid configuration of CpuGemmMatrixAdditionKer...
static IScheduler & get()
Access the scheduler singleton.
Definition: Scheduler.cpp:94