46 cpu::AsmGemmInfo init_assembly_metadata(
const GEMMInfo &
info)
48 cpu::AsmGemmInfo asm_info;
50 asm_info.reinterpret_input_as_3d =
info.reinterpret_input_as_3d();
51 asm_info.depth_output_gemm3d =
info.depth_output_gemm3d();
52 asm_info.activation_info =
info.activation_info();
53 asm_info.fast_mode =
info.fast_math();
54 asm_info.fixed_format =
info.fixed_format();
55 asm_info.weight_format =
info.weight_format();
56 asm_info.transpose_b =
57 info.pretranspose_B();
76 const bool is_c_bias = beta == 1 && c !=
nullptr;
77 const bool run_optimised =
79 (c ==
nullptr || beta == 0.f || beta == 1.f) &&
80 !(!
b->are_values_constant() &&
81 b->tensor_shape().z() > 1);
85 _reshape_b_only_on_first_run =
b->are_values_constant();
86 _run_vector_matrix_multiplication = a->
dimension(1) < 2;
87 _run_alpha_scale = alpha != 1.f;
88 _run_bias_addition = is_c_bias;
89 _run_addition = beta != 0 && beta != 1 && c !=
nullptr;
97 _run_interleave_transpose =
false;
98 const ITensorInfo *c_to_use = is_c_bias ? c :
nullptr;
99 _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
100 _asm_glue->configure(a,
b, c_to_use, d, asm_info);
103 const auto asm_mem_req = _asm_glue->workspace();
104 for (
unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
106 _aux_mem[slot] = asm_mem_req[slot];
110 if (_run_alpha_scale)
112 _alpha_scale_func = std::make_unique<cpu::CpuActivation>();
113 _alpha_scale_func->configure(
114 d,
nullptr,
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
119 _run_interleave_transpose = !_run_vector_matrix_multiplication;
121 ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d;
125 _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
130 _pretranspose_b_func = std::make_unique<CpuTranspose>();
131 _pretranspose_b_func->configure(b_to_use, &_pretransposed_b);
133 if (_reshape_b_only_on_first_run)
135 if (_run_interleave_transpose)
153 _aux_mem[PreTransposedRHS] =
155 b_to_use = &_pretransposed_b;
159 if (_run_vector_matrix_multiplication)
162 _mm_kernel->configure(a, b_to_use, gemm_output_to_use, alpha,
false);
168 _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
169 _interleave_kernel->configure(a, &_tmp_a);
170 _aux_mem[InterleavedLHS] =
174 _transpose1xW_b_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
175 _transpose1xW_b_kernel->configure(b_to_use, &_tmp_b);
176 _aux_mem[Transposed1xWRHS] =
185 _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, _run_interleave_transpose,
189 if (_run_bias_addition)
191 _add_bias = std::make_unique<cpu::CpuAdd>();
193 _aux_mem[TempResult] =
201 _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>();
202 _ma_kernel->configure(c, d, beta);
208 _activation_func = std::make_unique<cpu::CpuActivation>();
222 const bool is_c_bias = beta == 1 && c !=
nullptr;
223 const bool run_addition = c !=
nullptr && beta != 0 && beta != 1;
262 const size_t kernel_area = (dim0_sz - b_to_use->
dimension(1)) / input_pad_right;
264 (dim0_sz - kernel_area * input_pad_right) != b_to_use->
dimension(1),
265 "The product AB is defined only if A number of columns and B number of rows are related");
271 "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
287 "The C matrix must have the same number of rows as the matrix A");
289 "The C matrix must have the same number of columns as the matrix B");
320 const bool run_optimised =
322 (c ==
nullptr || beta == 0.f || beta == 1.f) &&
323 !(!
b->are_values_constant() &&
324 b->tensor_shape().z() > 1);
329 "CpuGemm cannot reinterpret the input tensor as 3D");
331 "CpuGemm cannot reinterpret the output tensor as 3D");
334 const bool run_vector_matrix_multiplication = a->
dimension(1) < 2;
336 const bool run_interleave_transpose = !run_vector_matrix_multiplication;
344 int mult_transpose1xW_width = 1;
345 int mult_interleave4x4_height = 1;
348 m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.
depth_output_gemm3d());
357 if (run_interleave_transpose)
359 matrix_a_info = &tmp_a_info;
360 matrix_b_info = &tmp_b_info;
369 b_to_use->
clone()->set_tensor_shape(
377 *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
379 matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
412 if (_asm_glue && _asm_glue->is_configured())
417 _asm_glue->run(asm_pack);
418 if (_run_alpha_scale)
421 _alpha_scale_func->run(
pack);
433 if (_run_interleave_transpose)
440 mm_pack.add_const_tensor(
ACL_SRC_0, interleaved_a.
get());
444 if (_pretranspose_b_func)
446 if (!_reshape_b_only_on_first_run)
450 _pretranspose_b_func->run(pretranspose_pack);
452 b_to_use = pretransposed_b.
get();
454 if (_run_interleave_transpose)
456 if (!_reshape_b_only_on_first_run)
461 _transpose1xW_b_kernel->window(), transpose_pack);
463 b_to_use = transposed1xw_b.
get();
466 mm_pack.add_const_tensor(
ACL_SRC_1, b_to_use);
470 _mm_kernel->window(), mm_pack);
473 if (_run_bias_addition)
476 _add_bias->run(
pack);
491 _activation_func->run(
pack);
499 if (_asm_glue && _asm_glue->is_configured())
501 _asm_glue->prepare(tensors);
503 else if (_reshape_b_only_on_first_run)
510 _pretranspose_b_func ==
513 false , !_run_interleave_transpose );
515 if (_pretranspose_b_func)
519 _pretranspose_b_func->run(pretranspose_pack);
520 b_to_use = pretransposed_b.
get();
522 if (_run_interleave_transpose)
527 _transpose1xW_b_kernel->window(), transpose_pack);
548 return CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a,
b, c, d, asm_info);
551 bool CpuGemm::isVarWeightsKernel()
const
553 return _asm_glue && _asm_glue->isVarWeightsKernel();