59 cpu::AsmGemmInfo init_assembly_metadata(
const GEMMInfo &
info)
61 cpu::AsmGemmInfo asm_info;
63 asm_info.reinterpret_input_as_3d =
info.reinterpret_input_as_3d();
64 asm_info.depth_output_gemm3d =
info.depth_output_gemm3d();
65 asm_info.activation_info =
info.activation_info();
66 asm_info.output_stage =
info.gemmlowp_output_stage();
67 asm_info.fast_mode =
info.fast_math();
73 CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
76 _mtx_a_reshape_kernel(),
77 _mtx_b_reshape_kernel(),
78 _mtx_a_reduction_kernel(),
79 _mtx_b_reduction_kernel(),
80 _offset_contribution_kernel(),
81 _offset_contribution_output_stage_kernel(),
83 _convert_to_signed_asymm(),
84 _convert_from_signed_asymm(),
94 _run_vector_matrix_multiplication(false),
95 _assembly_path(false),
96 _fused_assembly_path(false),
97 _reshape_b_only_on_first_run(false),
99 _fuse_output_stage(false),
100 _run_activation(false),
101 _flip_signedness(false),
121 _b_offset =
b->quantization_info().uniform().offset;
122 _run_vector_matrix_multiplication = a->
dimension(1) < 2;
123 _reshape_b_only_on_first_run =
b->are_values_constant();
124 _is_prepared =
false;
125 _fused_assembly_path =
false;
127 _reshape_b_only_on_first_run;
128 _gemm_info = gemm_info;
130 _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
135 if (_flip_signedness)
137 const int32_t offset_correction = 128;
141 _signed_a = a_to_use->
clone()->set_data_type(
dt).set_quantization_info(
143 _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
144 _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
145 a_to_use = &_signed_a;
149 _signed_output =
dst->clone()->set_data_type(
dt).set_quantization_info(
157 info.set_gemmlowp_output_stage(output_stage_corr);
160 matrix_a = &_signed_a;
166 _fuse_output_stage =
true;
173 if (!(!
b->are_values_constant() &&
174 b->tensor_shape().z() > 1))
186 auto c_info_to_use = c ==
nullptr ? nullptr : c;
187 _asm_glue->configure(a_to_use,
b, c_info_to_use,
dst, asm_info);
188 _fused_assembly_path = _asm_glue->is_configured();
192 auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 :
dst);
193 _asm_glue->configure(a_to_use,
b,
nullptr, output_to_use, asm_info);
195 _assembly_path = _asm_glue->is_configured();
206 if (!(_assembly_path || _run_vector_matrix_multiplication))
218 _mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();
219 _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
222 _mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();
223 _mtx_b_reshape_kernel->configure(
b, &_tmp_b);
226 if (!_fused_assembly_path)
237 _mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();
238 _mtx_b_reduction_kernel->configure(
b, &_vector_sum_col, reduction_info);
247 _mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();
248 _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
251 if (_fuse_output_stage)
256 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
257 _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
260 _offset_contribution_output_stage_kernel =
261 std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
262 _offset_contribution_output_stage_kernel->configure(
263 &_mm_result_s32, _a_offset == 0 ?
nullptr : &_vector_sum_col,
264 _b_offset == 0 ?
nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output :
dst,
265 a->
dimension(0), _a_offset, _b_offset,
info.gemmlowp_output_stage());
267 if (_flip_signedness)
269 _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
270 _convert_from_signed_asymm->configure(&_signed_output,
dst);
278 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
279 _mm_kernel->configure(matrix_a, matrix_b,
dst);
282 _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
283 _offset_contribution_kernel->configure(
dst, _a_offset == 0 ?
nullptr : &_vector_sum_col,
284 _b_offset == 0 ?
nullptr : &_vector_sum_row, a_to_use->
dimension(0),
285 _a_offset, _b_offset);
294 _activation_func = std::make_unique<CpuActivation>();
295 _activation_func->configure(
dst,
nullptr, activation);
300 const auto asm_mem_req = _asm_glue->workspace();
301 for (
unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
303 _aux_mem[slot] = asm_mem_req[slot];
308 _aux_mem[VectorSumCol] =
310 !_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent
311 : MemoryLifetime::Temporary,
313 _aux_mem[VectorSumRow] =
317 _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
319 _aux_mem[MMResultS32] =
322 _aux_mem[SignedOutput] =
339 "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
341 (a)->dimension(0) != (
b)->dimension(1),
342 "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
357 int32_t b_offset =
b->quantization_info().uniform().offset;
360 if (fuse_output_stage)
373 const int32_t offset_correction = 128;
377 signed_a = a_to_use->
clone()->set_data_type(
dt).set_quantization_info(
380 a_to_use = &signed_a;
384 signed_output = output->
clone()->set_data_type(
dt).set_quantization_info(
389 output_stage_corr.
gemmlowp_offset = signed_output.quantization_info().uniform().offset;
392 info.set_gemmlowp_output_stage(output_stage_corr);
395 matrix_a_info = &signed_a;
402 bool run_optimised =
false;
403 bool run_optimised_requantized =
false;
405 if (!(!
b->are_values_constant() &&
406 b->tensor_shape().z() > 1))
412 run_optimised_requantized = run_optimised;
417 a_to_use,
b,
nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
424 if (
info.depth_output_gemm3d() != 0)
426 if (
info.reinterpret_input_as_3d())
444 "NEGEMM cannot reinterpret the input tensor as 3D");
446 "NEGEMM cannot reinterpret the output tensor as 3D");
448 const bool run_vector_matrix_multiplication = a->
dimension(1) < 2;
449 if (!run_vector_matrix_multiplication)
451 matrix_a_info = &tmp_a_info;
452 matrix_b_info = &tmp_b_info;
461 shape_tmp_b.
set(0,
b->dimension(1) * 16);
462 shape_tmp_b.
set(1, std::ceil(
b->dimension(0) / 16.f));
473 if (!run_optimised_requantized)
500 if (fuse_output_stage)
505 info.reinterpret_input_as_3d(),
506 "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
508 info.depth_output_gemm3d() != 0,
509 "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
512 matrix_a_info, matrix_b_info, &mm_result_s32_info));
517 &mm_result_s32_info, a_offset == 0 ?
nullptr : &info_vector_sum_col,
518 b_offset == 0 ?
nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset,
519 b_offset,
info.gemmlowp_output_stage()));
526 info.reinterpret_input_as_3d(),
527 "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
529 info.depth_output_gemm3d() != 0,
530 "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
537 output, a_offset == 0 ?
nullptr : &info_vector_sum_col, b_offset == 0 ?
nullptr : &info_vector_sum_row,
538 a_offset, b_offset));
573 if (_flip_signedness)
578 a_to_use = signed_a.
get();
579 matrix_a = signed_a.
get();
583 if (_asm_glue->is_configured())
586 auto output_to_use = (_fuse_output_stage ? mm_result_s32.
get() :
dst);
601 _asm_glue->run(asm_glue_tensors);
605 if (!_run_vector_matrix_multiplication)
607 matrix_a = tmp_a.
get();
608 matrix_b = tmp_b.
get();
614 if (!_reshape_b_only_on_first_run)
619 _mtx_b_reshape_kernel->window(), pack_b);
623 if (_fuse_output_stage)
634 if (!_fused_assembly_path)
641 _mtx_a_reduction_kernel->window(),
pack);
645 if (_a_offset != 0 && !_reshape_b_only_on_first_run)
649 _mtx_b_reduction_kernel->window(),
pack);
652 if (_fuse_output_stage)
663 _offset_contribution_output_stage_kernel->window(),
pack);
674 _offset_contribution_kernel->window(),
pack);
679 if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
683 _convert_from_signed_asymm->window(),
pack);
690 _activation_func->run(
pack);
700 if (_asm_glue->is_configured())
702 _asm_glue->prepare(tensors);
705 else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
716 if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
723 _mtx_b_reduction_kernel->window(),
pack);