Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. More...

#include <ClGemmLowpMatrixMultiplyCore.h>

Collaboration diagram for ClGemmLowpMatrixMultiplyCore:

Public Member Functions
	ClGemmLowpMatrixMultiplyCore ()

	~ClGemmLowpMatrixMultiplyCore ()

void	configure (const CLCompileContext &compile_context, ITensorInfo a, ITensorInfo b, ITensorInfo c, ITensorInfo output, const GEMMInfo &gemm_info=GEMMInfo())
	Initialise the kernel's inputs, output. More...

void	run (ITensorPack &tensors) override
	Run the kernels contained in the function. More...

void	prepare (ITensorPack &constants) override
	Prepare the function for executing. More...

experimental::MemoryRequirements	workspace () const override
	Return the memory requirements required by the workspace. More...

Public Member Functions inherited from ICLOperator
	ICLOperator (IRuntimeContext *ctx=nullptr)
	Constructor. More...

	ICLOperator (const ICLOperator &)=delete
	Prevent instances of this class from being copied (As this class contains pointers) More...

	ICLOperator (ICLOperator &&)=default
	Default move constructor. More...

ICLOperator &	operator= (const ICLOperator &)=delete
	Prevent instances of this class from being copied (As this class contains pointers) More...

ICLOperator &	operator= (ICLOperator &&)=default
	Default move assignment operator. More...

Public Member Functions inherited from IOperator
virtual	~IOperator ()=default
	Destructor. More...

Static Public Member Functions
static Status	validate (const ITensorInfo a, const ITensorInfo b, const ITensorInfo c, const ITensorInfo output, const GEMMInfo &gemm_info=GEMMInfo())
	Static function to check if given info will lead to a valid configuration. More...

Detailed Description

Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL.

Definition at line 53 of file ClGemmLowpMatrixMultiplyCore.h.

Constructor & Destructor Documentation

◆ ClGemmLowpMatrixMultiplyCore()

ClGemmLowpMatrixMultiplyCore ( )

Definition at line 288 of file ClGemmLowpMatrixMultiplyCore.cpp.

     : _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
       _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
       _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
       _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>()),
       _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
       _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
       _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
       _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()),
       _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()),
       _aux_mem(AuxTensorIdx::Count)
 {
 }

◆ ~ClGemmLowpMatrixMultiplyCore()

~ClGemmLowpMatrixMultiplyCore ( )

default

Member Function Documentation

◆ configure()

void configure	(	const CLCompileContext &	compile_context,
		ITensorInfo *	a,
		ITensorInfo *	b,
		ITensorInfo *	c,
		ITensorInfo *	output,
		const GEMMInfo &	gemm_info = `GEMMInfo()`
	)

Initialise the kernel's inputs, output.

Valid data layouts:

NHWC
NCHW

Valid data type configurations:

src0	src1	src2	dst
QASYMM8	QASYMM8	S32	QASYMM8
QASYMM8	QSYMM8_PER_CHANNEL	S32	QASYMM8
QASYMM8	QSYMM8	S32	QASYMM8
QASYMM8	QASYMM8	S32	S32
QASYMM8	QSYMM8_PER_CHANNEL	S32	S32
QASYMM8	QSYMM8	S32	S32
QASYMM8_SIGNED	QASYMM8_SIGNED	S32	QASYMM8_SIGNED
QASYMM8_SIGNED	QSYMM8_PER_CHANNEL	S32	QASYMM8_SIGNED
QASYMM8_SIGNED	QSYMM8	S32	QASYMM8_SIGNED
QASYMM8_SIGNED	QASYMM8_SIGNED	S32	S32
QASYMM8_SIGNED	QSYMM8_PER_CHANNEL	S32	S32
QASYMM8_SIGNED	QSYMM8	S32	S32

Note: GEMMLowp: low precision GEMM kernel. [A * B + C] This kernel performs the following computations:

Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
Compute the matrix product of the resulting a * b in int32.
Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE

Parameters

[in]	compile_context	The compile context to be used.
[in]	a	First input tensor (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
[in]	b	Second input tensor (Matrix B). Data type supported: same as `a`
[in]	c	Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32
[out]	output	Output tensor. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE
[in]	gemm_info	(Optional) Specifies if the matrix A and/or matrix B have been reshaped and if the reshape of matrix B should be executed only for the first run

Definition at line 304 of file ClGemmLowpMatrixMultiplyCore.cpp.

 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));
     ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info);
  
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->quantization_info().uniform().offset;
     _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) &&
                           is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8;
     _b_offset  = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
     _gemm_info = gemm_info;
  
     // Get the GPU target
     const GPUTarget gpu_target = CLScheduler::get().target();
  
     // Set the target for the kernels
     _mm_native_kernel->set_target(gpu_target);
     _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
     _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
  
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
  
     // Arguments used by GEMMReshapeInfo
     // in order to know how the matrices have been reshaped
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
     const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n          = b->dimension(0);
     const unsigned int k          = a->dimension(0);
     const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
  
     const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
  
     _gemm_kernel_type = auto_select_gemm_kernel(
         auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run);
  
     if (_convert_to_qasymm8)
     {
         // Set data type for converted weights
         _qasymm8_weights = *b;
         _qasymm8_weights.set_data_type(DataType::QASYMM8);
         _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP);
     }
  
     ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
     if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
     {
         matrix_b = &_tmp_b;
  
         // Pick up the GEMM configuration
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
         std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
             auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
             depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
  
         // Configure reshape RHS kernel
         _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
                                          rhs_info);
     }
     if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
     {
         matrix_b = &_tmp_b;
  
         // Pick up the GEMM configuration
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
         std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(
             auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
             depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
  
         // Configure reshape RHS kernel
         _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
                                          rhs_info);
     }
  
     // Using default reduction info
     const GEMMLowpReductionKernelInfo reduction_info{};
  
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
     if (_a_offset != 0)
     {
         _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
  
         // Configure Matrix B reduction kernel
         _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b,
                                            &_vector_sum_col, reduction_info);
     }
  
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
     if (_b_offset != 0)
     {
         _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
  
         // Configure matrix A reduction kernel
         _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
     }
  
     GEMMKernelInfo gemm_kernel_info;
     gemm_kernel_info.m                       = m;
     gemm_kernel_info.n                       = n;
     gemm_kernel_info.k                       = k;
     gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
     gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     gemm_kernel_info.lhs_info                = lhs_info;
     gemm_kernel_info.rhs_info                = rhs_info;
     gemm_kernel_info.a_offset                = _a_offset;
     gemm_kernel_info.b_offset                = _b_offset;
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
     if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         // Configure offset contribution kernel
         const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
                                        ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
                                        : 1;
  
         _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
         _gemm_output_stage_shifts      = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
  
         GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
         gemmlowp_output_stage.output_data_type        = a->data_type();
         if (num_filters == 1)
         {
             // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
             // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
             gemmlowp_output_stage.is_quantized_per_channel = false;
         }
  
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
  
         if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS &&
             gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
             _mm_reshaped_only_rhs_kernel->configure(
                 compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
                 _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
                 &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
         else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL &&
                  gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
             _mm_reshaped_only_rhs_mmul_kernel->configure(
                 compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
                 _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
                 &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
         else
         {
             _run_output_stage = true;
  
             if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
             {
                 _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
                                                         gemm_kernel_info);
             }
             if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
             {
                 _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
                                                              gemm_kernel_info);
             }
             else
             {
                 // Pick up the GEMM configuration
                 // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
                 std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
                     auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
                     _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
  
                 // Configure matrix multiply kernel
                 _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info,
                                              reshape_info);
  
                 _offset_contribution_output_stage_kernel->configure(
                     compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
                     _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0),
                     _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers,
                     &_gemm_output_stage_shifts);
             }
         }
     }
     else
     {
         _run_offset_contribution = true;
         if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
         }
         else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
         }
         else
         {
             // Pick up the GEMM configuration
             // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
             std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
                 auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
                 _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
  
             // Configure matrix multiply kernel
             _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
         }
  
         // Configure offset contribution kernel
         _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col,
                                                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
                                                a->dimension(0), _a_offset, _b_offset);
     }
  
     // Request memory
     _aux_mem[RhsQAsymm8] =
         MemoryInfo(offset_int_vec(RhsQAsymm8),
                    _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
                    _qasymm8_weights.total_size());
     if (is_gemm_reshaped(_gemm_kernel_type))
     {
         // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
         _aux_mem[RhsQAsymm8] =
             MemoryInfo(offset_int_vec(RhsQAsymm8),
                        _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary,
                        _qasymm8_weights.total_size());
         _aux_mem[RhsReshape] = MemoryInfo(
             offset_int_vec(RhsReshape),
             _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
     }
     if (_a_offset != 0)
     {
         _aux_mem[VecSumCol] =
             MemoryInfo(offset_int_vec(VecSumCol),
                        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
                        _vector_sum_col.total_size());
     }
     if (_b_offset != 0)
     {
         _aux_mem[VecSumRow] =
             MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
     }
     _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
     _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent,
                                        _gemm_output_stage_multipliers.total_size());
     _aux_mem[Shifts] =
         MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
 }

References GEMMKernelInfo::a_offset, ARM_COMPUTE_ERROR_ON_NULLPTR, ARM_COMPUTE_ERROR_THROW_ON, ARM_COMPUTE_LOG_PARAMS, arm_compute::test::validation::b, GEMMKernelInfo::b_offset, arm_compute::misc::shape_calculator::compute_reductionA_shape(), arm_compute::misc::shape_calculator::compute_reductionB_shape(), ITensorInfo::data_type(), GEMMKernelInfo::depth_output_gemm3d, GEMMInfo::depth_output_gemm3d(), ITensorInfo::dimension(), GEMMLowpOutputStageInfo::gemmlowp_multipliers, GEMMInfo::gemmlowp_output_stage(), CLScheduler::get(), arm_compute::is_data_type_quantized_per_channel(), arm_compute::is_data_type_quantized_symmetric(), GEMMLowpOutputStageInfo::is_quantized_per_channel, GEMMKernelInfo::k, GEMMKernelInfo::lhs_info, GEMMKernelInfo::m, GEMMKernelInfo::n, arm_compute::NONE, UniformQuantizationInfo::offset, arm_compute::offset_int_vec(), GEMMLowpOutputStageInfo::output_data_type, GEMMKernelInfo::output_stage, arm_compute::experimental::Prepare, arm_compute::QASYMM8, ITensorInfo::quantization_info(), arm_compute::QUANTIZE_DOWN_FIXEDPOINT, GEMMKernelInfo::reinterpret_input_as_3d, GEMMInfo::reinterpret_input_as_3d(), GEMMInfo::reshape_b_only_on_first_run(), arm_compute::RESHAPED_ONLY_RHS, arm_compute::RESHAPED_ONLY_RHS_MMUL, GEMMKernelInfo::rhs_info, arm_compute::S32, TensorInfo::set_data_type(), CLScheduler::target(), TensorInfo::total_size(), GEMMLowpOutputStageInfo::type, QuantizationInfo::uniform(), ClGemmLowpMatrixMultiplyCore::validate(), and arm_compute::WRAP.

◆ prepare()

void prepare ( ITensorPack & constants )

overridevirtual

Prepare the function for executing.

Any one off pre-processing step required by the function is handled here

Parameters

[in] constants Vector that contains the constants tensors.

Note: Prepare stage might not need all the function's buffers' backing memory to be available in order to execute

Reimplemented from ICLOperator.

Definition at line 877 of file ClGemmLowpMatrixMultiplyCore.cpp.

 {
     if (!_is_prepared)
     {
         auto               b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
         CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
         CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, false);
  
         ARM_COMPUTE_ERROR_ON_NULLPTR(b);
  
         if (_convert_to_qasymm8)
         {
             ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}};
             CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
             b->mark_as_unused();
         }
  
         if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
         {
             // Run reshape kernel and mark original weights tensor as unused
             ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
                                       {TensorType::ACL_DST, tmp_b.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
             b->mark_as_unused();
         }
  
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
         if (_a_offset != 0 && _reshape_b_only_on_first_run)
         {
             ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
                                           {TensorType::ACL_DST, vec_sum_col.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
         }
  
         // Compute GEMM output multipliers and shifts for output stage
         {
             const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
                                            ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
                                            : 1;
  
             CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
             CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
  
             ICLTensor *multiplier_tensor = multipliers.get();
             if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
             {
                 multiplier_tensor->map(CLScheduler::get().queue(), true);
                 std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)),
                             _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(),
                             num_filters * sizeof(int32_t));
                 multiplier_tensor->unmap(CLScheduler::get().queue());
             }
  
             ICLTensor *shifts_tensor = shifts.get();
             if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
             {
                 shifts_tensor->map(CLScheduler::get().queue(), true);
                 std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)),
                             _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
                 shifts_tensor->unmap(CLScheduler::get().queue());
             }
         }
         CLScheduler::get().queue().finish();
         _is_prepared = true;
     }
 }

References arm_compute::ACL_DST, arm_compute::ACL_SRC, arm_compute::ACL_SRC_1, ARM_COMPUTE_ERROR_ON_NULLPTR, arm_compute::test::validation::b, CLScheduler::enqueue_op(), GEMMLowpOutputStageInfo::gemmlowp_multipliers, GEMMInfo::gemmlowp_output_stage(), GEMMLowpOutputStageInfo::gemmlowp_shifts, CLScheduler::get(), CLAuxTensorHandler::get(), ITensorPack::get_const_tensor(), ITensor::info(), GEMMLowpOutputStageInfo::is_quantized_per_channel, ICLTensor::map(), arm_compute::offset_int_vec(), ITensor::ptr_to_element(), CLScheduler::queue(), ITensorInfo::total_size(), and ICLTensor::unmap().

Referenced by ClGemmLowpMatrixMultiplyCore::run().

◆ run()

void run ( ITensorPack & tensors )

overridevirtual

Run the kernels contained in the function.

Parameters

[in] tensors Vector that contains the tensors to operate on.

Reimplemented from ICLOperator.

Definition at line 759 of file ClGemmLowpMatrixMultiplyCore.cpp.

 {
     const ITensor *a   = tensors.get_const_tensor(ACL_SRC_0);
     const ITensor *b   = tensors.get_const_tensor(ACL_SRC_1);
     const ITensor *c   = tensors.get_const_tensor(ACL_SRC_2);
     ITensor       *dst = tensors.get_tensor(ACL_DST);
  
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, dst);
  
     CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
     CLAuxTensorHandler vec_sum_row(offset_int_vec(VecSumRow), _vector_sum_row, tensors, true);
     CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, true);
     CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
     CLAuxTensorHandler res32(offset_int_vec(ResultS32), _mm_result_s32, tensors, true);
     CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, true);
     CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, true);
  
     // Prepare the consts if needed
     prepare(tensors);
  
     const ITensor *matrix_a = a;
     const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
  
     if (is_gemm_reshaped(_gemm_kernel_type))
     {
         matrix_b = tmp_b.get();
         if (!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
             ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
                                               {TensorType::ACL_DST, tmp_b.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
         }
     }
  
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
     if (_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
         ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
                                       {TensorType::ACL_DST, vec_sum_col.get()}};
         CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
     }
  
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
     if (_b_offset != 0)
     {
         ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}};
         CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
     }
  
     // Run matrix multiply
     if (is_gemm_reshaped(_gemm_kernel_type))
     {
         ITensorPack gemm_reshaped_pack;
         if (_run_offset_contribution)
         {
             gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a},
                                               {TensorType::ACL_SRC_1, matrix_b},
                                               {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}});
         }
         else
         {
             gemm_reshaped_pack = ITensorPack({
                 {TensorType::ACL_SRC, matrix_a},
                 {TensorType::ACL_SRC_1, matrix_b},
                 {TensorType::ACL_BIAS, c},
                 {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
                 {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
                 {TensorType::ACL_SHIFTS, shifts.get()},
                 {TensorType::ACL_MULTIPLIERS, multipliers.get()},
                 {TensorType::ACL_DST, dst},
             });
         }
         if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
         }
         else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
         {
             CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
         }
         else
         {
             ARM_COMPUTE_ERROR("Invalid reshaped kernel");
         }
     }
     else
     {
         ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a},
                                         {TensorType::ACL_SRC_1, matrix_b},
                                         {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}};
         CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
     }
     if (_run_output_stage)
     {
         // Run offset contribution/output stage kernel
         ITensorPack output_stage_pack = {
             {TensorType::ACL_SRC, res32.get()},
             {TensorType::ACL_BIAS, c},
             {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
             {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
             {TensorType::ACL_SHIFTS, shifts.get()},
             {TensorType::ACL_MULTIPLIERS, multipliers.get()},
             {TensorType::ACL_DST, dst},
         };
         CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
     }
     if (_run_offset_contribution)
     {
         // Run offset contribution kernel
         ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst},
                                            {TensorType::ACL_BIAS, c},
                                            {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
                                            {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}};
         CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
     }
 }

References arm_compute::ACL_BIAS, arm_compute::ACL_DST, arm_compute::ACL_MULTIPLIERS, arm_compute::ACL_SHIFTS, arm_compute::ACL_SRC, arm_compute::ACL_SRC_0, arm_compute::ACL_SRC_1, arm_compute::ACL_SRC_2, arm_compute::ACL_SRC_DST, arm_compute::ACL_VEC_COL_SUM, arm_compute::ACL_VEC_ROW_SUM, ARM_COMPUTE_ERROR, ARM_COMPUTE_ERROR_ON_NULLPTR, arm_compute::test::validation::b, arm_compute::test::validation::dst, CLScheduler::enqueue_op(), CLScheduler::get(), CLAuxTensorHandler::get(), ITensorPack::get_const_tensor(), ITensorPack::get_tensor(), arm_compute::offset_int_vec(), ClGemmLowpMatrixMultiplyCore::prepare(), arm_compute::RESHAPED_ONLY_RHS, and arm_compute::RESHAPED_ONLY_RHS_MMUL.

◆ validate()

Status validate	(	const ITensorInfo *	a,
		const ITensorInfo *	b,
		const ITensorInfo *	c,
		const ITensorInfo *	output,
		const GEMMInfo &	gemm_info = `GEMMInfo()`
	)

static

Static function to check if given info will lead to a valid configuration.

Returns: a status

Definition at line 557 of file ClGemmLowpMatrixMultiplyCore.cpp.

 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
                                                          DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
  
     int32_t a_offset = a->quantization_info().uniform().offset;
     int32_t b_offset = b->quantization_info().uniform().offset;
  
     const ITensorInfo *matrix_a_info = a;
  
     TensorInfo        tmp_b_info{};
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
  
     // Get the GPU target
     const GPUTarget gpu_target = CLScheduler::get().target();
  
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
     const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n          = b->dimension(0);
     const unsigned int k          = a->dimension(0);
     const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
  
     bool reshape_matrix_b = is_gemm_reshaped(
         auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size},
                                 gemm_info.reshape_b_only_on_first_run()));
  
     const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
  
     bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) &&
                               is_data_type_quantized_symmetric(b->data_type()) &&
                               is_data_type_quantized_asymmetric(a->data_type());
     TensorInfo weights_info(*b);
     if (convert_to_qasymm8)
     {
         b_offset = -128;
         weights_info.set_data_type(DataType::QASYMM8);
         ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
     }
     const ITensorInfo *matrix_b_info = &weights_info;
     if (reshape_matrix_b)
     {
         matrix_b_info = &tmp_b_info;
  
         // Pick up the GEMM configuration
         // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
         const auto res = select_default_gemm_config_reshaped_only_rhs(
             auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
         lhs_info = res.lhs_info;
         rhs_info = res.rhs_info;
  
         // Validate reshape RHS kernel
         auto_init_if_empty(tmp_b_info,
                            weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
         ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
     }
  
     TensorInfo info_vector_sum_col{};
     TensorInfo info_vector_sum_row{};
  
     const GEMMLowpReductionKernelInfo reduction_info;
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
     if (a_offset != 0)
     {
         info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
  
         // Configure Matrix B reduction kernel
         ARM_COMPUTE_RETURN_ON_ERROR(
             ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
     }
  
     // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
     if (b_offset != 0)
     {
         info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
  
         // Configure matrix A reduction kernel
         ARM_COMPUTE_RETURN_ON_ERROR(
             ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
     }
  
     GEMMKernelInfo gemm_kernel_info;
     gemm_kernel_info.m                       = m;
     gemm_kernel_info.n                       = n;
     gemm_kernel_info.k                       = k;
     gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
     gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
     gemm_kernel_info.lhs_info                = lhs_info;
     gemm_kernel_info.rhs_info                = rhs_info;
     gemm_kernel_info.a_offset                = a_offset;
     gemm_kernel_info.b_offset                = b_offset;
     if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
                                        ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
                                        : 1;
  
         const TensorInfo gemm_output_stage_multipliers_shifts_info(
             TensorInfo(TensorShape(num_filters), 1, DataType::S32));
  
         GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
         gemmlowp_output_stage.output_data_type        = a->data_type();
  
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
         if (reshape_matrix_b &&
             gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
                 matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
                 b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info,
                 &gemm_output_stage_multipliers_shifts_info));
         }
         else
         {
             TensorInfo mm_result_s32_info{};
  
             if (reshape_matrix_b)
             {
                 // Output tensor auto inizialitation if not yet initialized
                 auto_init_if_empty(mm_result_s32_info, a->clone()
                                                            ->set_tensor_shape(compute_mm_shape(
                                                                *matrix_a_info, *matrix_b_info, reshape_info))
                                                            .set_data_type(DataType::S32));
  
                 // Validate matrix multiply
                 ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
                     matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
             }
             else
             {
                 // Output tensor auto inizialitation if not yet initialized
                 auto_init_if_empty(mm_result_s32_info, a->clone()
                                                            ->set_tensor_shape(compute_mm_shape(
                                                                *matrix_a_info, *matrix_b_info, false, reshape_info))
                                                            .set_data_type(DataType::S32));
  
                 // Pick up the GEMM configuration
                 // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
                 // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
                 const auto res = select_default_gemm_config_native(
                     auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
                 lhs_info = res.lhs_info;
                 rhs_info = res.rhs_info;
  
                 // Validate matrix multiply
                 ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
                     matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
             }
  
             // Validate offset contribution kernel
             ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(
                 &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
                 b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage,
                 &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info));
         }
     }
     else
     {
         if (reshape_matrix_b)
         {
             // Validate matrix multiply
             ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
                 matrix_a_info, matrix_b_info, output, gemm_kernel_info));
         }
         else
         {
             // Pick up the GEMM configuration
             // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
             const auto res = select_default_gemm_config_native(
                 auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
             lhs_info = res.lhs_info;
             rhs_info = res.rhs_info;
  
             // Validate matrix multiply
             ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
                 matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
         }
  
         if (output->total_size() != 0)
         {
             // Validate offset contribution kernel
             ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(
                 output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
                 c, a_offset, b_offset));
         }
     }
  
     return Status{};
 }