85 if(validate_gemm_kernel(gemm_kernel.gemm_type))
88 return gemm_kernel.gemm_type;
93 return gemm_kernel.gemm_type;
97 inline bool validate_lhs_rhs_info_native(
const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const ITensorInfo *a,
const ITensorInfo *
b,
const GEMMReshapeInfo &reshape_info)
100 TensorInfo mm_result_s32_info{};
116 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(
auto_heuristics::CommonQuery query,
const ITensorInfo *a,
const ITensorInfo *b,
const GEMMReshapeInfo &reshape_info)
121 if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
124 return { config.lhs_info, config.rhs_info };
129 return { config.lhs_info, config.rhs_info };
133 inline bool validate_lhs_rhs_info_reshaped_only_rhs(
const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const ITensorInfo *a,
const ITensorInfo *b,
const ITensorInfo *output,
134 unsigned int m,
unsigned int n,
unsigned int k,
bool reinterpret_input_as_3d,
int depth_output_gemm3d)
137 TensorInfo tmp_b_info{};
149 GEMMKernelInfo gemm_kernel_info;
150 gemm_kernel_info.m = m;
151 gemm_kernel_info.n = n;
152 gemm_kernel_info.k = k;
153 gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
154 gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
155 gemm_kernel_info.lhs_info = lhs_info;
156 gemm_kernel_info.rhs_info = rhs_info;
158 TensorInfo output_info_copy(*output);
168 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(
auto_heuristics::CommonQuery query,
bool reinterpret_input_as_3d,
int depth_output_gemm3d,
169 const ITensorInfo *a,
170 const ITensorInfo *b,
const ITensorInfo *output)
175 if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.
m, query.
n, query.
k, reinterpret_input_as_3d, depth_output_gemm3d))
178 return { config.lhs_info, config.rhs_info };
183 return { config.lhs_info, config.rhs_info };
209 _aux_mem(AuxTensorIdx::Count)
227 _gemm_info = gemm_info;
233 _mm_native_kernel->set_target(gpu_target);
234 _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
246 const unsigned int batch_size = reinterpret_input_as_3d ? a->
dimension(3) : a->
dimension(2);
249 const auto reshape_info =
GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
254 if(_convert_to_qasymm8)
257 _qasymm8_weights = *
b;
259 _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights,
ConvertPolicy::WRAP);
262 ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights :
b;
263 if(_is_gemm_reshaped)
271 a, _convert_to_qasymm8 ? &_qasymm8_weights :
b, output);
274 _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
286 _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
295 _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
299 gemm_kernel_info.
m = m;
300 gemm_kernel_info.
n = n;
301 gemm_kernel_info.
k = k;
304 gemm_kernel_info.
lhs_info = lhs_info;
305 gemm_kernel_info.
rhs_info = rhs_info;
306 gemm_kernel_info.
a_offset = _a_offset;
307 gemm_kernel_info.
b_offset = _b_offset;
331 _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ?
nullptr : &_vector_sum_col,
332 _b_offset == 0 ?
nullptr : &_vector_sum_row, c !=
nullptr ? c :
nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
336 _run_output_stage =
true;
338 if(_is_gemm_reshaped)
340 _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
347 a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
350 _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
352 _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ?
nullptr : &_vector_sum_col, _b_offset == 0 ?
nullptr : &_vector_sum_row,
353 c !=
nullptr ? c :
nullptr, output, a->
dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,
354 &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
360 _run_offset_contribution =
true;
361 if(_is_gemm_reshaped)
364 _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
371 a, _convert_to_qasymm8 ? &_qasymm8_weights :
b, reshape_info);
374 _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
378 _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ?
nullptr : &_vector_sum_col, _b_offset == 0 ?
nullptr : &_vector_sum_row,
379 c !=
nullptr ? c :
nullptr, a->
dimension(0), _a_offset, _b_offset);
383 _aux_mem[RhsQAsymm8] =
MemoryInfo(
offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.
total_size());
384 if(_is_gemm_reshaped)
387 _aux_mem[RhsQAsymm8] =
MemoryInfo(
offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.
total_size());
388 _aux_mem[RhsReshape] =
MemoryInfo(
offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.
total_size());
392 _aux_mem[VecSumCol] =
MemoryInfo(
offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.
total_size());
429 const unsigned int batch_size = reinterpret_input_as_3d ? a->
dimension(3) : a->
dimension(2);
439 if(convert_to_qasymm8)
448 matrix_b_info = &tmp_b_info;
454 lhs_info = res.lhs_info;
455 rhs_info = res.rhs_info;
485 gemm_kernel_info.
m = m;
486 gemm_kernel_info.
n = n;
487 gemm_kernel_info.
k = k;
490 gemm_kernel_info.
lhs_info = lhs_info;
491 gemm_kernel_info.
rhs_info = rhs_info;
492 gemm_kernel_info.
a_offset = a_offset;
493 gemm_kernel_info.
b_offset = b_offset;
507 a_offset == 0 ?
nullptr : &info_vector_sum_col,
508 b_offset == 0 ?
nullptr : &info_vector_sum_row,
510 &gemm_output_stage_multipliers_shifts_info,
511 &gemm_output_stage_multipliers_shifts_info));
534 lhs_info = res.lhs_info;
535 rhs_info = res.rhs_info;
543 a_offset == 0 ?
nullptr : &info_vector_sum_col,
544 b_offset == 0 ?
nullptr : &info_vector_sum_row,
548 gemmlowp_output_stage,
549 &gemm_output_stage_multipliers_shifts_info,
550 &gemm_output_stage_multipliers_shifts_info));
565 lhs_info = res.lhs_info;
566 rhs_info = res.rhs_info;
576 a_offset == 0 ?
nullptr : &info_vector_sum_col,
577 b_offset == 0 ?
nullptr : &info_vector_sum_row,
579 a_offset, b_offset));
607 const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.
get() :
b;
609 if(_is_gemm_reshaped)
611 matrix_b = tmp_b.
get();
612 if(!_reshape_b_only_on_first_run)
625 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
647 if(_is_gemm_reshaped)
650 if(_run_offset_contribution)
683 if(_run_output_stage)
698 if(_run_offset_contribution)
723 if(_convert_to_qasymm8)
730 if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
743 if(_a_offset != 0 && _reshape_b_only_on_first_run)
761 if(multiplier_tensor !=
nullptr && multiplier_tensor->
info()->
total_size() > 0)
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
void map(cl::CommandQueue &q, bool blocking=true)
Enqueue a map operation of the allocated buffer on the given queue.
experimental::MemoryRequirements workspace() const override
Return the memory requirements required by the workspace.
Quantize using a fixed point multiplication.
Descriptor used by the GEMM kernels.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
static CLScheduler & get()
Access the scheduler singleton.
OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the out...
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
GPUTarget target() const
Get the target GPU.
void prepare(ITensorPack &constants) override
Prepare the function for executing.
unsigned int depth_output_gemm3d
Depth of the output tensor in case is reinterpreted as 3D.
GEMM reshape information class.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Calculate the matrix multiplication output shape of two tensors.
GEMMLowpOutputStageInfo gemmlowp_output_stage() const
GEMMLowp output stage.
TensorShape compute_reductionA_shape(const ITensorInfo &b)
Calculate the reductionA shape used in GEMMLowp.
~ClGemmLowpMatrixMultiplyCore()
A collection of adaptor functions that enable the auto selection between mlgo-based heuristics and de...
ITensorInfo & set_data_type(DataType data_type) override
Set the data type to the specified value.
GEMM LHS (Left Hand Side) matrix information.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Reshaped GEMM kernel where only the rhs matrix is reshaped.
int depth_output_gemm3d() const
Depth of the output when GEMM output is reinterpreted as 3D tensor.
GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
Select gemm config based on mlgo heuristics.
CLGEMMKernelType
OpenCL GEMM kernel types.
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col=nullptr, const ITensorInfo *vector_sum_row=nullptr, const ITensorInfo *bias=nullptr, const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration.
bool is_data_type_quantized_symmetric(DataType dt)
Check if a given data type is of symmetric quantized type.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
GEMMLowpOutputStageType type
GEMMLowp output stage type.
Interface for CPU tensor.
GEMMLHSMatrixInfo lhs_info
LHS matrix information used to retrieve the number of rows processed by each thread.
Copyright (c) 2017-2021 Arm Limited.
bool is_b_reshaped() const
Flag which specifies if the matrix B has been reshaped.
std::vector< MemoryInfo > MemoryRequirements
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset)
Static function to check if given info will lead to a valid configuration.
static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration.
bool is_quantized_per_channel
GEMMLowp quantized per-channel flag.
std::vector< int32_t > gemmlowp_shifts
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
1 channel, 1 S32 per channel
static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration.
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info=GEMMInfo())
Static function to check if given info will lead to a valid configuration.
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
unsigned int m
Number of LHS rows.
GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
Select gemm config based on default heuristics.
std::string to_string(const ROIPoolingLayerInfo &pool_info)
Formatted output of the ROIPoolingInfo type.
#define ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(fmt,...)
Log information level formatted message to the core system logger.
unsigned int n
Number of RHS columns.
void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel's inputs, output.
void run(ITensorPack &tensors) override
Run the kernels contained in the function.
Casts a given tensor to a new type.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
GEMM RHS (Right Hand Side) matrix information.
int32_t b_offset
Offset to be added to each element of the matrix B.
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
Static function to check if given info will lead to a valid configuration.
OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B...
quantized, asymmetric fixed-point 8-bit number unsigned
void unmap(cl::CommandQueue &q)
Enqueue an unmap operation of the allocated and mapped buffer on the given queue. ...
size_t total_size() const override
Returns the total size of the tensor in bytes.
void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush=true)
Schedule the execution of the passed kernel if possible.
std::vector< int32_t > gemmlowp_multipliers
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
GEMMLowpOutputStageInfo output_stage
GEMMLowp output stage information.
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type.
bool reinterpret_input_as_3d
Flag used to reinterpret the input as 3D.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
GEMMLowp output stage info.
OpenCL kernel used to add the offset contribution after the matrix multiplication.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
GEMMTypeResult select_default_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
Select gemm type based on default heuristics.
cl::CommandQueue & queue()
Accessor for the associated CL command queue.
bool reinterpret_input_as_3d() const
Flag which specifies if the input tensor has to be reinterpreted as 3D.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has b...
quantized, symmetric fixed-point 8-bit number
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
bool is_a_reshaped() const
Flag which specifies if the matrix A has been reshaped.
quantized, symmetric per channel fixed-point 8-bit number
TensorShape compute_reductionB_shape(const ITensorInfo &a)
Calculate the reductionB shape used in GEMMLowp.
int32_t a_offset
Offset to be added to each element of the matrix A.
unsigned int k
Number of rows for the rhs matrix.
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Interface for OpenCL tensor.
GEMMRHSMatrixInfo rhs_info
RHS matrix information used for reshaping the RHS matrix.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication In particular...
GPUTarget
Available GPU Targets.
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
Static function to check if given info will lead to a valid configuration.
Native GEMM kernel with configurable block size.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
Select gemm type based on mlgo heuristics.
unsigned int k
Number of LHS columns or RHS rows.
unsigned int m
Number of rows for the lhs matrix.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Store the tensor's metadata.
bool reshape_b_only_on_first_run() const
Flag which specifies if the reshape of matrix B should executed only for the first.
OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A...
int offset_int_vec(int offset)
GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
Select gemm config based on mlgo heuristics.
quantized, asymmetric fixed-point 8-bit number signed
unsigned int n
Number of columns for the rhs matrix.
DataType output_data_type
Output tensor data type to use if the output is not initialized.
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
Static function to check if given info will lead to a valid configuration.
ClGemmLowpMatrixMultiplyCore()
GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery &query)
Select gemm config based on default heuristics.