93 if(validate_gemm_kernel(gemm_kernel.gemm_type))
96 return gemm_kernel.gemm_type;
101 return gemm_kernel.gemm_type;
104 inline bool validate_lhs_rhs_info_reshaped_only_rhs(
const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *c,
105 const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)
108 TensorInfo tmp_b_info{};
116 gemm_kernel_info.lhs_info = lhs_info;
117 gemm_kernel_info.rhs_info = rhs_info;
118 gemm_kernel_info.has_pad_y =
false;
123 gemm_kernel_info.has_pad_y =
true;
132 inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(
auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info,
const ITensorInfo *a,
133 const ITensorInfo *b,
134 const ITensorInfo *c,
const ITensorInfo *output)
139 if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
142 return { config.lhs_info, config.rhs_info };
147 return { config.lhs_info, config.rhs_info };
151 inline bool validate_lhs_rhs_info_reshaped(
const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const ITensorInfo *a,
const ITensorInfo *b,
const ITensorInfo *c,
152 const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info,
bool reinterpret_input_as_3d)
155 TensorInfo tmp_a_info{};
156 TensorInfo tmp_b_info{};
172 gemm_kernel_info.lhs_info = lhs_info;
173 gemm_kernel_info.rhs_info = rhs_info;
182 inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(
auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info,
const ITensorInfo *a,
const ITensorInfo *b,
183 const ITensorInfo *c,
const ITensorInfo *output,
bool reinterpret_input_as_3d)
188 if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))
191 return { config.lhs_info, config.rhs_info };
196 return { config.lhs_info, config.rhs_info };
209 _reshape_b_only_on_first_run(false),
212 _aux_mem(AuxTensorIdx::Count)
225 _mm_kernel->set_target(gpu_target);
230 _mm_kernel->configure(compile_context, a, b, c, output, alpha, beta,
false, reshape_info, gemm_info.
fp_mixed_precision(), gemm_info.
activation_info());
245 int mult_transpose1xW_width = 1;
246 int mult_interleave4x4_height = 1;
249 _reshape_lhs_kernel->set_target(gpu_target);
250 _mm_kernel->set_target(gpu_target);
254 mult_transpose1xW_width = 4;
255 mult_interleave4x4_height = 2;
261 rhs_info.
h0 = mult_transpose1xW_width;
268 lhs_info.
v0 = mult_interleave4x4_height;
272 GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d,
false, gemm_info.
broadcast_bias());
275 _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
278 _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
281 _mm_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta,
true, reshape_info, gemm_info.
fp_mixed_precision(), gemm_info.
activation_info());
287 _aux_mem[RhsReshape] =
MemoryInfo(
offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.
total_size());
298 const unsigned int batch_size = reinterpret_input_as_3d ? a->
dimension(3) : a->
dimension(2);
313 _reshape_lhs_kernel->set_target(gpu_target);
314 _mm_kernel->set_target(gpu_target);
320 std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(
auto_heuristics::CommonQuery{ gpu_target,
data_type, m, n, k, batch_size }, kernel_info, a,
b,
324 _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
327 _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
331 _aux_mem[RhsReshape] =
MemoryInfo(
offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.
total_size());
342 const unsigned int batch_size = reinterpret_input_as_3d ? a->
dimension(3) : a->
dimension(2);
357 _mm_kernel->set_target(gpu_target);
363 std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
auto_heuristics::CommonQuery{ gpu_target,
data_type, m, n, k, batch_size }, kernel_info, a,
b, c, output);
366 _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
373 kernel_info.has_pad_y =
false;
374 _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
377 kernel_info.has_pad_y =
true;
378 _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
381 _aux_mem[RhsReshape] =
MemoryInfo(
offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.
total_size());
419 int mult_transpose1xW_width = 1;
420 int mult_interleave4x4_height = 1;
425 mult_transpose1xW_width = 4;
426 mult_interleave4x4_height = 2;
432 rhs_info.
h0 = mult_transpose1xW_width;
439 lhs_info.
v0 = mult_interleave4x4_height;
475 const unsigned int batch_size = reinterpret_input_as_3d ? a->
dimension(3) : a->
dimension(2);
494 lhs_info = gemm_config.lhs_info;
495 rhs_info = gemm_config.rhs_info;
523 const unsigned int batch_size = reinterpret_input_as_3d ? a->
dimension(3) : a->
dimension(2);
542 lhs_info = gemm_config.lhs_info;
543 rhs_info = gemm_config.rhs_info;
573 const unsigned int batch_size = reinterpret_input_as_3d ? a->
dimension(3) : a->
dimension(2);
583 switch(_gemm_kernel_type)
587 configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
592 configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
597 configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
602 configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
619 const unsigned int batch_size = reinterpret_input_as_3d ? a->
dimension(3) : a->
dimension(2);
630 const ITensorInfo *c_to_use = fuse_add_c ? c :
nullptr;
632 switch(gemm_kernel_type)
679 switch(_gemm_kernel_type)
693 if(!_reshape_b_only_on_first_run)
714 if(!_reshape_b_only_on_first_run)
724 bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
752 if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 !=
nullptr && rhs_aux !=
nullptr) && rhs_aux)
unsigned int top
top of the border
bool broadcast_bias
Flag used to broadcast the bias addition.
bool constant_weights() const
Flag which specifies if the values of the weights tensor are constant throughout multiple executions ...
GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query)
Select gemm config based on default heuristics.
Descriptor used by the GEMM kernels.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
static CLScheduler & get()
Access the scheduler singleton.
#define ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL(msg)
Log an information message to the logger with function name before the message.
unsigned int v0
Number of vertical blocks of size (m0xk0) stored on the same output row.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
GPUTarget target() const
Get the target GPU.
unsigned int depth_output_gemm3d
Depth of the output tensor in case is reinterpreted as 3D.
GEMM reshape information class.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
Initialise the kernel's inputs and output.
OpenCL kernel to multiply matrices when only the input matrix RHS (src1) has been reshaped...
A collection of adaptor functions that enable the auto selection between mlgo-based heuristics and de...
unsigned int h0
Number of horizontal blocks of size (k0xn0) stored on the same output row.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
bool fp_mixed_precision() const
Flag which specifies if a wider accumulator should be used.
GEMM LHS (Left Hand Side) matrix information.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
unsigned int bottom
bottom of the border
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context...
Reshaped GEMM kernel where only the rhs matrix is reshaped.
int depth_output_gemm3d() const
Depth of the output when GEMM output is reinterpreted as 3D tensor.
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
ActivationLayerInfo activation_info
Activation function to perform after the matrix multiplication.
bool retain_internal_weights() const
Flag which specifies if the weights tensor has to be retained from previous run.
CLGEMMKernelType
OpenCL GEMM kernel types.
Reshaped GEMM kernel where both lhs and rhs matrices are reshaped.
Interface for CPU tensor.
bool transpose
True if the (k0xn0) block has to be transposed before been stored.
bool interleave
True if the v0 (m0xk0) blocks have to be interleaved in the output row.
Copyright (c) 2017-2021 Arm Limited.
std::vector< MemoryInfo > MemoryRequirements
bool transpose
True if the (m0xk0) block has to be transposed before been stored.
GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
Select gemm config based on mlgo heuristics.
Native GEMM kernel with fixed block size.
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
unsigned int m
Number of LHS rows.
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
std::string to_string(const ROIPoolingLayerInfo &pool_info)
Formatted output of the ROIPoolingInfo type.
#define ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(fmt,...)
Log information level formatted message to the core system logger.
unsigned int n
Number of RHS columns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d=false)
Calculate the Left Hand Side matrix reshaped shape.
GEMM RHS (Right Hand Side) matrix information.
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
Static function to check if given info will lead to a valid configuration.
unsigned int n0
Number of columns processed by the matrix multiplication.
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision=false, const ActivationLayerInfo &activation_info=ActivationLayerInfo())
Static function to check if given info will lead to a valid configuration.
size_t total_size() const override
Returns the total size of the tensor in bytes.
void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush=true)
Schedule the execution of the passed kernel if possible.
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
bool reinterpret_input_as_3d
Flag used to reinterpret the input as 3D.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d)
Static function to check if given info will lead to a valid configuration.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
GEMMTypeResult select_default_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
Select gemm type based on default heuristics.
virtual PaddingSize padding() const =0
Padding of tensor.
bool reinterpret_input_as_3d() const
Flag which specifies if the input tensor has to be reinterpreted as 3D.
bool broadcast_bias() const
Flag which specifies whether to broadcast the shape of the bias tensor.
bool has_pad_y
Flag used to indicate if the input/output tensors have internal pad on the y direction.
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
#define ARM_COMPUTE_RETURN_ERROR_MSG(...)
An error is returned with the given description.
Interface for OpenCL tensor.
OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication In particular...
GPUTarget
Available GPU Targets.
GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_only_on_first_run)
Select gemm type based on mlgo heuristics.
unsigned int k
Number of LHS columns or RHS rows.
bool interleave
True if the h0 (k0xn0) blocks have to be interleaved in the output row.
bool is_zero(float a, float epsilon=0.00001f)
Checks if the input floating point number is 0.0f checking if the difference is within a range define...
void run(ITensorPack &tensors) override
Run the kernels contained in the function.
OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided.
virtual const cl::Buffer & cl_buffer() const =0
Interface to be implemented by the child class to return a reference to the OpenCL buffer containing ...
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Store the tensor's metadata.
bool reshape_b_only_on_first_run() const
Flag which specifies if the reshape of matrix B should executed only for the first.
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
void prepare(ITensorPack &constants) override
Prepare the function for executing.
int offset_int_vec(int offset)
unsigned int m0
Number of rows processed by the matrix multiplication.
GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
Select gemm config based on mlgo heuristics.
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
void tune_kernel_static(ICLKernel &kernel)
Tunes OpenCL kernel.
DataType
Available data types.
ActivationLayerInfo activation_info() const
Activation layer to apply after the matrix multiplication.
Reshaped GEMM kernel where both lhs and rhs matrices are reshaped.
OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been r...
GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery &query)
Select gemm config based on default heuristics.
experimental::MemoryRequirements workspace() const override
Return the memory requirements required by the workspace.