OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. More...

#include <CLGEMMMatrixMultiplyKernel.h>

Collaboration diagram for CLGEMMMatrixMultiplyKernel:

Public Member Functions
	CLGEMMMatrixMultiplyKernel ()
	Default constructor. More...

	CLGEMMMatrixMultiplyKernel (const CLGEMMMatrixMultiplyKernel &)=delete
	Prevent instances of this class from being copied (As this class contains pointers) More...

CLGEMMMatrixMultiplyKernel &	operator= (const CLGEMMMatrixMultiplyKernel &)=delete
	Prevent instances of this class from being copied (As this class contains pointers) More...

	CLGEMMMatrixMultiplyKernel (CLGEMMMatrixMultiplyKernel &&)=default
	Allow instances of this class to be moved. More...

CLGEMMMatrixMultiplyKernel &	operator= (CLGEMMMatrixMultiplyKernel &&)=default
	Allow instances of this class to be moved. More...

void	configure (const ICLTensor input0, const ICLTensor input1, const ICLTensor input2, ICLTensor output, float alpha, float beta=0.f, bool is_interleaved_transposed=true, const GEMMReshapeInfo &reshape_info=GEMMReshapeInfo(), bool fp_mixed_precision=false, const ActivationLayerInfo &activation_info=ActivationLayerInfo())
	Initialise the kernel's input, output and alpha. More...

void	configure (const CLCompileContext &compile_context, const ICLTensor input0, const ICLTensor input1, const ICLTensor input2, ICLTensor output, float alpha, float beta=0.f, bool is_interleaved_transposed=true, const GEMMReshapeInfo &reshape_info=GEMMReshapeInfo(), bool fp_mixed_precision=false, const ActivationLayerInfo &activation_info=ActivationLayerInfo())
	Initialise the kernel's input, output and alpha. More...

void	run (const Window &window, cl::CommandQueue &queue) override
	Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue. More...

Public Member Functions inherited from ICLKernel
	ICLKernel ()
	Constructor. More...

cl::Kernel &	kernel ()
	Returns a reference to the OpenCL kernel of this object. More...

template<typename T >
void	add_1D_array_argument (unsigned int &idx, const ICLArray< T > *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
	Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx. More...

void	add_1D_tensor_argument (unsigned int &idx, const ICLTensor *tensor, const Window &window)
	Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx. More...

void	add_1D_tensor_argument_if (bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
	Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true. More...

void	add_2D_tensor_argument (unsigned int &idx, const ICLTensor *tensor, const Window &window)
	Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx. More...

void	add_2D_tensor_argument_if (bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
	Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true. More...

void	add_3D_tensor_argument (unsigned int &idx, const ICLTensor *tensor, const Window &window)
	Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx. More...

void	add_4D_tensor_argument (unsigned int &idx, const ICLTensor *tensor, const Window &window)
	Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx. More...

virtual void	run_op (ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
	Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue. More...

template<typename T >
void	add_argument (unsigned int &idx, T value)
	Add the passed parameters to the object's kernel's arguments starting from the index idx. More...

void	set_lws_hint (const cl::NDRange &lws_hint)
	Set the Local-Workgroup-Size hint. More...

cl::NDRange	lws_hint () const
	Return the Local-Workgroup-Size hint. More...

void	set_wbsm_hint (const cl_int &wbsm_hint)
	Set the workgroup batch size modifier hint. More...

cl_int	wbsm_hint () const
	Return the workgroup batch size modifier hint. More...

const std::string &	config_id () const
	Get the configuration ID. More...

void	set_target (GPUTarget target)
	Set the targeted GPU architecture. More...

void	set_target (cl::Device &device)
	Set the targeted GPU architecture according to the CL device. More...

GPUTarget	get_target () const
	Get the targeted GPU architecture. More...

size_t	get_max_workgroup_size ()
	Get the maximum workgroup size for the device the CLKernelLibrary uses. More...

template<unsigned int dimension_size>
void	add_tensor_argument (unsigned &idx, const ICLTensor *tensor, const Window &window)

template<typename T , unsigned int dimension_size>
void	add_array_argument (unsigned &idx, const ICLArray< T > *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
	Add the passed array's parameters to the object's kernel's arguments starting from the index idx. More...

Public Member Functions inherited from IKernel
	IKernel ()
	Constructor. More...

virtual	~IKernel ()=default
	Destructor. More...

virtual bool	is_parallelisable () const
	Indicates whether or not the kernel is parallelisable. More...

virtual BorderSize	border_size () const
	The size of the border for that kernel. More...

const Window &	window () const
	The maximum window the kernel can be executed on. More...

Static Public Member Functions
static Status	validate (const ITensorInfo input0, const ITensorInfo input1, const ITensorInfo input2, const ITensorInfo output, float alpha, float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision=false, const ActivationLayerInfo &activation_info=ActivationLayerInfo())
	Static function to check if given info will lead to a valid configuration of CLGEMMMatrixMultiplyKernel. More...

Static Public Member Functions inherited from ICLKernel
static constexpr unsigned int	num_arguments_per_1D_array ()
	Returns the number of arguments enqueued per 1D array object. More...

static constexpr unsigned int	num_arguments_per_1D_tensor ()
	Returns the number of arguments enqueued per 1D tensor object. More...

static constexpr unsigned int	num_arguments_per_2D_tensor ()
	Returns the number of arguments enqueued per 2D tensor object. More...

static constexpr unsigned int	num_arguments_per_3D_tensor ()
	Returns the number of arguments enqueued per 3D tensor object. More...

static constexpr unsigned int	num_arguments_per_4D_tensor ()
	Returns the number of arguments enqueued per 4D tensor object. More...

static cl::NDRange	gws_from_window (const Window &window)
	Get the global work size given an execution window. More...

Data Fields
const ICLTensor *	_input0

const ICLTensor *	_input1

const ICLTensor *	_input2

ICLTensor *	_output

bool	_slide_matrix_b

bool	_reinterpret_input_as_3d

bool	_reinterpret_output_as_3d

bool	_add_bias

bool	_broadcast_bias

Detailed Description

OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided.

All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result. For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object

Note: If the input tensors input0 and input1 have been reshaped respectively with CLGEMMReshapeLHSMatrixKernel" and CLGEMMReshapeRHSMatrixKernel, the flag is_interleaved_transposed must be set to true

Attention: input1 tensor must have at least 2 dimensions (matrix)

Definition at line 42 of file CLGEMMMatrixMultiplyKernel.h.

Constructor & Destructor Documentation

◆ CLGEMMMatrixMultiplyKernel() [1/3]

CLGEMMMatrixMultiplyKernel ( )

Default constructor.

Definition at line 269 of file CLGEMMMatrixMultiplyKernel.cpp.

     : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _add_bias(false),
       _broadcast_bias(false)
 {
 }

◆ CLGEMMMatrixMultiplyKernel() [2/3]

CLGEMMMatrixMultiplyKernel ( const CLGEMMMatrixMultiplyKernel & )

delete

Prevent instances of this class from being copied (As this class contains pointers)

◆ CLGEMMMatrixMultiplyKernel() [3/3]

CLGEMMMatrixMultiplyKernel ( CLGEMMMatrixMultiplyKernel && )

default

Allow instances of this class to be moved.

Member Function Documentation

◆ configure() [1/2]

void configure	(	const ICLTensor *	input0,
		const ICLTensor *	input1,
		const ICLTensor *	input2,
		ICLTensor *	output,
		float	alpha,
		float	beta = `0.f`,
		bool	is_interleaved_transposed = `true`,
		const GEMMReshapeInfo &	reshape_info = `GEMMReshapeInfo()`,
		bool	fp_mixed_precision = `false`,
		const ActivationLayerInfo &	activation_info = `ActivationLayerInfo()`
	)

Initialise the kernel's input, output and alpha.

Parameters

[in]	input0	Input tensor containing the Matrix A. Data types supported: F16/F32
[in]	input1	Input tensor containing the Matrix B. Data type supported: same as `input0`
[in]	input2	Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as `input0`
[out]	output	Output tensor to store the result of matrix multiplication. Data type supported: same as `input0`
[in]	alpha	Weight of the matrix product
[in]	beta	(Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
[in]	is_interleaved_transposed	(Optional) True if input0 and input1 have been reshaped respectively using CLGEMMReshapeLHSMatrixKernel and CLGEMMReshapeRHSMatrixKernel
[in]	reshape_info	(Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
[in]	fp_mixed_precision	(Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
[in]	activation_info	(Optional) Activation to apply after the matrix multiplication

Definition at line 275 of file CLGEMMMatrixMultiplyKernel.cpp.

References CLKernelLibrary::get().

 {
     configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision, activation_info);
 }

◆ configure() [2/2]

void configure	(	const CLCompileContext &	compile_context,
		const ICLTensor *	input0,
		const ICLTensor *	input1,
		const ICLTensor *	input2,
		ICLTensor *	output,
		float	alpha,
		float	beta = `0.f`,
		bool	is_interleaved_transposed = `true`,
		const GEMMReshapeInfo &	reshape_info = `GEMMReshapeInfo()`,
		bool	fp_mixed_precision = `false`,
		const ActivationLayerInfo &	activation_info = `ActivationLayerInfo()`
	)

Initialise the kernel's input, output and alpha.

Parameters

[in]	compile_context	The compile context to be used.
[in]	input0	Input tensor containing the Matrix A. Data types supported: F16/F32
[in]	input1	Input tensor containing the Matrix B. Data type supported: same as `input0`
[in]	input2	Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as `input0`
[out]	output	Output tensor to store the result of matrix multiplication. Data type supported: same as `input0`
[in]	alpha	Weight of the matrix product
[in]	beta	(Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
[in]	is_interleaved_transposed	(Optional) True if input0 and input1 have been reshaped respectively using CLGEMMReshapeLHSMatrixKernel and CLGEMMReshapeRHSMatrixKernel
[in]	reshape_info	(Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
[in]	fp_mixed_precision	(Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
[in]	activation_info	(Optional) Activation to apply after the matrix multiplication

Definition at line 281 of file CLGEMMMatrixMultiplyKernel.cpp.

 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
 
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,
                                                   is_interleaved_transposed, reshape_info, fp_mixed_precision));
 
     auto padding_info = is_interleaved_transposed ? get_padding_info({ input0, input1, output }) : get_padding_info({ input0, output });
 
     _input0                   = input0;
     _input1                   = input1;
     _input2                   = helpers::float_ops::is_zero(beta) ? nullptr : input2;
     _output                   = output;
     _reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
     _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
     _add_bias                 = _input2 != nullptr;
     _broadcast_bias           = reshape_info.broadcast_bias();
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
     if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
     }
 
     // Check if we need to slide the matrix B
     const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();
 
     _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
 
     const DataType data_type = input0->info()->data_type();
 
     // Get target architecture
     GPUTarget gpu_target = get_target();
 
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta, is_interleaved_transposed, reshape_info,
                                                     gpu_target, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
     // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false)
     // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
     // This means that the actual m used by the kernel is given by output->info()->dimension(1)
     const unsigned int internal_m = _reinterpret_output_as_3d ? output->info()->dimension(1) * output->info()->dimension(2) : output->info()->dimension(1);
     const unsigned int n          = output->info()->dimension(0);
 
     const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
     const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
 
     const unsigned int m0 = num_elements_processed.y();
     const unsigned int n0 = num_elements_processed.x();
 
     // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
     const unsigned int partial_store_m0 = internal_m % m0;
     const unsigned int partial_store_n0 = n % n0;
 
     // Create build options
     CLBuildOptions build_opts;
 
     build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
     build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));
     build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));
     build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b()));
 
     const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
 
     std::string kernel_name;
     if(is_interleaved_transposed)
     {
         const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
         const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
 
         build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
         build_opts.add_option("-DN=" + support::cpp11::to_string(n));
         build_opts.add_option("-DK=" + support::cpp11::to_string(input1->info()->dimension(0) / (n0 * mult_transpose1xW_width)));
         build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width));
         build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height));
         build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
         build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
 
         if(is_data_type_float(data_type) && is_bifrost)
         {
             kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
         }
         else
         {
             kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
             if(fp_mixed_precision && data_type == DataType::F16)
             {
                 // currently wider accumulator is only supported for fp16 kernels.
                 kernel_name += "_acc32";
             }
         }
     }
     else // The input tensors have not been reshaped
     {
         build_opts.add_option("-DN=" + support::cpp11::to_string(n));
         build_opts.add_option("-DK=" + support::cpp11::to_string(input0->info()->dimension(0)));
         build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
         build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
         build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
         build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
 
         // Create kernels according to the architecture, data type and input size.
         if(is_data_type_float(data_type) && is_bifrost)
         {
             kernel_name = "gemm_mm_floating_point";
 
             if(input0->info()->num_dimensions() != 1)
             {
                 kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
                 if(fp_mixed_precision && data_type == DataType::F16)
                 {
                     // currently wider accumulator is only supported for fp16 kernels.
                     kernel_name += "_acc32";
                 }
             }
             else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
             {
                 // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
                 // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
                 // FC6 and FC7 of AlexNet and VGG-16).
                 kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
             }
 
             // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
             // via exhaustive autotuning over a range of representative layer configurations.
             set_lws_hint(cl::NDRange(4));
         }
         else // (MIDGARD and F32) or (F16)
         {
             kernel_name = "gemm_mm_floating_point";
         }
     }
 
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set config_id for enabling LWS tuning
     _config_id = "gemm_";
     _config_id += (is_interleaved_transposed ? "reshaped_" : "");
     _config_id += (_add_bias ? "add_bias_" : "");
     _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
     _config_id += (fp_mixed_precision ? "fp_mixed_" : "");
     _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
     _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
     _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(2));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(3));
     _config_id += "_";
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }

◆ operator=() [1/2]

CLGEMMMatrixMultiplyKernel& operator= ( const CLGEMMMatrixMultiplyKernel & )

delete

Prevent instances of this class from being copied (As this class contains pointers)

◆ operator=() [2/2]

CLGEMMMatrixMultiplyKernel& operator= ( CLGEMMMatrixMultiplyKernel && )

default

Allow instances of this class to be moved.

◆ run()

void run	(	const Window &	window,
		cl::CommandQueue &	queue
	)

overridevirtual

Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.

Note: The queue is not flushed by this method, and therefore the kernel will not have been executed by the time this method returns.

Parameters

[in]	window	Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
[in,out]	queue	Command queue on which to enqueue the kernel.

Reimplemented from ICLKernel.

Definition at line 480 of file CLGEMMMatrixMultiplyKernel.cpp.

References CLGEMMMatrixMultiplyKernel::_add_bias, CLGEMMMatrixMultiplyKernel::_input0, CLGEMMMatrixMultiplyKernel::_input1, CLGEMMMatrixMultiplyKernel::_input2, CLGEMMMatrixMultiplyKernel::_output, CLGEMMMatrixMultiplyKernel::_reinterpret_input_as_3d, CLGEMMMatrixMultiplyKernel::_reinterpret_output_as_3d, CLGEMMMatrixMultiplyKernel::_slide_matrix_b, ICLKernel::add_2D_tensor_argument(), ARM_COMPUTE_ERROR_ON, ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW, ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL, BorderSize::bottom, Window::DimX, Window::DimY, arm_compute::enqueue(), Window::first_slice_window_3D(), ITensor::info(), ICLKernel::lws_hint(), ICLKernel::num_arguments_per_2D_tensor(), ITensorInfo::num_dimensions(), ITensorInfo::padding(), Window::set(), arm_compute::test::validation::reference::slice(), Window::slide_window_slice_3D(), ITensorInfo::strides_in_bytes(), BorderSize::top, and IKernel::window().

 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
     if(_input1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
     }
 
     Window slice          = window.first_slice_window_3D();
     Window slice_matrix_b = slice;
 
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
     const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0;
 
     if(_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias;
         const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
     if(_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias;
         const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
     do
     {
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
         if(!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
 
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, _input0, slice);
         add_2D_tensor_argument(idx, _input1, slice_b);
         if(_add_bias)
         {
             add_2D_tensor_argument(idx, _input2, slice);
         }
         add_2D_tensor_argument(idx, _output, slice);
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
         if(_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
         }
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, lws_hint());
     }
     while(window.slide_window_slice_3D(slice));
 }

◆ validate()

Status validate	(	const ITensorInfo *	input0,
		const ITensorInfo *	input1,
		const ITensorInfo *	input2,
		const ITensorInfo *	output,
		float	alpha,
		float	beta,
		bool	is_interleaved_transposed,
		const GEMMReshapeInfo &	reshape_info,
		GPUTarget	gpu_target,
		bool	fp_mixed_precision = `false`,
		const ActivationLayerInfo &	activation_info = `ActivationLayerInfo()`
	)

static

Static function to check if given info will lead to a valid configuration of CLGEMMMatrixMultiplyKernel.

Parameters

[in]	input0	Input tensor containing the Matrix A info. Data types supported: F16/F32
[in]	input1	Input tensor containing the Matrix B info. Data type supported: same as `input0`
[in]	input2	Input tensor containing the Matrix C (bias) info. Can be nullptr. Data type supported: same as `input0`
[in]	output	Output tensor to store the result of matrix multiplication. Data type supported: same as `input0`
[in]	alpha	Weight of the matrix product
[in]	beta	Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
[in]	is_interleaved_transposed	True if input0 and input1 have been reshaped respectively using CLGEMMReshapeLHSMatrixKernel and CLGEMMReshapeRHSMatrixKernel
[in]	reshape_info	GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
[in]	gpu_target	GPU Target
[in]	fp_mixed_precision	(Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
[in]	activation_info	(Optional) Activation to apply after the matrix multiplication

Returns: a status

Definition at line 458 of file CLGEMMMatrixMultiplyKernel.cpp.

References ARM_COMPUTE_RETURN_ON_ERROR, ARM_COMPUTE_UNUSED, ICloneable< T >::clone(), and arm_compute::validate_arguments().

 {
     // Note: num_elements_processed will be set in validate_and_configure_window()
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(activation_info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
                                                               input1->clone().get(),
                                                               (input2 != nullptr) ? input2->clone().get() : nullptr,
                                                               output->clone().get(),
                                                               beta,
                                                               is_interleaved_transposed,
                                                               reshape_info,
                                                               gpu_target,
                                                               num_elements_processed)
                                 .first);
 
     return Status{};
 }