24.02.1
|
Go to the documentation of this file.
52 using ElementsProcessed = Steps;
55 constexpr
int mmul_m0 = 4;
56 constexpr
int mmul_n0 = 4;
57 constexpr
int mmul_k0 = 4;
60 const ITensorInfo *src1,
61 const ITensorInfo *src2,
62 const ITensorInfo *
dst,
65 const GEMMLHSMatrixInfo &lhs_info,
66 const GEMMRHSMatrixInfo &rhs_info,
67 const GEMMKernelInfo &gemm_info)
72 "The extension cl_arm_matrix_multiply is not supported on the target platform");
76 "The number of dimensions for the LHS matrix must be <= 4");
78 "The number of dimensions for the RHS matrix must be <= 3");
81 rhs_info.n0 != 8 && rhs_info.n0 != 16,
82 "Only 1,2,3,4,8, and 16 are supported for n0");
86 "Only true is supported for interleave with mmul extension enabled");
88 "Only false is supported for transpose with mmul extension enabled");
92 const unsigned int m = gemm_info.m;
93 const unsigned int n = gemm_info.n;
94 const unsigned int k = gemm_info.k;
103 if (gemm_info.reinterpret_input_as_3d)
113 if (src1->num_dimensions() > 2)
115 if (gemm_info.depth_output_gemm3d != 0)
127 const unsigned int src2_dim0 = src2->dimension(0);
128 const unsigned int src2_dim1 = src2->dimension(1);
131 if (gemm_info.broadcast_bias)
134 "Incorrect dimension of bias matrix which is to be broadcasted");
142 TensorShape tensor_shape1{src1->tensor_shape()};
143 tensor_shape1.set(0, n);
144 tensor_shape1.set(1, k);
146 const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
147 const TensorInfo tensor_info_reshaped1 =
152 if (
dst->total_size() != 0)
154 const TensorInfo tensor_info_dst =
167 const GEMMLHSMatrixInfo &lhs_info,
168 const GEMMRHSMatrixInfo &rhs_info,
169 const GEMMKernelInfo &gemm_info)
172 bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
178 TensorInfo tmp_info(*
dst);
180 if (reinterpret_output_as_3d)
184 TensorShape tmp_shape(
dst->tensor_shape());
185 tmp_shape.collapse(2U, 1U);
186 tmp_info.set_tensor_shape(tmp_shape);
193 const unsigned int dimension_to_collapse = std::min(
static_cast<unsigned int>(
dst->num_dimensions()), 2u);
194 Window collapsed = win.collapse(win, dimension_to_collapse);
197 Window::Dimension x_dimension = collapsed.x();
198 Window::Dimension y_dimension = collapsed.y();
201 const unsigned int ceil_to_multiple_n_n0 =
ceil_to_multiple(x_dimension.end(), rhs_info.n0);
202 const unsigned int ceil_to_multiple_m_m0 =
ceil_to_multiple(y_dimension.end(), lhs_info.m0);
205 const unsigned int n_div_n0 = ceil_to_multiple_n_n0 / rhs_info.n0;
206 const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / lhs_info.m0;
209 const unsigned int ceil_to_multiple_n_div_n0_mmul_n0 =
ceil_to_multiple(n_div_n0, mmul_n0);
210 const unsigned int ceil_to_multiple_m_div_m0_mmul_k0 =
ceil_to_multiple(m_div_m0, mmul_k0);
213 x_dimension.set_end(ceil_to_multiple_n_div_n0_mmul_n0 * mmul_k0);
214 y_dimension.set_end(ceil_to_multiple_m_div_m0_mmul_k0 / mmul_k0);
219 return std::make_pair(Status{}, collapsed);
248 _add_bias = src2 !=
nullptr;
255 IClKernel::configure_internal(win_config.second);
261 const unsigned int m0_leftover = _m % lhs_info.
m0;
262 const unsigned int n0_leftover = _n % rhs_info.
n0;
290 std::string
kernel_name(
"gemm_mm_reshaped_only_rhs_nt_mmul");
291 kernel_name += _export_to_cl_image ?
"_texture" :
"";
302 _config_id += (_add_bias ?
"add_bias_" :
"");
303 _config_id += (gemm_info.
broadcast_bias ?
"broadcast_bias_" :
"");
332 src2 !=
nullptr ? src2->
clone().get() :
nullptr,
333 dst->clone().get(), lhs_info, rhs_info, gemm_info)
341 cl::CommandQueue &queue)
357 if (src1->info()->num_dimensions() < 3)
363 cl::Image2D src1_image2d;
365 if (_export_to_cl_image)
367 const TensorShape shape2d(src1->info()->dimension(0) / 4,
368 src1->info()->dimension(1) * src1->info()->dimension(2));
369 const size_t image_row_pitch = src1->info()->strides_in_bytes()[1];
379 unsigned int idx = 0;
382 if (_export_to_cl_image)
384 _kernel.setArg(idx++, src1_image2d);
397 _kernel.setArg<cl_int>(idx++, _m);
398 _kernel.setArg<cl_int>(idx++, _n);
399 _kernel.setArg<cl_int>(idx++, _k);
403 enqueue(queue, *
this,
slice, cl::NDRange(32, 2),
false);
std::string to_string(T &&value)
Convert integer and float values to string.
bool reinterpret_input_as_3d
Flag used to reinterpret the input as 3D.
unsigned int m
Number of LHS rows.
unsigned int n0
Number of columns processed by the matrix multiplication.
const StringSet & options() const
Gets the current options list set.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Calculate the matrix multiplication output shape of two tensors.
std::string lower_string(const std::string &val)
Lower a given string.
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix.
bool arm_matrix_multiply_supported(const cl::Device &device)
Helper function to check whether the cl_arm_matrix_multiply extension is supported.
bool export_to_cl_image
True if the reshaped rhs has to be exported to cl_image.
ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel()
void add_3d_tensor_nhw_argument(unsigned int &idx, const ICLTensor *tensor)
Add the passed NHW 3D tensor's parameters to the object's kernel's arguments by passing strides,...
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
ActivationFunction activation() const
Get the type of activation function.
std::string upper_string(const std::string &val)
Raise a given string to upper case.
Descriptor used by the GEMM kernels.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
bool is_zero(float a, float epsilon=0.00001f)
Checks if the input floating point number is 0.0f checking if the difference is within a range define...
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
bool enabled() const
Check if initialised.
unsigned int n
Number of RHS columns.
void add_option(std::string option)
Adds option to the existing build option list.
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
unsigned int m0
Number of rows processed by the matrix multiplication.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set< std::string > &build_opts=std::set< std::string >())
Creates an opencl kernel using a compile context.
void add_option_if(bool cond, std::string option)
Adds option if a given condition is true;.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
std::string float_to_string_with_full_precision(float val)
Create a string with the float in full precision.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
bool broadcast_bias
Flag used to broadcast the bias addition.
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
std::pair< Status, Window > validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst)
Wrapper to configure the Khronos OpenCL C++ header.
void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Initialize the kernel's input and dst.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
unsigned int k
Number of LHS columns or RHS rows.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Window first_slice_window_3D() const
First 3D slice of the window.
const Window & window() const
The maximum window the kernel can be executed on.
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
float a() const
Get the alpha value.
GEMM LHS (Left Hand Side) matrix information.
Describe a multidimensional execution window.
cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type)
Create a cl::Image2D object from an OpenCL buffer.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Copyright (c) 2017-2024 Arm Limited.
@ F16
16-bit floating-point number
bool has_padding_changed(const std::unordered_map< const ITensorInfo *, PaddingSize > &padding_map)
Check if the previously stored padding info has changed after configuring a kernel.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
const std::string & string_from_activation_func(const ActivationFunction &act)
Translates a given activation function to a string.
ActivationLayerInfo activation_info
Activation function to perform after the matrix multiplication.
Store the tensor's metadata.
@ F32
32-bit floating-point number
unsigned int depth_output_gemm3d
Depth of the output tensor in case is reinterpreted as 3D.
GEMM RHS (Right Hand Side) matrix information.
float b() const
Get the beta value.
@ GEMM
GEMM CL kernel type.
std::unordered_map< const ITensorInfo *, PaddingSize > get_padding_info(std::initializer_list< const ITensorInfo * > infos)
Stores padding information before configuring a kernel.
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
bool is_one(float a, float epsilon=0.00001f)
Checks if the input floating point number is 1.0f checking if the difference is within a range define...
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.