24.02.1
|
Go to the documentation of this file.
48 using ElementsProcessed = Steps;
51 const ITensorInfo *src1,
52 const ITensorInfo *src2,
53 const ITensorInfo *
dst,
56 const GEMMLHSMatrixInfo &lhs_info,
57 const GEMMRHSMatrixInfo &rhs_info,
58 const GEMMKernelInfo &gemm_info)
66 "The number of dimensions for the LHS matrix must be <= 4");
68 "The number of dimensions for the RHS matrix must be <= 3");
72 "Only 2,3,4,8,16 are supported for k0");
75 "Only 2,3,4,8,16 are supported for n0");
77 (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 !=
nullptr) &&
78 (!gemm_info.broadcast_bias),
79 "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
83 const unsigned int m = gemm_info.m;
84 const unsigned int n = gemm_info.n;
85 const unsigned int k = gemm_info.k;
87 TensorShape tensor_shape1{src1->tensor_shape()};
88 tensor_shape1.set(0, n);
89 tensor_shape1.set(1, k);
93 const unsigned int src2_dim0 = src2->dimension(0);
94 const unsigned int src2_dim1 = src2->dimension(1);
97 if (gemm_info.broadcast_bias)
100 "Incorrect dimension of bias matrix which is to be broadcasted");
108 const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
110 const TensorInfo tensor_info_reshaped1 =
114 if (gemm_info.reinterpret_input_as_3d)
124 if (
dst->total_size() != 0)
126 const TensorInfo tensor_info_dst =
139 const GEMMLHSMatrixInfo &lhs_info,
140 const GEMMRHSMatrixInfo &rhs_info,
141 const GEMMKernelInfo &gemm_info,
142 ElementsProcessed &num_elements_processed)
145 unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
146 unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
147 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
148 bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
153 if ((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
155 reinterpret_output_as_3d =
false;
158 TensorInfo tmp_info(*
dst);
160 if (reinterpret_output_as_3d)
164 TensorShape tmp_shape(
dst->tensor_shape());
165 tmp_shape.collapse(2U, 1U);
166 tmp_info.set_tensor_shape(tmp_shape);
170 num_elems_processed_per_iteration_x = rhs_info.n0;
171 num_elems_processed_per_iteration_y = lhs_info.m0;
174 calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
178 const unsigned int dimension_to_collapse = std::min(
static_cast<unsigned int>(
dst->num_dimensions()), 2u);
179 Window collapsed = win.collapse(win, dimension_to_collapse);
212 _add_bias = src2 !=
nullptr;
220 if ((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
222 _reinterpret_input_as_3d =
false;
223 _reinterpret_output_as_3d =
false;
228 _slide_matrix_b = (src1->
num_dimensions() >= num_dimensions_src0);
230 ElementsProcessed num_elements_processed{};
234 (src2 !=
nullptr) ? src2->
clone().get() :
nullptr,
dst->clone().get(),
235 lhs_info, rhs_info, gemm_info, num_elements_processed);
236 ICLKernel::configure_internal(win);
241 const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.
m :
dst->dimension(1);
244 const unsigned int h_gemm_3d = _reinterpret_output_as_3d ?
dst->dimension(1) : src0->
dimension(1);
245 const unsigned int d_gemm_3d = _reinterpret_output_as_3d ?
dst->dimension(2) : src0->
dimension(2);
249 const unsigned int internal_m0 = std::min(internal_m, lhs_info.
m0);
252 const unsigned int partial_store_m0 = internal_m % internal_m0;
253 const unsigned int partial_store_n0 = gemm_info.
n % rhs_info.
n0;
267 build_opts.
add_option_if(_use_dummy_work_items,
"-DDUMMY_WORK_ITEMS");
278 build_opts.
add_option_if(_reinterpret_input_as_3d,
"-DREINTERPRET_INPUT_AS_3D");
279 build_opts.
add_option_if(_reinterpret_output_as_3d,
"-DREINTERPRET_OUTPUT_AS_3D");
280 build_opts.
add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
282 build_opts.
add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
287 "-DACTIVATION_TYPE=" +
294 std::string
kernel_name(
"gemm_mm_reshaped_only_rhs_");
307 _config_id += (_has_pad_y ?
"" :
"no_pad_y_");
308 _config_id += (_add_bias ?
"add_bias_" :
"");
309 _config_id += (gemm_info.
broadcast_bias ?
"broadcast_bias_" :
"");
310 _config_id += (_reinterpret_input_as_3d ?
"3di_" :
"");
311 _config_id += (_reinterpret_output_as_3d ?
"3do_" :
"");
352 cl::CommandQueue &queue)
368 if (src1->info()->num_dimensions() < 3)
374 const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u;
375 const size_t rhs_idx_batch_size = 2u;
376 const size_t bia_idx_batch_size = 2u;
377 const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u;
386 const unsigned int total_cross_plane_pad_lhs = src0->info()->padding().top + src0->info()->padding().bottom;
387 const unsigned int total_cross_plane_pad_out =
dst->info()->padding().top +
dst->info()->padding().bottom;
390 ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0)));
392 cl::Image2D src1_image2d;
394 if (_export_to_cl_image)
396 const TensorShape shape2d(src1->info()->dimension(0) / 4,
397 src1->info()->dimension(1) * src1->info()->dimension(2));
398 const size_t image_row_pitch = src1->info()->strides_in_bytes()[1];
409 if (!_slide_matrix_b)
411 slice_b = slice_matrix_b;
414 unsigned int idx = 0;
420 if (_export_to_cl_image)
422 _kernel.setArg(idx++, src1_image2d);
436 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
439 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size]));
444 _kernel.setArg<cl_uint>(idx++,
445 static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
449 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(
dst->info()->strides_in_bytes()[out_idx_batch_size]));
452 if (_reinterpret_input_as_3d && _has_pad_y)
454 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(total_cross_plane_pad_lhs));
458 if (_reinterpret_output_as_3d && _has_pad_y)
460 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(total_cross_plane_pad_out));
464 _kernel.setArg<cl_int>(idx++, _m);
465 _kernel.setArg<cl_int>(idx++, _n);
466 _kernel.setArg<cl_int>(idx++, _k);
std::string to_string(T &&value)
Convert integer and float values to string.
bool reinterpret_input_as_3d
Flag used to reinterpret the input as 3D.
unsigned int m
Number of LHS rows.
unsigned int n0
Number of columns processed by the matrix multiplication.
const StringSet & options() const
Gets the current options list set.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
bool preferred_dummy_work_items_support(const cl::Device &device)
Helper function to check if "dummy work-items" are preferred to have a power of two NDRange In case d...
TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Calculate the matrix multiplication output shape of two tensors.
std::string lower_string(const std::string &val)
Lower a given string.
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix.
bool export_to_cl_image
True if the reshaped rhs has to be exported to cl_image.
void configure(const ClCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Initialise the kernel's input and output.
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
ClGemmMatrixMultiplyReshapedOnlyRhsKernel()
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
ActivationFunction activation() const
Get the type of activation function.
std::string upper_string(const std::string &val)
Raise a given string to upper case.
Descriptor used by the GEMM kernels.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx ...
#define ARM_COMPUTE_ERROR_THROW_ON(status)
bool is_zero(float a, float epsilon=0.00001f)
Checks if the input floating point number is 0.0f checking if the difference is within a range define...
bool enabled() const
Check if initialised.
unsigned int n
Number of RHS columns.
void add_option(std::string option)
Adds option to the existing build option list.
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
unsigned int m0
Number of rows processed by the matrix multiplication.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor)
cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set< std::string > &build_opts=std::set< std::string >())
Creates an opencl kernel using a compile context.
void add_option_if(bool cond, std::string option)
Adds option if a given condition is true;.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
std::string float_to_string_with_full_precision(float val)
Create a string with the float in full precision.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
bool broadcast_bias
Flag used to broadcast the bias addition.
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
std::pair< Status, Window > validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst)
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Describe one of the image's dimensions with a start, end and step.
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
unsigned int k
Number of LHS columns or RHS rows.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Window first_slice_window_3D() const
First 3D slice of the window.
bool interleave
True if the h0 (k0xn0) blocks have to be interleaved in the output row.
const Window & window() const
The maximum window the kernel can be executed on.
bool has_pad_y
Flag used to indicate if the input/output tensors have internal pad on the y direction.
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
float a() const
Get the alpha value.
GEMM LHS (Left Hand Side) matrix information.
Describe a multidimensional execution window.
bool transpose
True if the (k0xn0) block has to be transposed before been stored.
cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type)
Create a cl::Image2D object from an OpenCL buffer.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Copyright (c) 2017-2024 Arm Limited.
@ F16
16-bit floating-point number
bool has_padding_changed(const std::unordered_map< const ITensorInfo *, PaddingSize > &padding_map)
Check if the previously stored padding info has changed after configuring a kernel.
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
const std::string & string_from_activation_func(const ActivationFunction &act)
Translates a given activation function to a string.
ActivationLayerInfo activation_info
Activation function to perform after the matrix multiplication.
Store the tensor's metadata.
@ F32
32-bit floating-point number
unsigned int depth_output_gemm3d
Depth of the output tensor in case is reinterpreted as 3D.
GEMM RHS (Right Hand Side) matrix information.
float b() const
Get the beta value.
unsigned int h0
Number of horizontal blocks of size (k0xn0) stored on the same output row.
@ GEMM
GEMM CL kernel type.
std::unordered_map< const ITensorInfo *, PaddingSize > get_padding_info(std::initializer_list< const ITensorInfo * > infos)
Stores padding information before configuring a kernel.
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
bool is_one(float a, float epsilon=0.00001f)
Checks if the input floating point number is 1.0f checking if the difference is within a range define...
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.