49 using ElementsProcessed = Steps;
51 Status validate_arguments(
const ITensorInfo *src0,
const ITensorInfo *src1,
const ITensorInfo *src2,
const ITensorInfo *
dst,
float alpha,
float beta,
52 const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const GEMMKernelInfo &gemm_info)
67 && (!gemm_info.broadcast_bias),
68 "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
72 const unsigned int m = gemm_info.m;
73 const unsigned int n = gemm_info.n;
74 const unsigned int k = gemm_info.k;
76 TensorShape tensor_shape1{ src1->tensor_shape() };
77 tensor_shape1.set(0, n);
78 tensor_shape1.set(1, k);
82 const unsigned int src2_dim0 = src2->dimension(0);
83 const unsigned int src2_dim1 = src2->dimension(1);
86 if(gemm_info.broadcast_bias)
96 const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
101 if(gemm_info.reinterpret_input_as_3d)
111 if(dst->total_size() != 0)
121 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
const GEMMLHSMatrixInfo &lhs_info,
122 const GEMMRHSMatrixInfo &rhs_info,
const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
124 unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
125 unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
126 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d;
127 bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
131 bool window_changed =
false;
136 if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
138 reinterpret_output_as_3d =
false;
144 TensorInfo tmp_info(*dst);
146 if(reinterpret_output_as_3d)
150 TensorShape tmp_shape(dst->tensor_shape());
151 tmp_shape.collapse(2U, 1U);
152 tmp_info.set_tensor_shape(tmp_shape);
156 num_elems_processed_per_iteration_x = rhs_info.n0;
157 num_elems_processed_per_iteration_y = lhs_info.m0;
159 win =
calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
160 win_out =
calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
164 const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
166 AccessWindowStatic src2_access(src2, 0, 0,
175 Window collapsed = win;
176 const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
177 collapsed = win.collapse(win, dimension_to_collapse);
180 return std::make_pair(err, collapsed);
200 _add_bias = src2 !=
nullptr;
208 if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
210 _reinterpret_input_as_3d =
false;
211 _reinterpret_output_as_3d =
false;
216 _slide_matrix_b = (src1->
num_dimensions() >= num_dimensions_src0);
218 ElementsProcessed num_elements_processed{};
221 auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
223 ICLKernel::configure_internal(win_config.second);
228 const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.
m : dst->
dimension(1);
231 const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->
dimension(1) : src0->
dimension(1);
232 const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->
dimension(2) : src0->
dimension(2);
236 const unsigned int internal_m0 = std::min(internal_m, lhs_info.
m0);
239 const unsigned int partial_store_m0 = internal_m % internal_m0;
240 const unsigned int partial_store_n0 = gemm_info.
n % rhs_info.
n0;
251 build_opts.
add_option_if(_use_dummy_work_items,
"-DDUMMY_WORK_ITEMS");
268 build_opts.
add_option_if(_reinterpret_input_as_3d,
"-DREINTERPRET_INPUT_AS_3D");
269 build_opts.
add_option_if(_reinterpret_output_as_3d,
"-DREINTERPRET_OUTPUT_AS_3D");
274 std::string
kernel_name(
"gemm_mm_reshaped_only_rhs_");
275 kernel_name += rhs_info.
transpose ?
"t" :
"nt";
284 _config_id += (_has_pad_y ?
"" :
"no_pad_y_");
285 _config_id += (_add_bias ?
"add_bias_" :
"");
286 _config_id += (gemm_info.
broadcast_bias ?
"broadcast_bias_" :
"");
287 _config_id += (_reinterpret_input_as_3d ?
"3di_" :
"");
288 _config_id += (_reinterpret_output_as_3d ?
"3do_" :
"");
317 ElementsProcessed num_elements_processed{};
321 src2 !=
nullptr ? src2->
clone().get() :
nullptr,
326 num_elements_processed)
345 if(src1->info()->num_dimensions() < 3)
351 const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u;
352 const size_t rhs_idx_batch_size = 2u;
353 const size_t bia_idx_batch_size = 2u;
354 const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u;
363 const unsigned int total_cross_plane_pad_lhs = src0->info()->padding().top + src0->info()->padding().bottom;
364 const unsigned int total_cross_plane_pad_out = dst->info()->padding().top + dst->info()->padding().bottom;
367 ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0)));
369 cl::Image2D src1_image2d;
371 if(_export_to_cl_image)
373 const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
374 const size_t image_row_pitch = src1->info()->strides_in_bytes()[1];
386 slice_b = slice_matrix_b;
389 unsigned int idx = 0;
395 if(_export_to_cl_image)
397 _kernel.setArg(idx++, src1_image2d);
411 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
414 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size]));
419 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
423 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
426 if(_reinterpret_input_as_3d && _has_pad_y)
428 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(total_cross_plane_pad_lhs));
432 if(_reinterpret_output_as_3d && _has_pad_y)
434 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(total_cross_plane_pad_out));
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
bool is_one(float a, float epsilon=0.00001f)
Checks if the input floating point number is 1.0f checking if the difference is within a range define...
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
bool broadcast_bias
Flag used to broadcast the bias addition.
#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor)
const Window & window() const
The maximum window the kernel can be executed on.
Descriptor used by the GEMM kernels.
void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx ...
bool enabled() const
Check if initialised.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
const StringSet & options() const
Gets the current options list set.
unsigned int depth_output_gemm3d
Depth of the output tensor in case is reinterpreted as 3D.
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
bool preferred_dummy_work_items_support(const cl::Device &device)
Helper function to check if "dummy work-items" are preferred to have a power of two NDRange In case d...
float a() const
Get the alpha value.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
std::string to_string(T &&value)
Convert integer and float values to string.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Calculate the matrix multiplication output shape of two tensors.
1 channel, 1 F32 per channel
unsigned int h0
Number of horizontal blocks of size (k0xn0) stored on the same output row.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const std::string & string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
Translates a given activation function to a string.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
GEMM LHS (Left Hand Side) matrix information.
Store the tensor's metadata.
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue...
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
ClGemmMatrixMultiplyReshapedOnlyRhsKernel()
ActivationLayerInfo activation_info
Activation function to perform after the matrix multiplication.
std::string lower_string(const std::string &val)
Lower a given string.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
bool transpose
True if the (k0xn0) block has to be transposed before been stored.
bool export_to_cl_image
True if the reshaped rhs has to be exported to cl_image.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
void add_option(std::string option)
Adds option to the existing build option list.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
unsigned int m
Number of LHS rows.
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Initialise the kernel's input and output.
cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set< std::string > &build_opts=std::set< std::string >())
Creates an opencl kernel using a compile context.
unsigned int n
Number of RHS columns.
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix...
GEMM RHS (Right Hand Side) matrix information.
std::string float_to_string_with_full_precision(float val)
Create a string with the float in full precision.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
unsigned int n0
Number of columns processed by the matrix multiplication.
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
bool reinterpret_input_as_3d
Flag used to reinterpret the input as 3D.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
void add_option_if(bool cond, std::string option)
Adds option if a given condition is true;.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
bool has_padding_changed(const std::unordered_map< const ITensorInfo *, PaddingSize > &padding_map)
Check if the previously stored padding info has changed after configuring a kernel.
bool has_pad_y
Flag used to indicate if the input/output tensors have internal pad on the y direction.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx...
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
std::unordered_map< const ITensorInfo *, PaddingSize > get_padding_info(std::initializer_list< const ITensorInfo *> infos)
Stores padding information before configuring a kernel.
unsigned int k
Number of LHS columns or RHS rows.
bool interleave
True if the h0 (k0xn0) blocks have to be interleaved in the output row.
bool is_zero(float a, float epsilon=0.00001f)
Checks if the input floating point number is 0.0f checking if the difference is within a range define...
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
Create a cl::Image2D object from an OpenCL buffer.
unsigned int m0
Number of rows processed by the matrix multiplication.
ActivationFunction activation() const
Get the type of activation function.
float b() const
Get the beta value.
Window first_slice_window_3D() const
First 3D slice of the window.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)