57 using ElementsProcessed = Steps;
59 Status validate_arguments(
const ITensorInfo *src0,
const ITensorInfo *src1,
const ITensorInfo *src2,
const ITensorInfo *
dst,
float alpha,
float beta,
const GEMMLHSMatrixInfo &lhs_info,
60 const GEMMRHSMatrixInfo &rhs_info,
61 const GEMMKernelInfo &gemm_info)
75 ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3),
"Only 2,3,4,8,16 are supported for m0");
76 ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
"Only 2,3,4,8,16 are supported for n0");
78 && (!gemm_info.broadcast_bias),
79 "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
83 const unsigned int m = gemm_info.m;
84 const unsigned int n = gemm_info.n;
85 const unsigned int k = gemm_info.k;
87 TensorShape tensor_shape0{ src0->tensor_shape() };
88 tensor_shape0.set(0, k);
89 tensor_shape0.set(1, m);
91 TensorShape tensor_shape1{ src1->tensor_shape() };
92 tensor_shape1.set(0, n);
93 tensor_shape1.set(1, k);
97 const unsigned int src2_dim0 = src2->dimension(0);
98 const unsigned int src2_dim1 = src2->dimension(1);
101 if(gemm_info.broadcast_bias)
111 const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
112 const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
120 if(dst->total_size() != 0)
130 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
const GEMMLHSMatrixInfo &lhs_info,
131 const GEMMRHSMatrixInfo &rhs_info,
132 const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
134 unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
135 unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
136 bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
140 bool window_changed =
false;
145 TensorInfo tmp_info(*dst);
147 if(reinterpret_output_as_3d)
151 TensorShape tmp_shape(dst->tensor_shape());
152 tmp_shape.collapse(2U, 1U);
153 tmp_info.set_tensor_shape(tmp_shape);
157 num_elems_processed_per_iteration_x = rhs_info.n0;
158 num_elems_processed_per_iteration_y = lhs_info.m0;
160 win =
calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
161 win_out =
calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
165 const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
167 const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y;
169 AccessWindowStatic src2_access(src2, 0, 0,
178 Window collapsed = win;
179 const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
180 collapsed = win.collapse(win, dimension_to_collapse);
183 return std::make_pair(err, collapsed);
203 _add_bias = src2 !=
nullptr;
209 _slide_matrix_b = (src1->
num_dimensions() >= num_dimensions_src0);
211 ElementsProcessed num_elements_processed{};
214 auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
216 ICLKernel::configure_internal(win_config.second);
222 const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.
m : dst->
dimension(1);
224 const unsigned int partial_store_m0 = internal_m % lhs_info.
m0;
225 const unsigned int partial_store_n0 = gemm_info.
n % rhs_info.
n0;
232 build_opts.
add_option_if(_reinterpret_output_as_3d,
"-DREINTERPRET_OUTPUT_AS_3D");
240 build_opts.
add_option_if(_use_dummy_work_items,
"-DDUMMY_WORK_ITEMS");
244 build_opts.
add_option_if(enable_mixed_precision,
"-DMIXED_PRECISION");
261 kernel_name += lhs_info.
transpose ?
"lhs_t_" :
"lhs_nt_";
262 kernel_name += rhs_info.
transpose ?
"rhs_t" :
"rhs_nt";
271 _config_id += (_add_bias ?
"add_bias_" :
"");
272 _config_id += (gemm_info.
broadcast_bias ?
"broadcast_bias_" :
"");
273 _config_id += (_reinterpret_output_as_3d ?
"3do_" :
"");
277 _config_id += (enable_mixed_precision ?
"mixed_precision_" :
"");
307 ElementsProcessed num_elements_processed{};
311 src2 !=
nullptr ? src2->
clone().get() :
nullptr,
316 num_elements_processed)
335 if(src1->info()->num_dimensions() < 3)
347 const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
349 cl::Image2D src1_image2d;
351 if(_export_to_cl_image)
353 const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
354 const size_t image_row_pitch = src1->info()->strides_in_bytes()[1];
366 slice_b = slice_matrix_b;
369 unsigned int idx = 0;
375 if(_export_to_cl_image)
377 _kernel.setArg(idx++, src1_image2d);
391 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(_k));
394 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
397 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
402 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
406 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
409 if(_reinterpret_output_as_3d)
411 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(total_cross_plane_pad));
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
bool is_one(float a, float epsilon=0.00001f)
Checks if the input floating point number is 1.0f checking if the difference is within a range define...
void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Initialise the kernel's input and output.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
bool broadcast_bias
Flag used to broadcast the bias addition.
#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor)
const Window & window() const
The maximum window the kernel can be executed on.
ClGemmMatrixMultiplyReshapedKernel()
bool fp_mixed_precision
Flag used to indicate wider accumulators (32 bit instead of 16 for FP16).
Descriptor used by the GEMM kernels.
void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx ...
bool enabled() const
Check if initialised.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
const StringSet & options() const
Gets the current options list set.
unsigned int v0
Number of vertical blocks of size (m0xk0) stored on the same output row.
unsigned int depth_output_gemm3d
Depth of the output tensor in case is reinterpreted as 3D.
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
bool preferred_dummy_work_items_support(const cl::Device &device)
Helper function to check if "dummy work-items" are preferred to have a power of two NDRange In case d...
float a() const
Get the alpha value.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
std::string to_string(T &&value)
Convert integer and float values to string.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Calculate the matrix multiplication output shape of two tensors.
1 channel, 1 F32 per channel
unsigned int h0
Number of horizontal blocks of size (k0xn0) stored on the same output row.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const std::string & string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
Translates a given activation function to a string.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
GEMM LHS (Left Hand Side) matrix information.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context...
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue...
ActivationLayerInfo activation_info
Activation function to perform after the matrix multiplication.
std::string lower_string(const std::string &val)
Lower a given string.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
bool transpose
True if the (k0xn0) block has to be transposed before been stored.
bool interleave
True if the v0 (m0xk0) blocks have to be interleaved in the output row.
bool export_to_cl_image
True if the reshaped rhs has to be exported to cl_image.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
void add_option(std::string option)
Adds option to the existing build option list.
bool transpose
True if the (m0xk0) block has to be transposed before been stored.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
unsigned int m
Number of LHS rows.
cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set< std::string > &build_opts=std::set< std::string >())
Creates an opencl kernel using a compile context.
unsigned int n
Number of RHS columns.
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d=false)
Calculate the Left Hand Side matrix reshaped shape.
Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix...
GEMM RHS (Right Hand Side) matrix information.
std::string float_to_string_with_full_precision(float val)
Create a string with the float in full precision.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
unsigned int n0
Number of columns processed by the matrix multiplication.
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
void add_option_if(bool cond, std::string option)
Adds option if a given condition is true;.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
bool has_padding_changed(const std::unordered_map< const ITensorInfo *, PaddingSize > &padding_map)
Check if the previously stored padding info has changed after configuring a kernel.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx...
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
std::unordered_map< const ITensorInfo *, PaddingSize > get_padding_info(std::initializer_list< const ITensorInfo *> infos)
Stores padding information before configuring a kernel.
Wrapper to configure the Khronos OpenCL C++ header.
unsigned int k
Number of LHS columns or RHS rows.
bool interleave
True if the h0 (k0xn0) blocks have to be interleaved in the output row.
bool is_zero(float a, float epsilon=0.00001f)
Checks if the input floating point number is 0.0f checking if the difference is within a range define...
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
Create a cl::Image2D object from an OpenCL buffer.
unsigned int m0
Number of rows processed by the matrix multiplication.
ActivationFunction activation() const
Get the type of activation function.
float b() const
Get the beta value.
Window first_slice_window_3D() const
First 3D slice of the window.
DataType
Available data types.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)