48 using namespace misc::shape_calculator;
52 using ElementsProcessed = Steps;
54 Status validate_arguments(
const ITensorInfo *src0,
const ITensorInfo *src1,
const ITensorInfo *
dst,
55 const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const GEMMReshapeInfo &gemm_info)
71 const int m = gemm_info.m();
72 const int n = gemm_info.n();
73 const int k = gemm_info.k();
75 TensorShape tensor_shape0{ src0->tensor_shape() };
76 tensor_shape0.set(0, k);
77 tensor_shape0.set(1, m);
79 TensorShape tensor_shape1{ src1->tensor_shape() };
80 tensor_shape1.set(0, n);
81 tensor_shape1.set(1, k);
83 const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
84 const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
92 if(dst->total_size() != 0)
94 const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(
compute_mm_shape(*src0, *src1, gemm_info));
102 std::pair<Status, Window> validate_and_configure_window(
const ITensorInfo *src0,
const ITensorInfo *src1, ITensorInfo *dst,
103 const GEMMLHSMatrixInfo &lhs_info,
const GEMMRHSMatrixInfo &rhs_info,
const GEMMReshapeInfo &gemm_info,
104 ElementsProcessed &num_elements_processed)
106 unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
107 unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
108 bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
113 TensorInfo tmp_info(*dst);
114 if(reinterpret_output_as_3d)
118 TensorShape tmp_shape(dst->tensor_shape());
119 tmp_shape.collapse(2U, 1U);
120 tmp_info.set_tensor_shape(tmp_shape);
124 num_elems_processed_per_iteration_x = rhs_info.n0;
125 num_elems_processed_per_iteration_y = lhs_info.m0;
126 Window win =
calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
130 Window collapsed = win;
131 const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
132 collapsed = win.collapse(win, dimension_to_collapse);
134 return std::make_pair(Status{}, collapsed);
158 ElementsProcessed num_elements_processed{};
161 auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
163 ICLKernel::configure_internal(win_config.second);
166 const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.
m() : dst->
dimension(1);
168 const unsigned int partial_store_m0 = internal_m % lhs_info.
m0;
169 const unsigned int partial_store_n0 = gemm_info.
n() % rhs_info.
n0;
173 build_opts.
add_option_if(_reinterpret_output_as_3d,
"-DREINTERPRET_OUTPUT_AS_3D");
179 build_opts.
add_option_if(_use_dummy_work_items,
"-DDUMMY_WORK_ITEMS");
193 kernel_name += lhs_info.
transpose ?
"lhs_t_" :
"lhs_nt_";
194 kernel_name += rhs_info.
transpose ?
"rhs_t" :
"rhs_nt";
204 _config_id += (_reinterpret_output_as_3d ?
"3do_" :
"");
233 ElementsProcessed num_elements_processed{};
241 num_elements_processed)
256 if(src1->info()->num_dimensions() < 3)
268 if(_reinterpret_output_as_3d)
271 const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4;
272 const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
273 _kernel.setArg<cl_uint>(idx0,
static_cast<unsigned int>(total_cross_plane_pad));
283 slice_b = slice_matrix_b;
286 unsigned int idx = 0;
287 add_2D_tensor_argument(idx, src0, slice);
288 add_2D_tensor_argument(idx, src1, slice_b);
289 add_2D_tensor_argument(idx, dst, slice);
290 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(_k));
291 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
292 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
293 _kernel.setArg<cl_uint>(idx++,
static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
294 enqueue(queue, *
this, slice, lws_hint(), _use_dummy_work_items);
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
bool dot8_supported(const cl::Device &device)
Helper function to check whether the cl_arm_integer_dot_product_int8 extension is supported...
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue...
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
const StringSet & options() const
Gets the current options list set.
unsigned int v0
Number of vertical blocks of size (m0xk0) stored on the same output row.
bool preferred_dummy_work_items_support(const cl::Device &device)
Helper function to check if "dummy work-items" are preferred to have a power of two NDRange In case d...
GEMM reshape information class.
std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL dot8 accumulator type.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
std::string to_string(T &&value)
Convert integer and float values to string.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
Calculate the matrix multiplication output shape of two tensors.
unsigned int h0
Number of horizontal blocks of size (k0xn0) stored on the same output row.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
GEMM LHS (Left Hand Side) matrix information.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context...
void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
Initialise the kernel's input and dst.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
bool transpose
True if the (k0xn0) block has to be transposed before been stored.
bool interleave
True if the v0 (m0xk0) blocks have to be interleaved in the output row.
Copyright (c) 2017-2021 Arm Limited.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
1 channel, 1 S32 per channel
void add_option(std::string option)
Adds option to the existing build option list.
bool transpose
True if the (m0xk0) block has to be transposed before been stored.
int n() const
Number of matrix B columns.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set< std::string > &build_opts=std::set< std::string >())
Creates an opencl kernel using a compile context.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d=false)
Calculate the Left Hand Side matrix reshaped shape.
GEMM RHS (Right Hand Side) matrix information.
unsigned int n0
Number of columns processed by the matrix multiplication.
quantized, asymmetric fixed-point 8-bit number unsigned
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
int k() const
Number of matrix A columns or matrix B rows.
void add_option_if(bool cond, std::string option)
Adds option if a given condition is true;.
ClGemmLowpMatrixMultiplyReshapedKernel()
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
bool has_padding_changed(const std::unordered_map< const ITensorInfo *, PaddingSize > &padding_map)
Check if the previously stored padding info has changed after configuring a kernel.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
int m() const
Number of matrix A rows.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
std::unordered_map< const ITensorInfo *, PaddingSize > get_padding_info(std::initializer_list< const ITensorInfo *> infos)
Stores padding information before configuring a kernel.
Wrapper to configure the Khronos OpenCL C++ header.
int depth_output_gemm3d() const
Depth (third dimension) of the output tensor to be used with the GEMM3D kernel.
bool interleave
True if the h0 (k0xn0) blocks have to be interleaved in the output row.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
Static function to check if given info will lead to a valid configuration.
unsigned int m0
Number of rows processed by the matrix multiplication.
quantized, asymmetric fixed-point 8-bit number signed
Window first_slice_window_3D() const
First 3D slice of the window.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)