Compute Library
 23.11
ClGemmHelpers.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
30 
31 #include <limits>
32 #include <utility>
33 
34 namespace arm_compute
35 {
36 namespace opencl
37 {
38 namespace kernels
39 {
40 namespace gemm
41 {
42 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
43  unsigned int n,
44  unsigned int m0,
45  unsigned int n0,
46  unsigned int k0,
47  unsigned int v0,
48  unsigned int h0,
49  bool lhs_interleave,
50  bool rhs_interleave,
51  bool lhs_transpose,
52  bool rhs_transpose,
53  bool export_to_cl_image)
54 {
55  ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0);
56  ARM_COMPUTE_ERROR_ON(v0 == 0);
57  v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1));
58 
59  if (h0 == 0)
60  {
61  // When h0 is 0, we should take the maximum H0 possible
62  h0 = std::max(n / n0, 1U);
63  }
64  else
65  {
66  h0 = std::max(std::min(static_cast<int>(n / n0), static_cast<int>(h0)), static_cast<int>(1));
67  }
68 
69  const GEMMLHSMatrixInfo lhs_info(m0, k0, v0, lhs_transpose, lhs_interleave);
70  const GEMMRHSMatrixInfo rhs_info(n0, k0, h0, rhs_transpose, rhs_interleave, export_to_cl_image);
71 
72  return std::make_pair(lhs_info, rhs_info);
73 }
74 
75 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
76 select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
77  std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
78  unsigned int n,
79  unsigned int k,
80  unsigned int b,
82 {
83  ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true,
84  "The fallback GeMM configuration cannot have export_to_cl_image = true");
85 
86  const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, data_type);
87  const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second);
88  const TensorInfo tensor_reshaped_info(shape, 1, data_type);
89 
90  if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
91  {
92  return info_img;
93  }
94  else
95  {
96  return info_buf;
97  }
98 }
99 
101 {
102  constexpr unsigned int num_floats_per_pixel = 4;
103 
104  const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size();
105  const unsigned int pixel_alignment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
106 
107  ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment");
108  if (pixel_alignment == 0)
109  {
110  return;
111  }
112 
113  const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel;
114  const unsigned int round_up_width =
115  ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
116  const unsigned int padding = round_up_width - stride_y_in_elements;
117 
118  tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0));
119 }
120 
121 Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
122 {
123  if (rhs_info.export_to_cl_image)
124  {
125  ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false,
126  "Export to cl_image only supported with n0 = 4, 8 or 16");
127  ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true,
128  "Export to cl_image only supported with k0 = 4, 8 or 16");
132  "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
134  "Impossible to retrieve the cl_image pitch alignment");
135 
136  // Check the width and height of the output tensor.
137  // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension
138  const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
139  const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
140 
141  ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4,
142  "Not supported width for cl_image");
144  tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h,
145  "Not supported height for cl_image");
146  }
147 
148  return Status{};
149 }
150 
151 bool is_mmul_kernel_preferred(const unsigned int m,
152  const unsigned int n,
153  const unsigned int k,
154  const unsigned int b,
155  const DataType data_type,
156  unsigned int &best_m0,
157  unsigned int &best_n0)
158 {
160 
161  const unsigned int mmul_k0 = 4;
162  best_m0 = 4;
163  best_n0 = 4;
164 
165  const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(m, best_m0);
166  const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / best_m0;
167  const unsigned int ceil_to_multiple_m_div_m0_mmul_k0 = ceil_to_multiple(m_div_m0, mmul_k0);
168  const unsigned int gws_y = ceil_to_multiple_m_div_m0_mmul_k0 / mmul_k0;
169 
170  return ((k % mmul_k0) == 0) && (gws_y > 4);
171 }
172 
173 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
174 find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
175 {
176  size_t min_acc = std::numeric_limits<size_t>::max();
177  size_t min_idx = 0;
178 
179  ARM_COMPUTE_ERROR_ON(configs.size() == 0);
180  const size_t num_rows = configs.size();
181  const size_t num_cols = configs[0].size();
182 
183  ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, "
184  "N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
185  ARM_COMPUTE_UNUSED(num_cols);
186 
187  // Find nearest GeMM workload
188  // Note: the workload does not depend on the K dimension
189  for (size_t y = 0; y < num_rows; ++y)
190  {
191  size_t mc0 = static_cast<size_t>(configs[y][0]);
192  size_t nc0 = static_cast<size_t>(configs[y][1]);
193  size_t kc0 = static_cast<size_t>(configs[y][2]);
194  size_t bc0 = static_cast<size_t>(configs[y][3]);
195 
196  size_t acc = 0;
197  acc += (m - mc0) * (m - mc0);
198  acc += (n - nc0) * (n - nc0);
199  acc += (k - kc0) * (k - kc0);
200  acc += (b - bc0) * (b - bc0);
201  acc = std::sqrt(acc);
202  if (acc < min_acc)
203  {
204  min_acc = acc;
205  min_idx = y;
206  }
207  }
208 
209  // Get the configuration from the nearest GeMM shape
210  const int m0 = configs[min_idx][4];
211  const int n0 = configs[min_idx][5];
212  const int k0 = configs[min_idx][6];
213  const int v0 = configs[min_idx][7];
214  const int h0 = configs[min_idx][8];
215  const int i_lhs = configs[min_idx][9];
216  const int i_rhs = configs[min_idx][10];
217  const int t_lhs = configs[min_idx][11];
218  const int t_rhs = configs[min_idx][12];
219  const int im_rhs = configs[min_idx][13];
220 
221  return configure_lhs_rhs_info(m, n, m0, n0, k0, v0, h0, i_lhs, i_rhs, t_lhs, t_rhs, im_rhs);
222 }
223 } // namespace gemm
224 } // namespace kernels
225 } // namespace opencl
226 } // namespace arm_compute
arm_compute::opencl::kernels::gemm::configure_lhs_rhs_info
std::pair< GEMMLHSMatrixInfo, GEMMRHSMatrixInfo > configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image)
Configure GEMMLHSMatrixInfo and GEMMRHSMatrixInfo.
Definition: ClGemmHelpers.cpp:42
arm_compute::export_to_cl_image
bool export_to_cl_image(const ITensorInfo *tensor)
Definition: CLHelpers.cpp:449
arm_compute::opencl::kernels::gemm::GeMMConfigsMatrix
std::vector< std::vector< int32_t > > GeMMConfigsMatrix
Definition: ClGemmHelpers.h:38
arm_compute::ITensorInfo::tensor_shape
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
arm_compute::GEMMRHSMatrixInfo::n0
unsigned int n0
Number of columns processed by the matrix multiplication.
Definition: Types.h:1918
arm_gemm::gemm
UniqueGemmCommon< Top, Tret > gemm(const GemmArgs &args, const OutputStage &os)
Definition: gemm_implementation.hpp:320
arm_compute::TensorShape
Shape of a tensor.
Definition: TensorShape.h:39
arm_compute::opencl::kernels::gemm::validate_image2d_support_on_rhs
Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix.
Definition: ClGemmHelpers.cpp:121
arm_compute::GEMMRHSMatrixInfo::export_to_cl_image
bool export_to_cl_image
True if the reshaped rhs has to be exported to cl_image.
Definition: Types.h:1923
arm_compute::opencl::kernels::gemm::find_lhs_rhs_info
std::pair< GEMMLHSMatrixInfo, GEMMRHSMatrixInfo > find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
Find the preferred configurations for the LHS and RHS tensor using the GeMMConfigsMatrix provided by ...
Definition: ClGemmHelpers.cpp:174
arm_compute::image2d_from_buffer_supported
bool image2d_from_buffer_supported(const cl::Device &device)
Helper function to check whether the cl_khr_image2d_from_buffer extension is supported.
Definition: CLHelpers.cpp:377
ClGemmHelpers.h
arm_compute::CLKernelLibrary::get
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Definition: CLKernelLibrary.cpp:41
CLKernelLibrary.h
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
arm_compute::utils::cast::U
U
Definition: SaturateCast.h:65
arm_compute::test::validation::shape
shape
Definition: DFT.cpp:115
ARM_COMPUTE_ERROR_ON
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(t,...)
Definition: Validate.h:838
arm_compute::GEMMRHSMatrixInfo::k0
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
Definition: Types.h:1919
ARM_COMPUTE_ERROR_ON_MSG
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Definition: Error.h:456
arm_compute::Status
Status class.
Definition: Error.h:52
OpenCL.h
Wrapper to configure the Khronos OpenCL C++ header.
ARM_COMPUTE_UNUSED
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:151
tensor
CLTensor * tensor
Pointer to the auxiliary tensor.
Definition: ClWorkloadRuntime.cpp:67
arm_compute::test::validation::data_type
data_type
Definition: Cast.cpp:222
arm_compute::opencl::kernels::gemm::update_padding_for_cl_image
void update_padding_for_cl_image(ITensorInfo *tensor)
Update padding required to export the OpenCL buffer to OpenCL image2d.
Definition: ClGemmHelpers.cpp:100
arm_compute::ceil_to_multiple
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Definition: Math.h:50
arm_compute::misc::shape_calculator::compute_rhs_reshaped_shape
TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRHSMatrixInfo &rhs_info)
Calculate the Right Hand Side matrix reshaped shape.
Definition: ShapeCalculator.h:233
ShapeCalculator.h
arm_compute::GEMMLHSMatrixInfo
GEMM LHS (Left Hand Side) matrix information.
Definition: Types.h:1896
arm_compute::TensorInfo
Store the tensor's metadata.
Definition: TensorInfo.h:41
arm_compute::PaddingSize
BorderSize PaddingSize
Container for 2D padding size.
Definition: Types.h:346
arm_compute::GEMMRHSMatrixInfo::transpose
bool transpose
True if the (k0xn0) block has to be transposed before been stored.
Definition: Types.h:1921
arm_compute::test::validation::b
SimpleTensor< float > b
Definition: DFT.cpp:157
ARM_COMPUTE_RETURN_ERROR_ON_MSG
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:245
arm_compute
Copyright (c) 2017-2023 Arm Limited.
Definition: introduction.dox:24
arm_compute::DataType::F16
@ F16
16-bit floating-point number
arm_compute::get_cl_image_pitch_alignment
size_t get_cl_image_pitch_alignment(const cl::Device &device)
Helper function to get the cl_image pitch alignment in pixels.
Definition: CLHelpers.cpp:382
arm_compute::opencl::kernels::gemm::select_lhs_rhs_info
std::pair< GEMMLHSMatrixInfo, GEMMRHSMatrixInfo > select_lhs_rhs_info(std::pair< GEMMLHSMatrixInfo, GEMMRHSMatrixInfo > info_img, std::pair< GEMMLHSMatrixInfo, GEMMRHSMatrixInfo > info_buf, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
Select GEMMLHSMatrixInfo and GEMMRHSMatrixInfo.
Definition: ClGemmHelpers.cpp:76
arm_compute::ITensorInfo
Store the tensor's metadata.
Definition: ITensorInfo.h:44
arm_compute::DataType::F32
@ F32
32-bit floating-point number
arm_compute::GEMMRHSMatrixInfo
GEMM RHS (Right Hand Side) matrix information.
Definition: Types.h:1911
arm_compute::opencl::kernels::gemm::is_mmul_kernel_preferred
bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b, const DataType data_type, unsigned int &best_m0, unsigned int &best_n0)
Determine if the MMUL kernels should be preferred.
Definition: ClGemmHelpers.cpp:151
arm_compute::DataType
DataType
Available data types.
Definition: CoreTypes.h:83
CLHelpers.h
arm_compute::CLKernelLibrary::get_device
const cl::Device & get_device()
Gets the CL device for which the programs are created.
Definition: CLKernelLibrary.cpp:73