Compute Library
 21.02
GCGEMMMatrixMultiplyKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
33 #include "arm_compute/core/Types.h"
34 #include "arm_compute/core/Utils.h"
42 #include "support/StringSupport.h"
43 
44 #include <set>
45 #include <string>
46 
47 using namespace arm_compute;
49 
50 namespace
51 {
52 using ElementsProcessed = Steps;
53 
54 inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
55 {
56  ARM_COMPUTE_UNUSED(reshape_info);
57  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
60  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
61 
62  if(!is_interleaved_transposed)
63  {
64  ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
65 
66  if(output->total_size() != 0)
67  {
68  ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
69  ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
71  }
72  }
73  else
74  {
75  const int m = reshape_info.m();
76  const int n = reshape_info.n();
77  const int k = reshape_info.k();
78  const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
79  const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
80 
81  TensorShape tensor_shape0{ input0->tensor_shape() };
82  tensor_shape0.set(0, k);
83  tensor_shape0.set(1, m);
84 
85  TensorShape tensor_shape1{ input1->tensor_shape() };
86  tensor_shape1.set(0, n);
87  tensor_shape1.set(1, k);
88 
89  const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
90  const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
91 
92  const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
93  const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
94 
95  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
96  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
97 
98  if(output->total_size() != 0)
99  {
100  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
101  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
103  }
104  }
105 
106  return Status{};
107 }
108 
109 inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
110  bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
111  GPUTarget gpu_target, ElementsProcessed &num_elements_processed)
112 {
113  ARM_COMPUTE_UNUSED(gpu_target);
114 
115  // Output tensor auto inizialitation if not yet initialized
116  TensorShape tensor_shape{ input0->tensor_shape() };
117  tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->dimension(0));
118  tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->dimension(1));
119 
120  auto_init_if_empty(*output, input0->clone()->set_tensor_shape(tensor_shape));
121 
122  bool window_changed = false;
123  Window win{};
124 
125  const DataType data_type = input0->data_type();
126  unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
127  unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
128 
129  if(is_interleaved_transposed)
130  {
131  // Configure window kernel
132  num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
133  num_elems_processed_per_iteration_y = 4;
134 
135  win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
136 
137  AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
138  AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
139  AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
140 
141  update_window_and_padding(win, input0_access, input1_access, output_access);
142 
143  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
144  }
145  else // The input tensors have not been reshaped
146  {
147  // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor.
148  num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
149 
150  switch(data_type)
151  {
152  case DataType::F16:
153  num_elems_processed_per_iteration_x = 4;
154  break;
155 
156  case DataType::F32:
157  num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
158  break;
159 
160  default:
161  ARM_COMPUTE_ERROR("Current data type is not supported");
162  break;
163  }
164 
165  win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
166 
167  AccessWindowStatic input0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
168  AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
169  AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
170 
171  update_window_and_padding(win, input0_access, input1_access, output_access);
172 
173  Coordinates coord;
174  coord.set_num_dimensions(output->num_dimensions());
175  output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
176  }
177 
178  Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
179  return std::make_pair(err, win);
180 }
181 } // namespace
182 
184  : _input0(nullptr), _input1(nullptr), _output(nullptr)
185 {
186 }
187 
188 void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
189 {
190  ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
191 
192  // Perform validate step
193  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
194 
195  _input0 = input0;
196  _input1 = input1;
197  _output = output;
198 
199  // Get target architecture
200  GPUTarget gpu_target = get_target();
201 
202  ElementsProcessed num_elements_processed{};
203 
204  // Configure kernel window
205  auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
206  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
207  IGCKernel::configure(win_config.second);
208 
209  // Create build options
210  std::set<std::string> build_opts;
211  std::string kernel_name;
212 
213  build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
214  build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
215  build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
216  build_opts.emplace("#define COLS_A " + support::cpp11::to_string(input0->info()->dimension(0)));
217  build_opts.emplace("#define COLS_B " + support::cpp11::to_string(input1->info()->dimension(0)));
218  build_opts.emplace("#define ALPHA " + float_to_string_with_full_precision(alpha));
219 
220  // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
221  if(is_interleaved_transposed)
222  {
223  const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();
224  const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
225 
226  build_opts.emplace("#define MULT_TRANSPOSE1XW_WIDTH " + support::cpp11::to_string(mult_transpose1xW_width));
227  build_opts.emplace("#define MULT_INTERLEAVE4X4_HEIGHT " + support::cpp11::to_string(mult_interleave4x4_height));
228 
229  switch(input0->info()->data_type())
230  {
231  case DataType::F16:
232  build_opts.emplace("#define DATA_TYPE_FP16");
233  break;
234 
235  case DataType::F32:
236  build_opts.emplace("#define DATA_TYPE_FP32");
237  break;
238 
239  default:
240  ARM_COMPUTE_ERROR("Current data type is not supported");
241  break;
242  }
243 
244  build_opts.emplace("#define GEMM_MM_INTERLEAVED_TRANSPOSED");
245 
246  kernel_name = "gemm_mm_interleaved_transposed";
247  }
248  else
249  {
250  // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
251 
252  GPUTarget arch_target = get_arch_from_target(gpu_target);
253  switch(input0->info()->data_type())
254  {
255  case DataType::F16:
256  build_opts.emplace("#define DATA_TYPE_FP16");
257  build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");
258  build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
259  break;
260 
261  case DataType::F32:
262  build_opts.emplace("#define DATA_TYPE_FP32");
263 
264  if(arch_target == GPUTarget::BIFROST && input0->info()->num_dimensions() != 1)
265  {
266  build_opts.emplace("#define GEMM_MM_FLOATING_POINT_BIFROST");
267  }
268  else
269  {
270  build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
271  }
272  break;
273 
274  default:
275  ARM_COMPUTE_ERROR("Current data type is not supported");
276  break;
277  }
278 
279  build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elements_processed.x()));
280  build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elements_processed.y()));
281 
282  kernel_name = "gemm_mm_floating_point";
283  }
284 
285  // Create kernel
286  _kernel = GCKernelLibrary::get().create_kernel(kernel_name, build_opts);
287 }
288 
289 Status GCGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
290  const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
291 {
292  ARM_COMPUTE_UNUSED(alpha);
293  ElementsProcessed num_elements_processed{};
294  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
295  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
296  input1->clone().get(),
297  output->clone().get(),
298  is_interleaved_transposed,
299  reshape_info,
300  gpu_target,
301  num_elements_processed)
302  .first);
303  return Status{};
304 }
305 
307 {
310 
311  _kernel.use();
312 
314  Window slice_matrix_b = slice;
315 
316  slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
317  slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
318 
319  do
320  {
321  Window slice_b = slice;
322  // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
323  // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
324  if(_input1->info()->num_dimensions() < 3)
325  {
326  slice_b = slice_matrix_b;
327  }
328 
329  unsigned int idx = 0;
330 
331  add_2D_tensor_argument(idx, _input0, 1, slice);
332  add_2D_tensor_argument(idx, _input1, 2, slice_b);
333  add_2D_tensor_argument(idx, _output, 3, slice);
334  _kernel.update_shader_params();
335  enqueue(*this, slice);
336  }
337  while(window.slide_window_slice_2D(slice));
338 }
Window first_slice_window_2D() const
First 2D slice of the window.
Definition: Window.h:283
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
int mult_interleave4x4_height() const
Multiplication factor for the height of the 4x4 interleaved block.
Definition: Types.h:1893
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
void run(const Window &window) override
Enqueue the OpenGL ES shader to process the given window.
TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b, int mult_transpose1xW_width=1)
Calculate the transposed 1xW width element shape.
void enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws=gles::NDRange(1U, 1U, 1U))
Add the kernel to the command queue with the given window.
Definition: IGCKernel.cpp:41
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
int mult_transpose1xW_width() const
Multiplication factor for the width of the 1xW transposed block.
Definition: Types.h:1885
GEMM reshape information class.
Definition: Types.h:1831
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
std::string to_string(T &&value)
Convert integer and float values to string.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
Interface for GLES Compute tensor.
Definition: IGCTensor.h:35
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
Status class.
Definition: Error.h:52
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
Definition: GPUTarget.cpp:189
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height=1, bool reinterpret_input_as_3d=false)
Calculate the interleaved shape of an input tensor.
bool slide_window_slice_2D(Window &slice) const
Slide the passed 2D window slice.
Definition: Window.h:323
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
Implementation of a static rectangular access pattern.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
int n() const
Number of matrix B columns.
Definition: Types.h:1869
const DataType data_type
Definition: Im2Col.cpp:150
Implementation of a rectangular access pattern.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: WindowHelpers.h:46
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
std::string float_to_string_with_full_precision(float val)
Create a string with the float in full precision.
Definition: Utils.h:1262
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Definition: Utils.h:71
Manages all the GLES kernels compilation and caching, provides accessors for the GLES Context...
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
Static function to check if given info will lead to a valid configuration of GCGEMMMatrixMultiplyKern...
void configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed=true, const GEMMReshapeInfo &reshape_info=GEMMReshapeInfo())
Initialise the kernel&#39;s input, output and alpha.
std::string kernel_name
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
size_t data_size_from_type(DataType data_type)
The size in bytes of the data type.
Definition: Utils.h:106
int k() const
Number of matrix A columns or matrix B rows.
Definition: Types.h:1877
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
void add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
Add the passed 2D tensor&#39;s parameters to the object&#39;s kernel&#39;s arguments starting from the index idx...
Definition: IGCKernel.cpp:127
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
Wrapper to configure the Khronos EGL and OpenGL ES C header.
static GCKernelLibrary & get()
Get the static instance of GCKernelLibrary.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
Definition: Error.h:159
Implementation of a XY-transpose access pattern.
GPUTarget
Available GPU Targets.
Definition: GPUTarget.h:34
int m() const
Number of matrix A rows.
Definition: Types.h:1861
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
GCKernel create_kernel(const std::string &shader_name, const StringSet &build_options_set={}) const
Creates a kernel from the kernel library.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:45
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
Definition: Dimensions.h:149
GPUTarget get_target() const
Get the targeted GPU architecture.
Definition: IGCKernel.h:122
Container for valid region of a window.
Definition: Types.h:188
DataType
Available data types.
Definition: Types.h:77
Describe a multidimensional execution window.
Definition: Window.h:39
TensorShape & set(size_t dimension, size_t value, bool apply_dim_correction=true, bool increase_dim_unit=true)
Accessor to set the value of one of the dimensions.
Definition: TensorShape.h:79
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)