Compute Library
 22.05
ClDirectConvolutionKernelComponent.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
25 
27 
29 #include "src/core/CL/ICLKernel.h"
33 
35 namespace arm_compute
36 {
37 namespace experimental
38 {
39 namespace dynamic_fusion
40 {
42 {
44 }
45 
47 {
48  return std::set<std::string> { "helpers.h", "tile_helpers.h" };
49 }
50 
52 {
53  const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
54  const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
55  auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
56 
57  // Get dst shape
58  PadStrideInfo pad_stride_info
59  {
60  static_cast<unsigned int>(_desc.conv2d.stride.x()),
61  static_cast<unsigned int>(_desc.conv2d.stride.y()),
62  static_cast<unsigned int>(_desc.conv2d.pad.left),
63  static_cast<unsigned int>(_desc.conv2d.pad.right),
64  static_cast<unsigned int>(_desc.conv2d.pad.top),
65  static_cast<unsigned int>(_desc.conv2d.pad.bottom),
66  DimensionRoundingType::FLOOR /*default rounding type*/
67  };
69 
70  // Output auto initialization if not yet initialized
71  auto_init_if_empty(*dst_info, output_shape,
72  1,
75 
76  const unsigned int vec_size = std::min(static_cast<unsigned int>(dst_info->tensor_shape()[0]), 4u);
77  const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U;
78  // const unsigned int num_rows = 1;
79  // const unsigned int vec_size = tile_info.tile_dims.x();
80  // const unsigned int num_rows = tile_info.tile_dims.y();
81 
82  // Create and configure kernel window
83  Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
84 
85  const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
86  win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
87  win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
88 
89  return win;
90 }
91 
93 {
94  return R"_()_"; // no macros
95 }
96 
98 {
99  const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
100  const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
101 
102  ARM_COMPUTE_ERROR_ON_MSG(src_info->data_layout() != DataLayout::NHWC, "Only NHWC data layout is supported by this component.");
103 
105  const auto k0 = adjust_vec_size(is_data_type_quantized(src_info->data_type()) ? 16u : 8u, src_info->dimension(channel_idx));
106  const bool leftover_loop = (src_info->dimension(channel_idx) % k0) != 0;
107 
108  std::string code = R"_(
109  //------------------ START KERNEL {{meta_kernel_id}} ---------------------
110  // IN_0(src) {{src}}
111  // IN_1(wei) {{weight}}
112  )_";
113  if(bias_info != nullptr)
114  {
115  code += R"_(
116  // IN_1(bia) {{bias}}
117  )_";
118  }
119  code += R"_(
120  // OUT(dst, accum) {{dst}}
121 
122  // Initialize the accumulators
123  TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
124  {
125  // All the tensor dimensions are passed at compile time.
126  // In case of dynamic tensor support, the following dimensions should be passed as function argument.
127  #define _IWEI_WIDTH {{WEI_WIDTH}}
128  #define _IWEI_HEIGHT {{WEI_HEIGHT}}
129  #define _ISRC_WIDTH {{src}}_w
130  #define _ISRC_HEIGHT {{src}}_h
131  #define _ISRC_CHANNELS {{src}}_c
132  #define _IDST_WIDTH {{arg_dst}}_w
133  #define _IDST_HEIGHT {{arg_dst}}_h
134  #define _IDST_CHANNELS {{arg_dst}}_c
135  #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
136 
137  // .v = access the whole vector (OpenCL vector)
138  // .s[x] = access the vector element at position x (scalar access)
139  TILE(int, M0, 1, xi);
140  TILE(int, M0, 1, yi);
141 
142  // Convert the linear index to coordinate
143  LOOP_UNROLLING(int, i, 0, 1, M0,
144  {
145  xi[i].v = ((mout + i) % _IDST_WIDTH) * {{STRIDE_X}};
146  yi[i].v = ((mout + i) / _IDST_WIDTH) * {{STRIDE_Y}};
147  xi[i].v -= {{PAD_LEFT}};
148  yi[i].v -= {{PAD_TOP}};
149  })
150 
151  LOOP_UNROLLING(int, i, 0, 1, M0,
152  {
153  {{dst}}[i].v = 0;
154  })
155 
156  for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
157  {
158  int ck = 0;
159  int xk = i % _IWEI_WIDTH;
160  int yk = i / _IWEI_HEIGHT;
161 
162  int k = 0;
163  for(; k <= (_ISRC_CHANNELS - K0); k += K0)
164  {
165  TILE({{SRC_DATA_TYPE}}, M0, K0, a);
166  TILE({{WEI_DATA_TYPE}}, N0, K0, b);
167 
168  LOOP_UNROLLING(int, i, 0, 1, M0,
169  {
170  a[i].v = {{ZERO_VALUE}};
171  })
172 
173  // Load tile from the src tensor
174  T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
175 
176  // Load tile from the weights tensor
177  T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
178 
179  // Compute the matrix multiplication between two tiles
180  T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
181 
182  ck += K0;
183  }
184 
185  // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
186  // This #if directive should be removed in case of dynamic tensor support
187  )_";
188 
189  if(leftover_loop)
190  {
191  code += R"_(
192  // Left-over accumulations
193  for(; k < _ISRC_CHANNELS; ++k)
194  {
195  TILE({{SRC_DATA_TYPE}}, M0, 1, a);
196  TILE({{WEI_DATA_TYPE}}, N0, 1, b);
197 
198  LOOP_UNROLLING(int, i, 0, 1, M0,
199  {
200  a[i].v = {{ZERO_VALUE}};
201  })
202 
203  // Load tile from the src tensor
204  T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
205 
206  // Load tile from the weights tensor
207  // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
208  T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
209 
210  // Compute the matrix multiplication between two tiles
211  T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
212 
213  ++ck;
214  }
215  )_";
216  }
217 
218  code += R"_(
219  #undef _I_WEI_WIDTH
220  #undef _I_WEI_HEIGHT
221  #undef _ISRC_WIDTH
222  #undef _ISRC_HEIGHT
223  #undef _ISRC_CHANNELS
224  #undef _IDST_WIDTH
225  #undef _IDST_HEIGHT
226  #undef _IDST_CHANNELS
227  #undef _IY_MULTIPLIER
228 
229  }
230  )_";
231 
232  if(bias_info != nullptr)
233  {
234  code += R"_(
235  TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
236 
237  T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, cout, 0, 1, 0, bias0);
238 
239  // c = c + bias[broadcasted]
240  T_ADD_BROADCAST_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
241  )_";
242  }
243 
244  code += R"_(
245  }
246 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
247  )_";
248  return code.c_str();
249 }
250 
252 {
253  if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
254  {
255  return false;
256  }
257 
258  // If not floating point
259  if(!is_data_type_float(tensor->data_type()))
260  {
261  return false;
262  }
263 
264  if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
265  {
266  return false;
267  }
268 
269  // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
271  {
272  return false;
273  }
274 
275  // Check cl image pitch alignment
277  {
278  return false;
279  }
280 
281  const size_t image_w = tensor->tensor_shape()[0] / 4;
282  const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
283  const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
284  const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
285 
286  if(image_w > max_image_w || image_h > max_image_h)
287  {
288  return false;
289  }
290 
291  return true;
292 }
293 
295 {
296  const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
297  auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
298  const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
299  // const auto tile_info = _blueprint->impl().get_tile_info();
300 
303  const GPUTarget gpu_target = CLScheduler::get().target();
304 
305  const unsigned int n0 = _blueprint->impl().get_execution_window().x().step();
306  const unsigned int m0 = _blueprint->impl().get_execution_window().y().step();
307  const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx));
308  const unsigned int partial_store_n0 = dst_info->dimension(0) % n0;
309  const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
310 
311  // Update the padding for the weights tensor if we can export to cl_image
312  if(export_to_cl_image)
313  {
315  }
316 
317  CLBuildOptions build_opts{};
318  build_opts.add_option("-cl-fast-relaxed-math");
319  build_opts.add_option("-DIS_TILED");
320  build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
321  build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
322  build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
323  build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
324 
325  return build_opts;
326 }
327 
329 {
330  const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
331  const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
332 
333  vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
334 
335  const GPUTarget gpu_target = CLScheduler::get().target();
336  const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
338  vtable.add(_weight, _blueprint->impl().group(_weight.arg_id), ClKernelArgDescriptor(_weight.arg_id, weight_type), "weight");
339 
340  if(!_bias.is_empty()) // optional bias
341  {
342  vtable.add(_bias, _blueprint->impl().group(_bias.arg_id), ClKernelArgDescriptor(_bias.arg_id, ClKernelTensorArgType::Vector), "bias");
343  }
344  vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
345 }
346 
348 {
349  TagLUT lut{};
350 
351  const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
352  const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
353  const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
354 
355  // Arguments and global shared variables
356  lut["src"] = vtable.get(_src);
357  lut["weight"] = vtable.get(_weight);
358 
359  if(!_bias.is_empty()) // optional bias
360  {
361  lut["bias"] = vtable.get(_bias);
362  lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type());
363  }
364  lut["dst"] = vtable.get(_dst);
365 
366  const auto dst_argument = _blueprint->impl().get_argument_shared_vars().get_dst_var();
367  lut["arg_dst"] = dst_argument.uniq_name;
368 
369  // Local build options
370  lut["meta_kernel_id"] = id();
371  lut["ACC_DATA_TYPE"] = src_info->data_type();
372  lut["SRC_DATA_TYPE"] = src_info->data_type();
373  lut["WEI_DATA_TYPE"] = weight_info->data_type();
374 
375  lut["SRC_TENSOR_TYPE"] = "BUFFER";
376  switch(vtable.get(_weight).desc.tensor_arg_type)
377  {
381  {
382  lut["WEI_TENSOR_TYPE"] = "IMAGE";
383  break;
384  }
385  default:
386  {
387  lut["WEI_TENSOR_TYPE"] = "BUFFER";
388  break;
389  }
390  }
393  lut["WEI_WIDTH"] = weight_info->dimension(width_idx);
394  lut["WEI_HEIGHT"] = weight_info->dimension(height_idx);
395 
396  lut["STRIDE_X"] = _desc.conv2d.stride.x();
397  lut["STRIDE_Y"] = _desc.conv2d.stride.y();
398 
399  lut["PAD_LEFT"] = _desc.conv2d.pad.left;
400  lut["PAD_TOP"] = _desc.conv2d.pad.top;
401 
402  lut["ZERO_VALUE"] = 0;
403 
404  return lut;
405 }
406 } // namespace dynamic_fusion
407 } // namespace experimental
408 } // namespace arm_compute
409 #endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:1030
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
size_t bottom
Padding across the height dimension on the bottom, in elements.
bool image2d_from_buffer_supported(const cl::Device &device)
Helper function to check whether the cl_khr_image2d_from_buffer extension is supported.
Definition: CLHelpers.cpp:370
Shape of a tensor.
Definition: TensorShape.h:39
virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override
Get the tag look-up table used to instantiate the component code.
SharedVar get(const SharedVarLink &var_link) const
Get the SharedVar associated with var_link.
Definition: Common.h:206
static CLScheduler & get()
Access the scheduler singleton.
bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout)
DataLayout data_layout() const override
Get the data layout of the tensor.
Definition: TensorInfo.h:291
GPUTarget target() const
Get the target GPU.
Definition: CLScheduler.cpp:49
std::string to_string(T &&value)
Convert integer and float values to string.
size_t dimension(size_t index) const override
Return the size of the requested dimension.
Definition: TensorInfo.h:205
virtual DataType data_type() const =0
Data type used for each element of the tensor.
QuantizationInfo quantization_info() const override
Get the quantization settings (scale and offset) of the tensor.
Definition: TensorInfo.h:287
1 channel, 1 F32 per channel
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:79
size_t x() const
Semantic accessor for width as x.
Definition: Size2D.h:75
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
Definition: GPUTarget.cpp:199
void update_padding_for_cl_image(ITensorInfo *tensor)
Update padding required to export the OpenCL buffer to OpenCL image2d.
Copyright (c) 2017-2022 Arm Limited.
ClKernelTensorArgType
Verbose and explicit way to enumerate all the tensor arguments variants used by all kernel implementa...
Definition: ClWorkload.h:46
Describes all the info required to add a kernel argument at run time.
Definition: ClWorkload.h:70
DataType data_type() const override
Data type used for each element of the tensor.
Definition: TensorInfo.h:242
void add_option(std::string option)
Adds option to the existing build option list.
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
size_t right
Padding across the width dimension on the right, in elements.
ClKernelTensorArgType tensor_arg_type
tensor argument type
Definition: ClWorkload.h:83
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Definition: Utils.h:71
A table of all the variables used in the kernel / blueprint Because we limit the DependencyGraph in t...
Definition: Common.h:92
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Definition: Error.h:456
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
Definition: CLHelpers.cpp:39
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Padding and stride information class.
Definition: Types.h:669
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
size_t left
Padding across the width dimension on the left, in elements.
size_t y() const
Semantic accessor for height as y.
Definition: Size2D.h:84
TensorInfo src_info(src_shape, 1, data_type)
size_t get_cl_image_pitch_alignment(const cl::Device &device)
Helper function to get the cl_image pitch alignment in pixels.
Definition: CLHelpers.cpp:375
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
GPUTarget
Available GPU Targets.
Definition: GPUTarget.h:34
size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193
Num samples, height, width, channels.
void add(SharedVarLink var_link, SharedVarGroup group, ClKernelArgDescriptor runtime_desc, const std::string &name="unnamed")
Create a SharedVar for a corresponding SharedVarLink (contains ArgumentID).
Definition: Common.h:153
size_t top
Padding across the height dimension on the top, in elements.
unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
Returns the adjusted vector size in case it is less than the input&#39;s first dimension, getting rounded down to its closest valid vector size.
Definition: Utils.h:1222
DataType
Available data types.
Definition: Types.h:79
DataLayout
[DataLayout enum definition]
Definition: Types.h:113
virtual void allocate_shared_vars(SharedVarTable &vtable) const override
Allocate all shared variables used by the component in the vtable.
Describe a multidimensional execution window.
Definition: Window.h:39
TensorShape compute_deep_convolution_shape(const TensorShape &input_shape, DataLayout input_data_layout, const TensorShape &weights_shape, const PadStrideInfo &conv_info)
Calculate the deep convolution shape output shape of a tensor.
bool is_data_type_float(DataType dt)
Check if a given data type is of floating point type.
Definition: Utils.h:1010
const cl::Device & get_device()
Gets the CL device for which the programs are created.