Compute Library
 22.11
ClCompositeKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
25 
27 
29 #include "src/core/CL/CLUtils.h"
32 
33 #include "support/Cast.h"
34 namespace arm_compute
35 {
36 namespace experimental
37 {
38 namespace dynamic_fusion
39 {
40 using namespace arm_compute::opencl;
41 
42 void ClCompositeKernel::configure(const ClCompileContext &compile_ctx, const ClKernelCode &cl_code)
43 {
44  // Create kernel from kernel source string
46  _kernel = static_cast<cl::Kernel>(compile_ctx.create_kernel(cl_code.name,
47  "" /* Program name: Used to as part of a unique string for built kernel cache. Not needed */,
48  cl_code.code,
49  klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
50  cl_code.build_options.options(),
51  false /* Is source binary */));
52 
53  // Configure execution window
54  IClKernel::configure_internal(cl_code.window);
55 
56  // Set config id for lws tuning
57  _config_id = cl_code.config_id;
58 
59  // Set kernel arguments
60  _arguments = cl_code.arguments;
61 }
62 
63 inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKernelArgDescriptor &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
64 {
65  switch(arg.tensor_arg_type)
66  {
68  {
69  ARM_COMPUTE_ERROR("Unsupported yet");
70  break;
71  }
72 
74  {
75  add_1D_tensor_argument(idx, tensor, arg_slice);
76  break;
77  }
78 
80  {
81  add_2D_tensor_argument(idx, tensor, arg_slice);
82  break;
83  }
85  {
86  add_2D_tensor_argument(idx, tensor, arg_slice);
87  const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom;
88  _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
89  break;
90  }
92  {
93  const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
94  const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1];
95  cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
96  cl_images.push_back(tensor_image2d);
97  _kernel.setArg(idx++, tensor_image2d);
98  break;
99  }
100 
102  {
103  add_2D_tensor_argument(idx, tensor, arg_slice);
104  _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
105  break;
106  }
108  {
109  const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
110  const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1];
111  cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
112  cl_images.push_back(tensor_image2d);
113  _kernel.setArg(idx++, tensor_image2d);
114  _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
115  break;
116  }
117 
119  {
120  add_3D_tensor_argument(idx, tensor, arg_slice);
121  break;
122  }
123 
125  {
126  add_4D_tensor_argument(idx, tensor, arg_slice);
127  break;
128  }
130  {
131  add_4d_tensor_nhwc_argument(idx, tensor);
132  break;
133  }
135  {
136  const size_t image_w = tensor->info()->dimension(0) / 4;
137  const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1);
138  const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
139 
140  cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
141  TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y);
142  cl_images.push_back(tensor_image2d);
143 
144  _kernel.setArg(idx++, tensor_image2d);
145  add_4d_tensor_nhwc_argument(idx, tensor);
146  break;
147  }
148  default:
149  {
150  ARM_COMPUTE_ERROR("Unsupported");
151  }
152  }
153 }
154 
155 void ClCompositeKernel::run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc)
156 {
157  ARM_COMPUTE_UNUSED(exec_desc);
160 
162  // Don't slice matrix along the z dimension if matrix has just 2 dimensions and matrix A more than 2
163  // This scenario can happen when the matrix multiplication is used to perform a convolution operation
164  Window slice_fixed_z = slice;
165  slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1));
166  slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1));
167 
168  unsigned int idx = 0;
169  do
170  {
171  // Set kernel arguments
172  Window arg_slice = slice;
173  // CLImages created from tensor arguments. Need to be retained until enqueue
174  std::vector<cl::Image2D> cl_images;
175  for(auto id_arg : _arguments)
176  {
177  const auto arg = id_arg.second;
178  auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.arg_id));
181  if(!arg.slide_along_dimz)
182  {
183  // The stride_z for matrix must be zero if we do not slice
184  ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0);
185  arg_slice = slice_fixed_z;
186  }
187  add_tensor_argument(idx, arg, tensor, arg_slice, cl_images);
188  }
189 
190  // Dispatch kernel
191  bool use_dummy_work_items = false;
192  enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
193  }
194  while(!exec_desc.skip_sliding_window && window.slide_window_slice_3D(slice));
195 }
196 
197 } // namespace dynamic_fusion
198 } // namespace experimental
199 } // namespace arm_compute
200 #endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
unsigned int top
top of the border
Definition: Types.h:390
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
ClKernelLibrary contains all the OpenCL kernels that are used throughout the library.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
Definition: ICLKernel.cpp:32
const StringSet & options() const
Gets the current options list set.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
void configure(const opencl::ClCompileContext &, const ClKernelCode &)
virtual DataType data_type() const =0
Data type used for each element of the tensor.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
size_t total_size_upper(size_t dimension) const
Collapses given dimension and above.
Definition: TensorShape.h:182
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:79
unsigned int bottom
bottom of the border
Definition: Types.h:392
Copyright (c) 2017-2022 Arm Limited.
Describes all the info required to add a kernel argument at run time.
Definition: ClWorkload.h:70
const std::string & kernel_path() const
Gets the path that the kernels reside in.
Contains kernel code to be compiled and run in a ClUnitWorkload.
Definition: ClWorkload.h:100
Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
Definition: CLLSTMLayer.h:50
Descriptor containing information required to run a single ClWorkload.
Definition: ClWorkload.h:91
static ClKernelLibrary & get()
Access the KernelLibrary singleton.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
ClKernelTensorArgType tensor_arg_type
tensor argument type
Definition: ClWorkload.h:83
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
virtual PaddingSize padding() const =0
Padding of tensor.
Kernel create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source, const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const
Creates an OpenCL kernel.
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
Definition: Window.h:349
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
std::string config_id
Generated from blueprint based on complex component.
Definition: ClWorkload.h:109
CLBuildOptions build_options
Kernel build options.
Definition: ClWorkload.h:110
CLCompileContext class.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
Interface for OpenCL tensor.
Definition: ICLTensor.h:42
bool skip_sliding_window
Skip sliding window slices during execution loop.
Definition: ClWorkload.h:95
virtual void run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc) override
Run the composite kernel.
ClKernelArgList arguments
Kernel argument descriptors.
Definition: ClWorkload.h:112
Tensor packing service.
Definition: ITensorPack.h:39
virtual const cl::Buffer & cl_buffer() const =0
Interface to be implemented by the child class to return a reference to the OpenCL buffer containing ...
CLTensor * tensor
Pointer to the auxiliary tensor.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
Create a cl::Image2D object from an OpenCL buffer.
Definition: CLUtils.cpp:35
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
Window first_slice_window_3D() const
First 3D slice of the window.
Definition: Window.h:305
int arg_id
Arg ID in the blueprint, -1 means empty / uninitialized.
Definition: ClWorkload.h:82
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:201
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)