Compute Library
 20.11
ICLKernel.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef ARM_COMPUTE_ICLKERNEL_H
25 #define ARM_COMPUTE_ICLKERNEL_H
26 
34 
35 #include <string>
36 
37 namespace arm_compute
38 {
39 template <typename T>
40 class ICLArray;
41 class ICLTensor;
42 class Window;
43 
44 /** Common interface for all the OpenCL kernels */
45 class ICLKernel : public IKernel
46 {
47 private:
48  /** Returns the number of arguments enqueued per array object.
49  *
50  * @return The number of arguments enqueued per array object.
51  */
52  template <unsigned int dimension_size>
53  constexpr static unsigned int num_arguments_per_array()
54  {
55  return num_arguments_per_tensor<dimension_size>();
56  }
57  /** Returns the number of arguments enqueued per tensor object.
58  *
59  * @return The number of arguments enqueued per tensor object.
60  */
61  template <unsigned int dimension_size>
62  constexpr static unsigned int num_arguments_per_tensor()
63  {
64  return 2 + 2 * dimension_size;
65  }
66  using IKernel::configure; //Prevent children from calling IKernel::configure() directly
67 protected:
68  /** Configure the kernel's window and local workgroup size hint.
69  *
70  * @param[in] window The maximum window which will be returned by window()
71  * @param[in] lws_hint (Optional) Local-Workgroup-Size to use.
72  */
73  void configure_internal(const Window &window, cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange())
74  {
75  _lws_hint = lws_hint;
76  IKernel::configure(window);
77  }
78 
79 public:
80  /** Constructor */
82  : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _lws_hint()
83  {
84  }
85  /** Returns a reference to the OpenCL kernel of this object.
86  *
87  * @return A reference to the OpenCL kernel of this object.
88  */
89  cl::Kernel &kernel()
90  {
91  return _kernel;
92  }
93  /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
94  *
95  * @param[in,out] idx Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
96  * @param[in] array Array to set as an argument of the object's kernel.
97  * @param[in] strides @ref Strides object containing stride of each dimension in bytes.
98  * @param[in] num_dimensions Number of dimensions of the @p array.
99  * @param[in] window Window the kernel will be executed on.
100  */
101  template <typename T>
102  void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
103  {
104  add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
105  }
106  /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
107  *
108  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
109  * @param[in] tensor Tensor to set as an argument of the object's kernel.
110  * @param[in] window Window the kernel will be executed on.
111  */
112  void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
113  {
114  add_tensor_argument<1>(idx, tensor, window);
115  }
116  /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
117  *
118  * @param[in] cond Condition to check
119  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
120  * @param[in] tensor Tensor to set as an argument of the object's kernel.
121  * @param[in] window Window the kernel will be executed on.
122  */
123  void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
124  {
125  if(cond)
126  {
127  add_1D_tensor_argument(idx, tensor, window);
128  }
129  }
130  /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
131  *
132  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
133  * @param[in] tensor Tensor to set as an argument of the object's kernel.
134  * @param[in] window Window the kernel will be executed on.
135  */
136  void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
137  {
138  add_tensor_argument<2>(idx, tensor, window);
139  }
140  /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
141  *
142  * @param[in] cond Condition to check
143  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
144  * @param[in] tensor Tensor to set as an argument of the object's kernel.
145  * @param[in] window Window the kernel will be executed on.
146  */
147  void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
148  {
149  if(cond)
150  {
151  add_2D_tensor_argument(idx, tensor, window);
152  }
153  }
154  /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
155  *
156  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
157  * @param[in] tensor Tensor to set as an argument of the object's kernel.
158  * @param[in] window Window the kernel will be executed on.
159  */
160  void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
161  {
162  add_tensor_argument<3>(idx, tensor, window);
163  }
164  /** Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
165  *
166  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
167  * @param[in] tensor Tensor to set as an argument of the object's kernel.
168  * @param[in] window Window the kernel will be executed on.
169  */
170  void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
171  {
172  add_tensor_argument<4>(idx, tensor, window);
173  }
174  /** Returns the number of arguments enqueued per 1D array object.
175  *
176  * @return The number of arguments enqueues per 1D array object.
177  */
178  constexpr static unsigned int num_arguments_per_1D_array()
179  {
180  return num_arguments_per_array<1>();
181  }
182  /** Returns the number of arguments enqueued per 1D tensor object.
183  *
184  * @return The number of arguments enqueues per 1D tensor object.
185  */
186  constexpr static unsigned int num_arguments_per_1D_tensor()
187  {
188  return num_arguments_per_tensor<1>();
189  }
190  /** Returns the number of arguments enqueued per 2D tensor object.
191  *
192  * @return The number of arguments enqueues per 2D tensor object.
193  */
194  constexpr static unsigned int num_arguments_per_2D_tensor()
195  {
196  return num_arguments_per_tensor<2>();
197  }
198  /** Returns the number of arguments enqueued per 3D tensor object.
199  *
200  * @return The number of arguments enqueues per 3D tensor object.
201  */
202  constexpr static unsigned int num_arguments_per_3D_tensor()
203  {
204  return num_arguments_per_tensor<3>();
205  }
206  /** Returns the number of arguments enqueued per 4D tensor object.
207  *
208  * @return The number of arguments enqueues per 4D tensor object.
209  */
210  constexpr static unsigned int num_arguments_per_4D_tensor()
211  {
212  return num_arguments_per_tensor<4>();
213  }
214  /** Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
215  *
216  * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
217  *
218  * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
219  * @param[in,out] queue Command queue on which to enqueue the kernel.
220  */
221  virtual void run(const Window &window, cl::CommandQueue &queue)
222  {
223  ARM_COMPUTE_UNUSED(window, queue);
224  }
225  /** Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
226  *
227  * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
228  *
229  * @param[in] tensors A vector containing the tensors to operato on.
230  * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
231  * @param[in,out] queue Command queue on which to enqueue the kernel.
232  */
233  virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
234  {
235  ARM_COMPUTE_UNUSED(tensors, window, queue);
236  }
237  /** Add the passed parameters to the object's kernel's arguments starting from the index idx.
238  *
239  * @param[in,out] idx Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.
240  * @param[in] value Value to set as an argument of the object's kernel.
241  */
242  template <typename T>
243  void add_argument(unsigned int &idx, T value)
244  {
245  _kernel.setArg(idx++, value);
246  }
247 
248  /** Set the Local-Workgroup-Size hint
249  *
250  * @note This method should be called after the configuration of the kernel
251  *
252  * @param[in] lws_hint Local-Workgroup-Size to use
253  */
254  void set_lws_hint(const cl::NDRange &lws_hint)
255  {
256  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure()
257  _lws_hint = lws_hint;
258  }
259 
260  /** Return the Local-Workgroup-Size hint
261  *
262  * @return Current lws hint
263  */
264  cl::NDRange lws_hint() const
265  {
266  return _lws_hint;
267  }
268 
269  /** Get the configuration ID
270  *
271  * @note The configuration ID can be used by the caller to distinguish different calls of the same OpenCL kernel
272  * In particular, this method can be used by CLScheduler to keep track of the best LWS for each configuration of the same kernel.
273  * The configuration ID should be provided only for the kernels potentially affected by the LWS geometry
274  *
275  * @note This method should be called after the configuration of the kernel
276  *
277  * @return configuration id string
278  */
279  const std::string &config_id() const
280  {
281  return _config_id;
282  }
283 
284  /** Set the targeted GPU architecture
285  *
286  * @param[in] target The targeted GPU architecture
287  */
288  void set_target(GPUTarget target)
289  {
290  _target = target;
291  }
292 
293  /** Set the targeted GPU architecture according to the CL device
294  *
295  * @param[in] device A CL device
296  */
297  void set_target(cl::Device &device);
298 
299  /** Get the targeted GPU architecture
300  *
301  * @return The targeted GPU architecture.
302  */
304  {
305  return _target;
306  }
307 
308  /** Get the maximum workgroup size for the device the CLKernelLibrary uses.
309  *
310  * @return The maximum workgroup size value.
311  */
312  size_t get_max_workgroup_size();
313  /** Get the global work size given an execution window
314  *
315  * @param[in] window Execution window
316  *
317  * @return Global work size of the given execution window
318  */
319  static cl::NDRange gws_from_window(const Window &window);
320 
321 private:
322  /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
323  *
324  * @param[in,out] idx Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
325  * @param[in] array Array to set as an argument of the object's kernel.
326  * @param[in] strides @ref Strides object containing stride of each dimension in bytes.
327  * @param[in] num_dimensions Number of dimensions of the @p array.
328  * @param[in] window Window the kernel will be executed on.
329  */
330  template <typename T, unsigned int dimension_size>
331  void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
332  /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
333  *
334  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
335  * @param[in] tensor Tensor to set as an argument of the object's kernel.
336  * @param[in] window Window the kernel will be executed on.
337  */
338  template <unsigned int dimension_size>
339  void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
340 
341 protected:
342  cl::Kernel _kernel; /**< OpenCL kernel to run */
343  GPUTarget _target; /**< The targeted GPU */
344  std::string _config_id; /**< Configuration ID */
345  size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */
346 private:
347  cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
348 };
349 
350 /** Add the kernel to the command queue with the given window.
351  *
352  * @note Depending on the size of the window, this might translate into several jobs being enqueued.
353  *
354  * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
355  *
356  * @param[in,out] queue OpenCL command queue.
357  * @param[in] kernel Kernel to enqueue
358  * @param[in] window Window the kernel has to process.
359  * @param[in] lws_hint (Optional) Local workgroup size requested. Default is based on the device target.
360  * @param[in] use_dummy_work_items (Optional) Use dummy work items in order to have two dimensional power of two NDRange. Default is false
361  * Note: it is kernel responsibility to check if the work-item is out-of-range
362  *
363  * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
364  */
365 void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
366 
367 /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
368  *
369  * @param[in,out] idx Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
370  * @param[in] array Array to set as an argument of the object's kernel.
371  * @param[in] strides @ref Strides object containing stride of each dimension in bytes.
372  * @param[in] num_dimensions Number of dimensions of the @p array.
373  * @param[in] window Window the kernel will be executed on.
374  */
375 template <typename T, unsigned int dimension_size>
376 void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
377 {
378  ARM_COMPUTE_ERROR_ON(array == nullptr);
379 
380  // Calculate offset to the start of the window
381  unsigned int offset_first_element = 0;
382 
383  for(unsigned int n = 0; n < num_dimensions; ++n)
384  {
385  offset_first_element += window[n].start() * strides[n];
386  }
387 
388  unsigned int idx_start = idx;
389  _kernel.setArg(idx++, array->cl_buffer());
390 
391  for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
392  {
393  _kernel.setArg<cl_uint>(idx++, strides[dimension]);
394  _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
395  }
396 
397  _kernel.setArg<cl_uint>(idx++, offset_first_element);
398 
399  ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
400  "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
401  ARM_COMPUTE_UNUSED(idx_start);
402 }
403 }
404 #endif /*ARM_COMPUTE_ICLKERNEL_H */
static constexpr unsigned int num_arguments_per_1D_tensor()
Returns the number of arguments enqueued per 1D tensor object.
Definition: ICLKernel.h:186
static cl::NDRange gws_from_window(const Window &window)
Get the global work size given an execution window.
Definition: ICLKernel.cpp:136
Common information for all the kernels.
Definition: IKernel.h:33
void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx ...
Definition: ICLKernel.h:123
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
static constexpr unsigned int num_arguments_per_1D_array()
Returns the number of arguments enqueued per 1D array object.
Definition: ICLKernel.h:178
virtual void run(const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
Definition: ICLKernel.h:221
void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx ...
Definition: ICLKernel.h:147
void enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws=gles::NDRange(1U, 1U, 1U))
Add the kernel to the command queue with the given window.
Definition: IGCKernel.cpp:41
cl::Kernel & kernel()
Returns a reference to the OpenCL kernel of this object.
Definition: ICLKernel.h:89
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
Definition: ICLKernel.h:264
void set_lws_hint(const cl::NDRange &lws_hint)
Set the Local-Workgroup-Size hint.
Definition: ICLKernel.h:254
void add_argument(unsigned int &idx, T value)
Add the passed parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:243
void add_1D_array_argument(unsigned int &idx, const ICLArray< T > *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:102
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg,...)
Definition: Error.h:457
Common interface for all the OpenCL kernels.
Definition: ICLKernel.h:45
void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:160
Copyright (c) 2017-2020 Arm Limited.
const std::string & config_id() const
Get the configuration ID.
Definition: ICLKernel.h:279
Interface for OpenCL Array.
Definition: ICLArray.h:35
static constexpr unsigned int num_arguments_per_3D_tensor()
Returns the number of arguments enqueued per 3D tensor object.
Definition: ICLKernel.h:202
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
GPUTarget get_target() const
Get the targeted GPU architecture.
Definition: ICLKernel.h:303
static constexpr unsigned int num_arguments_per_2D_tensor()
Returns the number of arguments enqueued per 2D tensor object.
Definition: ICLKernel.h:194
static constexpr unsigned int num_arguments_per_4D_tensor()
Returns the number of arguments enqueued per 4D tensor object.
Definition: ICLKernel.h:210
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
Strides of an item in bytes.
Definition: Strides.h:37
void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:136
Interface for OpenCL tensor.
Definition: ICLTensor.h:42
ICLKernel()
Constructor.
Definition: ICLKernel.h:81
GPUTarget
Available GPU Targets.
Definition: GPUTarget.h:34
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
Wrapper to configure the Khronos OpenCL C++ header.
size_t get_max_workgroup_size()
Get the maximum workgroup size for the device the CLKernelLibrary uses.
Definition: ICLKernel.cpp:127
Tensor packing service.
Definition: ITensorPack.h:36
void set_target(GPUTarget target)
Set the targeted GPU architecture.
Definition: ICLKernel.h:288
void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:112
void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:170
virtual const cl::Buffer & cl_buffer() const =0
Interface to be implemented by the child class to return a reference to the OpenCL buffer containing ...
Describe a multidimensional execution window.
Definition: Window.h:39
virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
Definition: ICLKernel.h:233