Compute Library
 21.05
ICLKernel.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef ARM_COMPUTE_ICLKERNEL_H
25 #define ARM_COMPUTE_ICLKERNEL_H
26 
35 
36 #include <string>
37 
38 namespace arm_compute
39 {
40 template <typename T>
41 class ICLArray;
42 class ICLTensor;
43 class Window;
44 
45 /** Common interface for all the OpenCL kernels */
46 class ICLKernel : public IKernel
47 {
48 private:
49  /** Returns the number of arguments enqueued per array object.
50  *
51  * @return The number of arguments enqueued per array object.
52  */
53  template <unsigned int dimension_size>
54  constexpr static unsigned int num_arguments_per_array()
55  {
56  return num_arguments_per_tensor<dimension_size>();
57  }
58  /** Returns the number of arguments enqueued per tensor object.
59  *
60  * @return The number of arguments enqueued per tensor object.
61  */
62  template <unsigned int dimension_size>
63  constexpr static unsigned int num_arguments_per_tensor()
64  {
65  return 2 + 2 * dimension_size;
66  }
67  using IKernel::configure; //Prevent children from calling IKernel::configure() directly
68 protected:
69  /** Configure the kernel's window and local workgroup size hint.
70  *
71  * @param[in] window The maximum window which will be returned by window()
72  * @param[in] lws_hint Local-Workgroup-Size to use.
73  * @param[in] wbsm_hint (Optional) Workgroup-Batch-Size-Modifier to use.
74  */
75  void configure_internal(const Window &window, cl::NDRange lws_hint, cl_int wbsm_hint = 0)
76  {
77  configure_internal(window, CLTuningParams(lws_hint, wbsm_hint));
78  }
79 
80  /** Configure the kernel's window and tuning parameters hints.
81  *
82  * @param[in] window The maximum window which will be returned by window()
83  * @param[in] tuning_params_hint (Optional) Tuning parameters to use.
84  */
85  void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0))
86  {
87  _tuning_params_hint = tuning_params_hint;
88  IKernel::configure(window);
89  }
90 
91 public:
92  /** Constructor */
94  : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _tuning_params_hint()
95  {
96  }
97  /** Returns a reference to the OpenCL kernel of this object.
98  *
99  * @return A reference to the OpenCL kernel of this object.
100  */
101  cl::Kernel &kernel()
102  {
103  return _kernel;
104  }
105  /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
106  *
107  * @param[in,out] idx Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
108  * @param[in] array Array to set as an argument of the object's kernel.
109  * @param[in] strides @ref Strides object containing stride of each dimension in bytes.
110  * @param[in] num_dimensions Number of dimensions of the @p array.
111  * @param[in] window Window the kernel will be executed on.
112  */
113  template <typename T>
114  void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
115  {
116  add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
117  }
118  /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
119  *
120  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
121  * @param[in] tensor Tensor to set as an argument of the object's kernel.
122  * @param[in] window Window the kernel will be executed on.
123  */
124  void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
125  {
126  add_tensor_argument<1>(idx, tensor, window);
127  }
128  /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
129  *
130  * @param[in] cond Condition to check
131  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
132  * @param[in] tensor Tensor to set as an argument of the object's kernel.
133  * @param[in] window Window the kernel will be executed on.
134  */
135  void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
136  {
137  if(cond)
138  {
139  add_1D_tensor_argument(idx, tensor, window);
140  }
141  }
142  /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
143  *
144  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
145  * @param[in] tensor Tensor to set as an argument of the object's kernel.
146  * @param[in] window Window the kernel will be executed on.
147  */
148  void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
149  {
150  add_tensor_argument<2>(idx, tensor, window);
151  }
152  /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
153  *
154  * @param[in] cond Condition to check
155  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
156  * @param[in] tensor Tensor to set as an argument of the object's kernel.
157  * @param[in] window Window the kernel will be executed on.
158  */
159  void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
160  {
161  if(cond)
162  {
163  add_2D_tensor_argument(idx, tensor, window);
164  }
165  }
166  /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
167  *
168  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
169  * @param[in] tensor Tensor to set as an argument of the object's kernel.
170  * @param[in] window Window the kernel will be executed on.
171  */
172  void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
173  {
174  add_tensor_argument<3>(idx, tensor, window);
175  }
176  /** Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
177  *
178  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
179  * @param[in] tensor Tensor to set as an argument of the object's kernel.
180  * @param[in] window Window the kernel will be executed on.
181  */
182  void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
183  {
184  add_tensor_argument<4>(idx, tensor, window);
185  }
186  /** Returns the number of arguments enqueued per 1D array object.
187  *
188  * @return The number of arguments enqueues per 1D array object.
189  */
190  constexpr static unsigned int num_arguments_per_1D_array()
191  {
192  return num_arguments_per_array<1>();
193  }
194  /** Returns the number of arguments enqueued per 1D tensor object.
195  *
196  * @return The number of arguments enqueues per 1D tensor object.
197  */
198  constexpr static unsigned int num_arguments_per_1D_tensor()
199  {
200  return num_arguments_per_tensor<1>();
201  }
202  /** Returns the number of arguments enqueued per 2D tensor object.
203  *
204  * @return The number of arguments enqueues per 2D tensor object.
205  */
206  constexpr static unsigned int num_arguments_per_2D_tensor()
207  {
208  return num_arguments_per_tensor<2>();
209  }
210  /** Returns the number of arguments enqueued per 3D tensor object.
211  *
212  * @return The number of arguments enqueues per 3D tensor object.
213  */
214  constexpr static unsigned int num_arguments_per_3D_tensor()
215  {
216  return num_arguments_per_tensor<3>();
217  }
218  /** Returns the number of arguments enqueued per 4D tensor object.
219  *
220  * @return The number of arguments enqueues per 4D tensor object.
221  */
222  constexpr static unsigned int num_arguments_per_4D_tensor()
223  {
224  return num_arguments_per_tensor<4>();
225  }
226  /** Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
227  *
228  * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
229  *
230  * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
231  * @param[in,out] queue Command queue on which to enqueue the kernel.
232  */
233  virtual void run(const Window &window, cl::CommandQueue &queue)
234  {
235  ARM_COMPUTE_UNUSED(window, queue);
236  }
237  /** Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
238  *
239  * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
240  *
241  * @param[in] tensors A vector containing the tensors to operato on.
242  * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
243  * @param[in,out] queue Command queue on which to enqueue the kernel.
244  */
245  virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
246  {
247  ARM_COMPUTE_UNUSED(tensors, window, queue);
248  }
249  /** Add the passed parameters to the object's kernel's arguments starting from the index idx.
250  *
251  * @param[in,out] idx Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.
252  * @param[in] value Value to set as an argument of the object's kernel.
253  */
254  template <typename T>
255  void add_argument(unsigned int &idx, T value)
256  {
257  _kernel.setArg(idx++, value);
258  }
259 
260  /** Set the Local-Workgroup-Size hint
261  *
262  * @note This method should be called after the configuration of the kernel
263  *
264  * @param[in] lws_hint Local-Workgroup-Size to use
265  */
266  void set_lws_hint(const cl::NDRange &lws_hint)
267  {
268  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure()
269  _tuning_params_hint.set_lws(lws_hint);
270  }
271 
272  /** Return the Local-Workgroup-Size hint
273  *
274  * @return Current lws hint
275  */
276  cl::NDRange lws_hint() const
277  {
278  return _tuning_params_hint.get_lws();
279  }
280 
281  /** Set the workgroup batch size modifier hint
282  *
283  * @note This method should be called after the configuration of the kernel
284  *
285  * @param[in] wbsm_hint workgroup batch size modifier value
286  */
287  void set_wbsm_hint(const cl_int &wbsm_hint)
288  {
289  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // wbsm_hint will be overwritten by configure()
290  _tuning_params_hint.set_wbsm(wbsm_hint);
291  }
292 
293  /** Return the workgroup batch size modifier hint
294  *
295  * @return Current wbsm hint
296  */
297  cl_int wbsm_hint() const
298  {
299  return _tuning_params_hint.get_wbsm();
300  }
301 
302  /** Get the configuration ID
303  *
304  * @note The configuration ID can be used by the caller to distinguish different calls of the same OpenCL kernel
305  * In particular, this method can be used by CLScheduler to keep track of the best LWS for each configuration of the same kernel.
306  * The configuration ID should be provided only for the kernels potentially affected by the LWS geometry
307  *
308  * @note This method should be called after the configuration of the kernel
309  *
310  * @return configuration id string
311  */
312  const std::string &config_id() const
313  {
314  return _config_id;
315  }
316 
317  /** Set the targeted GPU architecture
318  *
319  * @param[in] target The targeted GPU architecture
320  */
321  void set_target(GPUTarget target)
322  {
323  _target = target;
324  }
325 
326  /** Set the targeted GPU architecture according to the CL device
327  *
328  * @param[in] device A CL device
329  */
330  void set_target(cl::Device &device);
331 
332  /** Get the targeted GPU architecture
333  *
334  * @return The targeted GPU architecture.
335  */
337  {
338  return _target;
339  }
340 
341  /** Get the maximum workgroup size for the device the CLKernelLibrary uses.
342  *
343  * @return The maximum workgroup size value.
344  */
345  size_t get_max_workgroup_size();
346  /** Get the global work size given an execution window
347  *
348  * @param[in] window Execution window
349  *
350  * @return Global work size of the given execution window
351  */
352  static cl::NDRange gws_from_window(const Window &window);
353 
354 private:
355  /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
356  *
357  * @param[in,out] idx Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
358  * @param[in] array Array to set as an argument of the object's kernel.
359  * @param[in] strides @ref Strides object containing stride of each dimension in bytes.
360  * @param[in] num_dimensions Number of dimensions of the @p array.
361  * @param[in] window Window the kernel will be executed on.
362  */
363  template <typename T, unsigned int dimension_size>
364  void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
365  /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
366  *
367  * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
368  * @param[in] tensor Tensor to set as an argument of the object's kernel.
369  * @param[in] window Window the kernel will be executed on.
370  */
371  template <unsigned int dimension_size>
372  void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
373 
374 protected:
375  cl::Kernel _kernel; /**< OpenCL kernel to run */
376  GPUTarget _target; /**< The targeted GPU */
377  std::string _config_id; /**< Configuration ID */
378  size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */
379 private:
380  CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */
381 };
382 
383 /** Add the kernel to the command queue with the given window.
384  *
385  * @note Depending on the size of the window, this might translate into several jobs being enqueued.
386  *
387  * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
388  *
389  * @param[in,out] queue OpenCL command queue.
390  * @param[in] kernel Kernel to enqueue
391  * @param[in] window Window the kernel has to process.
392  * @param[in] lws_hint (Optional) Local workgroup size requested. Default is based on the device target.
393  * @param[in] use_dummy_work_items (Optional) Use dummy work items in order to have two dimensional power of two NDRange. Default is false
394  * Note: it is kernel responsibility to check if the work-item is out-of-range
395  *
396  * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
397  */
398 void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
399 
400 /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
401  *
402  * @param[in,out] idx Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
403  * @param[in] array Array to set as an argument of the object's kernel.
404  * @param[in] strides @ref Strides object containing stride of each dimension in bytes.
405  * @param[in] num_dimensions Number of dimensions of the @p array.
406  * @param[in] window Window the kernel will be executed on.
407  */
408 template <typename T, unsigned int dimension_size>
409 void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
410 {
411  ARM_COMPUTE_ERROR_ON(array == nullptr);
412 
413  // Calculate offset to the start of the window
414  unsigned int offset_first_element = 0;
415 
416  for(unsigned int n = 0; n < num_dimensions; ++n)
417  {
418  offset_first_element += window[n].start() * strides[n];
419  }
420 
421  unsigned int idx_start = idx;
422  _kernel.setArg(idx++, array->cl_buffer());
423 
424  for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
425  {
426  _kernel.setArg<cl_uint>(idx++, strides[dimension]);
427  _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
428  }
429 
430  _kernel.setArg<cl_uint>(idx++, offset_first_element);
431 
432  ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
433  "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
434  ARM_COMPUTE_UNUSED(idx_start);
435 }
436 }
437 #endif /*ARM_COMPUTE_ICLKERNEL_H */
static constexpr unsigned int num_arguments_per_1D_tensor()
Returns the number of arguments enqueued per 1D tensor object.
Definition: ICLKernel.h:198
static cl::NDRange gws_from_window(const Window &window)
Get the global work size given an execution window.
Definition: ICLKernel.cpp:140
Common information for all the kernels.
Definition: IKernel.h:33
void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx ...
Definition: ICLKernel.h:135
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
static constexpr unsigned int num_arguments_per_1D_array()
Returns the number of arguments enqueued per 1D array object.
Definition: ICLKernel.h:190
cl::NDRange get_lws() const
virtual void run(const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
Definition: ICLKernel.h:233
void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx ...
Definition: ICLKernel.h:159
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
Definition: ICLKernel.cpp:32
cl::Kernel & kernel()
Returns a reference to the OpenCL kernel of this object.
Definition: ICLKernel.h:101
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
Definition: ICLKernel.h:276
void set_lws_hint(const cl::NDRange &lws_hint)
Set the Local-Workgroup-Size hint.
Definition: ICLKernel.h:266
cl_int wbsm_hint() const
Return the workgroup batch size modifier hint.
Definition: ICLKernel.h:297
void add_argument(unsigned int &idx, T value)
Add the passed parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:255
void add_1D_array_argument(unsigned int &idx, const ICLArray< T > *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:114
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
< OpenCL tuner parameters
#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg,...)
Definition: Error.h:457
Common interface for all the OpenCL kernels.
Definition: ICLKernel.h:46
void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:172
Copyright (c) 2017-2021 Arm Limited.
void set_lws(cl::NDRange lws)
const std::string & config_id() const
Get the configuration ID.
Definition: ICLKernel.h:312
Interface for OpenCL Array.
Definition: ICLArray.h:35
static constexpr unsigned int num_arguments_per_3D_tensor()
Returns the number of arguments enqueued per 3D tensor object.
Definition: ICLKernel.h:214
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
GPUTarget get_target() const
Get the targeted GPU architecture.
Definition: ICLKernel.h:336
void set_wbsm(cl_int wbsm)
static constexpr unsigned int num_arguments_per_2D_tensor()
Returns the number of arguments enqueued per 2D tensor object.
Definition: ICLKernel.h:206
static constexpr unsigned int num_arguments_per_4D_tensor()
Returns the number of arguments enqueued per 4D tensor object.
Definition: ICLKernel.h:222
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
Strides of an item in bytes.
Definition: Strides.h:37
void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:148
Interface for OpenCL tensor.
Definition: ICLTensor.h:42
ICLKernel()
Constructor.
Definition: ICLKernel.h:93
GPUTarget
Available GPU Targets.
Definition: GPUTarget.h:34
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
Wrapper to configure the Khronos OpenCL C++ header.
size_t get_max_workgroup_size()
Get the maximum workgroup size for the device the CLKernelLibrary uses.
Definition: ICLKernel.cpp:131
Tensor packing service.
Definition: ITensorPack.h:37
void set_target(GPUTarget target)
Set the targeted GPU architecture.
Definition: ICLKernel.h:321
void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:124
void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
Definition: ICLKernel.h:182
virtual const cl::Buffer & cl_buffer() const =0
Interface to be implemented by the child class to return a reference to the OpenCL buffer containing ...
void set_wbsm_hint(const cl_int &wbsm_hint)
Set the workgroup batch size modifier hint.
Definition: ICLKernel.h:287
Describe a multidimensional execution window.
Definition: Window.h:39
virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
Definition: ICLKernel.h:245