Compute Library
 19.08
ICLKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2019 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
28 #include "arm_compute/core/Error.h"
31 #include "arm_compute/core/Utils.h"
34 
35 #include <cstddef>
36 
37 using namespace arm_compute;
38 
39 void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
40 {
41  if(kernel.kernel()() == nullptr)
42  {
43  return;
44  }
45 
46  for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
47  {
48  ARM_COMPUTE_ERROR_ON(window[i].step() == 0);
49  // Make sure that dimensions > Z are 1
50  ARM_COMPUTE_ERROR_ON((i >= 3) && ((window[i].end() - window[i].start()) != 1));
51  }
52 
53  cl::NDRange gws = ICLKernel::gws_from_window(window);
54 
55  // Check for empty NDRange
56  if(gws.dimensions() == 0)
57  {
58  return;
59  }
60 
61  // Use dummy work-items
62  if(use_dummy_work_items)
63  {
64  gws.get()[0] = get_next_power_two(gws[0]);
65  gws.get()[1] = get_next_power_two(gws[1]);
66  }
67 
68  cl::NDRange valid_lws;
69  if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
70  {
71  valid_lws = cl::NullRange;
72  }
73  else
74  {
75  valid_lws = lws_hint;
76  }
77 
78  cl::NDRange lws = cl::NullRange;
79 
80  if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
81  {
82  lws = valid_lws;
83  }
84 
85  queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws);
86 }
87 
88 template <unsigned int dimension_size>
89 void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window)
90 {
91  ARM_COMPUTE_ERROR_ON(tensor == nullptr);
92 
93  const ITensorInfo *info = tensor->info();
94  const Strides &strides = info->strides_in_bytes();
95 
96  // Calculate offset to the start of the window
97  unsigned int offset_first_element = info->offset_first_element_in_bytes();
98 
99  for(unsigned int n = 0; n < info->num_dimensions(); ++n)
100  {
101  offset_first_element += window[n].start() * strides[n];
102  }
103 
104  unsigned int idx_start = idx;
105  _kernel.setArg(idx++, tensor->cl_buffer());
106 
107  for(unsigned int d = 0; d < dimension_size; ++d)
108  {
109  _kernel.setArg<cl_uint>(idx++, strides[d]);
110  _kernel.setArg<cl_uint>(idx++, strides[d] * window[d].step());
111  }
112 
113  _kernel.setArg<cl_uint>(idx++, offset_first_element);
114 
115  ARM_COMPUTE_ERROR_ON_MSG(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
116  "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
117  ARM_COMPUTE_UNUSED(idx_start);
118 }
119 
120 #ifndef DOXYGEN_SKIP_THIS
121 template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window);
122 template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window);
123 template void ICLKernel::add_tensor_argument<3>(unsigned &idx, const ICLTensor *tensor, const Window &window);
124 template void ICLKernel::add_tensor_argument<4>(unsigned &idx, const ICLTensor *tensor, const Window &window);
125 #endif /* DOXYGEN_SKIP_THIS */
126 
127 void ICLKernel::set_target(cl::Device &device)
128 {
129  _target = get_target_from_device(device);
130 }
131 
133 {
134  if(_max_workgroup_size == 0)
135  {
136  _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel);
137  }
138  return _max_workgroup_size;
139 }
140 
141 cl::NDRange ICLKernel::gws_from_window(const Window &window)
142 {
143  if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
144  {
145  return cl::NullRange;
146  }
147 
148  cl::NDRange gws((window.x().end() - window.x().start()) / window.x().step(),
149  (window.y().end() - window.y().start()) / window.y().step(),
150  (window.z().end() - window.z().start()) / window.z().step());
151 
152  return gws;
153 }
static cl::NDRange gws_from_window(const Window &window)
Get the global work size given an execution window.
Definition: ICLKernel.cpp:141
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
Definition: ICLKernel.cpp:39
unsigned int get_next_power_two(unsigned int x)
Given an integer value, this function returns the next power of two.
Definition: Helpers.h:775
cl::Kernel & kernel()
Returns a reference to the OpenCL kernel of this object.
Definition: ICLKernel.h:87
constexpr int step() const
Return the step of the dimension.
Definition: Window.h:102
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:337
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Store the tensor's metadata.
Definition: ITensorInfo.h:40
constexpr const Dimension & z() const
Alias to access the third dimension of the window.
Definition: Window.h:161
Common interface for all the OpenCL kernels.
Definition: ICLKernel.h:43
Copyright (c) 2017-2018 ARM Limited.
size_t max_local_workgroup_size(const cl::Kernel &kernel) const
Find the maximum number of local work items in a workgroup can be supported for the kernel.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:160
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
GPUTarget get_target_from_device(const cl::Device &device)
Helper function to get the GPU target from CL device.
Definition: CLHelpers.cpp:131
Strides of an item in bytes.
Definition: Strides.h:37
Interface for OpenCL tensor.
Definition: ICLTensor.h:42
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
Definition: Window.h:152
size_t get_max_workgroup_size()
Get the maximum workgroup size for the device the CLKernelLibrary uses.
Definition: ICLKernel.cpp:132
virtual const cl::Buffer & cl_buffer() const =0
Interface to be implemented by the child class to return a reference to the OpenCL buffer containing ...
void set_target(GPUTarget target)
Set the targeted GPU architecture.
Definition: ICLKernel.h:271
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:97
static constexpr size_t num_max_dimensions
Number of dimensions the tensor has.
Definition: Dimensions.h:45
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:92
Describe a multidimensional execution window.
Definition: Window.h:39
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:143
#define ARM_COMPUTE_ERROR_ON_MSG(cond,...)
Definition: Error.h:328