23.11
|
Go to the documentation of this file.
36 const cl::NDRange &lws_hint,
37 bool use_dummy_work_items)
39 if (kernel.
kernel()() ==
nullptr)
44 for (
unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
51 cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items);
54 if (gws.dimensions() == 0)
61 cl::NDRange valid_lws;
64 valid_lws = cl::NullRange;
71 cl::NDRange lws = cl::NullRange;
73 if ((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
78 if (CLKernelLibrary::get().is_wbsm_supported())
82 queue.enqueueNDRangeKernel(kernel.
kernel(), cl::NullRange, gws, lws);
87 template <
unsigned int dimension_size>
96 unsigned int offset_first_element =
info->offset_first_element_in_bytes();
98 for (
unsigned int n = 0; n <
info->num_dimensions(); ++n)
103 unsigned int idx_start = idx;
104 _kernel.setArg(idx++,
tensor->cl_buffer());
106 for (
unsigned int d = 0; d < dimension_size; ++d)
112 _kernel.setArg<cl_uint>(idx++, offset_first_element);
115 "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel",
116 dimension_size, num_arguments_per_tensor<dimension_size>());
129 _kernel.setArg(idx++,
tensor->cl_buffer());
132 _kernel.setArg<cl_uint>(idx++, strides[1]);
133 _kernel.setArg<cl_uint>(idx++, strides[2]);
136 _kernel.setArg<cl_uint>(idx++,
info->dimension(0));
137 _kernel.setArg<cl_uint>(idx++,
info->dimension(1));
138 _kernel.setArg<cl_uint>(idx++,
info->dimension(2));
141 unsigned int offset_first_element =
info->offset_first_element_in_bytes();
142 _kernel.setArg<cl_uint>(idx++, offset_first_element);
154 _kernel.setArg(idx++,
tensor->cl_buffer());
157 _kernel.setArg<cl_uint>(idx++, strides[1]);
158 _kernel.setArg<cl_uint>(idx++, strides[2]);
159 _kernel.setArg<cl_uint>(idx++, strides[3]);
162 _kernel.setArg<cl_uint>(idx++,
info->dimension(0));
163 _kernel.setArg<cl_uint>(idx++,
info->dimension(1));
164 _kernel.setArg<cl_uint>(idx++,
info->dimension(2));
165 _kernel.setArg<cl_uint>(idx++,
info->dimension(3));
168 unsigned int offset_first_element =
info->offset_first_element_in_bytes();
169 _kernel.setArg<cl_uint>(idx++, offset_first_element);
172 #ifndef DOXYGEN_SKIP_THIS
173 template void ICLKernel::add_tensor_argument<1>(
unsigned &idx,
const ICLTensor *
tensor,
const Window &window);
174 template void ICLKernel::add_tensor_argument<2>(
unsigned &idx,
const ICLTensor *
tensor,
const Window &window);
175 template void ICLKernel::add_tensor_argument<3>(
unsigned &idx,
const ICLTensor *
tensor,
const Window &window);
176 template void ICLKernel::add_tensor_argument<4>(
unsigned &idx,
const ICLTensor *
tensor,
const Window &window);
177 template void ICLKernel::add_tensor_argument<5>(
unsigned &idx,
const ICLTensor *
tensor,
const Window &window);
187 if (_max_workgroup_size == 0)
191 return _max_workgroup_size;
198 return cl::NullRange;
205 if (use_dummy_work_items)
constexpr int start() const
Return the start of the dimension.
size_t get_max_workgroup_size()
Get the maximum workgroup size for the device the CLKernelLibrary uses.
#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg,...)
constexpr int step() const
Return the step of the dimension.
cl::Kernel & kernel()
Returns a reference to the OpenCL kernel of this object.
cl_int wbsm_hint() const
Return the workgroup batch size modifier hint.
Interface for OpenCL tensor.
unsigned int get_next_power_two(unsigned int x)
Given an integer value, this function returns the next power of two.
void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint)
void add_3d_tensor_nhw_argument(unsigned int &idx, const ICLTensor *tensor)
Add the passed NHW 3D tensor's parameters to the object's kernel's arguments by passing strides,...
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Strides of an item in bytes.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
static cl::NDRange gws_from_window(const Window &window, bool use_dummy_work_items)
Get the global work size given an execution window.
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
void set_target(GPUTarget target)
Set the targeted GPU architecture.
bool is_broadcasted(size_t dimension) const
Return whether a dimension has been broadcasted.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
GPUTarget get_target_from_device(const cl::Device &device)
Helper function to get the GPU target from CL device.
CLTensor * tensor
Pointer to the auxiliary tensor.
Common interface for all the OpenCL kernels.
void cache_gws(const cl::NDRange &gws)
Cache the latest gws used to enqueue this kernel.
const Window & window() const
The maximum window the kernel can be executed on.
void add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor)
Add the passed NHWC 4D tensor's parameters to the object's kernel's arguments by passing strides,...
Describe a multidimensional execution window.
Copyright (c) 2017-2023 Arm Limited.
void end(TokenStream &in, bool &valid)
size_t max_local_workgroup_size(const cl::Kernel &kernel) const
Find the maximum number of local work items in a workgroup can be supported for the kernel.
Store the tensor's metadata.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
cl::NDRange get_cached_gws() const
Get the cached gws used to enqueue this kernel.
constexpr int end() const
Return the end of the dimension.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
constexpr const Dimension & z() const
Alias to access the third dimension of the window.
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.