24 #ifndef ARM_COMPUTE_ICLKERNEL_H 25 #define ARM_COMPUTE_ICLKERNEL_H 44 bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1)
46 if(lws0.dimensions() != lws1.dimensions())
51 for(
size_t i = 0; i < lws0.dimensions(); ++i)
53 if(lws0.get()[i] != lws1.get()[i])
75 template <
unsigned int dimension_size>
76 constexpr
static unsigned int num_arguments_per_array()
78 return num_arguments_per_tensor<dimension_size>();
84 template <
unsigned int dimension_size>
85 constexpr
static unsigned int num_arguments_per_tensor()
87 return 2 + 2 * dimension_size;
95 using IKernel::configure;
115 _tuning_params_hint = tuning_params_hint;
119 _tuning_params_hint.
set_lws(default_lws_tune(window));
122 IKernel::configure(window);
155 template <
typename T>
158 add_array_argument<T, 1>(idx, array, strides, num_dimensions,
window);
168 add_tensor_argument<1>(idx, tensor,
window);
192 add_tensor_argument<2>(idx, tensor,
window);
216 add_tensor_argument<3>(idx, tensor,
window);
226 add_tensor_argument<4>(idx, tensor,
window);
234 return num_arguments_per_array<1>();
242 return num_arguments_per_tensor<1>();
250 return num_arguments_per_tensor<2>();
258 return num_arguments_per_tensor<3>();
266 return num_arguments_per_tensor<4>();
275 virtual void run(
const Window &window, cl::CommandQueue &queue)
296 template <
typename T>
299 _kernel.setArg(idx++, value);
311 _tuning_params_hint.
set_lws(lws_hint);
320 return _tuning_params_hint.
get_lws();
332 _tuning_params_hint.
set_wbsm(wbsm_hint);
341 return _tuning_params_hint.
get_wbsm();
405 template <
typename T,
unsigned int dimension_size>
406 void add_array_argument(
unsigned int &idx,
const ICLArray<T> *array,
const Strides &strides,
unsigned int num_dimensions,
const Window &window);
413 template <
unsigned int dimension_size>
414 void add_tensor_argument(
unsigned int &idx,
const ICLTensor *tensor,
const Window &window);
419 std::string _config_id;
420 size_t _max_workgroup_size;
451 template <
typename T,
unsigned int dimension_size>
452 void ICLKernel::add_array_argument(
unsigned &idx,
const ICLArray<T> *array,
const Strides &strides,
unsigned int num_dimensions,
const Window &window)
457 unsigned int offset_first_element = 0;
459 for(
unsigned int n = 0; n < num_dimensions; ++n)
461 offset_first_element += window[n].start() * strides[n];
464 unsigned int idx_start = idx;
465 _kernel.setArg(idx++, array->
cl_buffer());
467 for(
unsigned int dimension = 0; dimension < dimension_size; dimension++)
469 _kernel.setArg<cl_uint>(idx++, strides[dimension]);
470 _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
473 _kernel.setArg<cl_uint>(idx++, offset_first_element);
476 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
static constexpr unsigned int num_arguments_per_1D_tensor()
Returns the number of arguments enqueued per 1D tensor object.
static cl::NDRange gws_from_window(const Window &window)
Get the global work size given an execution window.
Common information for all the kernels.
void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx ...
const Window & window() const
The maximum window the kernel can be executed on.
static constexpr unsigned int num_arguments_per_1D_array()
Returns the number of arguments enqueued per 1D array object.
cl::NDRange get_lws() const
virtual void run(const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue...
void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx ...
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
cl::Kernel & kernel()
Returns a reference to the OpenCL kernel of this object.
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
void set_lws_hint(const cl::NDRange &lws_hint)
Set the Local-Workgroup-Size hint.
cl_int wbsm_hint() const
Return the workgroup batch size modifier hint.
void add_argument(unsigned int &idx, T value)
Add the passed parameters to the object's kernel's arguments starting from the index idx...
void add_1D_array_argument(unsigned int &idx, const ICLArray< T > *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx...
cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
< OpenCL tuner parameters
#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg,...)
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context...
Common interface for all the OpenCL kernels.
void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx...
Copyright (c) 2017-2021 Arm Limited.
void set_lws(cl::NDRange lws)
const std::string & config_id() const
Get the configuration ID.
cl::NDRange default_ndrange() const
Return the default NDRange for the device.
Interface for OpenCL Array.
static constexpr unsigned int num_arguments_per_3D_tensor()
Returns the number of arguments enqueued per 3D tensor object.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
GPUTarget get_target() const
Get the targeted GPU architecture.
void set_wbsm(cl_int wbsm)
static constexpr unsigned int num_arguments_per_2D_tensor()
Returns the number of arguments enqueued per 2D tensor object.
static constexpr unsigned int num_arguments_per_4D_tensor()
Returns the number of arguments enqueued per 4D tensor object.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Strides of an item in bytes.
void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx...
Interface for OpenCL tensor.
GPUTarget
Available GPU Targets.
Wrapper to configure the Khronos OpenCL C++ header.
size_t get_max_workgroup_size()
Get the maximum workgroup size for the device the CLKernelLibrary uses.
void set_target(GPUTarget target)
Set the targeted GPU architecture.
void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx...
void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx...
CLKernelType type() const
Returns the CL kernel type.
virtual const cl::Buffer & cl_buffer() const =0
Interface to be implemented by the child class to return a reference to the OpenCL buffer containing ...
void set_wbsm_hint(const cl_int &wbsm_hint)
Set the workgroup batch size modifier hint.
Describe a multidimensional execution window.
virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue...