24 #ifndef ARM_COMPUTE_ICLKERNEL_H 25 #define ARM_COMPUTE_ICLKERNEL_H 53 template <
unsigned int dimension_size>
54 constexpr
static unsigned int num_arguments_per_array()
56 return num_arguments_per_tensor<dimension_size>();
62 template <
unsigned int dimension_size>
63 constexpr
static unsigned int num_arguments_per_tensor()
65 return 2 + 2 * dimension_size;
67 using IKernel::configure;
87 _tuning_params_hint = tuning_params_hint;
88 IKernel::configure(
window);
94 : _kernel(nullptr), _target(
GPUTarget::
MIDGARD), _config_id(
arm_compute::default_config_id), _max_workgroup_size(0), _tuning_params_hint()
113 template <
typename T>
116 add_array_argument<T, 1>(idx, array, strides, num_dimensions,
window);
126 add_tensor_argument<1>(idx, tensor,
window);
150 add_tensor_argument<2>(idx, tensor,
window);
174 add_tensor_argument<3>(idx, tensor,
window);
184 add_tensor_argument<4>(idx, tensor,
window);
192 return num_arguments_per_array<1>();
200 return num_arguments_per_tensor<1>();
208 return num_arguments_per_tensor<2>();
216 return num_arguments_per_tensor<3>();
224 return num_arguments_per_tensor<4>();
254 template <
typename T>
257 _kernel.setArg(idx++, value);
278 return _tuning_params_hint.
get_lws();
299 return _tuning_params_hint.
get_wbsm();
363 template <
typename T,
unsigned int dimension_size>
371 template <
unsigned int dimension_size>
377 std::string _config_id;
378 size_t _max_workgroup_size;
398 void enqueue(cl::CommandQueue &queue, ICLKernel &kernel,
const Window &window,
const cl::NDRange &lws_hint =
CLKernelLibrary::get().default_ndrange(),
bool use_dummy_work_items =
false);
408 template <
typename T,
unsigned int dimension_size>
409 void ICLKernel::add_array_argument(
unsigned &idx,
const ICLArray<T> *array,
const Strides &strides,
unsigned int num_dimensions,
const Window &window)
414 unsigned int offset_first_element = 0;
416 for(
unsigned int n = 0; n < num_dimensions; ++n)
418 offset_first_element +=
window[n].start() * strides[n];
421 unsigned int idx_start = idx;
422 _kernel.setArg(idx++, array->
cl_buffer());
424 for(
unsigned int dimension = 0; dimension < dimension_size; dimension++)
426 _kernel.setArg<cl_uint>(idx++, strides[dimension]);
427 _kernel.setArg<cl_uint>(idx++, strides[dimension] *
window[dimension].step());
430 _kernel.setArg<cl_uint>(idx++, offset_first_element);
433 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
static constexpr unsigned int num_arguments_per_1D_tensor()
Returns the number of arguments enqueued per 1D tensor object.
static cl::NDRange gws_from_window(const Window &window)
Get the global work size given an execution window.
Common information for all the kernels.
void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx ...
const Window & window() const
The maximum window the kernel can be executed on.
static constexpr unsigned int num_arguments_per_1D_array()
Returns the number of arguments enqueued per 1D array object.
cl::NDRange get_lws() const
virtual void run(const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx ...
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
cl::Kernel & kernel()
Returns a reference to the OpenCL kernel of this object.
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
void set_lws_hint(const cl::NDRange &lws_hint)
Set the Local-Workgroup-Size hint.
cl_int wbsm_hint() const
Return the workgroup batch size modifier hint.
void add_argument(unsigned int &idx, T value)
Add the passed parameters to the object's kernel's arguments starting from the index idx.
void add_1D_array_argument(unsigned int &idx, const ICLArray< T > *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
< OpenCL tuner parameters
#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg,...)
Common interface for all the OpenCL kernels.
void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
Copyright (c) 2017-2021 Arm Limited.
void set_lws(cl::NDRange lws)
const std::string & config_id() const
Get the configuration ID.
Interface for OpenCL Array.
static constexpr unsigned int num_arguments_per_3D_tensor()
Returns the number of arguments enqueued per 3D tensor object.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
GPUTarget get_target() const
Get the targeted GPU architecture.
void set_wbsm(cl_int wbsm)
static constexpr unsigned int num_arguments_per_2D_tensor()
Returns the number of arguments enqueued per 2D tensor object.
static constexpr unsigned int num_arguments_per_4D_tensor()
Returns the number of arguments enqueued per 4D tensor object.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Strides of an item in bytes.
void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
Interface for OpenCL tensor.
GPUTarget
Available GPU Targets.
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
Wrapper to configure the Khronos OpenCL C++ header.
size_t get_max_workgroup_size()
Get the maximum workgroup size for the device the CLKernelLibrary uses.
void set_target(GPUTarget target)
Set the targeted GPU architecture.
void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
virtual const cl::Buffer & cl_buffer() const =0
Interface to be implemented by the child class to return a reference to the OpenCL buffer containing ...
void set_wbsm_hint(const cl_int &wbsm_hint)
Set the workgroup batch size modifier hint.
Describe a multidimensional execution window.
virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.