24.02.1
|
Go to the documentation of this file.
24 #ifndef ARM_COMPUTE_ICLKERNEL_H
25 #define ARM_COMPUTE_ICLKERNEL_H
44 bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1)
46 if (lws0.dimensions() != lws1.dimensions())
51 for (
size_t i = 0; i < lws0.dimensions(); ++i)
53 if (lws0.get()[i] != lws1.get()[i])
74 template <
unsigned int dimension_size>
75 constexpr
static unsigned int num_arguments_per_array()
77 return num_arguments_per_tensor<dimension_size>();
83 template <
unsigned int dimension_size>
84 constexpr
static unsigned int num_arguments_per_tensor()
86 return 2 + 2 * dimension_size;
96 cl::NDRange default_lws_tune(
const Window &
window,
bool use_dummy_work_items)
101 using IKernel::configure;
123 _tuning_params_hint = tuning_params_hint;
129 _tuning_params_hint.
set_lws(default_lws_tune(
window,
false ));
132 IKernel::configure(
window);
141 _max_workgroup_size(0),
143 _tuning_params_hint(),
144 _cached_gws(
cl::NullRange)
171 template <
typename T>
175 unsigned int num_dimensions,
178 add_array_argument<T, 1>(idx, array, strides, num_dimensions,
window);
272 constexpr
unsigned int no_args_per_3d_tensor_nhw = 7u;
273 return no_args_per_3d_tensor_nhw;
289 constexpr
unsigned int no_args_per_4d_tensor_nhwc = 9u;
290 return no_args_per_4d_tensor_nhwc;
299 return num_arguments_per_array<1>();
307 return num_arguments_per_tensor<1>();
315 return num_arguments_per_tensor<2>();
323 return num_arguments_per_tensor<3>();
331 return num_arguments_per_tensor<4>();
361 template <
typename T>
364 _kernel.setArg(idx++, value);
385 return _tuning_params_hint.
get_lws();
406 return _tuning_params_hint.
get_wbsm();
483 template <
typename T,
unsigned int dimension_size>
484 void add_array_argument(
unsigned int &idx,
487 unsigned int num_dimensions,
495 template <
unsigned int dimension_size>
501 std::string _config_id;
502 size_t _max_workgroup_size;
506 cl::NDRange _cached_gws;
524 void enqueue(cl::CommandQueue &queue,
526 const Window &window,
528 bool use_dummy_work_items =
false);
538 template <
typename T,
unsigned int dimension_size>
539 void ICLKernel::add_array_argument(
545 unsigned int offset_first_element = 0;
547 for (
unsigned int n = 0; n < num_dimensions; ++n)
549 offset_first_element +=
window[n].start() * strides[n];
552 unsigned int idx_start = idx;
553 _kernel.setArg(idx++, array->
cl_buffer());
555 for (
unsigned int dimension = 0; dimension < dimension_size; dimension++)
557 _kernel.setArg<cl_uint>(idx++, strides[dimension]);
558 _kernel.setArg<cl_uint>(idx++, strides[dimension] *
window[dimension].step());
561 _kernel.setArg<cl_uint>(idx++, offset_first_element);
564 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel",
565 dimension_size, num_arguments_per_array<dimension_size>());
cl::NDRange get_lws() const
void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
size_t get_max_workgroup_size()
Get the maximum workgroup size for the device the CLKernelLibrary uses.
#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg,...)
constexpr static unsigned int num_arguments_per_4D_tensor()
Returns the number of arguments enqueued per 4D tensor object.
constexpr static unsigned int num_arguments_per_2D_tensor()
Returns the number of arguments enqueued per 2D tensor object.
cl::Kernel & kernel()
Returns a reference to the OpenCL kernel of this object.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
cl_int wbsm_hint() const
Return the workgroup batch size modifier hint.
cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws)
Interface for OpenCL tensor.
void add_3d_tensor_nhw_argument(unsigned int &idx, const ICLTensor *tensor)
Add the passed NHW 3D tensor's parameters to the object's kernel's arguments by passing strides,...
GPUTarget get_target() const
Get the targeted GPU architecture.
void add_1D_array_argument(unsigned int &idx, const ICLArray< T > *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
void set_wbsm_hint(const cl_int &wbsm_hint)
Set the workgroup batch size modifier hint.
Interface for OpenCL Array.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
Strides of an item in bytes.
void set_lws_hint(const cl::NDRange &lws_hint)
Set the Local-Workgroup-Size hint.
const std::string & config_id() const
Get the configuration ID.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx ...
void set_lws(cl::NDRange lws)
Common information for all the kernels.
constexpr static unsigned int num_arguments_per_1D_tensor()
Returns the number of arguments enqueued per 1D tensor object.
static cl::NDRange gws_from_window(const Window &window, bool use_dummy_work_items)
Get the global work size given an execution window.
void add_argument(unsigned int &idx, T value)
Add the passed parameters to the object's kernel's arguments starting from the index idx.
void set_target(GPUTarget target)
Set the targeted GPU architecture.
void add_5D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 5D tensor's parameters to the object's kernel's arguments starting from the index idx.
< OpenCL tuner parameters
Wrapper to configure the Khronos OpenCL C++ header.
constexpr static unsigned int num_arguments_per_4d_tensor_nhwc()
Returns the number of arguments enqueued per NHWC 4D Tensor object.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
CLTensor * tensor
Pointer to the auxiliary tensor.
void set_wbsm(cl_int wbsm)
Common interface for all the OpenCL kernels.
void cache_gws(const cl::NDRange &gws)
Cache the latest gws used to enqueue this kernel.
constexpr static unsigned int num_arguments_per_1D_array()
Returns the number of arguments enqueued per 1D array object.
const Window & window() const
The maximum window the kernel can be executed on.
GPUTarget
Available GPU Targets.
void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
void add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor)
Add the passed NHWC 4D tensor's parameters to the object's kernel's arguments by passing strides,...
void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx ...
Describe a multidimensional execution window.
constexpr static unsigned int num_arguments_per_3D_tensor()
Returns the number of arguments enqueued per 3D tensor object.
Copyright (c) 2017-2024 Arm Limited.
@ UNKNOWN
Unknown CL kernel type.
cl::NDRange default_ndrange() const
Return the default NDRange for the device.
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
constexpr static unsigned int num_arguments_per_3d_tensor_nhw()
Returns the number of arguments enqueued per NHW 3D Tensor object.
virtual const cl::Buffer & cl_buffer() const =0
Interface to be implemented by the child class to return a reference to the OpenCL buffer containing ...
cl::NDRange get_cached_gws() const
Get the cached gws used to enqueue this kernel.
CLKernelType type() const
Returns the CL kernel type.
virtual void run(const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue.
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.