24 #ifndef ARM_COMPUTE_ICLKERNEL_H 25 #define ARM_COMPUTE_ICLKERNEL_H 40 #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) 43 namespace experimental
45 namespace dynamic_fusion
48 struct ClExecutionDescriptor;
52 #endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) 58 bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1)
60 if(lws0.dimensions() != lws1.dimensions())
65 for(
size_t i = 0; i < lws0.dimensions(); ++i)
67 if(lws0.get()[i] != lws1.get()[i])
88 template <
unsigned int dimension_size>
89 constexpr
static unsigned int num_arguments_per_array()
91 return num_arguments_per_tensor<dimension_size>();
97 template <
unsigned int dimension_size>
98 constexpr
static unsigned int num_arguments_per_tensor()
100 return 2 + 2 * dimension_size;
103 cl::NDRange default_lws_tune(
const Window &window)
108 using IKernel::configure;
116 void configure_internal(
const Window &window, cl::NDRange lws_hint, cl_int wbsm_hint = 0)
128 _tuning_params_hint = tuning_params_hint;
132 _tuning_params_hint.set_lws(default_lws_tune(window));
135 IKernel::configure(window);
168 template <
typename T>
171 add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
181 add_tensor_argument<1>(idx, tensor, window);
194 add_1D_tensor_argument(idx, tensor, window);
205 add_tensor_argument<2>(idx, tensor, window);
218 add_2D_tensor_argument(idx, tensor, window);
229 add_tensor_argument<3>(idx, tensor, window);
239 add_tensor_argument<4>(idx, tensor, window);
249 add_tensor_argument<5>(idx, tensor, window);
257 void add_3d_tensor_nhw_argument(
unsigned int &idx,
const ICLTensor *tensor);
265 constexpr
unsigned int no_args_per_3d_tensor_nhw = 7u;
266 return no_args_per_3d_tensor_nhw;
274 void add_4d_tensor_nhwc_argument(
unsigned int &idx,
const ICLTensor *tensor);
282 constexpr
unsigned int no_args_per_4d_tensor_nhwc = 9u;
283 return no_args_per_4d_tensor_nhwc;
292 return num_arguments_per_array<1>();
300 return num_arguments_per_tensor<1>();
308 return num_arguments_per_tensor<2>();
316 return num_arguments_per_tensor<3>();
324 return num_arguments_per_tensor<4>();
333 virtual void run(
const Window &window, cl::CommandQueue &queue)
350 #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) 356 #endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) 362 template <
typename T>
365 _kernel.setArg(idx++, value);
377 _tuning_params_hint.set_lws(lws_hint);
386 return _tuning_params_hint.get_lws();
398 _tuning_params_hint.set_wbsm(wbsm_hint);
407 return _tuning_params_hint.get_wbsm();
438 void set_target(cl::Device &device);
453 size_t get_max_workgroup_size();
460 static cl::NDRange gws_from_window(
const Window &window);
471 template <
typename T,
unsigned int dimension_size>
472 void add_array_argument(
unsigned int &idx,
const ICLArray<T> *array,
const Strides &strides,
unsigned int num_dimensions,
const Window &window);
479 template <
unsigned int dimension_size>
480 void add_tensor_argument(
unsigned int &idx,
const ICLTensor *tensor,
const Window &window);
485 std::string _config_id;
486 size_t _max_workgroup_size;
517 template <
typename T,
unsigned int dimension_size>
518 void ICLKernel::add_array_argument(
unsigned &idx,
const ICLArray<T> *array,
const Strides &strides,
unsigned int num_dimensions,
const Window &window)
523 unsigned int offset_first_element = 0;
525 for(
unsigned int n = 0;
n < num_dimensions; ++
n)
527 offset_first_element += window[
n].start() * strides[
n];
530 unsigned int idx_start = idx;
531 _kernel.setArg(idx++, array->
cl_buffer());
533 for(
unsigned int dimension = 0; dimension < dimension_size; dimension++)
535 _kernel.setArg<cl_uint>(idx++, strides[dimension]);
536 _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
539 _kernel.setArg<cl_uint>(idx++, offset_first_element);
542 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
static constexpr unsigned int num_arguments_per_1D_tensor()
Returns the number of arguments enqueued per 1D tensor object.
Common information for all the kernels.
void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx ...
static constexpr unsigned int num_arguments_per_1D_array()
Returns the number of arguments enqueued per 1D array object.
virtual void run(const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue...
void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx ...
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
static constexpr unsigned int num_arguments_per_3d_tensor_nhw()
Returns the number of arguments enqueued per NHW 3D Tensor object.
cl::Kernel & kernel()
Returns a reference to the OpenCL kernel of this object.
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
void set_lws_hint(const cl::NDRange &lws_hint)
Set the Local-Workgroup-Size hint.
cl_int wbsm_hint() const
Return the workgroup batch size modifier hint.
virtual void run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
The execution is carried out through run_op method. But the run_op method needs to be extended to inc...
void add_argument(unsigned int &idx, T value)
Add the passed parameters to the object's kernel's arguments starting from the index idx...
void add_1D_array_argument(unsigned int &idx, const ICLArray< T > *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx...
cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
< OpenCL tuner parameters
#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg,...)
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context...
Common interface for all the OpenCL kernels.
void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx...
Copyright (c) 2017-2022 Arm Limited.
const std::string & config_id() const
Get the configuration ID.
cl::NDRange default_ndrange() const
Return the default NDRange for the device.
Descriptor containing information required to run a single ClWorkload.
Interface for OpenCL Array.
static constexpr unsigned int num_arguments_per_3D_tensor()
Returns the number of arguments enqueued per 3D tensor object.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
GPUTarget get_target() const
Get the targeted GPU architecture.
static constexpr unsigned int num_arguments_per_2D_tensor()
Returns the number of arguments enqueued per 2D tensor object.
static constexpr unsigned int num_arguments_per_4D_tensor()
Returns the number of arguments enqueued per 4D tensor object.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Strides of an item in bytes.
static constexpr unsigned int num_arguments_per_4d_tensor_nhwc()
Returns the number of arguments enqueued per NHWC 4D Tensor object.
void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx...
Interface for OpenCL tensor.
GPUTarget
Available GPU Targets.
Wrapper to configure the Khronos OpenCL C++ header.
void set_target(GPUTarget target)
Set the targeted GPU architecture.
void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx...
void add_5D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 5D tensor's parameters to the object's kernel's arguments starting from the index idx...
void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx...
CLKernelType type() const
Returns the CL kernel type.
virtual const cl::Buffer & cl_buffer() const =0
Interface to be implemented by the child class to return a reference to the OpenCL buffer containing ...
void set_wbsm_hint(const cl_int &wbsm_hint)
Set the workgroup batch size modifier hint.
Describe a multidimensional execution window.
virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue...