24.02.1
|
Go to the documentation of this file.
37 namespace experimental
39 namespace dynamic_fusion
50 _attributes{attributes},
65 return "direct_conv2d";
76 std::string code = R
"_(
77 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
79 // IN_1(wei) {{weight}}
88 // OUT(dst, accum) {{dst}}
90 TILE(uint, M0, 1, g_dst_indirect_y);
93 #define _IWEI_WIDTH {{WEI_WIDTH}}
94 #define _IWEI_HEIGHT {{WEI_HEIGHT}}
95 #define _ISRC_WIDTH {{SRC_WIDTH}}
96 #define _ISRC_HEIGHT {{SRC_HEIGHT}}
97 #define _ISRC_CHANNELS {{SRC_CHANNELS}}
98 #define _IDST_WIDTH {{DST_WIDTH}}
99 #define _IDST_HEIGHT {{DST_HEIGHT}}
100 #define _IDST_CHANNELS {{DST_CHANNELS}}
101 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
103 TILE(int, M0, 1, xi);
104 TILE(int, M0, 1, yi);
106 // Convert the linear index to coordinate
107 LOOP_UNROLLING(int, i, 0, 1, M0,
109 xi[0].s[i] = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
110 yi[0].s[i] = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
111 xi[0].s[i] -= {{PAD_LEFT}};
112 yi[0].s[i] -= {{PAD_TOP}};
115 LOOP_UNROLLING(int, i, 0, 1, M0,
120 for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
122 int xk = i % _IWEI_WIDTH;
123 int yk = i / _IWEI_WIDTH;
125 TILE(int, 1, M0, my);
127 LOOP_UNROLLING(int, i, 0, 1, M0,
129 int x_s = xi[0].s[i] + xk;
130 int y_s = yi[0].s[i] + yk;
131 my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
132 my[0].s[i] = my[0].s[i] + g_ind_2 * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
133 my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
134 my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
135 my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
136 my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
140 for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
142 TILE({{SRC_DATA_TYPE}}, M0, K0, a);
143 TILE({{WEI_DATA_TYPE}}, N0, K0, b);
145 LOOP_UNROLLING(int, i, 0, 1, M0,
147 a[i].v = {{ZERO_VALUE}};
150 LOOP_UNROLLING(int, i, 0, 1, N0,
152 b[i].v = {{ZERO_VALUE}};
155 T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
157 T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
159 T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
166 for(; ck < _ISRC_CHANNELS; ++ck)
168 TILE({{SRC_DATA_TYPE}}, M0, 1, a);
169 TILE({{WEI_DATA_TYPE}}, N0, 1, b);
171 LOOP_UNROLLING(int, i, 0, 1, M0,
173 a[i].v = {{ZERO_VALUE}};
176 LOOP_UNROLLING(int, i, 0, 1, N0,
178 b[i].v = {{ZERO_VALUE}};
181 T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
183 T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
185 T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
195 #undef _ISRC_CHANNELS
198 #undef _IDST_CHANNELS
199 #undef _IY_MULTIPLIER
207 TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
209 T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
211 T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
216 LOOP_UNROLLING(int, i, 0, 1, M0,
218 g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1);
219 g_dst_indirect_y[i].v += g_ind_2 * (int)({{DST_WIDTH}} * {{DST_HEIGHT}});
222 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
262 lut[
"meta_kernel_id"] =
id();
263 lut[
"ACC_DATA_TYPE"] = _src->
data_type();
264 lut[
"SRC_DATA_TYPE"] = _src->
data_type();
265 lut[
"WEI_DATA_TYPE"] = _weight->
data_type();
267 lut[
"SRC_TENSOR_TYPE"] =
"BUFFER";
274 lut[
"WEI_TENSOR_TYPE"] =
"IMAGE";
279 lut[
"WEI_TENSOR_TYPE"] =
"BUFFER";
298 lut[
"STRIDE_X"] = _attributes.
stride().x();
299 lut[
"STRIDE_Y"] = _attributes.
stride().y();
301 lut[
"PAD_LEFT"] = _attributes.
pad().left;
302 lut[
"PAD_TOP"] = _attributes.
pad().top;
304 lut[
"ZERO_VALUE"] = 0;
314 const unsigned int n0 = root_window.
x().
step();
315 const unsigned int m0 = root_window.y().step();
317 const unsigned int partial_store_n0 = _dst->
dimension(0) % n0;
322 build_opts.
add_option(
"-cl-fast-relaxed-math");
328 build_opts.add_option(
"-cl-unsafe-math-optimizations");
349 std::string config_id{};
368 return std::set<std::string>{
"helpers.h",
"tile_helpers.h"};
Class to describe a number of elements in each dimension.
std::string to_string(T &&value)
Convert integer and float values to string.
@ Image_Export_To_ClImage2D
Conv2dAttributes & pad(const Padding2D &pad)
Set padding.
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.
virtual const IGpuTemplateComponentWriter * template_writer() const
Get writer for the component.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
DataLayout
[DataLayout enum definition]
Component specific settings.
CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override
Generate the build options used in the component.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
static constexpr GpuKernelArgumentInfo::Type common_tensor_type
For now all kernel intermeditate/destination tensors are expected to be of type Tensor_4D_t_Buffer.
std::unordered_map< Tag, TagVal > TagLUT
Tag lookup table.
constexpr int step() const
Return the step of the dimension.
ClComponentDirectConv2dSettings & fast_relaxed_math(bool fast_relaxed_math)
Set fast_relaxed_math flag.
std::string get_config_id() const override
Generate the component config id string used for tuning.
std::string lower_string(const std::string &val)
Lower a given string.
ComponentPtr get_root_component() const
Get the root (first) component of this group.
bool export_to_cl_image() const
Get export_to_cl_image flag.
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
const ITensorInfo * get_any_dst_tensor() const
Get one of the destination tensors of this group.
An interface used by ClTemplateWriter to write source code for a kernel component.
This is a generic class that packs the arguments of an operator.
constexpr auto data_layout
GpuKernelArgumentInfo kernel_argument_info
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
size_t total_size_upper(size_t dimension) const
Collapses given dimension and above.
Conv2dAttributes & stride(const Size2D &stride)
Set stride.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
bool has_valid_id() const
Check if the tensor id is valid.
std::string get_component_code(const ComponentGroup &comp_group) const override
Generate kernel component code template.
void add_option(std::string option)
Adds option to the existing build option list.
ArgumentPack< ITensorInfo > tensors() const
Get tensor arguments.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
size_t total_size() const
Collapses all dimensions to a single linear total size.
void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override
Declare all variables used by the component in the vtable.
TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override
Generate the tag look-up table used to instantiate the component code.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
ComponentId id() const
Get component id.
Window get_window() const override
Generate the execution window for the component.
Contain information required to set up a kernel argument at run time.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Describe one of the image's dimensions with a start, end and step.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
TensorVariable get_variable(const ITensorInfo *tensor) const
Get the TensorVariable associated with tensor.
std::set< std::string > get_headers_list() const override
Generate the header list used in the component.
const std::string & string_from_data_layout(DataLayout dl)
Convert a data layout identity into a string.
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
int32_t ComponentId
Uniquely identifies a kernel component within a workload.
size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
Get the index of the given dimension.
ClTemplateDirectConv2d(ComponentId id, const ArgumentPack< ITensorInfo > &tensors, const Attributes &attributes, const Settings &settings)
Constructor.
Describe a multidimensional execution window.
std::string get_name() const override
Generate kernel component name.
Copyright (c) 2017-2024 Arm Limited.
ClComponentDirectConv2dSettings & direct_conv_descriptor(const DirectConvComputeKernelInfo &desc)
Set direct convolution descriptor.
unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
Returns the adjusted vector size in case it is less than the input's first dimension,...
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
A group of gpu kernel components to be fused together PRECONDITIONS:
virtual Window get_window() const
Generate the execution window for the component.
Type
Enumerate all the tensor arguments variants used by all kernel implementations.
A table of all the variables used in the kernel.
void declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias="unnamed")
Declare a TensorVariable for a corresponding tensor info.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
DataType
Available data types.
@ Image_3D_Export_To_ClImage2D