24.02.1
|
Go to the documentation of this file.
31 namespace experimental
33 namespace dynamic_fusion
44 _attributes{attributes},
59 return "depthwise_conv2d";
68 std::string code = R
"_(
69 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
71 // IN_1(wei) {{weight}}
82 // OUT(dst, accum) {{dst}}
84 TILE(uint, M0, 1, g_dst_indirect_y);
87 #define _IWEI_WIDTH {{WEI_WIDTH}}
88 #define _IWEI_HEIGHT {{WEI_HEIGHT}}
89 #define _IDST_WIDTH {{arg_dst}}_w
90 #define _IDST_HEIGHT {{arg_dst}}_h
93 #define _IM0_B _IWEI_WIDTH
95 #define _IBOUNDARY_CHECK (!((_IWEI_WIDTH == 1 && _IWEI_HEIGHT == 1 && {{PAD_LEFT}} == 0 && {{PAD_TOP}} == 0 && M0 == 1)))
99 const int yo = g_ind_2 % {{arg_dst}}_h;
100 const int bout = g_ind_2 / {{arg_dst}}_h;
105 int xi = g_ind_1 * {{STRIDE_X}};
106 int yi = yo * {{STRIDE_Y}};
110 LOOP_UNROLLING(int, i, 0, 1, M0,
119 LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
125 for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
131 TILE({{SRC_DATA_TYPE}}, _IM0_A, _IN0_A, a);
133 LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
138 T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, _IM0_A, _IN0_A, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi + yk * {{DILATION_Y}}, xi, (g_ind_0 / {{DEPTH_MULTIPLIER}}), {{src}}_w, {{src}}_h, {{DILATION_X}}, 1, _IBOUNDARY_CHECK, a);
140 TILE({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, b);
142 T_LOAD({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, {{WEI_TENSOR_TYPE}}, {{weight}}, g_ind_0, yk * _IM0_B, 1, {{weight}}_stride_y, b);
144 LOOP_UNROLLING(int, m0, 0, 1, M0,
146 LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
153 {{dst}}[m0].v += a[xk + m0].v * b[xk].v;
159 {{dst}}[m0].v = fma(a[xk + m0].v, b[xk].v, {{dst}}[m0].v);
179 TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}});
181 T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 0, 0, {{bias}});
183 T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, {{bias}}, {{dst}});
188 LOOP_UNROLLING(int, i, 0, 1, M0,
190 g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
191 g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
192 g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
195 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
244 lut[
"meta_kernel_id"] =
id();
245 lut[
"ACC_DATA_TYPE"] = _src->
data_type();
246 lut[
"SRC_DATA_TYPE"] = _src->
data_type();
247 lut[
"WEI_DATA_TYPE"] = _weight->
data_type();
254 lut[
"SRC_TENSOR_TYPE"] =
"IMAGE";
257 lut[
"SRC_TENSOR_TYPE"] =
"BUFFER";
266 lut[
"WEI_TENSOR_TYPE"] =
"IMAGE";
269 lut[
"WEI_TENSOR_TYPE"] =
"BUFFER";
280 lut[
"STRIDE_X"] = _attributes.
stride().x();
281 lut[
"STRIDE_Y"] = _attributes.
stride().y();
283 lut[
"PAD_LEFT"] = _attributes.
pad().left;
284 lut[
"PAD_TOP"] = _attributes.
pad().top;
286 lut[
"DILATION_X"] = _attributes.
dilation().x();
287 lut[
"DILATION_Y"] = _attributes.
dilation().y();
300 const unsigned int n0 = _settings.
n0();
301 const unsigned int m0 = _settings.
m0();
304 const unsigned int partial_store_n0 = _dst->
dimension(0) % n0;
310 build_opts.
add_option(
"-cl-fast-relaxed-math");
316 build_opts.add_option(
"-cl-unsafe-math-optimizations");
330 std::string config_id{};
351 return std::set<std::string>{
"helpers.h",
"tile_helpers.h"};
Class to describe a number of elements in each dimension.
std::string to_string(T &&value)
Convert integer and float values to string.
ClComponentDepthwiseConv2dSettings & n0(unsigned int n0)
Set N0: number of columns processed by each thread.
@ Image_Export_To_ClImage2D
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
ClComponentDepthwiseConv2dSettings & export_input_to_cl_image(bool cl_image)
Set export_input_to_cl_image flag.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
ClComponentDepthwiseConv2dSettings & m0(unsigned int m0)
Set M0: number of rows processed by each thread.
std::unordered_map< Tag, TagVal > TagLUT
Tag lookup table.
std::string get_component_code(const ComponentGroup &comp_group) const override
Generate kernel component code template.
void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override
Declare all variables used by the component in the vtable.
ClComponentDepthwiseConv2dSettings & export_weights_to_cl_image(bool cl_image)
Set export_weights_to_cl_image flag.
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
Window get_window() const override
Generate the execution window for the component.
const ITensorInfo * get_any_dst_tensor() const
Get one of the destination tensors of this group.
An interface used by ClTemplateWriter to write source code for a kernel component.
This is a generic class that packs the arguments of an operator.
GpuKernelArgumentInfo kernel_argument_info
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override
Generate the tag look-up table used to instantiate the component code.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override
Generate the build options used in the component.
std::set< std::string > get_headers_list() const override
Generate the header list used in the component.
bool has_valid_id() const
Check if the tensor id is valid.
Component specific settings.
void add_option(std::string option)
Adds option to the existing build option list.
Window collapse(const Window &full_window, size_t first, size_t last=Coordinates::num_max_dimensions) const
Collapse the dimensions between first and last.
ArgumentPack< ITensorInfo > tensors() const
Get tensor arguments.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
size_t total_size() const
Collapses all dimensions to a single linear total size.
ClComponentDepthwiseConv2dSettings & is_fma_available(bool is_fma_available)
Set is_fma_available flag.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
ComponentId id() const
Get component id.
Contain information required to set up a kernel argument at run time.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
ClTemplateDepthwiseConv2d(ComponentId id, const ArgumentPack< ITensorInfo > &tensors, const Attributes &attributes, const Settings &settings)
Constructor.
TensorVariable get_variable(const ITensorInfo *tensor) const
Get the TensorVariable associated with tensor.
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
int32_t ComponentId
Uniquely identifies a kernel component within a workload.
Describe a multidimensional execution window.
ClComponentDepthwiseConv2dSettings & fast_relaxed_math(bool fast_relaxed_math)
Set fast_relaxed_math flag.
DepthwiseConv2dAttributes & depth_multiplier(const uint32_t &depth_multiplier)
Set depth multiplier.
Copyright (c) 2017-2024 Arm Limited.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
DepthwiseConv2dAttributes & pad(const Padding2D &pad)
Set padding.
A group of gpu kernel components to be fused together PRECONDITIONS:
Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
std::string get_config_id() const override
Generate the component config id string used for tuning.
Type
Enumerate all the tensor arguments variants used by all kernel implementations.
A table of all the variables used in the kernel.
void declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias="unnamed")
Declare a TensorVariable for a corresponding tensor info.
std::string get_name() const override
Generate kernel component name.
DepthwiseConv2dAttributes & dilation(const Size2D &dilation)
Set dilation.
@ Image_3D_Export_To_ClImage2D
DepthwiseConv2dAttributes & stride(const Size2D &stride)
Set stride.