Compute Library
 22.11
ClElementwiseKernelComponent.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
25 
27 #include "arm_compute/core/Error.h"
31 
32 namespace arm_compute
33 {
34 namespace experimental
35 {
36 namespace dynamic_fusion
37 {
39 {
40  return ComponentType::Simple;
41 }
42 
44 {
45  return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" };
46 }
47 
49 {
50  const ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
51  const ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
52  ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
53 
54  ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
55 
56  const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs_info, *rhs_info);
57  const TensorShape &out_shape = broadcast_pair.first;
58 
59  auto_init_if_empty(*dst_info, out_shape, 1, lhs_info->data_type());
60 
62  // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged
63  // This is in line with the collapsing convention used by Conv2d
64  output_shape.collapse(2U, 1U);
65  const unsigned int vector_size_byte_opencl = 16;
66  const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0));
67  Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
68 
69  return win;
70 }
71 
73 {
74  std::string code;
75  const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
76 
77  if(is_root)
78  {
79  return R"_(
80  //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
81  // IN_0(LHS) {{lhs}}
82  // IN_1(RHS) {{rhs}}
83  // OUT(dst, accum) {{dst}}
84 
85  // dst = lhs + rhs (mix-precision, broadcast, boundary aware)
86  TILE({{DATA_TYPE}}, M0, N0, {{dst}});
87  {
88  TILE({{DATA_TYPE}}, M0, N0, lhs_tile);
89  TILE({{DATA_TYPE}}, M0, N0, rhs_tile);
90 
91  // Since mout maps to dimensions 1 (y) and dimension 2 (z) of the input tensor because of the collapsed window, bout maps to dimension 3 (w)
92  {{lhs}}_offset_first_element_in_bytes += bout * {{lhs}}_stride_w;
93  {{rhs}}_offset_first_element_in_bytes += bout * {{rhs}}_stride_w;
94 
95  T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, cout, mout, 1, {{lhs}}_stride_y, lhs_tile);
96  T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_x}}, {{rhs_start_y}}, 1, {{rhs}}_stride_y, rhs_tile);
97 
98 #if defined(IS_BROADCAST)
99  T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
100 #else // !defined(IS_BROADCAST)
101  T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
102 #endif // defined(IS_BROADCAST)
103 
104  }
105  //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
106 )_";
107  }
108  else
109  {
110  return R"_(
111  //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
112  // IN_0/Out(Accumulator) {{acc}}
113  // IN_1(Addend) {{addend}}
114 
115  // acc = addend + acc (mix-precision, broadcast, boundary aware)
116  {
117  TILE({{DATA_TYPE}}, M0, N0, addend_tile);
118 
119  T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{addend}}, {{rhs_start_x}}, {{rhs_start_y}}, 1, {{addend}}_stride_y, addend_tile);
120 
121 #if defined(IS_BROADCAST)
122  T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
123 #else // !defined(IS_BROADCAST)
124  T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
125 #endif // defined(IS_BROADCAST)
126  }
127  //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
128 )_";
129  }
130 }
131 
133 {
134  const auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
135  const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
136 
137  CLBuildOptions build_opts{};
138  const auto n0 = _blueprint->impl().get_execution_window().x().step();
139  const auto m0 = _blueprint->impl().get_execution_window().y().step();
140  const unsigned int partial_store_n0 = t_dst_info->dimension(0) % n0;
141  const bool is_broadcast = t_rhs_info->tensor_shape() != t_dst_info->tensor_shape();
142 
143  build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
144  build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
145  build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
146  build_opts.add_option_if(is_broadcast, "-DIS_BROADCAST");
147 
148  return build_opts;
149 }
150 
152 {
153  auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
154  std::string config_id{};
155  config_id += lower_string(string_from_data_type(t_dst_info->data_type()));
156  config_id += "_";
157  config_id += support::cpp11::to_string(t_dst_info->dimension(0));
158  config_id += "_";
159  config_id += support::cpp11::to_string(t_dst_info->dimension(1));
160  config_id += "_";
161  config_id += lower_string(string_from_data_layout(t_dst_info->data_layout()));
162  return config_id;
163 }
164 
166 {
167  const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
168  vtable.add(_lhs, _blueprint->impl().group(_lhs.arg_id), ClKernelArgDescriptor(_lhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "lhs");
169  vtable.add(_rhs, _blueprint->impl().group(_rhs.arg_id), ClKernelArgDescriptor(_rhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "rhs");
170  if(is_root)
171  {
172  vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
173  }
174 }
175 
177 {
178  TagLUT lut{};
179  const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
180  ITensorInfo *t_addend_info = nullptr;
181  // Arguments and global shared variables
182  const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
183  if(is_root)
184  {
185  lut["lhs"] = vtable.get(_lhs);
186  lut["rhs"] = vtable.get(_rhs);
187  lut["dst"] = vtable.get(_dst);
188  t_addend_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
189  }
190  else
191  {
192  // Determine which link is the accumulator
193  Link accumulator;
194  Link addend;
195  if(_blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Automatic)
196  {
197  accumulator = _lhs;
198  addend = _rhs;
199  }
200  else if(_blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Automatic)
201  {
202  accumulator = _rhs;
203  addend = _lhs;
204  }
205  else
206  {
207  ARM_COMPUTE_ERROR("Invalid elementwise component linking");
208  }
209  lut["acc"] = vtable.get(accumulator);
210  lut["addend"] = vtable.get(addend);
211  t_addend_info = _blueprint->impl().get_kernel_argument_info(addend.arg_id);
212  }
213  // Local build options
214  lut["meta_kernel_id"] = id();
215  lut["DATA_TYPE"] = get_cl_type_from_data_type(t_dst_info->data_type());
216 
217  switch(_desc.eltwise.op)
218  {
220  lut["ELTWISE_OP"] = "DIV";
221  break;
223  lut["ELTWISE_OP"] = "ADD";
224  break;
225  default:
226  ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
227  }
228 
229  // Set broadcast parameters
230  // PRE: All tensors are broadcast-compatible
231  const bool is_broadcast = t_addend_info->tensor_shape() != t_dst_info->tensor_shape();
232  if(is_broadcast)
233  {
234  // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy
235  if(t_addend_info->dimension(0) == 1U && t_addend_info->dimension(1) == 1U && t_addend_info->dimension(2) == 1U) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1]
236  {
237  lut["rhs_m0"] = "1";
238  lut["rhs_n0"] = "1";
239  lut["rhs_start_y"] = "0";
240  lut["rhs_start_x"] = "0";
241  }
242  else if(t_addend_info->dimension(1) == 1U && t_addend_info->dimension(2) == 1U) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN]
243  {
244  lut["rhs_m0"] = "1";
245  lut["rhs_n0"] = "N0";
246  lut["rhs_start_y"] = "0";
247  lut["rhs_start_x"] = "cout";
248  }
249  else
250  {
251  ARM_COMPUTE_ERROR("Only support rhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions");
252  }
253  }
254  else
255  {
256  lut["rhs_m0"] = "M0";
257  lut["rhs_n0"] = "N0";
258  lut["rhs_start_y"] = "mout";
259  lut["rhs_start_x"] = "cout";
260  }
261  return lut;
262 }
263 } // namespace dynamic_fusion
264 } // namespace experimental
265 } // namespace arm_compute
266 #endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Shape of a tensor.
Definition: TensorShape.h:39
SharedVar get(const SharedVarLink &var_link) const
Get the SharedVar associated with var_link.
Definition: Common.h:206
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
std::string to_string(T &&value)
Convert integer and float values to string.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
std::string lower_string(const std::string &val)
Lower a given string.
Definition: Utils.cpp:353
static std::pair< TensorShape, ValidRegion > broadcast_shape_and_valid_region(const Infos &... infos)
If infos are broadcast compatible tensor info&#39;s, return the broadcasted shape and the intersection of...
Definition: ITensorInfo.h:317
Copyright (c) 2017-2022 Arm Limited.
Describes all the info required to add a kernel argument at run time.
Definition: ClWorkload.h:70
virtual void allocate_shared_vars(SharedVarTable &vtable) const override
Allocate all shared variables used by the component in the vtable.
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
Definition: Utils.cpp:135
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
A table of all the variables used in the kernel / blueprint Because we limit the DependencyGraph in t...
Definition: Common.h:92
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
std::string generate_config_id() const override
Generate config id of the component.
unsigned int num_elems_processed_per_iteration
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
Definition: CLHelpers.cpp:39
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
const std::string & string_from_data_layout(DataLayout dl)
Convert a data layout identity into a string.
Definition: Utils.cpp:123
virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override
Get the tag look-up table used to instantiate the component code.
void add(SharedVarLink var_link, SharedVarGroup group, ClKernelArgDescriptor runtime_desc, const std::string &name="unnamed")
Create a SharedVar for a corresponding SharedVarLink (contains ArgumentID).
Definition: Common.h:153
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
Returns the adjusted vector size in case it is less than the input&#39;s first dimension, getting rounded down to its closest valid vector size.
Definition: Utils.h:1222
Describe a multidimensional execution window.
Definition: Window.h:39
void collapse(size_t n, size_t first=0)
Collapse the first n dimensions.
Definition: TensorShape.h:133