Compute Library
 22.08
ClFusedKernelGraph.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
25 #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
26 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
32 #include "support/DeepCopy.h"
33 
34 #include <vector>
35 
36 namespace arm_compute
37 {
38 namespace experimental
39 {
40 namespace dynamic_fusion
41 {
42 struct ClKernelFusionGroup;
43 
44 /** A const view of a subgraph of the @ref ClKernelGraph to be fused together
45  *
46  */
48 {
49 public:
51 
52  ClKernelFusionGroup() = default;
54  : id{ id }, graph{}, fused_kernels{}, tensors{}
55  {
56  }
57  ~ClKernelFusionGroup() = default;
58 
59  void set_id(Id i)
60  {
61  id = i;
62  }
63 
64  Id add_fused_kernel(const ClKernel *kernel)
65  {
66  /// PRE: Acyclicity ensured by DependencyGraph
67  /// PRE: Connectedness ensured by DependencyGraph
68  /// PRE: Single-rootedness ensured by User
69  std::vector<Id> src_tensors;
70  for(const auto t : kernel->tensors().get_const_src_tensors())
71  {
72  auto id = graph.add_tensor(t->id);
73  if(tensors.find(id) == tensors.end())
74  {
75  tensors[id] = t;
76  }
77  src_tensors.push_back(id);
78  }
79  std::vector<Id> dst_tensors;
80  for(const auto t : kernel->tensors().get_const_dst_tensors())
81  {
82  auto id = graph.add_tensor(t->id);
83  if(tensors.find(id) == tensors.end())
84  {
85  tensors[id] = t;
86  }
87  dst_tensors.push_back(id);
88  }
89  auto id = graph.add_operator(src_tensors, dst_tensors);
90  fused_kernels[id.second] = kernel;
91  return id.second;
92  }
93 
94  const ClKernel *get_root_kernel() const
95  {
96  auto root_kernels = graph.get_root_ops();
97  ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1);
98  return fused_kernels.at(root_kernels.at(0));
99  }
100 
101  std::vector<const ClKernelTensor *> get_src_tensors() const
102  {
103  std::vector<const ClKernelTensor *> src_tensors;
104  for(auto tensor_id : graph.src_tensors())
105  {
106  src_tensors.push_back(tensors.at(tensor_id));
107  }
108  return src_tensors;
109  }
110 
111  std::vector<const ClKernelTensor *> get_dst_tensors() const
112  {
113  std::vector<const ClKernelTensor *> dst_tensors;
114  for(auto tensor_id : graph.dst_tensors())
115  {
116  dst_tensors.push_back(tensors.at(tensor_id));
117  }
118  return dst_tensors;
119  }
120 
121  friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)
122  {
123  return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors;
124  }
125 
126  Id id{};
127  DependencyGraph graph{}; // A subgraph of the original ClKernelGraph
128  std::map<Id, const ClKernel *> fused_kernels{};
129  std::map<Id, const ClKernelTensor *> tensors{};
130 };
131 
132 std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group);
133 
135 {
136 public:
138 
139  using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>;
140 
141  ClFusedKernelGraph() = default;
142  ~ClFusedKernelGraph() = default;
143  ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default;
144  ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default;
146  ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default;
147 
148  friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
149  {
150  /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison.
151  /// The fusion groups can already guarantee the equivalence of fusion
152  /// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s
153  return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups;
154  }
155 
156  Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels)
157  {
158  auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>();
159  for(const auto k : fused_kernels)
160  {
161  fg->add_fused_kernel(k);
162  }
163  const auto src_tensors = fg->get_src_tensors();
164  const auto dst_tensors = fg->get_dst_tensors();
165  std::vector<Id> inputs{};
166  std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel)
167  {
168  return fg_dependency.add_tensor(kernel->id);
169  });
170  std::vector<Id> outputs{};
171  std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel)
172  {
173  return fg_dependency.add_tensor(kernel->id);
174  });
175  const auto id = fg_dependency.add_operator(inputs, outputs);
176  fg->set_id(id.second);
177  fusion_groups[id.second] = std::move(fg);
178  return id.second;
179  }
180 
182  {
183  /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold
184  ClKernelFusionGroup *fg_src{};
185  ClKernelFusionGroup *fg_dst{};
186  // Find fg_src (parent / root) and fg_dst (child / non-root)
187  if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
188  {
189  fg_src = &fg0;
190  fg_dst = &fg1;
191  }
192  else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
193  {
194  fg_src = &fg1;
195  fg_dst = &fg0;
196  }
197  else
198  {
199  return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
200  }
201 
202  for(const auto &t : fg_dependency.src_tensors(fg_dst->id))
203  {
204  if(!is_in(t, fg_dependency.dst_tensors(fg_src->id)))
205  {
206  // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src
207 
208  // Before:
209  // fg_src
210  // |
211  // .. t1
212  // | |
213  // -> fg_dst <-
214  //
215  // After:
216  // fg_src <---t1
217  //
218  const auto st = link_src_tensors(fg_src->id, { t });
219  if(!bool(st))
220  {
221  return st;
222  }
223  }
224  else
225  {
226  const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t);
227  if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id)
228  {
229  // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst
230  // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3))
231 
232  // Before:
233  // fg_src
234  // |
235  // t0
236  // |
237  // -> fg_dst
238  //
239  // After:
240  // fg_src
241  //
242  const auto st = remove_fg_tensor(t);
243  if(!bool(st))
244  {
245  return st;
246  }
247  }
248  else
249  {
250  // If the tensors ARE in between fg_src and fg_dst
251  // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src
252 
253  // Before:
254  // fg_src
255  // |
256  // t0
257  // |
258  // |-----------
259  // | |
260  // -> fg_dst -> fg_other
261  //
262  // After:
263  // fg_src
264  // |
265  // t0
266  // |
267  // -> fg_other
268  //
269 
270  // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an
271  // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory
272  // so that any unfused fg's (fg_other in this case) can read it.
273  // So all this means that we not only can STORE the tensors at the "end" of a fusion group,
274  // but also any other tensors that are not source tensors. And all tensors that are STORED (exported),
275  // can be termed "dst tensors" to a fusion group
276  void();
277  }
278  }
279  }
280 
281  for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
282  {
283  // Link any outgoing tensors of fg_dst to fg_src
284 
285  // Before:
286  // fg_src
287  // |
288  // ..
289  // |
290  // -> fg_dst
291  // |
292  // |--------
293  // | |
294  // |-> t0 |-> t1
295  //
296  // After:
297  // fg_src
298  // |
299  // |--------
300  // | |
301  // |-> t0 |-> t1
302  //
303  const auto st = link_dst_tensors(fg_src->id, { t });
304  if(!bool(st))
305  {
306  return st;
307  }
308  }
309 
310  // Merge fg_dst's graph into fg_src's graph
311  for(const auto kernel : traverse(*fg_dst))
312  {
313  fg_src->add_fused_kernel(kernel);
314  }
315 
316  const auto st = remove_fg(fg_dst->id);
317  return st;
318  }
320  {
321  /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel
322  /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels)
323  /// Note that this does not apply to fusion groups
324  /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's
325  /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group
326  /// INV0: All Fusion groups have a single root
327  /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph
328  /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph
329  /// INV3: All non-roots are Simple kernels
330  /// INV4: All non roots' dst tensors have the same shape as that of the root kernel
331  /// INV5: All kernels within a fusion group have the same UnitWorkloadStage
332  const ClKernelFusionGroup *fg_src {};
333  const ClKernelFusionGroup *fg_dst{};
334 
335  // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other
336  // This guarantess INV0
337  // This also finds fg_src (parent / root) and fg_dst (child / non-root)
338  if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
339  {
340  fg_src = &fg0;
341  fg_dst = &fg1;
342  }
343  else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
344  {
345  fg_src = &fg1;
346  fg_dst = &fg0;
347  }
348  else
349  {
350  return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
351  }
352 
353  // Find unconnected tensors between fg_src and fg_dst
354  std::vector<Id> unconnected_tensors{};
355  for(const auto &t : fg_dependency.dst_tensors(fg_src->id))
356  {
357  if(!is_in(t, fg_dependency.src_tensors(fg_dst->id)))
358  {
359  unconnected_tensors.push_back(t);
360  }
361  }
362 
363  // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst
364  // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups
365  for(const auto &t : unconnected_tensors)
366  {
367  if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id))
368  {
369  return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" };
370  }
371  }
372 
373  // Check 2: All non-root fgs are simple. Ensure INV3
374  if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple)
375  {
376  return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" };
377  }
378 
379  // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4
380  const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id);
381  ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor)
382  const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc;
383 
384  for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
385  {
386  const auto t_info = original_graph->get_tensor(t)->desc;
387  if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0))
388  {
389  return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" };
390  }
391  }
392 
393  // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5
394  if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage))
395  {
396  return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" };
397  }
398 
399  return Status{};
400  }
401 
402  const ClKernelGraph *original_graph{};
403  DependencyGraph fg_dependency{};
404  KernelFusionGroupMap fusion_groups{};
405  // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups.
406 
407 private:
408  Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors)
409  {
410  for(auto t : src_tensors)
411  {
412  fg_dependency.link_input(fg, t);
413  }
414  return Status{};
415  }
416  Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors)
417  {
418  for(auto t : dst_tensors)
419  {
420  fg_dependency.link_output(fg, t);
421  }
422  return Status{};
423  }
424  Status remove_fg(Id fg)
425  {
426  fg_dependency.remove_operator(fg);
427  fusion_groups.erase(fg);
428  return Status{};
429  }
430  Status remove_fg_tensor(Id tensor)
431  {
432  fg_dependency.remove_tensor(tensor);
433  return Status{};
434  }
435 };
436 
437 std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph);
438 std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph);
439 
440 std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph);
441 
442 Status fuse(ClFusedKernelGraph &fused_kernel_graph);
443 
444 Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg);
445 
446 Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph);
447 
448 } // namespace dynamic_fusion
449 } // namespace experimental
450 } // namespace arm_compute
451 #endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
452 #endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
The dependency graph of a workload, where the nodes are of 2 types: Tensor or Operator Represented as...
Id add_tensor(Id merge_tensor=empty_id())
Add a new tensor.
Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1)
ITensorDescPack< ClKernelTensor > tensors() const
std::vector< Id > get_root_ops() const
Get all root ops.
Context (device capabilities, platform details) associated with a ClWorkload.
Definition: ClWorkload.h:162
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Status class.
Definition: Error.h:52
std::vector< const ClKernel * > traverse(const ClKernelFusionGroup &group)
Id add_fusion_group(const std::vector< const ClKernel *> &fused_kernels)
Copyright (c) 2017-2022 Arm Limited.
Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const
Intermediate representation of the final, complete kernel source.
Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph)
std::vector< Id > src_tensors(Id op) const
Get source tensors to an operator.
bool is_in(const T &v, const std::vector< T > &vec)
Status fuse(ClFusedKernelGraph &fused_kernel_graph)
friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
bool have_different_dimensions(const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)
Definition: Validate.h:47
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290
std::vector< Id > dst_tensors(Id op) const
Get destination tensors to an operator.
std::pair< Status, DependencyGraph::Id > add_operator(const std::vector< Id > &inputs, const std::vector< Id > &outputs)
Add a new operator.
A const view of a subgraph of the ClKernelGraph to be fused together.
std::map< Id, utils::memory::deep_unique_ptr< ClKernelFusionGroup > > KernelFusionGroupMap
std::vector< const ClKernelTensor * > get_dst_tensors() const
std::pair< Status, ClFusedKernelGraph > init_fusion_graph(const ClKernelGraph &kernel_graph)
std::vector< const ClKernelTensor * > get_src_tensors() const
Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg)
friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)