40 #include <arm_compute/core/Types.h>
41 #include <arm_compute/runtime/Allocator.h>
54 return std::make_unique<NeonMemoryManager>(std::make_unique<arm_compute::Allocator>(),
61 return std::make_unique<NeonWorkloadFactory>(
62 PolymorphicPointerDowncast<NeonMemoryManager>(memoryManager));
68 return std::make_unique<NeonWorkloadFactory>(
75 auto memoryManager = std::make_shared<NeonMemoryManager>(std::make_unique<arm_compute::Allocator>(),
80 auto factory = std::make_unique<NeonTensorHandleFactory>(memoryManager);
87 return std::make_unique<NeonWorkloadFactory>(
88 PolymorphicPointerDowncast<NeonMemoryManager>(memoryManager));
94 auto memoryManager = std::make_shared<NeonMemoryManager>(std::make_unique<arm_compute::Allocator>(),
99 auto factory = std::make_unique<NeonTensorHandleFactory>(memoryManager);
105 return std::make_unique<NeonWorkloadFactory>(
149 auto it = subgraph.
end();
150 std::map<LayerGuid, Layer*> untouched;
152 while (it != subgraph.
begin())
155 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
156 untouched.insert({base.
GetGuid(), &base});
160 while (it != subgraph.
begin())
163 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
175 if (output->GetNumConnections() == 1)
177 for (
auto&& childInput : output->GetConnections())
180 (checkDataTypeInputandOutput(childInput->GetOwningLayer())))
182 Layer& child = childInput->GetOwningLayer();
184 auto* activationLayer = PolymorphicDowncast<ActivationLayer*>(&child);
186 const std::string name = std::string(
"fused-") + child.
GetName() + std::string(
"-into-") +
205 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
214 FuseConvolution2dLayer<Convolution2dLayer>(optimizationViews,
219 untouched.erase(baseLayer->
GetGuid());
220 untouched.erase(activationLayer->GetGuid());
226 PolymorphicDowncast<DepthwiseConvolution2dLayer*>(&base);
237 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
245 FuseDepthwiseConvolution2dLayer<DepthwiseConvolution2dLayer>(optimizationViews,
250 untouched.erase(baseLayer->
GetGuid());
251 untouched.erase(activationLayer->GetGuid());
268 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
276 FuseFullyConnectedLayer<FullyConnectedLayer>(optimizationViews,
281 untouched.erase(baseLayer->
GetGuid());
282 untouched.erase(activationLayer->GetGuid());
288 PolymorphicDowncast<BatchNormalizationLayer*>(&base);
292 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
293 baseLayer->
m_Mean->GetTensorInfo(),
295 baseLayer->
m_Beta->GetTensorInfo(),
296 baseLayer->
m_Gamma->GetTensorInfo(),
303 FuseBatchNormalizationLayer<BatchNormalizationLayer>(optimizationViews,
309 replacementLayer->
m_Beta = std::move(baseLayer->
m_Beta);
311 replacementLayer->
m_Mean = std::move(baseLayer->
m_Mean);
313 untouched.erase(baseLayer->
GetGuid());
314 untouched.erase(activationLayer->GetGuid());
319 AdditionLayer* baseLayer = PolymorphicDowncast<AdditionLayer*>(&base);
324 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
329 FuseAdditionLayer<AdditionLayer>(optimizationViews,
334 untouched.erase(baseLayer->
GetGuid());
335 untouched.erase(activationLayer->GetGuid());
340 DivisionLayer* baseLayer = PolymorphicDowncast<DivisionLayer*>(&base);
345 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
350 FuseDivisionLayer<DivisionLayer>(optimizationViews,
355 untouched.erase(baseLayer->
GetGuid());
356 untouched.erase(activationLayer->GetGuid());
366 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
371 FuseMultiplicationLayer<MultiplicationLayer>(optimizationViews,
376 untouched.erase(baseLayer->
GetGuid());
377 untouched.erase(activationLayer->GetGuid());
382 SubtractionLayer* baseLayer = PolymorphicDowncast<SubtractionLayer*>(&base);
387 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
392 FuseSubtractionLayer<SubtractionLayer>(optimizationViews,
397 untouched.erase(baseLayer->
GetGuid());
398 untouched.erase(activationLayer->GetGuid());
410 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
415 FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
421 untouched.erase(baseLayer->
GetGuid());
422 untouched.erase(activationLayer->GetGuid());
430 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
435 FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
441 untouched.erase(baseLayer->
GetGuid());
442 untouched.erase(activationLayer->GetGuid());
450 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
455 FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
461 untouched.erase(baseLayer->
GetGuid());
462 untouched.erase(activationLayer->GetGuid());
470 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
475 FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
481 untouched.erase(baseLayer->
GetGuid());
482 untouched.erase(activationLayer->GetGuid());
496 ReduceLayer* baseLayer = PolymorphicDowncast<ReduceLayer*>(&base);
499 if (!reduceDescriptor.
m_vAxis.empty() && reduceDescriptor.
m_vAxis.size() > 1)
502 std::vector<IConnectableLayer*> layers = ChainReduceLayers<ReduceLayer>(optimizationViews,
507 ReplaceLayers<ReduceLayer>(optimizationViews, baseLayer, layers);
508 untouched.erase(baseLayer->
GetGuid());
515 ReshapeLayer* baseLayer = PolymorphicDowncast<ReshapeLayer*>(&base);
526 Layer* layerList[4] = {
nullptr,
nullptr,
nullptr,
nullptr};
529 if (IsLayerSequence<BinaryOperation>(base,
535 bool fuseReLu =
false;
536 unsigned int numInputs = 0;
537 unsigned int numOutputs = 0;
538 std::vector<TensorInfo> inputInfos;
539 std::vector<TensorInfo> outputInfos;
542 if (BuildAddMulAddTensorInfoLists<Layer>(layerList,
547 activationDescriptor,
553 {outputInfos.begin(), outputInfos.end()},
555 activationDescriptor);
558 std::string fusedName;
566 FusedLayer* addMulAddFusedLayer = PolymorphicDowncast<FusedLayer*>(addMulAddLayer);
568 std::make_shared<ActivationDescriptor>(*activationDescriptor));
572 std::vector<IConnectableLayer*> originalLayers;
573 for (
unsigned int layerIdx = 0; layerIdx < 4; ++layerIdx)
575 if (layerList[layerIdx])
577 originalLayers.push_back(layerList[layerIdx]);
581 std::vector<SlotList> inputLayersSlotLists, outputLayersSlotLists;
582 BuildAddMulAddSlotLists<SlotList>(fuseReLu,
583 outputInfos.size() > 1,
584 inputLayersSlotLists,
585 outputLayersSlotLists);
587 ReplaceMultipleLayers<FusedLayer>(optimizationViews,
589 PolymorphicDowncast<FusedLayer*>(addMulAddLayer),
590 inputLayersSlotLists,
591 outputLayersSlotLists);
594 for (
unsigned int layerIdx = 0; layerIdx < 4; ++layerIdx)
596 if (layerList[layerIdx])
598 untouched.erase(layerList[layerIdx]->GetGuid());
615 return optimizationViews;
625 auto memoryManager = std::make_shared<NeonMemoryManager>(std::make_unique<arm_compute::Allocator>(),
630 auto factory = std::make_unique<NeonTensorHandleFactory>(memoryManager);
639 return std::make_unique<DefaultAllocator>();